scraper.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #!/usr/bin/env python2
  2. from lxml import etree
  3. from urlparse import urljoin
  4. import io
  5. import json
  6. import os
  7. import re
  8. import requests
  9. import sys
  10. file_path = os.path.join(os.path.dirname(os.path.dirname((os.path.abspath(__file__)))), "site/list")
  11. class Hackerspaces(object):
  12. LOCATION_KEY = "coordinate"
  13. LOGO_KEY = "logo"
  14. BASE_URL = "http://hackerspaces.org"
  15. @staticmethod
  16. def absolute_url(path):
  17. return urljoin(Hackerspaces.BASE_URL, path)
  18. @staticmethod
  19. def wiki_page(title):
  20. return urljoin(Hackerspaces.absolute_url('wiki/'), title)
  21. @staticmethod
  22. def country_list(country, offset=0):
  23. return "http://hackerspaces.org/w/api.php?action=ask&query=[[country::{0}]]\n[[Category:Hackerspace]]&format=json&offset={1!s}".format(country, offset)
  24. @staticmethod
  25. def edit_page(title):
  26. url = Hackerspaces.absolute_url(u"w/index.php?title={0}&action=edit".format(title))
  27. return url
  28. @staticmethod
  29. def url_for_file(filename):
  30. """Format the url
  31. """
  32. url = Hackerspaces.wiki_page('File') + u":{0}".format(filename)
  33. tree = Hackerspaces.get_etree(url)
  34. url = tree.xpath('//a[@class="internal"]/@href')
  35. if len(url) == 1:
  36. return url[0]
  37. #regex matching all the non numeric character including
  38. NON_NUMERIC = re.compile('[^,\d.]+')
  39. #url used to fetch information about the space api
  40. SPACE_API = "http://spaceapi.net/directory.json"
  41. def __init__(self, country):
  42. """docstring for """
  43. super(Hackerspaces, self).__init__()
  44. self.country = country
  45. #load the space api directory
  46. @staticmethod
  47. def get_json(url):
  48. print(url)
  49. resp = requests.get(url)
  50. return resp.json()
  51. @staticmethod
  52. def get_etree(url, browser_dump=False):
  53. """Return a etree from an url using request
  54. """
  55. resp = requests.get(url)
  56. if browser_dump:
  57. with open('dump', 'w') as f:
  58. f.write(resp.content)
  59. import webbrowser
  60. webbrowser.open('dump')
  61. exit(0)
  62. tree = etree.parse(io.StringIO(resp.text))
  63. return tree
  64. def get_hackerspaces(self):
  65. offset = 0
  66. hackerspaces = {}
  67. while 1:
  68. hackerspaces_page = Hackerspaces.get_json(Hackerspaces.country_list(self.country, offset=offset))
  69. print hackerspaces_page
  70. hackerspaces.update(hackerspaces_page['query']['results'])
  71. if 'query-continue-offset' in hackerspaces_page.keys():
  72. offset = int(hackerspaces_page['query-continue-offset'])
  73. else:
  74. print "END"
  75. break
  76. hs_names = hackerspaces.keys()
  77. hackerspaces = {}
  78. for name in hs_names:
  79. print name
  80. hackerspaces[name] = self.get_hackerspace(name)
  81. #space api url
  82. space_url = self.get_space_api_url(name)
  83. if space_url:
  84. hackerspaces[name]['space_url'] = space_url
  85. return hackerspaces
  86. def get_space_api_url(self, name):
  87. req = requests.get(Hackerspaces.SPACE_API)
  88. j = json.loads(req.text)
  89. return j.get(name)
  90. def get_hackerspace(self, name):
  91. url = Hackerspaces.edit_page(name)
  92. tree = self.get_etree(url)
  93. text_box = tree.xpath('//*[@id="wpTextbox1"]//text()')
  94. text = "".join(text_box)
  95. #select the first string between {{ and }}
  96. #this use a simple descent parser
  97. count = 0
  98. result = ""
  99. for char in text:
  100. if char == '{':
  101. count += 1
  102. elif char == '}':
  103. count -= 1
  104. if count == 0 and result:
  105. break
  106. elif count == 2:
  107. result += char
  108. #split over | for key value pair
  109. #and split over = for key value
  110. ret = {}
  111. for kv in result.split("|"):
  112. try:
  113. key, value = [el.replace('\n', ' ').strip() for el in kv.split("=")]
  114. if key == Hackerspaces.LOCATION_KEY:
  115. value = Hackerspaces.clean_location(value)
  116. elif key == Hackerspaces.LOGO_KEY:
  117. if value:
  118. value = Hackerspaces.absolute_url(Hackerspaces.url_for_file(value))
  119. if value:
  120. ret[key] = value
  121. except ValueError:
  122. pass
  123. return ret
  124. @staticmethod
  125. def clean_location(location_string):
  126. """This method will remove all non numeric or commas or dot from a string, then
  127. split it on the comma and select only the 2 first element. This has the goal to
  128. clean any string that is malformed
  129. """
  130. return hackerspaces.NON_NUMERIC.sub('', location_string).split(",")[:2]
  131. if __name__ == '__main__':
  132. if len(sys.argv) == 3:
  133. hackerspaces = Hackerspaces(sys.argv[1])
  134. else:
  135. hackerspaces = Hackerspaces('Switzerland')
  136. with open(file_path, 'w') as f:
  137. hs = hackerspaces.get_hackerspaces()
  138. print(hs)
  139. json.dump(hs, f)