scraper.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. from lxml import etree
  2. from urlparse import urljoin
  3. import io
  4. import json
  5. import os
  6. import re
  7. import requests
  8. import sys
  9. file_path = os.path.join(os.path.dirname(os.path.dirname((os.path.abspath(__file__)))), "site/list")
  10. class Hackerspaces(object):
  11. LOCATION_KEY = "coordinate"
  12. LOGO_KEY = "logo"
  13. BASE_URL = "http://hackerspaces.org"
  14. @staticmethod
  15. def absolute_url(path):
  16. return urljoin(Hackerspaces.BASE_URL, path)
  17. @staticmethod
  18. def wiki_page(title):
  19. return urljoin(Hackerspaces.absolute_url('wiki/'), title)
  20. @staticmethod
  21. def edit_page(title):
  22. url = Hackerspaces.absolute_url("w/index.php?title={0}&action=edit".format(title))
  23. return url
  24. @staticmethod
  25. def url_for_file(filename):
  26. """Format the url
  27. """
  28. url = Hackerspaces.wiki_page('File') + ":{0}".format(filename)
  29. tree = Hackerspaces.get_etree(url)
  30. url = tree.xpath('//a[@class="internal"]/@href')
  31. if len(url) == 1:
  32. return url[0]
  33. #regex matching all the non numeric character including
  34. NON_NUMERIC = re.compile('[^,\d.]+')
  35. #url used to fetch information about the space api
  36. SPACE_API = "http://spaceapi.net/directory.json"
  37. def __init__(self, country):
  38. """docstring for """
  39. super(Hackerspaces, self).__init__()
  40. self.country = country
  41. #load the space api directory
  42. @staticmethod
  43. def get_etree(url, browser_dump=False):
  44. """Return a etree from an url using request
  45. """
  46. print url
  47. resp = requests.get(url)
  48. if browser_dump:
  49. with open('dump', 'w') as f:
  50. f.write(resp.content)
  51. import webbrowser
  52. webbrowser.open('dump')
  53. exit(0)
  54. tree = etree.parse(io.StringIO(resp.text))
  55. return tree
  56. def get_hackerspaces(self):
  57. tree = Hackerspaces.get_etree(Hackerspaces.wiki_page(self.country))
  58. hs_names = tree.xpath('//*[@id="mw-content-text"]/table[1]/tr[2]/td[1]/ul[1]/li//text()')
  59. #links = tree.xpath('//*[@id="mw-content-text"]/table[1]/tr[2]/td[1]/ul[1]/li/a/@href')
  60. hackerspaces = {}
  61. for name in hs_names:
  62. print name
  63. hackerspaces[name] = self.get_hackerspace(name)
  64. #space api url
  65. space_url = self.get_space_api_url(name)
  66. if space_url:
  67. hackerspaces[name]['space_url'] = space_url
  68. return hackerspaces
  69. def get_space_api_url(self, name):
  70. req = requests.get(Hackerspaces.SPACE_API)
  71. j = json.loads(req.text)
  72. return j.get(name)
  73. def get_hackerspace(self, name):
  74. url = Hackerspaces.edit_page(name)
  75. tree = self.get_etree(url)
  76. text_box = tree.xpath('//*[@id="wpTextbox1"]//text()')
  77. text = "".join(text_box)
  78. #select the first string between {{ and }}
  79. #this use a simple descent parser
  80. count = 0
  81. result = ""
  82. for char in text:
  83. if char == '{':
  84. count += 1
  85. elif char == '}':
  86. count -= 1
  87. if count == 0 and result:
  88. break
  89. elif count == 2:
  90. result += char
  91. #split over | for key value pair
  92. #and split over = for key value
  93. ret = {}
  94. for kv in result.split("|"):
  95. try:
  96. key, value = [el.replace('\n', ' ').strip() for el in kv.split("=")]
  97. if key == Hackerspaces.LOCATION_KEY:
  98. value = Hackerspaces.clean_location(value)
  99. elif key == Hackerspaces.LOGO_KEY:
  100. if value:
  101. value = Hackerspaces.absolute_url(Hackerspaces.url_for_file(value))
  102. if value:
  103. ret[key] = value
  104. except ValueError:
  105. pass
  106. return ret
  107. @staticmethod
  108. def clean_location(location_string):
  109. """This method will remove all non numeric or commas or dot from a string, then
  110. split it on the comma and select only the 2 first element. This has the goal to
  111. clean any string that is malformed
  112. """
  113. return hackerspaces.NON_NUMERIC.sub('', location_string).split(",")[:2]
  114. if __name__ == '__main__':
  115. if len(sys.argv) == 3:
  116. hackerspaces = Hackerspaces(sys.argv[1])
  117. else:
  118. hackerspaces = Hackerspaces('switzerland')
  119. with open(file_path, 'w') as f:
  120. hs = hackerspaces.get_hackerspaces()
  121. print(hs)
  122. json.dump(hs, f)