BJSpider.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. from concurrent.futures import ThreadPoolExecutor
  2. from lxml.html import fromstring, HtmlElement
  3. from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
  4. from crawler.fields import BulletinBasicFields, SaveCompanyInformation
  5. class BJSpider:
  6. def __init__(self):
  7. self.sign = 'bj'
  8. self.enable_proxy = None
  9. self.district_mapping = {
  10. '110101': {
  11. 'region': ('北京', '北京市', '东城区'),
  12. 'years': [('2020', 25), ('2019', 22), ('2018', 25), ('2017', 26), ('2016', 26), ('2015', 26)]
  13. },
  14. '110102': {
  15. 'region': ('北京', '北京市', '西城区'),
  16. 'years': [('2020', 22), ('2019', 19), ('2018', 20), ('2017', 22), ('2016', 1), ('2015', 1)]
  17. },
  18. '110105': {
  19. 'region': ('北京', '北京市', '朝阳区'),
  20. 'years': [('2020', 36), ('2019', 37), ('2018', 37), ('2017', 37), ('2016', 37), ('2015', 37)]
  21. },
  22. '110108': {
  23. 'region': ('北京', '北京市', '海淀区'),
  24. 'years': [('2020', 42), ('2019', 42), ('2018', 42), ('2017', 39), ('2016', 39), ('2015', 1)]
  25. },
  26. '110106': {
  27. 'region': ('北京', '北京市', '丰台区'),
  28. 'years': [('2020', 21), ('2019', 24), ('2018', 23), ('2017', 28), ('2016', 28), ('2015', 1)]
  29. },
  30. '110107': {
  31. 'region': ('北京', '北京市', '石景山区'),
  32. 'years': [('2020', 15), ('2019', 15), ('2018', 15), ('2017', 15), ('2016', 15), ('2015', 14)]
  33. },
  34. '110109': {
  35. 'region': ('北京', '北京市', '门头沟区'),
  36. 'years': [('2020', 1), ('2019', 15), ('2018', 15), ('2017', 14), ('2016', 1), ('2015', 1)]
  37. },
  38. '110111': {
  39. 'region': ('北京', '北京市', '房山区'),
  40. 'years': [('2020', 26), ('2019', 35), ('2018', 35), ('2017', 35), ('2016', 34), ('2015', 1)]
  41. },
  42. '110112': {
  43. 'region': ('北京', '北京市', '通州区'),
  44. 'years': [('2020', 19), ('2019', 24), ('2018', 24), ('2017', 24), ('2016', 24), ('2015', 1)]
  45. },
  46. '110110': {
  47. 'region': ('北京', '北京市', '顺义区'),
  48. 'years': [('2020', 1), ('2019', 1), ('2018', 1), ('2017', 30), ('2016', 1), ('2015', 1)]
  49. },
  50. '110221': {
  51. 'region': ('北京', '北京市', '昌平区'),
  52. 'years': [('2020', 28), ('2019', 35), ('2018', 35), ('2017', 35), ('2016', 35), ('2015', 34)]
  53. },
  54. '110224': {
  55. 'region': ('北京', '北京市', '大兴区'),
  56. 'years': [('2020', 29), ('2019', 36), ('2018', 35), ('2017', 34), ('2016', 34), ('2015', 1)]
  57. },
  58. '110227': {
  59. 'region': ('北京', '北京市', '怀柔区'),
  60. 'years': [('2020', 13), ('2019', 14), ('2018', 14), ('2017', 14), ('2016', 13), ('2015', 1)]
  61. },
  62. '110226': {
  63. 'region': ('北京', '北京市', '平谷区'),
  64. 'years': [('2020', 12), ('2019', 12), ('2018', 12), ('2017', 12), ('2016', 1), ('2015', 1)]
  65. },
  66. '110228': {
  67. 'region': ('北京', '北京市', '密云区'),
  68. 'years': [('2020', 15), ('2019', 15), ('2018', 15), ('2017', 15), ('2016', 14), ('2015', 14)]
  69. },
  70. '110229': {
  71. 'region': ('北京', '北京市', '延庆区'),
  72. 'years': [('2020', 11), ('2019', 13), ('2018', 13), ('2017', 13), ('2016', 13), ('2015', 1)]
  73. }
  74. }
  75. def extract_text_and_save(
  76. self,
  77. element: HtmlElement,
  78. region: tuple,
  79. code: str,
  80. **request_params
  81. ):
  82. """
  83. 提取文本并保存
  84. @param element: 元素对象
  85. @param region: 地区元组
  86. @param code: 行政区划代码
  87. """
  88. nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
  89. province, city, county = region
  90. for node in nodes:
  91. social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
  92. company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
  93. if len(social_id) == 0 and len(company) == 0:
  94. continue
  95. item = BulletinBasicFields(
  96. social_id=social_id,
  97. company=company,
  98. district_code=code,
  99. province=province,
  100. city=city,
  101. county=county,
  102. page=request_params.get('page', ''),
  103. url=request_params.get('url', ''),
  104. )
  105. SaveCompanyInformation(item, self.sign)
  106. def generate_request_tasks(self):
  107. results = []
  108. url = crawl_params('general')
  109. for district_code, data in self.district_mapping.items():
  110. region = data.get('region')
  111. years = data.get('years')
  112. for year, max_page_num in years:
  113. for page in range(1, max_page_num + 1):
  114. link = url.format(
  115. page=page,
  116. district_code=district_code,
  117. year=year,
  118. select_page=page
  119. )
  120. results.append((link, region, district_code, page))
  121. yield from results
  122. def crawl_spider(self, task: tuple):
  123. url, region, district_code, page = task
  124. response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
  125. element = fromstring(response.text)
  126. self.extract_text_and_save(element, region, district_code, url=url, page=page)
  127. def run(self, enable_proxy=None, max_workers: int = 1):
  128. self.enable_proxy = enable_proxy or False
  129. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  130. Executor.map(self.crawl_spider, self.generate_request_tasks())