TJSpider.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. from concurrent.futures import ThreadPoolExecutor
  2. from lxml.html import fromstring, HtmlElement
  3. from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
  4. from crawler.fields import BulletinBasicFields, SaveCompanyInformation
  5. class TJSpider:
  6. def __init__(self):
  7. self.sign = 'tj'
  8. self.enable_proxy = None
  9. self.district_mapping = {
  10. '120101': {
  11. 'region': ('天津', '天津市', '和平区'),
  12. 'years': [('2020', 8), ('2019', 7), ('2018', 6), ('2017', 6), ('2016', 6), ('2015', 6)]
  13. },
  14. '120102': {
  15. 'region': ('天津', '天津市', '河东区'),
  16. 'years': [('2020', 11), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015', 6)]
  17. },
  18. '120103': {
  19. 'region': ('天津', '天津市', '河西区'),
  20. 'years': [('2020', 13), ('2019', 12), ('2018', 9), ('2017', 9), ('2016', 9), ('2015', 9)]
  21. },
  22. '120104': {
  23. 'region': ('天津', '天津市', '南开区'),
  24. 'years': [('2020', 12), ('2019', 9), ('2018', 9), ('2017', 9), ('2016', 9), ('2015', 9)]
  25. },
  26. '120105': {
  27. 'region': ('天津', '天津市', '河北区'),
  28. 'years': [('2020', 9), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015',7)]
  29. },
  30. '120106': {
  31. 'region': ('天津', '天津市', '红桥区'),
  32. 'years': [('2020', 8), ('2019', 6), ('2018', 6), ('2017', 5), ('2016', 5), ('2015', 5)]
  33. },
  34. '120110': {
  35. 'region': ('天津', '天津市', '东丽区'),
  36. 'years': [('2020', 11), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015', 7)]
  37. },
  38. '120111': {
  39. 'region': ('天津', '天津市', '西青区'),
  40. 'years': [('2020', 11), ('2019', 10), ('2018', 8), ('2017', 7), ('2016', 6), ('2015', 6)]
  41. },
  42. '120112': {
  43. 'region': ('天津', '天津市', '津南区'),
  44. 'years': [('2020', 11), ('2019', 8), ('2018', 7), ('2017', 7), ('2016', 6), ('2015',6)]
  45. },
  46. '120113': {
  47. 'region': ('天津', '天津市', '北辰区'),
  48. 'years': [('2020', 11), ('2019', 9), ('2018', 8), ('2017', 8), ('2016', 8), ('2015', 8)]
  49. },
  50. '120116': {
  51. 'region': ('天津', '天津市', '滨海新区'),
  52. 'years': [('2020', 28), ('2019', 27), ('2018', 23), ('2017', 22), ('2016', 21), ('2015', 21)]
  53. },
  54. '120221': {
  55. 'region': ('天津', '天津市', '宁河区'),
  56. 'years': [('2020', 15), ('2019', 13), ('2018', 11), ('2017', 10), ('2016', 9), ('2015', 9)]
  57. },
  58. '120222': {
  59. 'region': ('天津', '天津市', '武清区'),
  60. 'years': [('2020', 13), ('2019', 12), ('2018', 12), ('2017', 11), ('2016', 11), ('2015', 11)]
  61. },
  62. '120223': {
  63. 'region': ('天津', '天津市', '静海区'),
  64. 'years': [('2020', 17), ('2019', 16), ('2018', 16), ('2017', 16), ('2016', 14), ('2015', 13)]
  65. },
  66. '120224': {
  67. 'region': ('天津', '天津市', '宝坻区'),
  68. 'years': [('2020', 17), ('2019', 16), ('2018', 15), ('2017', 15), ('2016', 8), ('2015', 7)]
  69. },
  70. # '': {
  71. # 'region': ('天津', '天津市', '蓟州区'),
  72. # 'years': [('2020',), ('2019',), ('2018',), ('2017',), ('2016',),
  73. # ('2015',)]
  74. # },
  75. }
  76. def extract_text_and_save(self, element: HtmlElement, region: tuple, code: str, **request_params):
  77. nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
  78. province, city, county = region
  79. for node in nodes:
  80. social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
  81. company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
  82. if len(social_id) == 0 and len(company) == 0:
  83. continue
  84. item = BulletinBasicFields(
  85. social_id=social_id,
  86. company=company,
  87. district_code=code,
  88. province=province,
  89. city=city,
  90. county=county,
  91. url=request_params.get('url', ''),
  92. page=request_params.get('page', '')
  93. )
  94. SaveCompanyInformation(item, self.sign)
  95. def crawl_spider(self, task: tuple):
  96. url, region, district_code, page = task
  97. response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
  98. element = fromstring(response.text)
  99. self.extract_text_and_save(element, region, district_code, url=url, page=page)
  100. def generate_request_tasks(self):
  101. results = []
  102. url = crawl_params('general')
  103. for district_code, data in self.district_mapping.items():
  104. region = data.get('region')
  105. years = data.get('years')
  106. for year, max_page_num in years:
  107. for page in range(1, max_page_num + 1):
  108. link = url.format(
  109. page=page,
  110. district_code=district_code,
  111. year=year,
  112. select_page=page
  113. )
  114. results.append((link, region, district_code, page))
  115. yield from results
  116. def run(self, enable_proxy=None, max_workers: int = 1):
  117. self.enable_proxy = enable_proxy or False
  118. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  119. Executor.map(self.crawl_spider, self.generate_request_tasks())