SHSpider.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
  2. from urllib.parse import urljoin
  3. from lxml.html import fromstring, HtmlElement
  4. from config.load import crawl_sites
  5. from crawler.defaults import fetch_page_by_post, fetch_page_by_get, crawl_request
  6. from crawler.fields import (
  7. SaveCompanyInformation,
  8. BulletinBasicFields,
  9. )
  10. class SHSpider:
  11. def __init__(self):
  12. self.sign = 'sh'
  13. self.enable_proxy = None
  14. self.site = 'http://www.sydjsh.cn/'
  15. def extract_text_and_save(self, url, yw_type):
  16. # print(url, yw_type)
  17. response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
  18. element = fromstring(response.text)
  19. nodes = element.xpath('//table[@id="content"]//tr[position()>1]')
  20. for node in nodes:
  21. if yw_type in ['SL', 'BZ']:
  22. item = BulletinBasicFields(
  23. company="".join(node.xpath('./td[3]/text()')),
  24. legal_person="".join(node.xpath('./td[5]/text()')),
  25. capital="".join(node.xpath('./td[7]/text()')) + '万元',
  26. capital_origin="".join(node.xpath('./td[6]/text()')),
  27. purpose_and_business="".join(node.xpath('./td[8]/text()')),
  28. address="".join(node.xpath('./td[4]/text()')),
  29. social_id="".join(node.xpath('./td[2]/text()')),
  30. status='create',
  31. province='上海'
  32. )
  33. SaveCompanyInformation(item, self.sign)
  34. elif yw_type in ['BG', 'JGQTXS', 'JGQTBG', 'JGQTCX', 'JGQTBL', 'JGQTGQ']:
  35. item = BulletinBasicFields(
  36. social_id="".join(node.xpath('./td[2]/text()')),
  37. company="".join(node.xpath('./td[3]/text()')),
  38. status='modify',
  39. province='上海'
  40. )
  41. SaveCompanyInformation(item, self.sign)
  42. elif yw_type == 'ZX':
  43. item = BulletinBasicFields(
  44. social_id="".join(node.xpath('./td[3]/text()')),
  45. company="".join(node.xpath('./td[4]/text()')),
  46. status='cancellation',
  47. province='上海'
  48. )
  49. SaveCompanyInformation(item, self.sign)
  50. def generate_snapshot_links(self, url, data):
  51. list_links = []
  52. response = crawl_request(fetch_page_by_post, url, self.enable_proxy, data=data)
  53. # print(url, data)
  54. element = fromstring(response.text)
  55. nodes = element.xpath('//div[@class="center1"]/ul/li')
  56. for node in nodes:
  57. href = "".join(node.xpath('./a/@href'))
  58. if data['yw_type'] == 'JGQTCX':
  59. # 机关注销公告列表页面的网站地址官方拼接是错误的,此处需要replace一下
  60. href = href.replace('jgqtXc', 'jgqtCx')
  61. elif data['yw_type'] == 'JGQTXS':
  62. href = href.replace('JgqtCl', 'jgqtCl')
  63. url = urljoin(self.site, href)
  64. list_links.append(url)
  65. yield from list_links
  66. def crawl_spider(self, task: tuple):
  67. url, data = task # 列表页信息
  68. with ThreadPoolExecutor(max_workers=5) as Executor:
  69. futures = []
  70. for link in self.generate_snapshot_links(url, data):
  71. futures.append(Executor.submit(self.extract_text_and_save, link, data['yw_type']))
  72. wait(futures, return_when=ALL_COMPLETED)
  73. def task_list(self):
  74. for spider in crawl_sites.get(self.sign):
  75. url = "".join(spider.keys())
  76. data: dict = spider.get(url)
  77. total_page = int(data.get('pageTotal'))
  78. for page in range(1, total_page + 1):
  79. item = {
  80. "pageIndex": str(page),
  81. "yw_type": data.get('yw_type'),
  82. "vl": "item",
  83. "type": data.get('type'),
  84. "pageSize": data.get('pageSize')
  85. }
  86. yield url, item
  87. def run(self, enable_proxy=None, max_workers: int = 1):
  88. self.enable_proxy = enable_proxy or False
  89. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  90. Executor.map(self.crawl_spider, self.task_list())
  91. class SHNDSpider:
  92. """上海事业单位编制网 - 年度报告"""
  93. def __init__(self):
  94. self.enable_proxy = None
  95. self.sign = 'sh'
  96. self.district_mapping = {
  97. '310000': {'region': ('上海', '上海市', '市属'), 'max_page_number': 622},
  98. '310106': {'region': ('上海', '上海市', '静安区'), 'max_page_number': 281},
  99. '310104': {'region': ('上海', '上海市', '徐汇区'), 'max_page_number': 208},
  100. '310113': {'region': ('上海', '上海市', '宝山区'), 'max_page_number': 361},
  101. '310109': {'region': ('上海', '上海市', '虹口区'), 'max_page_number': 186},
  102. '310112': {'region': ('上海', '上海市', '闵行区'), 'max_page_number': 361},
  103. '310230': {'region': ('上海', '上海市', '崇明区'), 'max_page_number': 317},
  104. '310105': {'region': ('上海', '上海市', '长宁区'), 'max_page_number': 170},
  105. '310107': {'region': ('上海', '上海市', '普陀区'), 'max_page_number': 231},
  106. '310117': {'region': ('上海', '上海市', '松江区'), 'max_page_number': 314},
  107. '310115': {'region': ('上海', '上海市', '浦东新区'), 'max_page_number': 741},
  108. '310101': {'region': ('上海', '上海市', '黄浦区'), 'max_page_number': 225},
  109. '310110': {'region': ('上海', '上海市', '杨浦区'), 'max_page_number': 210},
  110. '310114': {'region': ('上海', '上海市', '嘉定区'), 'max_page_number': 284},
  111. '310116': {'region': ('上海', '上海市', '金山区'), 'max_page_number': 265},
  112. '310226': {'region': ('上海', '上海市', '奉贤区'), 'max_page_number': 265},
  113. '310118': {'region': ('上海', '上海市', '青浦区'), 'max_page_number': 273}
  114. }
  115. self.url = 'http://www.sydjsh.cn/ndbg.do'
  116. def extract_text_and_save(self, element: HtmlElement, code: str, **request_params):
  117. province, city, county = self.district_mapping.get(code).get('region')
  118. nodes = element.xpath('//*[@class="cursor"]')
  119. for node in nodes:
  120. social_id = "".join(node.xpath('./td[1]/text()'))
  121. company = "".join(node.xpath('./td[2]/text()'))
  122. if len(social_id) == 0 and len(company) == 0:
  123. continue
  124. item = BulletinBasicFields(
  125. social_id=social_id,
  126. company=company,
  127. district_code=code,
  128. province=province,
  129. city=city,
  130. county=county,
  131. url=request_params.get('url'),
  132. request_data=request_params.get('request_data'),
  133. page=request_params.get('page')
  134. )
  135. SaveCompanyInformation(item, self.sign)
  136. def generate_request_tasks(self):
  137. results = []
  138. for geo_code, data in self.district_mapping.items():
  139. max_page_number = data.get('max_page_number') + 1
  140. for page in range(1, max_page_number):
  141. results.append({
  142. "pageIndex": str(page),
  143. "keyword": "",
  144. "type": "4",
  145. "year": "",
  146. "geo_code": geo_code
  147. })
  148. yield from results
  149. def crawl_spider(self, data: dict):
  150. geo_code = data.get('geo_code')
  151. page = data.get('pageIndex')
  152. response = crawl_request(fetch_page_by_post, self.url, self.enable_proxy, data=data)
  153. element = fromstring(response.text)
  154. self.extract_text_and_save(element, geo_code, page=page, url=self.url, request_data=data)
  155. def run(self, enable_proxy=None, max_workers: int = 1):
  156. self.enable_proxy = enable_proxy or False
  157. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  158. Executor.map(self.crawl_spider, self.generate_request_tasks())