HuSpider.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. import re
  2. from concurrent.futures import ThreadPoolExecutor
  3. from crawler.defaults import fetch_page_by_get, crawl_request, crawl_params
  4. from crawler.fields import SaveCompanyInformation, BulletinBasicFields
  5. class HuSpider:
  6. """湖南省机构编制网"""
  7. def __init__(self):
  8. self.sign = 'hunan'
  9. self.enable_proxy = None
  10. self.nd_max_page_number = 10 # 事业单位年度报告公示最大页数
  11. self.yw_max_page_number = 20 # 事业单位登记公示最大页数
  12. self.years = ['2020', '2019', '2018', '2017', '2016', '2015', '2014']
  13. def extract_detail_page(self, json_data: dict, **request_params):
  14. rows = json_data.get('rows', [])
  15. for row in rows:
  16. item = {
  17. 'company': row.get('sydwmc', ''),
  18. 'legal_person': row.get('fddbr', ''),
  19. 'capital': row.get('kbzj', ''),
  20. 'capital_origin': row.get('jfly', ''),
  21. 'purpose_and_business': row.get('zzhywfw', ''),
  22. 'address': row.get('address', ''),
  23. 'social_id': row.get('unify_code', ''),
  24. 'district_code': row.get('unify_code', '')[2:8] if row.get('unify_code', '') else '',
  25. 'province': '湖南省',
  26. 'url': request_params.get('url', ''),
  27. 'page': request_params.get('page', '')
  28. }
  29. SaveCompanyInformation(BulletinBasicFields(**item), self.sign)
  30. def crawl_spider(self, task: tuple):
  31. url, page = task
  32. response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
  33. json_str = re.search('builtTable(.*?)$', response.text).group(1)
  34. result = eval(json_str)
  35. self.extract_detail_page(result, url=url, page=page)
  36. def generate_request_tasks(self):
  37. results = []
  38. nd_url = str(crawl_params(self.sign)[0])
  39. yw_url = str(crawl_params(self.sign)[1])
  40. for page in range(1, self.nd_max_page_number + 1):
  41. for year in self.years:
  42. url = nd_url.format(page, year)
  43. results.append((url, page))
  44. for page in range(1, self.yw_max_page_number + 1):
  45. url = yw_url.format(page)
  46. results.append((url, page))
  47. yield from results
  48. def run(self, enable_proxy=None, max_workers: int = 1):
  49. self.enable_proxy = enable_proxy or False
  50. with ThreadPoolExecutor(max_workers=max_workers) as Executor:
  51. Executor.map(self.crawl_spider, self.generate_request_tasks())