123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- import re
- from concurrent.futures import ThreadPoolExecutor
- from crawler.defaults import fetch_page_by_get, crawl_request, crawl_params
- from crawler.fields import SaveCompanyInformation, BulletinBasicFields
- class HuSpider:
- """湖南省机构编制网"""
- def __init__(self):
- self.sign = 'hunan'
- self.enable_proxy = None
- self.nd_max_page_number = 10 # 事业单位年度报告公示最大页数
- self.yw_max_page_number = 20 # 事业单位登记公示最大页数
- self.years = ['2020', '2019', '2018', '2017', '2016', '2015', '2014']
- def extract_detail_page(self, json_data: dict, **request_params):
- rows = json_data.get('rows', [])
- for row in rows:
- item = {
- 'company': row.get('sydwmc', ''),
- 'legal_person': row.get('fddbr', ''),
- 'capital': row.get('kbzj', ''),
- 'capital_origin': row.get('jfly', ''),
- 'purpose_and_business': row.get('zzhywfw', ''),
- 'address': row.get('address', ''),
- 'social_id': row.get('unify_code', ''),
- 'district_code': row.get('unify_code', '')[2:8] if row.get('unify_code', '') else '',
- 'province': '湖南省',
- 'url': request_params.get('url', ''),
- 'page': request_params.get('page', '')
- }
- SaveCompanyInformation(BulletinBasicFields(**item), self.sign)
- def crawl_spider(self, task: tuple):
- url, page = task
- response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
- json_str = re.search('builtTable(.*?)$', response.text).group(1)
- result = eval(json_str)
- self.extract_detail_page(result, url=url, page=page)
- def generate_request_tasks(self):
- results = []
- nd_url = str(crawl_params(self.sign)[0])
- yw_url = str(crawl_params(self.sign)[1])
- for page in range(1, self.nd_max_page_number + 1):
- for year in self.years:
- url = nd_url.format(page, year)
- results.append((url, page))
- for page in range(1, self.yw_max_page_number + 1):
- url = yw_url.format(page)
- results.append((url, page))
- yield from results
- def run(self, enable_proxy=None, max_workers: int = 1):
- self.enable_proxy = enable_proxy or False
- with ThreadPoolExecutor(max_workers=max_workers) as Executor:
- Executor.map(self.crawl_spider, self.generate_request_tasks())
|