123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- from concurrent.futures import ThreadPoolExecutor
- from lxml.html import fromstring, HtmlElement
- from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
- from crawler.fields import BulletinBasicFields, SaveCompanyInformation
- class BJSpider:
- def __init__(self):
- self.sign = 'bj'
- self.enable_proxy = None
- self.district_mapping = {
- '110101': {
- 'region': ('北京', '北京市', '东城区'),
- 'years': [('2020', 25), ('2019', 22), ('2018', 25), ('2017', 26), ('2016', 26), ('2015', 26)]
- },
- '110102': {
- 'region': ('北京', '北京市', '西城区'),
- 'years': [('2020', 22), ('2019', 19), ('2018', 20), ('2017', 22), ('2016', 1), ('2015', 1)]
- },
- '110105': {
- 'region': ('北京', '北京市', '朝阳区'),
- 'years': [('2020', 36), ('2019', 37), ('2018', 37), ('2017', 37), ('2016', 37), ('2015', 37)]
- },
- '110108': {
- 'region': ('北京', '北京市', '海淀区'),
- 'years': [('2020', 42), ('2019', 42), ('2018', 42), ('2017', 39), ('2016', 39), ('2015', 1)]
- },
- '110106': {
- 'region': ('北京', '北京市', '丰台区'),
- 'years': [('2020', 21), ('2019', 24), ('2018', 23), ('2017', 28), ('2016', 28), ('2015', 1)]
- },
- '110107': {
- 'region': ('北京', '北京市', '石景山区'),
- 'years': [('2020', 15), ('2019', 15), ('2018', 15), ('2017', 15), ('2016', 15), ('2015', 14)]
- },
- '110109': {
- 'region': ('北京', '北京市', '门头沟区'),
- 'years': [('2020', 1), ('2019', 15), ('2018', 15), ('2017', 14), ('2016', 1), ('2015', 1)]
- },
- '110111': {
- 'region': ('北京', '北京市', '房山区'),
- 'years': [('2020', 26), ('2019', 35), ('2018', 35), ('2017', 35), ('2016', 34), ('2015', 1)]
- },
- '110112': {
- 'region': ('北京', '北京市', '通州区'),
- 'years': [('2020', 19), ('2019', 24), ('2018', 24), ('2017', 24), ('2016', 24), ('2015', 1)]
- },
- '110110': {
- 'region': ('北京', '北京市', '顺义区'),
- 'years': [('2020', 1), ('2019', 1), ('2018', 1), ('2017', 30), ('2016', 1), ('2015', 1)]
- },
- '110221': {
- 'region': ('北京', '北京市', '昌平区'),
- 'years': [('2020', 28), ('2019', 35), ('2018', 35), ('2017', 35), ('2016', 35), ('2015', 34)]
- },
- '110224': {
- 'region': ('北京', '北京市', '大兴区'),
- 'years': [('2020', 29), ('2019', 36), ('2018', 35), ('2017', 34), ('2016', 34), ('2015', 1)]
- },
- '110227': {
- 'region': ('北京', '北京市', '怀柔区'),
- 'years': [('2020', 13), ('2019', 14), ('2018', 14), ('2017', 14), ('2016', 13), ('2015', 1)]
- },
- '110226': {
- 'region': ('北京', '北京市', '平谷区'),
- 'years': [('2020', 12), ('2019', 12), ('2018', 12), ('2017', 12), ('2016', 1), ('2015', 1)]
- },
- '110228': {
- 'region': ('北京', '北京市', '密云区'),
- 'years': [('2020', 15), ('2019', 15), ('2018', 15), ('2017', 15), ('2016', 14), ('2015', 14)]
- },
- '110229': {
- 'region': ('北京', '北京市', '延庆区'),
- 'years': [('2020', 11), ('2019', 13), ('2018', 13), ('2017', 13), ('2016', 13), ('2015', 1)]
- }
- }
- def extract_text_and_save(
- self,
- element: HtmlElement,
- region: tuple,
- code: str,
- **request_params
- ):
- """
- 提取文本并保存
- @param element: 元素对象
- @param region: 地区元组
- @param code: 行政区划代码
- """
- nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
- province, city, county = region
- for node in nodes:
- social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
- company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
- if len(social_id) == 0 and len(company) == 0:
- continue
- item = BulletinBasicFields(
- social_id=social_id,
- company=company,
- district_code=code,
- province=province,
- city=city,
- county=county,
- page=request_params.get('page', ''),
- url=request_params.get('url', ''),
- )
- SaveCompanyInformation(item, self.sign)
- def generate_request_tasks(self):
- results = []
- url = crawl_params('general')
- for district_code, data in self.district_mapping.items():
- region = data.get('region')
- years = data.get('years')
- for year, max_page_num in years:
- for page in range(1, max_page_num + 1):
- link = url.format(
- page=page,
- district_code=district_code,
- year=year,
- select_page=page
- )
- results.append((link, region, district_code, page))
- yield from results
- def crawl_spider(self, task: tuple):
- url, region, district_code, page = task
- response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
- element = fromstring(response.text)
- self.extract_text_and_save(element, region, district_code, url=url, page=page)
- def run(self, enable_proxy=None, max_workers: int = 1):
- self.enable_proxy = enable_proxy or False
- with ThreadPoolExecutor(max_workers=max_workers) as Executor:
- Executor.map(self.crawl_spider, self.generate_request_tasks())
|