data_spider
/
topic_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
							from concurrent.futures import ThreadPoolExecutor

from lxml.html import fromstring, HtmlElement

from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
from crawler.fields import BulletinBasicFields, SaveCompanyInformation


class TJSpider:

    def __init__(self):
        self.sign = 'tj'
        self.enable_proxy = None
        self.district_mapping = {
            '120101': {
                'region': ('天津', '天津市', '和平区'),
                'years': [('2020', 8), ('2019', 7), ('2018', 6), ('2017', 6), ('2016', 6), ('2015', 6)]
            },
            '120102': {
                'region': ('天津', '天津市', '河东区'),
                'years': [('2020', 11), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015', 6)]
            },
            '120103': {
                'region': ('天津', '天津市', '河西区'),
                'years': [('2020', 13), ('2019', 12), ('2018', 9), ('2017', 9), ('2016', 9), ('2015', 9)]
            },
            '120104': {
                'region': ('天津', '天津市', '南开区'),
                'years': [('2020', 12), ('2019', 9), ('2018', 9), ('2017', 9), ('2016', 9), ('2015', 9)]
            },
            '120105': {
                'region': ('天津', '天津市', '河北区'),
                'years': [('2020', 9), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015',7)]
            },
            '120106': {
                'region': ('天津', '天津市', '红桥区'),
                'years': [('2020', 8), ('2019', 6), ('2018', 6), ('2017', 5), ('2016', 5), ('2015', 5)]
            },
            '120110': {
                'region': ('天津', '天津市', '东丽区'),
                'years': [('2020', 11), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015', 7)]
            },
            '120111': {
                'region': ('天津', '天津市', '西青区'),
                'years': [('2020', 11), ('2019', 10), ('2018', 8), ('2017', 7), ('2016', 6), ('2015', 6)]
            },
            '120112': {
                'region': ('天津', '天津市', '津南区'),
                'years': [('2020', 11), ('2019', 8), ('2018', 7), ('2017', 7), ('2016', 6), ('2015',6)]
            },
            '120113': {
                'region': ('天津', '天津市', '北辰区'),
                'years': [('2020', 11), ('2019', 9), ('2018', 8), ('2017', 8), ('2016', 8), ('2015', 8)]
            },
            '120116': {
                'region': ('天津', '天津市', '滨海新区'),
                'years': [('2020', 28), ('2019', 27), ('2018', 23), ('2017', 22), ('2016', 21), ('2015', 21)]
            },
            '120221': {
                'region': ('天津', '天津市', '宁河区'),
                'years': [('2020', 15), ('2019', 13), ('2018', 11), ('2017', 10), ('2016', 9), ('2015', 9)]
            },
            '120222': {
                'region': ('天津', '天津市', '武清区'),
                'years': [('2020', 13), ('2019', 12), ('2018', 12), ('2017', 11), ('2016', 11), ('2015', 11)]
            },
            '120223': {
                'region': ('天津', '天津市', '静海区'),
                'years': [('2020', 17), ('2019', 16), ('2018', 16), ('2017', 16), ('2016', 14), ('2015', 13)]
            },
            '120224': {
                'region': ('天津', '天津市', '宝坻区'),
                'years': [('2020', 17), ('2019', 16), ('2018', 15), ('2017', 15), ('2016', 8), ('2015', 7)]
            },
            # '': {
            #     'region': ('天津', '天津市', '蓟州区'),
            #     'years': [('2020',), ('2019',), ('2018',), ('2017',), ('2016',),
            #               ('2015',)]
            # },
        }

    def extract_text_and_save(self, element: HtmlElement, region: tuple, code: str, **request_params):
        nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
        province, city, county = region
        for node in nodes:
            social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
            company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
            if len(social_id) == 0 and len(company) == 0:
                continue

            item = BulletinBasicFields(
                social_id=social_id,
                company=company,
                district_code=code,
                province=province,
                city=city,
                county=county,
                url=request_params.get('url', ''),
                page=request_params.get('page', '')
            )
            SaveCompanyInformation(item, self.sign)

    def crawl_spider(self, task: tuple):
        url, region, district_code, page = task
        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
        element = fromstring(response.text)
        self.extract_text_and_save(element, region, district_code, url=url, page=page)

    def generate_request_tasks(self):
        results = []
        url = crawl_params('general')
        for district_code, data in self.district_mapping.items():
            region = data.get('region')
            years = data.get('years')
            for year, max_page_num in years:
                for page in range(1, max_page_num + 1):
                    link = url.format(
                        page=page,
                        district_code=district_code,
                        year=year,
                        select_page=page
                    )
                    results.append((link, region, district_code, page))
        yield from results

    def run(self, enable_proxy=None, max_workers: int = 1):
        self.enable_proxy = enable_proxy or False
        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
            Executor.map(self.crawl_spider, self.generate_request_tasks())