data_spider
/
topic_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
							from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from urllib.parse import urljoin

from lxml.html import fromstring, HtmlElement

from config.load import crawl_sites
from crawler.defaults import fetch_page_by_post, fetch_page_by_get, crawl_request
from crawler.fields import (
    SaveCompanyInformation,
    BulletinBasicFields,
)


class SHSpider:

    def __init__(self):
        self.sign = 'sh'
        self.enable_proxy = None
        self.site = 'http://www.sydjsh.cn/'

    def extract_text_and_save(self, url, yw_type):
        # print(url, yw_type)
        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
        element = fromstring(response.text)
        nodes = element.xpath('//table[@id="content"]//tr[position()>1]')
        for node in nodes:
            if yw_type in ['SL', 'BZ']:
                item = BulletinBasicFields(
                    company="".join(node.xpath('./td[3]/text()')),
                    legal_person="".join(node.xpath('./td[5]/text()')),
                    capital="".join(node.xpath('./td[7]/text()')) + '万元',
                    capital_origin="".join(node.xpath('./td[6]/text()')),
                    purpose_and_business="".join(node.xpath('./td[8]/text()')),
                    address="".join(node.xpath('./td[4]/text()')),
                    social_id="".join(node.xpath('./td[2]/text()')),
                    status='create',
                    province='上海'
                )
                SaveCompanyInformation(item, self.sign)

            elif yw_type in ['BG', 'JGQTXS', 'JGQTBG', 'JGQTCX', 'JGQTBL', 'JGQTGQ']:
                item = BulletinBasicFields(
                    social_id="".join(node.xpath('./td[2]/text()')),
                    company="".join(node.xpath('./td[3]/text()')),
                    status='modify',
                    province='上海'
                )
                SaveCompanyInformation(item, self.sign)

            elif yw_type == 'ZX':
                item = BulletinBasicFields(
                    social_id="".join(node.xpath('./td[3]/text()')),
                    company="".join(node.xpath('./td[4]/text()')),
                    status='cancellation',
                    province='上海'
                )
                SaveCompanyInformation(item, self.sign)

    def generate_snapshot_links(self, url, data):
        list_links = []
        response = crawl_request(fetch_page_by_post, url, self.enable_proxy, data=data)
        # print(url, data)
        element = fromstring(response.text)
        nodes = element.xpath('//div[@class="center1"]/ul/li')
        for node in nodes:
            href = "".join(node.xpath('./a/@href'))
            if data['yw_type'] == 'JGQTCX':
                # 机关注销公告列表页面的网站地址官方拼接是错误的,此处需要replace一下
                href = href.replace('jgqtXc', 'jgqtCx')
            elif data['yw_type'] == 'JGQTXS':
                href = href.replace('JgqtCl', 'jgqtCl')
            url = urljoin(self.site, href)
            list_links.append(url)
        yield from list_links

    def crawl_spider(self, task: tuple):
        url, data = task  # 列表页信息
        with ThreadPoolExecutor(max_workers=5) as Executor:
            futures = []
            for link in self.generate_snapshot_links(url, data):
                futures.append(Executor.submit(self.extract_text_and_save, link, data['yw_type']))
            wait(futures, return_when=ALL_COMPLETED)

    def task_list(self):
        for spider in crawl_sites.get(self.sign):
            url = "".join(spider.keys())
            data: dict = spider.get(url)
            total_page = int(data.get('pageTotal'))
            for page in range(1, total_page + 1):
                item = {
                    "pageIndex": str(page),
                    "yw_type": data.get('yw_type'),
                    "vl": "item",
                    "type": data.get('type'),
                    "pageSize": data.get('pageSize')
                }
                yield url, item

    def run(self, enable_proxy=None, max_workers: int = 1):
        self.enable_proxy = enable_proxy or False
        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
            Executor.map(self.crawl_spider, self.task_list())


class SHNDSpider:
    """上海事业单位编制网 - 年度报告"""

    def __init__(self):
        self.enable_proxy = None
        self.sign = 'sh'
        self.district_mapping = {
            '310000': {'region': ('上海', '上海市', '市属'), 'max_page_number': 622},
            '310106': {'region': ('上海', '上海市', '静安区'), 'max_page_number': 281},
            '310104': {'region': ('上海', '上海市', '徐汇区'), 'max_page_number': 208},
            '310113': {'region': ('上海', '上海市', '宝山区'), 'max_page_number': 361},
            '310109': {'region': ('上海', '上海市', '虹口区'), 'max_page_number': 186},
            '310112': {'region': ('上海', '上海市', '闵行区'), 'max_page_number': 361},
            '310230': {'region': ('上海', '上海市', '崇明区'), 'max_page_number': 317},
            '310105': {'region': ('上海', '上海市', '长宁区'), 'max_page_number': 170},
            '310107': {'region': ('上海', '上海市', '普陀区'), 'max_page_number': 231},
            '310117': {'region': ('上海', '上海市', '松江区'), 'max_page_number': 314},
            '310115': {'region': ('上海', '上海市', '浦东新区'), 'max_page_number': 741},
            '310101': {'region': ('上海', '上海市', '黄浦区'), 'max_page_number': 225},
            '310110': {'region': ('上海', '上海市', '杨浦区'), 'max_page_number': 210},
            '310114': {'region': ('上海', '上海市', '嘉定区'), 'max_page_number': 284},
            '310116': {'region': ('上海', '上海市', '金山区'), 'max_page_number': 265},
            '310226': {'region': ('上海', '上海市', '奉贤区'), 'max_page_number': 265},
            '310118': {'region': ('上海', '上海市', '青浦区'), 'max_page_number': 273}
        }
        self.url = 'http://www.sydjsh.cn/ndbg.do'

    def extract_text_and_save(self, element: HtmlElement, code: str, **request_params):
        province, city, county = self.district_mapping.get(code).get('region')
        nodes = element.xpath('//*[@class="cursor"]')
        for node in nodes:
            social_id = "".join(node.xpath('./td[1]/text()'))
            company = "".join(node.xpath('./td[2]/text()'))
            if len(social_id) == 0 and len(company) == 0:
                continue

            item = BulletinBasicFields(
                social_id=social_id,
                company=company,
                district_code=code,
                province=province,
                city=city,
                county=county,
                url=request_params.get('url'),
                request_data=request_params.get('request_data'),
                page=request_params.get('page')
            )
            SaveCompanyInformation(item, self.sign)

    def generate_request_tasks(self):
        results = []
        for geo_code, data in self.district_mapping.items():
            max_page_number = data.get('max_page_number') + 1
            for page in range(1, max_page_number):
                results.append({
                    "pageIndex": str(page),
                    "keyword": "",
                    "type": "4",
                    "year": "",
                    "geo_code": geo_code
                })
        yield from results

    def crawl_spider(self, data: dict):
        geo_code = data.get('geo_code')
        page = data.get('pageIndex')
        response = crawl_request(fetch_page_by_post, self.url, self.enable_proxy, data=data)
        element = fromstring(response.text)
        self.extract_text_and_save(element, geo_code, page=page, url=self.url, request_data=data)

    def run(self, enable_proxy=None, max_workers: int = 1):
        self.enable_proxy = enable_proxy or False
        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
            Executor.map(self.crawl_spider, self.generate_request_tasks())