dongzhaorui@topnet.net.cn 3 年 前
コミット
c0b9f76f04

+ 0 - 0
company/config/__init__.py


+ 26 - 0
company/config/conf.yaml

@@ -0,0 +1,26 @@
+# 请求头
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
+
+
+# sockes5 代理
+proxy:
+  socks5:
+    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
+    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
+
+
+# mongo
+mongo:
+  host: 172.17.4.87
+  port: !!int 27080
+#  host: 127.0.0.1
+#  port: !!int 27017
+
+
+# redis
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10

+ 24 - 0
company/config/load.py

@@ -0,0 +1,24 @@
+from pathlib import Path
+
+import yaml
+
+__all__ = [
+    'mongo_conf', 'redis_conf',
+    'headers', 'jy_proxy',
+    'crawl_sites'
+]
+
+base_path = Path(__file__).parent
+yaml_conf = (base_path / 'conf.yaml').resolve()
+yaml_sites = (base_path / 'sites.yaml').resolve()
+
+with open(yaml_conf, encoding="utf-8") as f:
+    conf = yaml.safe_load(f)
+    mongo_conf = conf['mongo']
+    redis_conf = conf['redis']
+    headers: dict = conf['headers']
+    jy_proxy: dict = conf['proxy']
+
+with open(yaml_sites, encoding="utf-8") as fp:
+    sites = yaml.safe_load(fp)
+    crawl_sites = sites['crawl_sites']

+ 202 - 0
company/config/sites.yaml

@@ -0,0 +1,202 @@
+# 爬虫采集网站地址列表
+crawl_sites:
+  general: http://search.gjsy.gov.cn:9090/queryAll/searchFrame2?page={page}&districtCode={district_code}&checkYear={year}&sydwName=&selectPage={select_page}
+  sh:
+#    - http://www.sydjsh.cn/djgg.do?vl=all&type=5&pageSize=6
+    # 设立公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'SL'
+        type: '51'
+        pageSize: '100'
+        pageTotal: '2'
+    # 变更公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'BG'
+        type: '52'
+        pageSize: '100'
+        pageTotal: '2'
+    # 注销公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'ZX'
+        type: '53'
+        pageSize: '100'
+        pageTotal: '2'
+    # 补证公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'BZ'
+        type: '54'
+        pageSize: '100'
+        pageTotal: '2'
+    # 机关初领公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'JGQTXS'
+        type: '55'
+        pageSize: '15'
+        pageTotal: '1'
+    # 机关变更公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'JGQTBG'
+        type: '56'
+        pageSize: '15'
+        pageTotal: '1'
+    # 机关注销公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'JGQTCX'
+        type: '57'
+        pageSize: '15'
+        pageTotal: '1'
+    # 机关补领公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'JGQTBL'
+        type: '58'
+        pageSize: '15'
+        pageTotal: '1'
+    # 机关超过有效期公告
+    - http://www.sydjsh.cn/djgg.do:
+        pageIndex: '1'
+        yw_type: 'JGQTGQ'
+        type: '59'
+        pageSize: '15'
+        pageTotal: '1'
+  hb:
+    - http://111.47.11.106:81/content/262.html
+    - http://111.47.11.106:81/content/4751.html
+  hunan:
+    # 事业单位年度报告公示
+    - http://www.hunanbb.gov.cn/sbb/front/report/page.do?page={}&rows=100&jsonpCallback=builtTable&sydwmc=&nbnd={}&unify_code=
+    # 事业单位登记公示
+    - http://www.hunanbb.gov.cn/sbb/front/exchange/page.do?page={}&rows=200&jsonpCallback=builtTable&sydwmc=&ywlx=&unify_code=
+  gd:
+    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2020&owner=44:
+        searchTxt: ''
+        pageInfo.switchingPage: 'false'
+        pageInfo.pageIndex: '0'
+        pageInfo.pageTotal: '449'
+        pageInfo.recordCount: '44886'
+        pageInfo.pageSize: '100'
+
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2019&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '929'
+#        pageInfo.recordCount: '46426'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2018&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '944'
+#        pageInfo.recordCount: '47175'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2017&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '918'
+#        pageInfo.recordCount: '45883'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2016&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '788'
+#        pageInfo.recordCount: '39395'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2015&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '802'
+#        pageInfo.recordCount: '40075'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2014&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '736'
+#        pageInfo.recordCount: '36789'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2013&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '672'
+#        pageInfo.recordCount: '33590'
+#        pageInfo.pageSize: '50'
+#
+#    - http://www.gdsy.gov.cn/findNianDuLst2.do?checkYear=2012&owner=44:
+#        searchTxt: ''
+#        pageInfo.switchingPage: 'false'
+#        pageInfo.pageIndex: '0'
+#        pageInfo.pageTotal: '509'
+#        pageInfo.recordCount: '25441'
+#        pageInfo.pageSize: '50'
+  sx:
+    - http://www.sxdjgl.gov.cn/html/0/14/19.html:
+        code: ''
+        year: '2020'
+        pageSize: '600'
+        pageTotal: '1'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/20.html:
+        code: ''
+        year: '2020'
+        pageSize: '600'
+        pageTotal: '7'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/21.html:
+        code: ''
+        year: '2020'
+        pageSize: '600'
+        pageTotal: '33'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/19.html:
+        code: ''
+        year: '2019'
+        pageSize: '600'
+        pageTotal: '1'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/20.html:
+        code: ''
+        year: '2019'
+        pageSize: '600'
+        pageTotal: '7'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/21.html:
+        code: ''
+        year: '2019'
+        pageSize: '600'
+        pageTotal: '34'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/19.html:
+        code: ''
+        year: '2018'
+        pageSize: '600'
+        pageTotal: '1'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/20.html:
+        code: ''
+        year: '2018'
+        pageSize: '600'
+        pageTotal: '7'
+
+    - http://www.sxdjgl.gov.cn/html/0/14/21.html:
+        code: ''
+        year: '2018'
+        pageSize: '600'
+        pageTotal: '34'

+ 60 - 0
company/crawler/__init__.py

@@ -0,0 +1,60 @@
+from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED, wait
+
+from crawler.spiders import (
+    SXSpider,
+    GDSpider,
+    BJSpider,
+    TJSpider,
+    HuSpider,
+    SHSpider,
+    HBSpider,
+    SHNDSpider
+)
+from utils.log import logger
+
+
+def bj_spider(max_workers):
+    return BJSpider().run(True, max_workers)
+
+
+def tj_spider(max_workers):
+    return TJSpider().run(True, max_workers)
+
+
+def hu_spider(max_workers):
+    return HuSpider().run(True, max_workers)
+
+
+def gd_spider(max_workers):
+    return GDSpider().run(True, max_workers)
+
+
+def sx_spider(max_workers):
+    return SXSpider().run(True, max_workers)
+
+
+def sh_spider(max_workers):
+    return SHSpider().run(True, max_workers)
+
+
+def hb_spider(max_workers):
+    return HBSpider().run(True, max_workers)
+
+
+def sh_nd_spider(max_workers):
+    return SHNDSpider().run(True, max_workers)
+
+
+def activate_spider(max_workers: int = 1):
+    futures = []
+    with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+        # futures.append(Executor.submit(bj_spider, 2))
+        # futures.append(Executor.submit(tj_spider, 2))
+        # futures.append(Executor.submit(hb_spider, 2))
+        futures.append(Executor.submit(hu_spider, 2))
+        # futures.append(Executor.submit(gd_spider, 2))
+        # futures.append(Executor.submit(sx_spider, 2))
+        # futures.append(Executor.submit(sh_spider, 2))
+        # futures.append(Executor.submit(sh_nd_spider, 2))
+        wait(futures, return_when=ALL_COMPLETED)
+    logger.info('[采集]采集完成')

+ 100 - 0
company/crawler/defaults.py

@@ -0,0 +1,100 @@
+import requests
+from requests.models import Response
+
+from config.load import headers, crawl_sites
+from crawler.exceptions import InvalidResponseException
+from crawler.socks5 import Proxy
+from utils.log import logger
+
+
+def validate_response(response: Response):
+    """验证响应,确保代理正常访问到资源"""
+    page_source = response.text
+    if 'Please try again!' in page_source:
+        # 网站封ip, 当出现'Please try again!' 使用的ip已被关小黑屋
+        raise InvalidResponseException('ip被封')
+    elif page_source == '':
+        raise InvalidResponseException('空字符串')
+    else:
+        pass
+
+
+def fetch_page_by_get(url: str, **kwargs):
+    allow_use_proxy = kwargs.get('enable_proxy')
+    proxy = None
+    if allow_use_proxy:
+        proxy = Proxy(True)
+        kwargs.update({'proxies': proxy.proxies})
+
+    while True:
+        try:
+            response = requests.get(
+                url,
+                headers=kwargs.get('headers') or headers,
+                params=kwargs.get('params'),
+                proxies=kwargs.get('proxies'),
+                timeout=60
+            )
+            logger.info(f'[采集] {response.status_code} {url}')
+            response.encoding = response.apparent_encoding
+            validate_response(response)
+            return response
+        except requests.RequestException as e:
+            logger.error(f'{e.__class__.__name__}')
+            # traceback.print_exc()
+            if allow_use_proxy:
+                proxy.switch()
+                kwargs.update({'proxies': proxy.proxies})
+            else:
+                break
+        except InvalidResponseException:
+            if allow_use_proxy:
+                proxy.switch()
+                kwargs.update({'proxies': proxy.proxies})
+            else:
+                break
+
+
+def fetch_page_by_post(url: str, **kwargs):
+    allow_use_proxy = kwargs.get('enable_proxy')
+    proxy = None
+    if allow_use_proxy:
+        proxy = Proxy(True)
+        kwargs.update({'proxies': proxy.proxies})
+
+    while True:
+        try:
+            r = requests.post(
+                url,
+                headers=kwargs.get('headers') or headers,
+                data=kwargs.get('data'),
+                proxies=kwargs.get('proxies'),
+                timeout=60
+            )
+            logger.info(f'[采集] {r.status_code} {url}')
+            r.encoding = r.apparent_encoding
+            return r
+        except requests.RequestException as e:
+            logger.error(f'{e.__class__.__name__}')
+            # traceback.print_exc()
+            if allow_use_proxy:
+                proxy.switch()
+                kwargs.update({'proxies': proxy.proxies})
+            else:
+                break
+
+
+def crawl_request(fn, url, enable_proxy: bool = False, **kwargs):
+    kwargs.update({'enable_proxy': enable_proxy})
+    response = fn(url, **kwargs)
+    return response
+
+
+def crawl_params(region_sign: str):
+    """
+    采集参数
+
+    @param region_sign: 地区标识
+    @return: 采集请求相关内容
+    """
+    return crawl_sites.get(region_sign)

+ 7 - 0
company/crawler/exceptions.py

@@ -0,0 +1,7 @@
+
+
+class InvalidResponseException(Exception):
+    """爬虫发起无效请求,验证失败,抛出该异常"""
+
+    def __init__(self, *args, **kwargs):
+        pass

+ 56 - 0
company/crawler/fields.py

@@ -0,0 +1,56 @@
+import json
+import time
+
+import bson
+
+from utils.databases import MongoDBS
+from utils.log import logger
+
+
+class BulletinBasicFields(dict):
+    """ 事业单位登记公告基础字段 """
+
+    def __init__(self, **kw):
+        self.__items = {
+            'company': '',  # 公司名称
+            'legal_person': '',  # 法定代表人
+            'capital': '',  # 开办资金
+            'capital_origin': '',  # 经费来源
+            'purpose_and_business': '',  # 宗旨和业务范围
+            'address': '',  # 住所
+            'social_id': '',  # 统一社会信用代码
+            'province': '',  # 省
+            'city': '',  # 市
+            'county': '',  # 县
+            'district_code': '',  # 行政区划代码
+            'page': '',  # 所属页数
+            'url': '',  # 源地址
+            'request_data': '',  # 请求参数
+            'status': '',  # 公司登记状态: create=设立 modify=变更 cancellation=注销 lost_invalid=遗失作废
+            'create_time': bson.int64.Int64(int(time.time()))
+        }
+        for key, value in kw.items():
+            if key in self.__items:
+                if isinstance(kw.get(key), dict):
+                    self.__items[key] = json.dumps(value, ensure_ascii=False)
+                else:
+                    self.__items[key] = value
+        super(BulletinBasicFields, self).__init__(self.__items)
+
+
+class SaveCompanyInformation:
+
+    def __init__(self, item: dict, region_name: str = None):
+        if isinstance(item, BulletinBasicFields):
+            collection = 'company_basic_information'
+        else:
+            raise
+
+        if region_name is not None:
+            collection = f'{collection}_{region_name}'
+        else:
+            collection = f'{collection}_other'
+
+        with MongoDBS('py_spider', collection) as coll:
+            result = coll.insert_one(item)
+            logger.info(f"[Mongo数据库:{collection}-{result.inserted_id}] {item['social_id']} {item['company']} 保存成功")

+ 151 - 0
company/crawler/socks5.py

@@ -0,0 +1,151 @@
+import copy
+import time
+
+import requests
+
+from config.load import jy_proxy, headers
+from utils.log import logger
+
+__all__ = ['Proxy']
+
+
+def decrypt(input_str: str) -> str:
+    """
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    key = jy_proxy['socks5']['decrypt']
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+class Socks5Proxy:
+
+    __instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls.__instance is None:
+            cls.__instance = super().__new__(cls)
+        return cls.__instance
+
+    def __init__(self):
+        self.__proxies = {}
+
+    def _init(self):
+        self.url = jy_proxy['socks5']['url']
+        self.pool = []
+        self.index = 0  # 当前代理在代理池的位置
+        self.counter = {}
+        self.seconds = 60
+
+        while not self.__proxies:
+            if len(self.pool) > 0 and not self.__proxies:
+                self.__proxies = copy.deepcopy(self.pool[self.index])
+            else:
+                self.generate_pool()
+
+    @property
+    def proxies(self):
+        return self.__proxies
+
+    def switch(self, reset=False):
+        """切换代理"""
+        if reset is True:
+            self.counter.clear()
+            self.flush_pool()
+
+        elif len(self.counter) > 0:
+            end_time = self.counter[str(self.__proxies)]
+            current_time = int(time.time())
+            if end_time - current_time < self.seconds:
+                self.pool.remove(self.__proxies)
+                logger.info(f"[代理]移除:{self.__proxies}")
+                del self.counter[str(self.__proxies)]
+                logger.info(f"[代理]剩余个数:{len(self.pool)}")
+
+        self.__proxies = {}  # 重置代理
+
+        while not self.proxies:
+            if len(self.pool) > 0:
+                self.index += 1
+                if self.index >= len(self.pool):
+                    self.index = 0
+                self.__proxies = copy.deepcopy(self.pool[self.index])
+                logger.info(f"[代理]切换 - {self.index}")
+            else:
+                logger.info("[代理]无可用代理")
+                self.flush_pool()
+
+    def generate_pool(self):
+        """初始化代理池"""
+        self.__socks5()
+        self.__check_proxies()
+
+    def flush_pool(self):
+        logger.info(f"[代理]刷新代理池")
+        self.pool.clear()
+        self.generate_pool()
+
+    def __socks5(self):
+        logger.info(f"[代理]请求服务:{self.url}")
+        try:
+            response = requests.get(self.url, timeout=10)
+            self.__extract_ip(response)
+        except requests.RequestException:
+            pass
+
+    def __extract_ip(self, response):
+        for proxy in response.json():
+            host = decrypt(proxy['host'])
+            port = int(proxy['port'])
+            end_time = proxy['EndTime']
+            items = {
+                'http': 'socks5://{}:{}'.format(host, port),
+                'https': 'socks5://{}:{}'.format(host, port)
+            }
+            self.pool.append(items)
+            self.counter.setdefault(str(items), end_time)
+
+    def __check_proxies(self):
+        check_ip = 'https://myip.ipip.net'
+        logger.info(f"[代理]通信检查:{check_ip}")
+        for proxies in self.pool[::-1]:
+            try:
+                requests_param = {
+                    "headers": headers,
+                    "proxies": proxies,
+                    "timeout": 10
+                }
+                requests.get(check_ip, **requests_param)
+            except requests.RequestException:
+                self.pool.remove(proxies)
+                del self.counter[str(proxies)]
+        logger.info(f"[代理]可用个数:{len(self.pool)}")
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        if enable_proxy:
+            self._init()
+        return self
+
+
+Proxy = Socks5Proxy()

+ 141 - 0
company/crawler/spiders/BJSpider.py

@@ -0,0 +1,141 @@
+from concurrent.futures import ThreadPoolExecutor
+
+from lxml.html import fromstring, HtmlElement
+
+from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
+from crawler.fields import BulletinBasicFields, SaveCompanyInformation
+
+
+class BJSpider:
+
+    def __init__(self):
+        self.sign = 'bj'
+        self.enable_proxy = None
+        self.district_mapping = {
+            '110101': {
+                'region': ('北京', '北京市', '东城区'),
+                'years': [('2020', 25), ('2019', 22), ('2018', 25), ('2017', 26), ('2016', 26), ('2015', 26)]
+            },
+            '110102': {
+                'region': ('北京', '北京市', '西城区'),
+                'years': [('2020', 22), ('2019', 19), ('2018', 20), ('2017', 22), ('2016', 1), ('2015', 1)]
+            },
+            '110105': {
+                'region': ('北京', '北京市', '朝阳区'),
+                'years': [('2020', 36), ('2019', 37), ('2018', 37), ('2017', 37), ('2016', 37), ('2015', 37)]
+            },
+            '110108': {
+                'region': ('北京', '北京市', '海淀区'),
+                'years': [('2020', 42), ('2019', 42), ('2018', 42), ('2017', 39), ('2016', 39), ('2015', 1)]
+            },
+            '110106': {
+                'region': ('北京', '北京市', '丰台区'),
+                'years': [('2020', 21), ('2019', 24), ('2018', 23), ('2017', 28), ('2016', 28), ('2015', 1)]
+            },
+            '110107': {
+                'region': ('北京', '北京市', '石景山区'),
+                'years': [('2020', 15), ('2019', 15), ('2018', 15), ('2017', 15), ('2016', 15), ('2015', 14)]
+            },
+            '110109': {
+                'region': ('北京', '北京市', '门头沟区'),
+                'years': [('2020', 1), ('2019', 15), ('2018', 15), ('2017', 14), ('2016', 1), ('2015', 1)]
+            },
+            '110111': {
+                'region': ('北京', '北京市', '房山区'),
+                'years': [('2020', 26), ('2019', 35), ('2018', 35), ('2017', 35), ('2016', 34), ('2015', 1)]
+            },
+            '110112': {
+                'region': ('北京', '北京市', '通州区'),
+                'years': [('2020', 19), ('2019', 24), ('2018', 24), ('2017', 24), ('2016', 24), ('2015', 1)]
+            },
+            '110110': {
+                'region': ('北京', '北京市', '顺义区'),
+                'years': [('2020', 1), ('2019', 1), ('2018', 1), ('2017', 30), ('2016', 1), ('2015', 1)]
+            },
+            '110221': {
+                'region': ('北京', '北京市', '昌平区'),
+                'years': [('2020', 28), ('2019', 35), ('2018', 35), ('2017', 35), ('2016', 35), ('2015', 34)]
+            },
+            '110224': {
+                'region': ('北京', '北京市', '大兴区'),
+                'years': [('2020', 29), ('2019', 36), ('2018', 35), ('2017', 34), ('2016', 34), ('2015', 1)]
+            },
+            '110227': {
+                'region': ('北京', '北京市', '怀柔区'),
+                'years': [('2020', 13), ('2019', 14), ('2018', 14), ('2017', 14), ('2016', 13), ('2015', 1)]
+            },
+            '110226': {
+                'region': ('北京', '北京市', '平谷区'),
+                'years': [('2020', 12), ('2019', 12), ('2018', 12), ('2017', 12), ('2016', 1), ('2015', 1)]
+            },
+            '110228': {
+                'region': ('北京', '北京市', '密云区'),
+                'years': [('2020', 15), ('2019', 15), ('2018', 15), ('2017', 15), ('2016', 14), ('2015', 14)]
+            },
+            '110229': {
+                'region': ('北京', '北京市', '延庆区'),
+                'years': [('2020', 11), ('2019', 13), ('2018', 13), ('2017', 13), ('2016', 13), ('2015', 1)]
+            }
+        }
+
+    def extract_text_and_save(
+            self,
+            element: HtmlElement,
+            region: tuple,
+            code: str,
+            **request_params
+    ):
+        """
+        提取文本并保存
+
+        @param element: 元素对象
+        @param region: 地区元组
+        @param code: 行政区划代码
+        """
+        nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
+        province, city, county = region
+        for node in nodes:
+            social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
+            company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
+            if len(social_id) == 0 and len(company) == 0:
+                continue
+
+            item = BulletinBasicFields(
+                social_id=social_id,
+                company=company,
+                district_code=code,
+                province=province,
+                city=city,
+                county=county,
+                page=request_params.get('page', ''),
+                url=request_params.get('url', ''),
+            )
+            SaveCompanyInformation(item, self.sign)
+
+    def generate_request_tasks(self):
+        results = []
+        url = crawl_params('general')
+        for district_code, data in self.district_mapping.items():
+            region = data.get('region')
+            years = data.get('years')
+            for year, max_page_num in years:
+                for page in range(1, max_page_num + 1):
+                    link = url.format(
+                        page=page,
+                        district_code=district_code,
+                        year=year,
+                        select_page=page
+                    )
+                    results.append((link, region, district_code, page))
+        yield from results
+
+    def crawl_spider(self, task: tuple):
+        url, region, district_code, page = task
+        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
+        element = fromstring(response.text)
+        self.extract_text_and_save(element, region, district_code, url=url, page=page)
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 58 - 0
company/crawler/spiders/GDSpider.py

@@ -0,0 +1,58 @@
+from concurrent.futures import ThreadPoolExecutor
+
+from lxml.html import fromstring, HtmlElement
+
+from crawler.defaults import fetch_page_by_post, crawl_request, crawl_params
+from crawler.fields import BulletinBasicFields, SaveCompanyInformation
+
+
+class GDSpider:
+
+    def __init__(self):
+        self.sign = 'gd'
+        self.enable_proxy = None
+
+    def extract_text_and_save(self, element: HtmlElement, **request_params):
+        nodes = element.xpath('//*[@name="frm"]/div/table[2]//tr[position()>1]')
+        for node in nodes:
+            social_id = "".join(node.xpath('./td[2]/text()')).strip()
+            item = BulletinBasicFields(
+                social_id=social_id,
+                company="".join(node.xpath('./td[3]//text()')).strip(),
+                district_code=social_id[2:8],
+                province='广东省',
+                url=request_params.get('url'),
+                request_data=request_params.get('request_data'),
+                page=request_params.get('page')
+            )
+            SaveCompanyInformation(item, self.sign)
+
+    def crawl_spider(self, task: tuple):
+        url, data, page = task
+        headers = {
+            'Content-Type': 'application/x-www-form-urlencoded',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
+        }
+        response = crawl_request(fetch_page_by_post, url, self.enable_proxy, headers=headers, data=data)
+        element = fromstring(response.text)
+        self.extract_text_and_save(element, url=url, page=page, request_data=data)
+
+    def generate_request_tasks(self):
+        results = []
+        for spider in crawl_params(self.sign):
+            url = "".join(spider.keys())
+            params: dict = spider.get(url)
+            total_page = int(params.get('pageInfo.pageTotal'))
+            for page in range(1, total_page + 1):
+                item = {**params}
+                item.update({
+                    'pageInfo.switchingPage': 'true',
+                    'pageInfo.pageIndex': str(page)
+                })
+                results.append((url, item, page))
+        yield from results
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 186 - 0
company/crawler/spiders/HBSpider.py

@@ -0,0 +1,186 @@
+from concurrent.futures import ThreadPoolExecutor
+
+from lxml.html import fromstring, HtmlElement
+
+from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
+from crawler.fields import BulletinBasicFields, SaveCompanyInformation
+
+
+class HBSpider:
+    """湖北 - 事业单位编制网"""
+
+    def __init__(self):
+        self.sign = 'hb'
+        self.enable_proxy = None
+        self.max_page_number = 40
+        self.years = ['2020', '2019', '2018', '2017', '2016', '2015']
+        self.district_mapping = {
+            '420000': {'region': ('湖北省', '', '')},
+            '420100': {'region': ('湖北省', '武汉市', '')},
+            '420102': {'region': ('湖北省', '', '江岸区')},
+            '420103': {'region': ('湖北省', '', '江汉区')},
+            '420104': {'region': ('湖北省', '', '硚口区')},
+            '420105': {'region': ('湖北省', '', '汉阳区')},
+            '420106': {'region': ('湖北省', '', '武昌区')},
+            '420107': {'region': ('湖北省', '', '青山区')},
+            '420111': {'region': ('湖北省', '', '洪山区')},
+            '420112': {'region': ('湖北省', '', '东西湖区')},
+            '420113': {'region': ('湖北省', '', '武汉市汉南区')},
+            '420114': {'region': ('湖北省', '', '蔡甸区')},
+            '420115': {'region': ('湖北省', '', '江夏区')},
+            '420116': {'region': ('湖北省', '', '黄陂区')},
+            '420117': {'region': ('湖北省', '', '新洲区')},
+            '420118': {'region': ('湖北省', '', '武汉经济技术开发区')},
+            '420119': {'region': ('湖北省', '', '武汉东湖新技术开发区')},
+            '420200': {'region': ('湖北省', '黄石市', '')},
+            '420202': {'region': ('湖北省', '', '黄石港区')},
+            '420203': {'region': ('湖北省', '', '西塞山区(石灰窑区)')},
+            '420204': {'region': ('湖北省', '', '下陆区')},
+            '420205': {'region': ('湖北省', '', '铁山区')},
+            '420222': {'region': ('湖北省', '', '阳新县')},
+            '420281': {'region': ('湖北省', '', '大冶市')},
+            '420300': {'region': ('湖北省', '十堰市', '')},
+            '420302': {'region': ('湖北省', '', '茅箭区')},
+            '420303': {'region': ('湖北省', '', '张湾区')},
+            '420321': {'region': ('湖北省', '', '十堰市郧阳区')},
+            '420322': {'region': ('湖北省', '', '郧西县')},
+            '420323': {'region': ('湖北省', '', '竹山县')},
+            '420324': {'region': ('湖北省', '', '竹溪县')},
+            '420325': {'region': ('湖北省', '', '房县')},
+            '420381': {'region': ('湖北省', '', '丹江口市')},
+            '420500': {'region': ('湖北省', '宜昌市', '')},
+            '420502': {'region': ('湖北省', '', '西陵区')},
+            '420503': {'region': ('湖北省', '', '伍家岗区')},
+            '420504': {'region': ('湖北省', '', '点军区')},
+            '420505': {'region': ('湖北省', '', '猇亭区')},
+            '420521': {'region': ('湖北省', '', '夷陵区')},
+            '420525': {'region': ('湖北省', '', '远安县')},
+            '420526': {'region': ('湖北省', '', '兴山县')},
+            '420527': {'region': ('湖北省', '', '秭归县')},
+            '420528': {'region': ('湖北省', '', '长阳土家族自治县')},
+            '420529': {'region': ('湖北省', '', '五峰土家族自治县')},
+            '420581': {'region': ('湖北省', '', '宜都市')},
+            '420582': {'region': ('湖北省', '', '当阳市')},
+            '420583': {'region': ('湖北省', '', '枝江市')},
+            '420600': {'region': ('湖北省', '襄阳市', '')},
+            '420602': {'region': ('湖北省', '', '襄城区')},
+            '420606': {'region': ('湖北省', '', '樊城区')},
+            '420621': {'region': ('湖北省', '', '襄州区')},
+            '420624': {'region': ('湖北省', '', '南漳县')},
+            '420625': {'region': ('湖北省', '', '谷城县')},
+            '420626': {'region': ('湖北省', '', '保康县')},
+            '420682': {'region': ('湖北省', '', '老河口市')},
+            '420683': {'region': ('湖北省', '', '枣阳市')},
+            '420684': {'region': ('湖北省', '', '宜城市')},
+            '420700': {'region': ('湖北省', '鄂州市', '')},
+            '420702': {'region': ('湖北省', '', '梁子湖区')},
+            '420703': {'region': ('湖北省', '', '华容区')},
+            '420704': {'region': ('湖北省', '', '鄂城区')},
+            '420800': {'region': ('湖北省', '荆门市', '')},
+            '420802': {'region': ('湖北省', '', '荆门市东宝区')},
+            '420803': {'region': ('湖北省', '', '荆门市掇刀区')},
+            '420804': {'region': ('湖北省', '', '荆门市屈家岭管理区')},
+            '420821': {'region': ('湖北省', '', '京山市')},
+            '420822': {'region': ('湖北省', '', '沙洋县')},
+            '420881': {'region': ('湖北省', '', '钟祥市')},
+            '420900': {'region': ('湖北省', '孝感市', '')},
+            '420902': {'region': ('湖北省', '', '孝南区')},
+            '420921': {'region': ('湖北省', '', '孝昌县')},
+            '420922': {'region': ('湖北省', '', '大悟县')},
+            '420923': {'region': ('湖北省', '', '云梦县')},
+            '420981': {'region': ('湖北省', '', '应城市')},
+            '420982': {'region': ('湖北省', '', '安陆市')},
+            '420984': {'region': ('湖北省', '', '汉川市')},
+            '421000': {'region': ('湖北省', '荆州市', '')},
+            '421002': {'region': ('湖北省', '', '沙市区')},
+            '421003': {'region': ('湖北省', '', '荆州区')},
+            '421022': {'region': ('湖北省', '', '公安县')},
+            '421023': {'region': ('湖北省', '', '监利县')},
+            '421024': {'region': ('湖北省', '', '江陵县')},
+            '421081': {'region': ('湖北省', '', '石首市')},
+            '421083': {'region': ('湖北省', '', '洪湖市')},
+            '421087': {'region': ('湖北省', '', '松滋市')},
+            '421100': {'region': ('湖北省', '黄冈市', '')},
+            '421102': {'region': ('湖北省', '', '黄州区')},
+            '421121': {'region': ('湖北省', '', '团风县')},
+            '421122': {'region': ('湖北省', '', '红安县')},
+            '421123': {'region': ('湖北省', '', '罗田县')},
+            '421124': {'region': ('湖北省', '', '英山县')},
+            '421125': {'region': ('湖北省', '', '浠水县')},
+            '421126': {'region': ('湖北省', '', '蕲春县')},
+            '421127': {'region': ('湖北省', '', '黄梅县')},
+            '421181': {'region': ('湖北省', '', '麻城市')},
+            '421182': {'region': ('湖北省', '', '武穴市')},
+            '421200': {'region': ('湖北省', '咸宁市', '')},
+            '421202': {'region': ('湖北省', '', '咸安区')},
+            '421221': {'region': ('湖北省', '', '嘉鱼县')},
+            '421222': {'region': ('湖北省', '', '通城县')},
+            '421223': {'region': ('湖北省', '', '崇阳县')},
+            '421224': {'region': ('湖北省', '', '通山县')},
+            '421281': {'region': ('湖北省', '', '赤壁市')},
+            '421300': {'region': ('湖北省', '随州市', '')},
+            '421302': {'region': ('湖北省', '', '曾都区')},
+            '421304': {'region': ('湖北省', '', '随县')},
+            '421381': {'region': ('湖北省', '', '广水市')},
+            '422800': {'region': ('湖北省', '恩施土家族苗族自治州', '')},
+            '422801': {'region': ('湖北省', '', '恩施市')},
+            '422802': {'region': ('湖北省', '', '利川市')},
+            '422822': {'region': ('湖北省', '', '建始县')},
+            '422823': {'region': ('湖北省', '', '巴东县')},
+            '422825': {'region': ('湖北省', '', '宣恩县')},
+            '422826': {'region': ('湖北省', '', '咸丰县')},
+            '422827': {'region': ('湖北省', '', '来凤县')},
+            '422828': {'region': ('湖北省', '', '鹤峰县')},
+            '429004': {'region': ('湖北省', '', '仙桃市')},
+            '429005': {'region': ('湖北省', '', '潜江市')},
+            '429006': {'region': ('湖北省', '', '天门市')},
+            '429021': {'region': ('湖北省', '', '神农架林区')}
+        }
+
+    def extract_text_and_save(self, element: HtmlElement, region: tuple, code: str, **request_params):
+        nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
+        province, city, county = region
+        for node in nodes:
+            social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
+            company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
+            if len(social_id) == 0 and len(company) == 0:
+                continue
+
+            item = BulletinBasicFields(
+                social_id=social_id,
+                company=company,
+                district_code=code,
+                province=province,
+                city=city,
+                county=county,
+                page=request_params.get('page', ''),
+                url=request_params.get('url', '')
+            )
+            SaveCompanyInformation(item, self.sign)
+
+    def generate_request_tasks(self):
+        results = []
+        url = crawl_params('general')
+        for district_code, data in self.district_mapping.items():
+            region = data.get('region')
+            for year in self.years:
+                for page in range(1, self.max_page_number + 1):
+                    link = url.format(
+                        page=page,
+                        district_code=district_code,
+                        year=year,
+                        select_page=page
+                    )
+                    results.append((link, region, district_code, page))
+        yield from results
+
+    def crawl_spider(self, task: tuple):
+        url, region, district_code, page = task
+        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
+        element = fromstring(response.text)
+        self.extract_text_and_save(element, region, district_code, page=page, url=url)
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 60 - 0
company/crawler/spiders/HuSpider.py

@@ -0,0 +1,60 @@
+import re
+from concurrent.futures import ThreadPoolExecutor
+
+from crawler.defaults import fetch_page_by_get, crawl_request, crawl_params
+from crawler.fields import SaveCompanyInformation, BulletinBasicFields
+
+
+class HuSpider:
+    """湖南省机构编制网"""
+
+    def __init__(self):
+        self.sign = 'hunan'
+        self.enable_proxy = None
+        self.nd_max_page_number = 10  # 事业单位年度报告公示最大页数
+        self.yw_max_page_number = 20  # 事业单位登记公示最大页数
+        self.years = ['2020', '2019', '2018', '2017', '2016', '2015', '2014']
+
+    def extract_detail_page(self, json_data: dict, **request_params):
+        rows = json_data.get('rows', [])
+        for row in rows:
+            item = {
+                'company': row.get('sydwmc', ''),
+                'legal_person': row.get('fddbr', ''),
+                'capital': row.get('kbzj', ''),
+                'capital_origin': row.get('jfly', ''),
+                'purpose_and_business': row.get('zzhywfw', ''),
+                'address': row.get('address', ''),
+                'social_id': row.get('unify_code', ''),
+                'district_code': row.get('unify_code', '')[2:8] if row.get('unify_code', '') else '',
+                'province': '湖南省',
+                'url': request_params.get('url', ''),
+                'page': request_params.get('page', '')
+            }
+            SaveCompanyInformation(BulletinBasicFields(**item), self.sign)
+
+    def crawl_spider(self, task: tuple):
+        url, page = task
+        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
+        json_str = re.search('builtTable(.*?)$', response.text).group(1)
+        result = eval(json_str)
+        self.extract_detail_page(result, url=url, page=page)
+
+    def generate_request_tasks(self):
+        results = []
+        nd_url = str(crawl_params(self.sign)[0])
+        yw_url = str(crawl_params(self.sign)[1])
+        for page in range(1, self.nd_max_page_number + 1):
+            for year in self.years:
+                url = nd_url.format(page, year)
+                results.append((url, page))
+
+        for page in range(1, self.yw_max_page_number + 1):
+            url = yw_url.format(page)
+            results.append((url, page))
+        yield from results
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 178 - 0
company/crawler/spiders/SHSpider.py

@@ -0,0 +1,178 @@
+from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
+from urllib.parse import urljoin
+
+from lxml.html import fromstring, HtmlElement
+
+from config.load import crawl_sites
+from crawler.defaults import fetch_page_by_post, fetch_page_by_get, crawl_request
+from crawler.fields import (
+    SaveCompanyInformation,
+    BulletinBasicFields,
+)
+
+
+class SHSpider:
+
+    def __init__(self):
+        self.sign = 'sh'
+        self.enable_proxy = None
+        self.site = 'http://www.sydjsh.cn/'
+
+    def extract_text_and_save(self, url, yw_type):
+        # print(url, yw_type)
+        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
+        element = fromstring(response.text)
+        nodes = element.xpath('//table[@id="content"]//tr[position()>1]')
+        for node in nodes:
+            if yw_type in ['SL', 'BZ']:
+                item = BulletinBasicFields(
+                    company="".join(node.xpath('./td[3]/text()')),
+                    legal_person="".join(node.xpath('./td[5]/text()')),
+                    capital="".join(node.xpath('./td[7]/text()')) + '万元',
+                    capital_origin="".join(node.xpath('./td[6]/text()')),
+                    purpose_and_business="".join(node.xpath('./td[8]/text()')),
+                    address="".join(node.xpath('./td[4]/text()')),
+                    social_id="".join(node.xpath('./td[2]/text()')),
+                    status='create',
+                    province='上海'
+                )
+                SaveCompanyInformation(item, self.sign)
+
+            elif yw_type in ['BG', 'JGQTXS', 'JGQTBG', 'JGQTCX', 'JGQTBL', 'JGQTGQ']:
+                item = BulletinBasicFields(
+                    social_id="".join(node.xpath('./td[2]/text()')),
+                    company="".join(node.xpath('./td[3]/text()')),
+                    status='modify',
+                    province='上海'
+                )
+                SaveCompanyInformation(item, self.sign)
+
+            elif yw_type == 'ZX':
+                item = BulletinBasicFields(
+                    social_id="".join(node.xpath('./td[3]/text()')),
+                    company="".join(node.xpath('./td[4]/text()')),
+                    status='cancellation',
+                    province='上海'
+                )
+                SaveCompanyInformation(item, self.sign)
+
+    def generate_snapshot_links(self, url, data):
+        list_links = []
+        response = crawl_request(fetch_page_by_post, url, self.enable_proxy, data=data)
+        # print(url, data)
+        element = fromstring(response.text)
+        nodes = element.xpath('//div[@class="center1"]/ul/li')
+        for node in nodes:
+            href = "".join(node.xpath('./a/@href'))
+            if data['yw_type'] == 'JGQTCX':
+                # 机关注销公告列表页面的网站地址官方拼接是错误的,此处需要replace一下
+                href = href.replace('jgqtXc', 'jgqtCx')
+            elif data['yw_type'] == 'JGQTXS':
+                href = href.replace('JgqtCl', 'jgqtCl')
+            url = urljoin(self.site, href)
+            list_links.append(url)
+        yield from list_links
+
+    def crawl_spider(self, task: tuple):
+        url, data = task  # 列表页信息
+        with ThreadPoolExecutor(max_workers=5) as Executor:
+            futures = []
+            for link in self.generate_snapshot_links(url, data):
+                futures.append(Executor.submit(self.extract_text_and_save, link, data['yw_type']))
+            wait(futures, return_when=ALL_COMPLETED)
+
+    def task_list(self):
+        for spider in crawl_sites.get(self.sign):
+            url = "".join(spider.keys())
+            data: dict = spider.get(url)
+            total_page = int(data.get('pageTotal'))
+            for page in range(1, total_page + 1):
+                item = {
+                    "pageIndex": str(page),
+                    "yw_type": data.get('yw_type'),
+                    "vl": "item",
+                    "type": data.get('type'),
+                    "pageSize": data.get('pageSize')
+                }
+                yield url, item
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.task_list())
+
+
+class SHNDSpider:
+    """上海事业单位编制网 - 年度报告"""
+
+    def __init__(self):
+        self.enable_proxy = None
+        self.sign = 'sh'
+        self.district_mapping = {
+            '310000': {'region': ('上海', '上海市', '市属'), 'max_page_number': 622},
+            '310106': {'region': ('上海', '上海市', '静安区'), 'max_page_number': 281},
+            '310104': {'region': ('上海', '上海市', '徐汇区'), 'max_page_number': 208},
+            '310113': {'region': ('上海', '上海市', '宝山区'), 'max_page_number': 361},
+            '310109': {'region': ('上海', '上海市', '虹口区'), 'max_page_number': 186},
+            '310112': {'region': ('上海', '上海市', '闵行区'), 'max_page_number': 361},
+            '310230': {'region': ('上海', '上海市', '崇明区'), 'max_page_number': 317},
+            '310105': {'region': ('上海', '上海市', '长宁区'), 'max_page_number': 170},
+            '310107': {'region': ('上海', '上海市', '普陀区'), 'max_page_number': 231},
+            '310117': {'region': ('上海', '上海市', '松江区'), 'max_page_number': 314},
+            '310115': {'region': ('上海', '上海市', '浦东新区'), 'max_page_number': 741},
+            '310101': {'region': ('上海', '上海市', '黄浦区'), 'max_page_number': 225},
+            '310110': {'region': ('上海', '上海市', '杨浦区'), 'max_page_number': 210},
+            '310114': {'region': ('上海', '上海市', '嘉定区'), 'max_page_number': 284},
+            '310116': {'region': ('上海', '上海市', '金山区'), 'max_page_number': 265},
+            '310226': {'region': ('上海', '上海市', '奉贤区'), 'max_page_number': 265},
+            '310118': {'region': ('上海', '上海市', '青浦区'), 'max_page_number': 273}
+        }
+        self.url = 'http://www.sydjsh.cn/ndbg.do'
+
+    def extract_text_and_save(self, element: HtmlElement, code: str, **request_params):
+        province, city, county = self.district_mapping.get(code).get('region')
+        nodes = element.xpath('//*[@class="cursor"]')
+        for node in nodes:
+            social_id = "".join(node.xpath('./td[1]/text()'))
+            company = "".join(node.xpath('./td[2]/text()'))
+            if len(social_id) == 0 and len(company) == 0:
+                continue
+
+            item = BulletinBasicFields(
+                social_id=social_id,
+                company=company,
+                district_code=code,
+                province=province,
+                city=city,
+                county=county,
+                url=request_params.get('url'),
+                request_data=request_params.get('request_data'),
+                page=request_params.get('page')
+            )
+            SaveCompanyInformation(item, self.sign)
+
+    def generate_request_tasks(self):
+        results = []
+        for geo_code, data in self.district_mapping.items():
+            max_page_number = data.get('max_page_number') + 1
+            for page in range(1, max_page_number):
+                results.append({
+                    "pageIndex": str(page),
+                    "keyword": "",
+                    "type": "4",
+                    "year": "",
+                    "geo_code": geo_code
+                })
+        yield from results
+
+    def crawl_spider(self, data: dict):
+        geo_code = data.get('geo_code')
+        page = data.get('pageIndex')
+        response = crawl_request(fetch_page_by_post, self.url, self.enable_proxy, data=data)
+        element = fromstring(response.text)
+        self.extract_text_and_save(element, geo_code, page=page, url=self.url, request_data=data)
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 56 - 0
company/crawler/spiders/SXSpider.py

@@ -0,0 +1,56 @@
+from concurrent.futures import ThreadPoolExecutor
+
+from lxml.html import fromstring, HtmlElement
+
+from crawler.defaults import fetch_page_by_post, crawl_request, crawl_params
+from crawler.fields import (
+    SaveCompanyInformation,
+    BulletinBasicFields
+)
+
+
+class SXSpider:
+
+    def __init__(self):
+        self.sign = 'sx'
+        self.enable_proxy = None
+
+    def extract_text_and_save(self, element: HtmlElement, **request_params):
+        nodes = element.xpath('//ul[@class="listLeft-item"]/li')
+        for node in nodes:
+            name = "".join(node.xpath('./a/text()')).strip()
+            item = BulletinBasicFields(
+                company=name,
+                province='陕西省',
+                url=request_params.get('url'),
+                request_data=request_params.get('request_data'),
+                page=request_params.get('page')
+            )
+            SaveCompanyInformation(item, self.sign)
+
+    def crawl_spider(self, task: tuple):
+        url, data, page = task
+        response = crawl_request(fetch_page_by_post, url, self.enable_proxy, data=data)
+        element = fromstring(response.text)
+        self.extract_text_and_save(element, url=url, page=page, request_data=data)
+
+    def generate_request_tasks(self):
+        results = []
+        for spider in crawl_params(self.sign):
+            url = "".join(spider.keys())
+            data: dict = spider.get(url)
+            total_page = int(data.get('pageTotal'))
+            for page in range(1, total_page + 1):
+                item = {
+                    'code': '',
+                    'year': data.get('year'),
+                    'contentUrlPage.pageSize': data.get('pageSize'),
+                    'contentUrlPage.currentPage': str(page)
+                }
+                results.append((url, item, page))
+        yield from results
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 129 - 0
company/crawler/spiders/TJSpider.py

@@ -0,0 +1,129 @@
+from concurrent.futures import ThreadPoolExecutor
+
+from lxml.html import fromstring, HtmlElement
+
+from crawler.defaults import crawl_request, fetch_page_by_get, crawl_params
+from crawler.fields import BulletinBasicFields, SaveCompanyInformation
+
+
+class TJSpider:
+
+    def __init__(self):
+        self.sign = 'tj'
+        self.enable_proxy = None
+        self.district_mapping = {
+            '120101': {
+                'region': ('天津', '天津市', '和平区'),
+                'years': [('2020', 8), ('2019', 7), ('2018', 6), ('2017', 6), ('2016', 6), ('2015', 6)]
+            },
+            '120102': {
+                'region': ('天津', '天津市', '河东区'),
+                'years': [('2020', 11), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015', 6)]
+            },
+            '120103': {
+                'region': ('天津', '天津市', '河西区'),
+                'years': [('2020', 13), ('2019', 12), ('2018', 9), ('2017', 9), ('2016', 9), ('2015', 9)]
+            },
+            '120104': {
+                'region': ('天津', '天津市', '南开区'),
+                'years': [('2020', 12), ('2019', 9), ('2018', 9), ('2017', 9), ('2016', 9), ('2015', 9)]
+            },
+            '120105': {
+                'region': ('天津', '天津市', '河北区'),
+                'years': [('2020', 9), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015',7)]
+            },
+            '120106': {
+                'region': ('天津', '天津市', '红桥区'),
+                'years': [('2020', 8), ('2019', 6), ('2018', 6), ('2017', 5), ('2016', 5), ('2015', 5)]
+            },
+            '120110': {
+                'region': ('天津', '天津市', '东丽区'),
+                'years': [('2020', 11), ('2019', 9), ('2018', 7), ('2017', 7), ('2016', 7), ('2015', 7)]
+            },
+            '120111': {
+                'region': ('天津', '天津市', '西青区'),
+                'years': [('2020', 11), ('2019', 10), ('2018', 8), ('2017', 7), ('2016', 6), ('2015', 6)]
+            },
+            '120112': {
+                'region': ('天津', '天津市', '津南区'),
+                'years': [('2020', 11), ('2019', 8), ('2018', 7), ('2017', 7), ('2016', 6), ('2015',6)]
+            },
+            '120113': {
+                'region': ('天津', '天津市', '北辰区'),
+                'years': [('2020', 11), ('2019', 9), ('2018', 8), ('2017', 8), ('2016', 8), ('2015', 8)]
+            },
+            '120116': {
+                'region': ('天津', '天津市', '滨海新区'),
+                'years': [('2020', 28), ('2019', 27), ('2018', 23), ('2017', 22), ('2016', 21), ('2015', 21)]
+            },
+            '120221': {
+                'region': ('天津', '天津市', '宁河区'),
+                'years': [('2020', 15), ('2019', 13), ('2018', 11), ('2017', 10), ('2016', 9), ('2015', 9)]
+            },
+            '120222': {
+                'region': ('天津', '天津市', '武清区'),
+                'years': [('2020', 13), ('2019', 12), ('2018', 12), ('2017', 11), ('2016', 11), ('2015', 11)]
+            },
+            '120223': {
+                'region': ('天津', '天津市', '静海区'),
+                'years': [('2020', 17), ('2019', 16), ('2018', 16), ('2017', 16), ('2016', 14), ('2015', 13)]
+            },
+            '120224': {
+                'region': ('天津', '天津市', '宝坻区'),
+                'years': [('2020', 17), ('2019', 16), ('2018', 15), ('2017', 15), ('2016', 8), ('2015', 7)]
+            },
+            # '': {
+            #     'region': ('天津', '天津市', '蓟州区'),
+            #     'years': [('2020',), ('2019',), ('2018',), ('2017',), ('2016',),
+            #               ('2015',)]
+            # },
+        }
+
+    def extract_text_and_save(self, element: HtmlElement, region: tuple, code: str, **request_params):
+        nodes = element.xpath('//*[@class="zong1424"]/table//tr[last()]/td/table//tr[position()>1]')
+        province, city, county = region
+        for node in nodes:
+            social_id = "".join("".join(node.xpath('./td[2]/a/text()')).split())
+            company = "".join("".join(node.xpath('./td[3]/a/text()')).split())
+            if len(social_id) == 0 and len(company) == 0:
+                continue
+
+            item = BulletinBasicFields(
+                social_id=social_id,
+                company=company,
+                district_code=code,
+                province=province,
+                city=city,
+                county=county,
+                url=request_params.get('url', ''),
+                page=request_params.get('page', '')
+            )
+            SaveCompanyInformation(item, self.sign)
+
+    def crawl_spider(self, task: tuple):
+        url, region, district_code, page = task
+        response = crawl_request(fetch_page_by_get, url, self.enable_proxy)
+        element = fromstring(response.text)
+        self.extract_text_and_save(element, region, district_code, url=url, page=page)
+
+    def generate_request_tasks(self):
+        results = []
+        url = crawl_params('general')
+        for district_code, data in self.district_mapping.items():
+            region = data.get('region')
+            years = data.get('years')
+            for year, max_page_num in years:
+                for page in range(1, max_page_num + 1):
+                    link = url.format(
+                        page=page,
+                        district_code=district_code,
+                        year=year,
+                        select_page=page
+                    )
+                    results.append((link, region, district_code, page))
+        yield from results
+
+    def run(self, enable_proxy=None, max_workers: int = 1):
+        self.enable_proxy = enable_proxy or False
+        with ThreadPoolExecutor(max_workers=max_workers) as Executor:
+            Executor.map(self.crawl_spider, self.generate_request_tasks())

+ 7 - 0
company/crawler/spiders/__init__.py

@@ -0,0 +1,7 @@
+from .BJSpider import BJSpider
+from .GDSpider import GDSpider
+from .HuSpider import HuSpider
+from .SHSpider import SHSpider, SHNDSpider
+from .SXSpider import SXSpider
+from .TJSpider import TJSpider
+from .HBSpider import HBSpider

+ 9 - 0
company/main.py

@@ -0,0 +1,9 @@
+from crawler import activate_spider
+
+
+def main():
+    activate_spider(max_workers=4)
+
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
company/utils/__init__.py


+ 41 - 0
company/utils/databases.py

@@ -0,0 +1,41 @@
+from typing import Optional
+
+import redis
+from pymongo import MongoClient
+from pymongo.database import Database
+from pymongo.collection import Collection
+from config.load import mongo_conf, redis_conf
+
+__all__ = ['MongoDBS', 'RedisDBS']
+
+
+class MongoDBS:
+    """ Mongo """
+
+    def __init__(self, db: str, coll: str, cfg: Optional[dict] = mongo_conf):
+        self.client = MongoClient(host=cfg['host'], port=cfg['port'])
+        self.db: Database = self.client[db]
+        self.collection: Collection = self.db[coll]
+
+    def __enter__(self):
+        return self.collection
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.client.close()
+
+
+class RedisDBS:
+    """ redis """
+
+    def __init__(self, cfg: Optional[dict] = redis_conf):
+        pool = redis.ConnectionPool(
+            host=cfg['host'],
+            port=cfg['port'],
+            password=cfg['pwd'],
+            db=cfg['db']
+        )
+        self.__r = redis.Redis(connection_pool=pool, decode_responses=True)
+
+    @property
+    def redis(self):
+        return self.__r

+ 40 - 0
company/utils/log.py

@@ -0,0 +1,40 @@
+from loguru import logger
+
+logger.add(
+    'logs/crawl_{time:YYYY-MM-DD}.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '[采集]' in x['message']
+)
+
+logger.add(
+    'logs/proxy.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '[代理]' in x['message']
+)
+
+logger.add(
+    'logs/databases.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '数据库' in x['message']
+)
+
+logger.add(
+    'logs/error.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {file}:{line} - {level} - {message}',
+    level='ERROR',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 0 - 0
credit_china/__init__.py


+ 0 - 0
credit_china/config/__init__.py


+ 26 - 0
credit_china/config/conf.yaml

@@ -0,0 +1,26 @@
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36
+  Accept: '*/*'
+
+
+# sockes5 代理
+proxy:
+  socks5:
+    url: http://socks.spdata.jianyu360.com/socks/getips?limit=10
+    decrypt: ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/
+
+
+# mongo
+mongo:
+  host: 172.17.4.87
+  port: !!int 27080
+#  host: 127.0.0.1
+#  port: !!int 27017
+
+
+# redis
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10

+ 16 - 0
credit_china/config/load.py

@@ -0,0 +1,16 @@
+from pathlib import Path
+
+import yaml
+
+__all__ = ['mongo_conf', 'redis_conf', 'jy_proxy', 'headers']
+
+base_path = Path(__file__).parent
+yaml_conf = (base_path / 'conf.yaml').resolve()
+yaml_sites = (base_path / 'sites.yaml').resolve()
+
+with open(yaml_conf, encoding="utf-8") as f:
+    conf = yaml.safe_load(f)
+    mongo_conf = conf['mongo']
+    redis_conf = conf['redis']
+    jy_proxy: dict = conf['proxy']
+    headers: dict = conf['headers']

+ 237 - 0
credit_china/crawl_spiders.py

@@ -0,0 +1,237 @@
+import threading
+
+import requests
+
+from exceptions import InvalidProxiesException
+from utils.log import logger
+
+__all__ = ['QueryList', 'QueryDetail', 'crawl_spider']
+Lock = threading.Lock()
+
+
+class CreditChinaListSpider:
+
+    def __init__(self, keyword: str = '', proxies: dict = None):
+        self.proxies = proxies
+        self.url = "https://public.creditchina.gov.cn/private-api/catalogSearchHome"
+        self.headers = {
+            "Host": "public.creditchina.gov.cn",
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
+            "Origin": "https://www.creditchina.gov.cn",
+            "Referer": "https://www.creditchina.gov.cn/",
+            "Accept-Language": "zh-CN,zh;q=0.9"
+        }
+        self.keyword = keyword
+        self.params = {
+            "keyword": self.keyword,
+            "scenes": "defaultScenario",
+            "tableName": "credit_xyzx_tyshxydm",
+            "searchState": "2",
+            "entityType": "1,2,4,5,6,7,8",
+            "templateId": "",
+            "page": "1",
+            "pageSize": "10"
+        }
+        self.results = []
+
+    def set_results(self, val: list):
+        self.results = val
+
+    def get_results(self):
+        return self.results
+
+    def crawl_request(self):
+        request_params = {
+            'headers': self.headers,
+            'proxies': self.proxies,
+            'timeout': 60
+        }
+        try:
+            r = requests.get(self.url, params=self.params, **request_params)
+            logger.info(f"[采集]{self.keyword} 列表查询状态:{r.status_code}")
+            return r
+        except requests.exceptions.ReadTimeout:
+            raise InvalidProxiesException()
+        except requests.exceptions.ConnectTimeout:
+            raise InvalidProxiesException()
+        except requests.RequestException as e:
+            logger.error(e.__class__.__name__)
+
+    def crawl_response(self, response):
+        results = []
+        data_json = response.json()
+        if len(data_json) > 0:
+            data_list = data_json.get('data').get('list')
+            logger.info('[采集]列表查询:{} 结果:{}条'.format(self.keyword, len(data_list)))
+            for item in data_list:
+                results.append({
+                    'entity_uuid': item['uuid'],
+                    'entity_name': item['accurate_entity_name'],
+                    'entity_code': item['accurate_entity_code'],
+                    'entity_type': item['entityType'],
+                    'entity_name_query': item['accurate_entity_name_query'],
+                    'recid': item['recid'],
+                })
+            return results
+
+    def crawl_spider(self):
+        response = self.crawl_request()
+        results = self.crawl_response(response)
+        self.set_results(results)
+
+    def start(self):
+        self.crawl_spider()
+
+    def __iter__(self):
+        return iter(self.get_results())
+
+    def __call__(self, *args, **kwargs):
+        self.proxies = kwargs.get('proxies')
+        if 'keyword' in kwargs and kwargs.get('keyword') is not None:
+            self.keyword = kwargs.get('keyword')
+
+        self.params.update({'keyword': self.keyword})
+        if len(self.keyword) > 0:
+            self.start()
+        return self
+
+
+class CreditChinaDetailSpider:
+
+    def __init__(
+            self,
+            entity_uuid: str = '',
+            entity_code: str = '',
+            entity_name: str = '',
+            entity_type: str = '',
+            proxies: dict = None):
+        self.uuid = entity_uuid
+        self.social_id = entity_code
+        self.keyword = entity_name
+        self.entity_type = entity_type
+        self.proxies = proxies
+        self.url = "https://public.creditchina.gov.cn/private-api/getTyshxydmDetailsContent"
+        self.headers = {
+            "Host": "public.creditchina.gov.cn",
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
+            "Origin": "https://www.creditchina.gov.cn",
+            "Referer": "https://www.creditchina.gov.cn/",
+            "Accept-Language": "zh-CN,zh;q=0.9"
+        }
+        self.params = {
+            "keyword": self.keyword,
+            "scenes": "defaultscenario",
+            "entityType": self.entity_type,
+            "searchState": "1",
+            "uuid": self.uuid,
+            "tyshxydm": self.social_id
+        }
+        self.results = {}
+
+    def crawl_request(self):
+        request_params = {
+            'headers': self.headers,
+            'proxies': self.proxies,
+            'timeout': 60
+        }
+        try:
+            r = requests.get(self.url, params=self.params, **request_params)
+            # logger.info(f"[采集]{self.keyword} 详情查询状态:{r.status_code}")
+            return r
+        except requests.exceptions.ReadTimeout:
+            raise InvalidProxiesException()
+        except requests.exceptions.ConnectTimeout:
+            raise InvalidProxiesException()
+        except requests.RequestException as e:
+            logger.error(e.__class__.__name__)
+
+    def crawl_response(self, response):
+        data_json = response.json()
+        if len(data_json) > 0:
+            # message = data_json.get('message')
+            # logger.info('[采集]详情查询:{} 结果:{}'.format(self.keyword, message))
+            try:
+                data = data_json.get('data').get('data')
+                entity = data.get('entity')
+                head_entity = data_json.get('data').get('headEntity')
+                results = {
+                    'entity_type': data.get('data_catalog', ''),  # 主体类型
+                    'social_id': head_entity.get('tyshxydm') or self.social_id,  # 统一社会信用代码
+                    'status': head_entity.get('status'),  # 运行状态
+                    'entity': head_entity.get('dymc') or entity.get('dymc', ''),  # 第一名称
+                    'entity_1': entity.get('demc', ''),  # 第二名称
+                    'entity_2': entity.get('dsmc', ''),  # 第三名称
+                    'entity_other': entity.get('qtmc', ''),  # 其他名称
+                    'legal_person': entity.get('fddbr', ''),  # 法人
+                    'capital_origin': entity.get('jfly', ''),  # 经费来源
+                    'capital': entity.get('kbzj', '') + '万元',  # 开办资金
+                    'jbdw': entity.get('jbdw', ''),  # 举办单位
+                    'spjg': entity.get('spjg', ''),  # 审批机关
+                    'zsyxqz1': entity.get('zsyxqz1', ''),  # 证书有效期自
+                    'zsyxqz2': entity.get('zsyxqz2', ''),  # 证书有效期至
+                    'address': entity.get('dz', ''),  # 地址
+                    'purpose_and_business': entity.get('zzhywfw', ''),  # 宗旨
+                }
+                return results
+            except Exception as e:
+                logger.error(e.__class__.__name__)
+
+    def set_result(self, val: dict):
+        self.results = val
+
+    def get_result(self):
+        return self.results
+
+    def crawl_spider(self):
+        response = self.crawl_request()
+        results = self.crawl_response(response)
+        self.set_result(results)
+
+    def start(self):
+        self.crawl_spider()
+
+    def __call__(self, *args, **kwargs):
+        self.proxies = kwargs.get('proxies')
+        for key, value in kwargs.items():
+            if key == 'entity_uuid' and kwargs.get('entity_uuid') is not None:
+                self.uuid = kwargs.get(key)
+            elif key == 'entity_code' and kwargs.get('entity_code') is not None:
+                self.social_id = kwargs.get('entity_code')
+            elif key == 'entity_name' and kwargs.get('entity_name') is not None:
+                self.keyword = kwargs.get('entity_name')
+            elif key == 'entity_type' and kwargs.get('entity_type') is not None:
+                self.entity_type = kwargs.get('entity_type')
+
+        self.params.update({
+            'keyword': self.keyword,
+            "entityType": self.entity_type,
+            "uuid": self.uuid,
+            "tyshxydm": self.social_id
+        })
+        conditions = [
+            len(self.uuid) > 0,
+            len(self.social_id) > 0,
+            len(self.keyword) > 0,
+            len(self.entity_type) > 0
+        ]
+        if all(conditions):
+            self.start()
+        return self
+
+
+QueryList = CreditChinaListSpider()
+QueryDetail = CreditChinaDetailSpider()
+
+
+def crawl_spider(keyword: str, proxies: dict = None):
+    Lock.acquire()
+    results = []
+    for items in QueryList(keyword=keyword, proxies=proxies):
+        # print(f">>> {keyword} ", items)
+        detail = QueryDetail(**items)
+        results.append(detail.results)
+    # print(f"{keyword} 搜索完成")
+    Lock.release()
+    return results

+ 7 - 0
credit_china/exceptions.py

@@ -0,0 +1,7 @@
+
+
+class InvalidProxiesException(Exception):
+    """无效的代理,抛出该异常"""
+
+    def __init__(self, *args, **kwargs):
+        pass

+ 69 - 0
credit_china/main.py

@@ -0,0 +1,69 @@
+from concurrent.futures import ThreadPoolExecutor
+
+from crawl_spiders import crawl_spider
+from exceptions import InvalidProxiesException
+from utils.databases import MongoDBS
+from utils.log import logger
+from utils.socks5 import Proxy
+
+proxy = Proxy(True)
+
+
+# def save_data_by_mongo(collection: str, results: list, item: dict):
+#     success = MongoDBS('py_spider', collection).collection.update_one(
+#         {"_id": item["_id"]},
+#         {'$set': {'basic_info': results}}
+#     )
+#     msg = "[Mongo数据库]{} 查询结果:{}条 更新:{}条 ".format(
+#         item['_id'],
+#         len(results),
+#         success.modified_count,
+#     )
+#     logger.info(msg)
+
+def save_data_by_mongo(collection: str, results: list, item: dict):
+    success = MongoDBS('py_spider', collection).collection.insert_many(results)
+    msg = "[Mongo数据库]{} 查询结果:{}条 成功添加:{}条 ".format(
+        item['company_name'],
+        len(results),
+        len(success.inserted_ids)
+    )
+    logger.info(msg)
+
+
+def crawl_spiders(item: dict):
+    global proxy
+    while True:
+        try:
+            # results = crawl_spider(item['name'], proxy.proxies)
+            results = crawl_spider(item['company_name'], proxy.proxies)
+            if len(results) > 0:
+                # save_data_by_mongo('buyer_err', results, item)
+                save_data_by_mongo('company_basic_info_all', results, item)
+            break
+        except InvalidProxiesException:
+            proxy.switch()
+
+
+# def main():
+#     query = {'basic_info': {'$exists': False}}
+#     with ThreadPoolExecutor(max_workers=10) as Executor:
+#         with MongoDBS('py_spider', 'buyer_err') as coll:
+#             with coll.find(query, no_cursor_timeout=True, batch_size=10) as cursor:
+#                 # task = []
+#                 # for item in cursor.limit(10):
+#                 #     task.append(Executor.submit(crawl_spiders, item))
+#                 # wait(task, return_when=ALL_COMPLETED)
+#                 Executor.map(crawl_spiders, cursor)
+
+def main():
+    query = {'basic_info': {'$exists': False}}
+    with ThreadPoolExecutor(max_workers=10) as Executor:
+        # with MongoDBS('py_spider', 'company_basic_info_all') as coll:
+        with MongoDBS('py_spider', 'company_name') as coll:
+            with coll.find(no_cursor_timeout=True, batch_size=10) as cursor:
+                Executor.map(crawl_spiders, cursor)
+
+
+if __name__ == '__main__':
+    main()

+ 0 - 0
credit_china/utils/__init__.py


+ 43 - 0
credit_china/utils/databases.py

@@ -0,0 +1,43 @@
+from typing import Optional
+
+import redis
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo.database import Database
+
+from config.load import mongo_conf, redis_conf
+
+__all__ = ['MongoDBS', 'RedisDBS']
+
+
+class MongoDBS:
+    """ Mongo """
+
+    def __init__(self, db: str, coll: str, cfg: Optional[dict] = mongo_conf):
+        self.client = MongoClient(host=cfg['host'], port=cfg['port'])
+        self.db: Database = self.client[db]
+        self.collection: Collection = self.db[coll]
+
+    def __enter__(self):
+        return self.collection
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.client.close()
+
+
+class RedisDBS:
+    """ redis """
+
+    def __init__(self, cfg: Optional[dict] = redis_conf):
+        pool = redis.ConnectionPool(
+            host=cfg['host'],
+            port=cfg['port'],
+            password=cfg['pwd'],
+            db=cfg['db']
+        )
+        self.__r = redis.Redis(connection_pool=pool, decode_responses=True)
+
+    @property
+    def redis(self):
+        return self.__r
+

+ 50 - 0
credit_china/utils/log.py

@@ -0,0 +1,50 @@
+from loguru import logger
+
+logger.add(
+    'logs/crawl_{time:YYYY-MM-DD}.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '[采集]' in x['message']
+)
+
+logger.add(
+    'logs/crawl_{time:YYYY-MM-DD}.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '[浏览器]' in x['message']
+)
+
+logger.add(
+    'logs/proxy.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '[代理]' in x['message']
+)
+
+logger.add(
+    'logs/databases.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {message}',
+    level='INFO',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+    filter=lambda x: '数据库' in x['message']
+)
+
+logger.add(
+    'logs/error.log',
+    format='{time:YYYY-MM-DD HH:mm:ss} - {file}:{line} - {level} - {message}',
+    level='ERROR',
+    rotation='500 MB',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 151 - 0
credit_china/utils/socks5.py

@@ -0,0 +1,151 @@
+import copy
+import time
+
+import requests
+
+from config.load import jy_proxy, headers
+from utils.log import logger
+
+__all__ = ['Proxy']
+
+
+def decrypt(input_str: str) -> str:
+    """
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    key = jy_proxy['socks5']['decrypt']
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+class Socks5Proxy:
+
+    __instance = None
+
+    def __new__(cls, *args, **kwargs):
+        if cls.__instance is None:
+            cls.__instance = super().__new__(cls)
+        return cls.__instance
+
+    def __init__(self):
+        self.__proxies = {}
+
+    def _init(self):
+        self.url = jy_proxy['socks5']['url']
+        self.pool = []
+        self.index = 0  # 当前代理在代理池的位置
+        self.counter = {}
+        self.seconds = 60
+
+        while not self.__proxies:
+            if len(self.pool) > 0 and not self.__proxies:
+                self.__proxies = copy.deepcopy(self.pool[self.index])
+            else:
+                self.generate_pool()
+
+    @property
+    def proxies(self):
+        return self.__proxies
+
+    def switch(self, reset=False):
+        """切换代理"""
+        if reset is True:
+            self.counter.clear()
+            self.flush_pool()
+
+        elif len(self.counter) > 0:
+            end_time = self.counter[str(self.__proxies)]
+            current_time = int(time.time())
+            if end_time - current_time < self.seconds:
+                self.pool.remove(self.__proxies)
+                logger.info(f"[代理]移除:{self.__proxies}")
+                del self.counter[str(self.__proxies)]
+                logger.info(f"[代理]剩余个数:{len(self.pool)}")
+
+        self.__proxies = {}  # 重置代理
+
+        while not self.proxies:
+            if len(self.pool) > 0:
+                self.index += 1
+                if self.index >= len(self.pool):
+                    self.index = 0
+                self.__proxies = copy.deepcopy(self.pool[self.index])
+                logger.info(f"[代理]切换 - {self.index}")
+            else:
+                logger.info("[代理]无可用代理")
+                self.flush_pool()
+
+    def generate_pool(self):
+        """初始化代理池"""
+        self.__socks5()
+        self.__check_proxies()
+
+    def flush_pool(self):
+        logger.info(f"[代理]刷新代理池")
+        self.pool.clear()
+        self.generate_pool()
+
+    def __socks5(self):
+        logger.info(f"[代理]请求服务:{self.url}")
+        try:
+            response = requests.get(self.url, timeout=10)
+            self.__extract_ip(response)
+        except requests.RequestException:
+            pass
+
+    def __extract_ip(self, response):
+        for proxy in response.json():
+            host = decrypt(proxy['host'])
+            port = int(proxy['port'])
+            end_time = proxy['EndTime']
+            items = {
+                'http': 'socks5://{}:{}'.format(host, port),
+                'https': 'socks5://{}:{}'.format(host, port)
+            }
+            self.pool.append(items)
+            self.counter.setdefault(str(items), end_time)
+
+    def __check_proxies(self):
+        check_ip = 'https://myip.ipip.net'
+        logger.info(f"[代理]通信检查:{check_ip}")
+        for proxies in self.pool[::-1]:
+            try:
+                requests_param = {
+                    "headers": headers,
+                    "proxies": proxies,
+                    "timeout": 10
+                }
+                requests.get(check_ip, **requests_param)
+            except requests.RequestException:
+                self.pool.remove(proxies)
+                del self.counter[str(proxies)]
+        logger.info(f"[代理]可用个数:{len(self.pool)}")
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        if enable_proxy:
+            self._init()
+        return self
+
+
+Proxy = Socks5Proxy()

+ 20 - 0
credit_china/utils/tools.py

@@ -0,0 +1,20 @@
+import hashlib
+
+import bson
+
+
+def hex_sign(content: str) -> str:
+    """
+    十六进制数字字符串形式摘要值
+
+    @param content: 字符串文本
+    @return: 摘要值
+    """
+    sha1 = hashlib.sha1()
+    sha1.update(content.encode("utf-8"))
+    return sha1.hexdigest()
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)

+ 121 - 0
hospital/crawl_detail_page.py

@@ -0,0 +1,121 @@
+import requests
+
+from default import (
+    crawl_tab,
+    html2element,
+    save_tab,
+    hospital_name,
+    hospital_alias, hospital_main_department
+)
+
+headers = {
+    "authority": "www.yixue.com",
+    "cache-control": "max-age=0",
+    "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"97\", \"Chromium\";v=\"97\"",
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": "\"Windows\"",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+    "sec-fetch-site": "same-origin",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-user": "?1",
+    "sec-fetch-dest": "document",
+    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "referer": "https://www.yixue.com/%E6%B5%B7%E8%A5%BF%E8%92%99%E5%8F%A4%E6%97%8F%E8%97%8F%E6%97%8F%E8%87%AA%E6%B2%BB%E5%B7%9E%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8",
+}
+
+
+def start():
+    count = 0
+    q = {'finished': False}
+    with crawl_tab.find(q, no_cursor_timeout=True, batch_size=5) as cursor:
+        for item in cursor:
+            count += 1
+            url = item['url']
+            headers.update({'referer': item['refer']})
+            try:
+                response = requests.get(url, headers=headers, timeout=60)
+                element = html2element(response.text)
+
+                table = element.xpath('//div[@id="mw-content-text"]/div/table[1]/@class')
+                '''放在table格式中的数据'''
+                if len(table) > 0 and 'navbox' not in table:
+                    node_table = element.xpath('//div[@id="mw-content-text"]/div/table[1]')[0]
+                    name = "".join(node_table.xpath('.//tr[1]/th/span/text()')).strip()
+                    hospital = {
+                        'origin_name': name,
+                        'origin_url': url,
+                        'name': hospital_name(name),  # 医院名称
+                        'level': '',  # 医院等级
+                        'type': '',  # 医院类型
+                        'address': "".join(node_table.xpath('.//tr[2]/td/text()')).strip(),  # 医院地址
+                        'main_depart': '',  # 重点科室
+                        'business_type': '',  # 经营方式
+                        'tel': "".join(node_table.xpath('.//tr[3]/td/text()')).strip(),  # 联系电话
+                        'fax_number': "".join(node_table.xpath('.//tr[4]/td/text()')).strip().replace('&nbsp;', ''),  # 传真号码
+                        'e_mail': "".join(node_table.xpath('.//tr[5]/td/text()')).strip().replace('&nbsp;', ''),  # 电子邮箱
+                        'postcode': "".join(node_table.xpath('.//tr[6]/td/text()')).strip().replace('&nbsp;', ''), # 邮政编码
+                        'website': "".join(node_table.xpath('.//tr[7]/td/a/@href')).strip(),  # 医院网站
+                        'alias': hospital_alias(name),  # 其他名称
+                        'area': item.get('province', ''),  # 省份
+                        'city': item.get('city', ''),  # 城市
+                        'district': item.get('district', '')  # 区县
+                    }
+                else:
+                    # 重点科室
+                    line1 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/text()')).strip()
+                    line2 = "、".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[5]/a/text()')).strip()
+                    line = "{}、{}".format(line1, line2)
+                    main_department = hospital_main_department(line)
+                    # 发布的医院名称
+
+                    name_xpath = [
+                        '//div[@id="mw-content-text"]/div/p[1]/b/text()',
+                        '//*[@id="firstHeading"]/text()'
+                    ]
+                    for _xpath in name_xpath:
+                        name = "".join(element.xpath(_xpath)).strip()
+                        if len(name) > 0:
+                            break
+                    else:
+                        name = ''
+
+                    hospital = {
+                        'origin_name': name,
+                        'origin_url': url,
+                        'name': hospital_name(name),  # 医院名称
+                        'level': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[3]/a/text()')).strip(),  # 医院等级
+                        'type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[4]/a/text()')).strip(),  # 医院类型
+                        'address': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[1]/text()')).strip().replace(":", ""),  # 医院地址
+                        'main_depart': main_department,  # 重点科室
+                        'business_type': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[6]/a/text()')).strip(),  # 经营方式
+                        'tel': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[2]/text()')).strip().replace(":", ""),  # 联系电话
+                        'fax_number': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[7]/text()')).strip().replace(":", ""),  # 传真号码
+                        'e_mail': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[9]/text()')).strip().replace(":", ""),  # 电子邮箱
+                        'postcode': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[8]/text()')).strip().replace(":", ""), # 邮政编码
+                        'website': "".join(element.xpath('//div[@id="mw-content-text"]/div/ul/li[10]/a/text()')).strip(),  # 医院网站
+                        'alias': hospital_alias(name),  # 其他名称
+                        'area': item.get('province', ''),  # 省份
+                        'city': item.get('city', ''),  # 城市
+                        'district': item.get('district', '')  # 区县
+                    }
+                save_tab.insert_one(hospital)
+                crawl_tab.update_one(
+                    {'_id': item['_id']},
+                    {'$set': {'finished': True}}
+                )
+                print(f"[采集成功] {item['name']}")
+            except:
+                crawl_tab.update_one(
+                    {'_id': item['_id']},
+                    {'$set': {'finished': False}}
+                )
+                print(f"[采集失败] {item['name']}")
+
+            if count % 500 == 0:
+                print(f"已采集 {count} 条")
+
+
+if __name__ == '__main__':
+    start()

+ 50 - 0
hospital/crawl_list_page.py

@@ -0,0 +1,50 @@
+import time
+from urllib.parse import urljoin
+
+import requests
+
+from default import (
+    crawl_tab,
+    headers,
+    html2element,
+    area_tab,
+    unknown_element
+)
+
+
+def start():
+    with area_tab.find() as cursor:
+        for hospital in cursor:
+            url = hospital['url']
+            response = requests.get(url, headers=headers, timeout=60)
+            # print(response)
+            element = html2element(response.text)
+            nodes = element.xpath('//div[@id="mw-content-text"]/div/ul')
+            if len(nodes) > 0:
+                ul = nodes[-2]
+                items = []
+                for li in ul:
+                    try:
+                        a = li.xpath('./b/a')[-1]
+                    except IndexError:
+                        unknown_element(li, hospital)
+                        continue
+
+                    title = a.attrib.get('title')
+                    href = a.attrib.get('href')
+                    link = urljoin(url, href)
+                    # print(title, link)
+                    items.append({
+                        'name': title,
+                        'url': link,
+                        'refer': url,
+                        'province': hospital.get('province', ''),
+                        'city': hospital.get('city', ''),
+                        'district': hospital.get('district', '')
+                    })
+                result = crawl_tab.insert_many(items)
+                print(f"{hospital['name']} 共有医院 {len(result.inserted_ids)} 家")
+
+
+if __name__ == '__main__':
+    start()

+ 51 - 0
hospital/crawl_region.py

@@ -0,0 +1,51 @@
+import re
+from urllib.parse import urljoin
+
+import requests
+
+from default import (
+    html2element,
+    headers,
+    query_address,
+    query_region,
+    area_tab
+)
+
+Address = query_address()
+
+
+def start():
+
+    url = "https://www.yixue.com/%E5%85%A8%E5%9B%BD%E5%8C%BB%E9%99%A2%E5%88%97%E8%A1%A8"
+    response = requests.get(url, headers=headers, timeout=60)
+    element = html2element(response.text)
+    nodes = element.xpath('//div[@id="mw-content-text"]/div/p')
+    p_nodes = nodes[: len(nodes) - 5]
+    for node in p_nodes:
+        a_nodes = node.xpath('./a')
+        if len(a_nodes) > 1:
+            items = []
+            for a in a_nodes:
+                href = a.attrib.get('href')
+                title = a.attrib.get('title')
+                link = urljoin(url, href)
+
+                if '页面不存在' in title:
+                    continue
+
+                print(title, link)
+                result = re.search('(.*)医院列表', title).group(1)
+                result = re.split('省|自治区', result)
+                # print(result)
+                region = result[-1]
+                # print(region)
+                item = query_region(region, Address)
+                # print(f'>>> ', item)
+                if item is not None:
+                    items.append({'name': title, 'url': link, **item})
+            result = area_tab.insert_many(items)
+            print(f"医院区域列表 新增 {len(result.inserted_ids)} 条")
+
+
+if __name__ == '__main__':
+    start()

+ 185 - 0
hospital/default.py

@@ -0,0 +1,185 @@
+import io
+import re
+
+from lxml.html import fromstring
+from pymongo import MongoClient
+from pymongo.collection import Collection
+
+from lxml.html import tostring
+
+hospital = MongoClient('127.0.0.1:27017').hospital
+area_tab: Collection = hospital.area
+crawl_tab: Collection = hospital.list_item
+save_tab: Collection = hospital.data_info
+err_tab: Collection = hospital.crawl_error
+
+region = MongoClient('127.0.0.1:27017').region
+address_tab: Collection = region.address
+
+headers = {
+    "authority": "www.yixue.com",
+    "cache-control": "max-age=0",
+    "sec-ch-ua": "^\\^",
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": "^\\^Windows^^",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+    "sec-fetch-site": "none",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-user": "?1",
+    "sec-fetch-dest": "document",
+    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8"
+}
+
+
+def html2element(html):
+    return fromstring(html)
+
+
+def element2html(element):
+    return tostring(element, encoding='utf-8').decode('utf-8')
+
+
+def unknown_element(element, item):
+    page_source = element2html(element)
+    data = {
+        **item,
+        'page_source': page_source
+    }
+    err_tab.insert_one(data)
+
+
+def query_address(query=None, projection=None):
+    results = []
+    if query is None:
+        query = {}
+    if projection is None:
+        projection = {'province': 1, 'city': 1, 'district': 1}
+    with address_tab.find(query, projection=projection) as cursor:
+        for item in cursor:
+            del item['_id']
+            results.append(item)
+    return results
+
+
+def remove_suffix(text):
+    if text is None:
+        return None
+    return re.sub('省|自治区|地区', '', text)
+
+
+def _query_region(text, items):
+    _find_result = []
+    for k in ['district', 'city', 'province']:
+        for item in items:
+            _val = item.get(k)
+            if text is not None and _val is not None and text == _val:
+                if k == 'province':
+                    _find_result.append({'province': _val})
+                elif k == 'city':
+                    _find_result.append({'province': item.get('province'), 'city': _val})
+                else:
+                    _find_result.append(item)
+
+    if len(_find_result) > 0:
+        return _find_result
+    return None
+
+
+def query_region(text, address):
+    result = re.match('(.*(?:省|自治区|市)){0,1}(.*(?:市|区|县|州|盟)){0,1}', text)
+    prov, city = result.groups()  # 抽取省 市信息
+    prov = remove_suffix(prov)
+    city = remove_suffix(city)
+    if prov is None and city is None:
+        return {}
+    '''查询省市信息'''
+    item = (_query_region(city, address) or _query_region(prov, address))
+    # print(item)
+    if item is None:
+        return None
+    elif len(item) > 1 and prov is not None:
+        '''查询结果大于1,通过精准对比市级或者省级名称取出数据'''
+        for _item in item:
+            if prov == _item.get('city') or prov == _item['province']:
+                return _item
+    elif len(item) > 1 and prov is None:
+        '''查询结果大于1,直接给出一个省市信息'''
+        _item = item[0]
+        return {'province': _item.get('province'), 'city': _item.get('city')}
+    else:
+        return item[0]
+
+
+res_name = re.compile('(.*(?:院|区|部|所|馆|科|局|诊|病|场|康|站|点|社|字|室|会|瘤|大|矿|腔|堂|岗|合|〗|校|办|)|号|坊|医|房|贵|光|吾|门诊|体检|中心|公司|机构|集团|美容|整形|部队|保健|基地|服务)){0,1}((.*)){0,1}$')
+
+
+def hospital_alias(text: str):
+    """医院别名"""
+    res = res_name.match(text)
+    _, _other = res.groups()
+    if _other is not None:
+        _other = _other[1:-1]
+        _other = ",".join(_other.split('、'))
+        # print(_other)
+    return _other if _other is not None else ''
+
+
+def hospital_name(text: str):
+    res = res_name.match(text)
+    _name, _ = res.groups()
+    return _name
+
+
+def hospital_main_department(text: str):
+    res = re.match(':(.*){0,1}(、){0,1}$', text)
+    if res is None:
+        return ''
+    _department, _ = res.groups()
+    if _department is not None:
+        # print(_department)
+        _departments = _department.split('、')
+        _stream = io.StringIO()
+        for val in _departments:
+            if len(val) == 0:
+                continue
+            else:
+                _stream.write(val + '、')
+
+        _department = _stream.getvalue()
+        # print(_department[:-1])
+    return _department[:-1] if _department is None else ''
+
+
+if __name__ == '__main__':
+    # ma = ':特需门诊、银屑病、白癜风科、、痤疮门诊、灰指甲专科'
+    # ma = ':、、、、、、、、消化内科、心血管内科、眼科、产科'
+    # ma = ':、心脏科、神经外科'
+    # hospital_main_department(ma)
+    # name = '北京鸿慈童康'
+    # name = '上海精神卫生康复医院二部'
+    # name = '海湾镇燎原卫生院'
+    # name = '张家港时代港口医院有限公司'
+    # name = '北京玉之光医疗整形美容国际连锁机构(玉之光(北京)国际医疗美容整形机构)'
+    # name = '中国人民解放军第306医院(三零六医院、三0六医院)'
+    # name = '上海浦东新区迎博社区卫生服务站'
+    # name = '上海徐剑炜整形美容'
+    # print(hospital_name(name))
+    # print(hospital_alias(name))
+
+    # name = '上海市宝山区医院列表'
+    # name = '北京市宣武区医院列表'
+    # name = '北京市延庆县医院列表'
+    # name = '甘孜藏族自治州医院列表'
+    # name = '湖北省神农架林区医院列表'
+    # name = '永州市医院列表'
+    # name = '德宏傣族景颇族自治州医院列表'
+    # name = '云南省丽江地区医院列表' # 1
+    # name = '延边朝鲜族自治州医院列表'
+    # name = '兴安盟医院列表'
+    # name = '新疆维吾尔自治区喀什地区医院列表'
+    name = '石河子市医院列表'
+    address = query_address()
+    result = query_region(name, address)
+    print(result)

+ 18 - 0
hospital/mgo_handler_district.py

@@ -0,0 +1,18 @@
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from urllib.parse import urljoin
+
+client = MongoClient('127.0.0.1:27017')
+district: Collection = client['py_spider']['district']
+
+
+with district.find() as cursor:
+    for item in cursor:
+        city = item['city']
+        district = item['district']
+        if city == district:
+            search_key = '{}医院列表'.format(district)
+        else:
+            search_key = '{}{}医院列表'.format(city, district)
+        url = urljoin('https://www.yixue.com/', search_key)
+        print(url)