Explorar o código

新增 - 全国统一组织查询

dongzhaorui %!s(int64=3) %!d(string=hai) anos
pai
achega
190606c48f

+ 0 - 0
codes_hospital/config/__init__.py


+ 2 - 0
codes_hospital/config/constants.yaml

@@ -0,0 +1,2 @@
+headers:
+  User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36

+ 39 - 0
codes_hospital/config/dev.yaml

@@ -0,0 +1,39 @@
+mongo:
+  host: 172.17.4.87
+  port: !!int 27080
+
+
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10
+
+
+redis_cluster:
+  - host: 172.17.4.239
+    port: 2479
+  - host: 172.17.4.240
+    port: 2579
+  - host: 172.17.4.84
+    port: 2379
+
+
+es:
+  host: 172.17.4.184
+  port: !!int 19800
+  db: biddingall # es库别名
+
+
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+  endpoint: oss-cn-beijing-internal.aliyuncs.com    # 内网使用
+  bucket_name: jy-datafile
+
+
+proxy:
+  socks5:
+    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
+    auth:
+      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB

+ 39 - 0
codes_hospital/config/load.py

@@ -0,0 +1,39 @@
+import platform
+from pathlib import Path
+
+import yaml
+
+__all__ = [
+    'mongo_conf',
+    'redis_conf',
+    'redis_startup_nodes',
+    'oss_conf',
+    'es_conf',
+    'jy_proxy',
+    'node_module_path',
+    'headers',
+]
+if platform.system() not in ['Darwin', 'Windows']:
+    ENV = 'dev.yaml'
+else:
+    ENV = 'test.yaml'
+
+_base_path = Path(__file__).parent
+_yaml_conf = (_base_path / ENV).resolve()
+_yaml_constants = (_base_path / 'constants.yaml').resolve()
+_node_modules = (_base_path.parent / 'node_modules').resolve()
+
+with open(_yaml_conf, encoding="utf-8") as f:
+    _conf = yaml.safe_load(f)
+    mongo_conf = _conf['mongo']
+    redis_conf = _conf['redis']
+    redis_startup_nodes = _conf['redis_cluster']
+    oss_conf: dict = _conf['ali_oss']
+    es_conf: dict = _conf['es']
+    jy_proxy: dict = _conf['proxy']
+    node_module_path = _node_modules
+
+
+with open(_yaml_constants, encoding="utf-8") as fp:
+    _constants = yaml.safe_load(fp)
+    headers: dict = _constants['headers']

+ 41 - 0
codes_hospital/config/test.yaml

@@ -0,0 +1,41 @@
+#mongo:
+#  host: 127.0.0.1
+#  port: !!int 27017
+mongo:
+  host: 127.0.0.1
+  port: !!int 27001
+
+redis:
+  host: 127.0.0.1
+  port: !!int 6379
+  pwd: ""
+  db: !!int 10
+
+
+redis_cluster:
+  - host: 192.168.3.207
+    port: 2179
+  - host: 192.168.3.166
+    port: 2379
+  - host: 192.168.3.207
+    port: 2279
+
+
+ali_oss:
+  key_id: LTAI4G5x9aoZx8dDamQ7vfZi
+  key_secret: Bk98FsbPYXcJe72n1bG3Ssf73acuNh
+  endpoint: oss-cn-beijing.aliyuncs.com   # 公网使用
+  bucket_name: jy-datafile
+
+
+es:
+  host: 192.168.3.206
+  port: !!int 9800
+  db: biddingall # es库别名
+
+
+proxy:
+  socks5:
+    url: http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch
+    auth:
+      Authorization: Basic amlhbnl1MDAxOjEyM3F3ZSFB

+ 289 - 0
codes_hospital/crawl_hospital.py

@@ -0,0 +1,289 @@
+import json
+import time
+
+import execjs
+import requests
+import urllib3
+
+from utils.databases import mongo_table
+from utils.log import logger
+from utils.socks5 import Socks5Proxy
+
+urllib3.disable_warnings()
+
+zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
+f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
+
+
+def md5_hex(val):
+    salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
+    val = val + salt
+    script = """
+        function n(t, e, n, r, o, i) {
+            return u(function(t, e) {
+                return t << e | t >>> 32 - e;
+            }(u(u(e, t), u(r, i)), o), n);
+        }
+        function r(t, e, r, o, i, a, u) {
+            return n(e & r | ~e & o, t, e, i, a, u);
+        }
+        function o(t, e, r, o, i, a, u) {
+            return n(e & o | r & ~o, t, e, i, a, u);
+        }
+        function i(t, e, r, o, i, a, u) {
+            return n(e ^ r ^ o, t, e, i, a, u);
+        }
+        function a(t, e, r, o, i, a, u) {
+            return n(r ^ (e | ~o), t, e, i, a, u);
+        }
+        function u(t, e) {
+            var n = (65535 & t) + (65535 & e);
+            return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
+        }
+        hex_md5= function(t) {
+                return function(t) {
+                    for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
+                    return n;
+                }(function(t, e) {
+                    t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
+                    for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
+                        var p = n, h = c, d = s, v = f;
+                        n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586), 
+                        s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330), 
+                        n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426), 
+                        s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983), 
+                        n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417), 
+                        s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162), 
+                        n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101), 
+                        s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329), 
+                        n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632), 
+                        s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302), 
+                        n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083), 
+                        s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848), 
+                        n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690), 
+                        s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501), 
+                        n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784), 
+                        s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734), 
+                        n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463), 
+                        s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556), 
+                        n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353), 
+                        s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640), 
+                        n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222), 
+                        s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189), 
+                        n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835), 
+                        s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651), 
+                        n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415), 
+                        s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055), 
+                        n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606), 
+                        s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799), 
+                        n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744), 
+                        s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649), 
+                        n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379), 
+                        s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551), 
+                        n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
+                    }
+                    return Array(n, c, s, f);
+                }(function(t) {
+                    for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
+                    return e;
+                }(t), 8 * t.length));
+            }
+
+    """
+    ctx = execjs.compile(script)
+    result = ctx.call('hex_md5', val)
+    # print(result)
+    return result
+
+
+def quote(data):
+    script = """
+    getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
+    """
+    ctx = execjs.compile(script)
+    quote_str = ctx.call('getQuote', data)
+    # print(quote_str)
+    return quote_str
+
+
+def get_jgdm(query, proxies):
+    results = []
+    url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
+    headers = {
+        "Host": "ss.cods.org.cn",
+        "content-type": "application/x-www-form-urlencoded",
+        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
+        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
+    }
+    val = {
+        "q": query,
+        "t": "common",
+        "currentPage": 1,
+        "xzqh": "",
+        "jglx": "B",  # 事业类型
+        "zczj": "",
+        "clrq": "",
+        "mobile": "",
+        "isDeepSearch": False,
+        "platform": "weixin",
+        "openid": "o0VVO5Wjhblu4tgm4OkMaJecvsO4"
+    }
+    json_str = quote(val)
+    sign = md5_hex(json_str)
+    data = {
+        "jsonString": json_str,
+        "sign": sign
+    }
+    # print(data)
+    request_params = dict(
+        headers=headers,
+        data=data,
+        verify=False,
+        timeout=60,
+        proxies=proxies
+    )
+    print('proxies >>> ', proxies)
+    try:
+        response = requests.post(url, **request_params)
+    except requests.exceptions.ProxyError:
+        raise requests.RequestException(f"'{query}'jgdm请求失败")
+
+    # print(response)
+    resp_json = response.json()
+    # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
+    assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
+    documents = resp_json['jginfoList']["documents"]
+    for item in documents:
+        if item['jyzt'] != '注销':
+            # results.append({'query': query, 'encJgdm': item['encJgdm']})
+            results.append(item['encJgdm'])
+    logger.info(f"[列表查询成功]{proxies} - {query}")
+    return results
+
+
+def get_hospital(query, jgdm, proxies):
+    url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
+    headers = {
+        "Host": "ss.cods.org.cn",
+        "Content-Length": "531",
+        "content-type": "application/x-www-form-urlencoded",
+        "Accept-Encoding": "gzip,compress,br,deflate",
+        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
+        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
+        "Connection": "keep-alive"
+    }
+    val = {
+        "jgdm": jgdm,
+        "keyword": query,
+        "platform": "weixin",
+        "openid": "o0VVO5QnhbdQfl4fkZWw8faTGkZM"
+    }
+    json_str = quote(val)
+    data = {
+        'jsonString': json_str,
+        'sign': md5_hex(json_str)
+    }
+    # print(data)
+    request_params = dict(
+        headers=headers,
+        data=data,
+        proxies=proxies,
+        verify=False,
+        timeout=60
+    )
+    print("proxies >>> ", proxies)
+    try:
+        response = requests.post(url, **request_params)
+    except requests.exceptions.ProxyError:
+        raise requests.RequestException(f"'{jgdm}'医院请求失败")
+
+    resp_json = response.json()
+    # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
+    assert "code" in resp_json and resp_json["code"] == '0'
+    item = resp_json['document']
+    alias_name = ""
+    if "jgmchis" in item:
+        alias_name = ",".join(item["jgmchis"])
+    hospital = {
+        "search_name": query,  # 搜索名称
+        "hospital_name": item["jgmc"],  # 医院名称
+        "alias_name": alias_name,  # 曾用名 ("xxx,xxx")
+        "credit_no": item["tydm"],  # 统一信用代码
+        "legal_person": item["fddbr"],  # 法定代表人
+        "capital": item["newZczj"],  # 注册资本
+        "establish_date": item["clrq"],  # 成立日期
+        "company_type": "事业单位",  # 企业类型 (机构类型)
+        "operation_startdate": item["jyqxz"],  # 营业期限自
+        "operation_enddate": item["jyqxe"],  # 营业期限至
+        "business_scope": item["jyfw"],  # 经营范围
+        "authority": item["djbmmc"],  # 登记机关(批准机构名称)
+        "company_address": item["zcdz"],  # 联系地址(注册地址)
+        "company_code": item["djh"],  # 注册号(登记号)
+        "organization": item["jjlxdm"],  # (经济类型)
+        "industry": item["jjhydm"],  # (经济行业)
+    }
+    f_hospital_codes.insert_one(hospital)
+    # print(json.dumps(hospital, indent=4, ensure_ascii=False))
+    logger.info(f'[详情查询成功]{proxies} - {hospital["hospital_name"]}')
+    return hospital
+
+
+def callback_requests(func, *args, **kwargs):
+    proxy = kwargs.pop('proxy', None)
+    while True:
+        try:
+            proxies = proxy.proxies if proxy is not None else None
+            kwargs.setdefault('proxies', proxies)
+            return func(*args, **kwargs)
+        except (requests.RequestException, AssertionError) as e:
+            logger.error(e)
+            time.sleep(3)
+            if proxy is not None:
+                proxy.switch()
+
+
+def query_hospital(tasks, proxy):
+    while len(tasks) > 0:
+        task = tasks.pop(0)
+        query = task['name']
+        logger.info(f"[开始查询]{query}")
+        jgdm_lst = callback_requests(get_jgdm, query, proxy=proxy)
+        time.sleep(3)
+        for jgdm in jgdm_lst:
+            callback_requests(get_hospital, query, jgdm, proxy=proxy)
+            time.sleep(5)
+
+        total = len(jgdm_lst)  # 事业单位的数量
+        zktest_unexists_name.update_one(
+            {'_id': task['_id']},
+            {'$set': {'is_crawl': True, 'count': total}}
+        )
+        logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
+        time.sleep(20)
+
+
+def crawl_spider():
+    logger.info('开始任务')
+    proxy = Socks5Proxy(True)
+    while True:
+        projection = {'_id': 1, 'name': 1}
+        q = {
+            "$and": [
+                {"count": {"$exists": True}},
+                {"count": {"$gt": 0}}
+            ],
+            'is_crawl': {'$exists': False}
+        }
+        cursor = zktest_unexists_name.find(q, projection=projection)
+        tasks = [item for item in cursor.limit(5)]
+        if len(tasks) == 0:
+            logger.info('任务结束')
+            break
+
+        logger.info(f'获取{len(tasks)}条新任务')
+        query_hospital(tasks, proxy)
+        proxy.switch()
+
+
+if __name__ == '__main__':
+    crawl_spider()

+ 97 - 0
codes_hospital/retrieval.py

@@ -0,0 +1,97 @@
+import time
+from concurrent.futures import ThreadPoolExecutor, wait
+from urllib.parse import quote
+
+import requests
+
+from utils.databases import mongo_table
+from utils.log import logger
+from utils.tools import err_details
+
+zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
+f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
+
+
+jsessionid = ''
+kefu_cookie = 'dd7281f417074bb1bd0a5903c56951e0'
+
+
+def retrieval_hospital_total(session, name):
+    global jsessionid
+    page = 1
+    t = 'common'
+    search_token = 'e22bf59ff783e31c6b72b38e89641ae6,20220804105514323!20220804105532255!20220804105658624!20220804105746669!2022080410575422'
+    data = [{
+        "title": name,
+        "link": f"wx_searchPro.action?keyword={name}",
+        "other": ""
+    }]
+    referer = "https://ss.cods.org.cn/latest/searchR?q={}&currentPage={}&t={}&searchToken={}".format(
+        quote(quote(name)),
+        page,
+        t,
+        search_token
+    )
+    headers = {
+        "Accept": "application/json, text/javascript, */*; q=0.01",
+        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Content-Length": "0",
+        "Origin": "https://ss.cods.org.cn",
+        "Pragma": "no-cache",
+        "$Referer": referer,
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
+        "X-Requested-With": "XMLHttpRequest",
+    }
+    cookies = {
+        "JSESSIONID": jsessionid,
+        "IAM_SID": "IAM_SID%3DADB9609D3F02477985A90E53DBB95BC2",
+        "Hm_lvt_f4e96f98fa73da7d450a46f37fffbf56": "1659581210",
+        "Hm_lpvt_f4e96f98fa73da7d450a46f37fffbf56": "1659581210",
+        "userCookie": "5e2b9289-eabb-4efe-2b9c-6435835e87fb",
+        "key": quote(str(data))
+    }
+    url = "https://ss.cods.org.cn/latest/jgmcSuggest"
+    params = {
+        "q": quote(name)
+    }
+    request_params = dict(
+        headers=headers,
+        cookies=cookies,
+        data=params,
+        timeout=60,
+    )
+    response = session.post(url, **request_params)
+    # print(response)
+    total = len(response.json())
+    return total
+
+
+def retrieval_hospital(session, task):
+    name = task['name']
+    total = retrieval_hospital_total(session, name)
+    logger.info(f'[检索医院-{name}]存在{total}条')
+    time.sleep(0.1)
+    zktest_unexists_name.update_one(
+        {'_id': task['_id']},
+        {'$set': {'count': total}}
+    )
+
+
+def retrieval():
+    q = {'count': {'$exists': False}}
+    projection = {'name': 1}
+    cursor = zktest_unexists_name.find(q, projection=projection)
+    with ThreadPoolExecutor(max_workers=8) as pool, requests.session() as session:
+        futures = []
+        for item in cursor:
+            f = pool.submit(retrieval_hospital, session, item)
+            f.add_done_callback(err_details)
+            futures.append(f)
+        wait(futures)
+
+
+if __name__ == '__main__':
+    with requests.session() as session:
+        retrieval_hospital_total(session, '宣城市妇幼保健所')

+ 0 - 0
codes_hospital/utils/__init__.py


+ 106 - 0
codes_hospital/utils/databases.py

@@ -0,0 +1,106 @@
+import bson
+import pymongo
+import redis
+from elasticsearch import Elasticsearch
+from rediscluster import RedisCluster
+
+from config.load import mongo_conf, redis_conf, es_conf, redis_startup_nodes
+
+# ---------------------------------- mongo ----------------------------------
+MONGO_URI_CLIENTS = {}    # a dictionary hold all client with uri as key
+
+
+def mongo_client(cfg=None, host=None, port=None, fork=False, **kwargs):
+    if host is not None and port is not None:
+        uri = f'mongodb://{host}:{port}'
+    else:
+        _cfg = (cfg or mongo_conf)
+        uri = f'mongodb://{_cfg["host"]}:{_cfg["port"]}'
+
+    if fork:
+        return pymongo.MongoClient(uri, **kwargs)
+    global MONGO_URI_CLIENTS
+    matched_client = MONGO_URI_CLIENTS.get(uri)
+    if matched_client is None:
+        new_client = pymongo.MongoClient(uri, **kwargs)
+        if new_client is not None:
+            MONGO_URI_CLIENTS[uri] = new_client
+        return new_client
+    return matched_client
+
+
+def mongo_database(name: str, **kw):
+    client = mongo_client(**kw)
+    return client.get_database(name)
+
+
+def mongo_table(db: str, name: str, **kw):
+    database = mongo_database(db, **kw)
+    return database.get_collection(name)
+
+
+def int2long(param: int):
+    """int 转换成 long """
+    return bson.int64.Int64(param)
+
+
+def object_id(_id: str):
+    return bson.objectid.ObjectId(_id)
+
+
+# ---------------------------------- es ----------------------------------
+def es_client(cfg=None):
+    if cfg is None:
+        cfg = es_conf
+    return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
+
+
+def es_query(title: str, publish_time: int):
+    """
+    查询es
+
+    :param title: 标题
+    :param publish_time: 发布时间
+    :return:
+    """
+    client = es_client()
+    stime = publish_time - 432000  # 往前推5天
+    etime = publish_time + 432000
+    # 通过发布标题和发布时间范围查询
+    query = {
+        "query": {
+            "bool": {
+                "must": [
+                    {
+                        "multi_match": {
+                            "query": title,
+                            "type": "phrase",
+                            "fields": ["title"]
+                        }
+                    },
+                    {"range": {'publishtime': {"from": stime, "to": etime}}}
+                ]
+            }
+        }
+    }
+    result = client.search(index=es_conf['db'], body=query, request_timeout=100)
+    total = int(result['hits']['total'])
+    return total
+
+
+# ---------------------------------- redis ----------------------------------
+def redis_client(cfg=None):
+    if cfg is None:
+        cfg = redis_conf
+    pool = redis.ConnectionPool(
+        host=cfg['host'],
+        port=cfg['port'],
+        password=cfg['pwd'],
+        db=cfg['db']
+    )
+    return redis.Redis(connection_pool=pool, decode_responses=True)
+
+
+def redis_cluster():
+    startup_nodes = redis_startup_nodes
+    return RedisCluster(startup_nodes=startup_nodes, decode_responses=True)

+ 14 - 0
codes_hospital/utils/log.py

@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from loguru import logger
+
+_absolute = Path(__file__).absolute().parent.parent
+_log_path = (_absolute / 'logs/crawl-{time:YYYY-MM-DD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 48 - 0
codes_hospital/utils/socks5.py

@@ -0,0 +1,48 @@
+import threading
+
+import requests
+
+from config.load import jy_proxy
+
+__all__ = ['Socks5Proxy']
+
+
+class Socks5Proxy:
+
+    def __init__(self, *args, **kwargs):
+        self._lock = threading.RLock()
+        self._url = jy_proxy['socks5']['url']
+        self._auth = jy_proxy['socks5']['auth']
+        self._enable_proxy = False
+        self._proxies = None
+        self(*args, **kwargs)
+
+    @property
+    def get_thread_name(self):
+        return threading.current_thread().name
+
+    @property
+    def proxies(self):
+        return self._proxies
+
+    def switch(self):
+        with self._lock:
+            if self._enable_proxy:
+                while True:
+                    proxies = self._fetch_proxies()
+                    if proxies != self._proxies:
+                        self._proxies = proxies
+                        break
+
+    def _fetch_proxies(self):
+        _proxy = {}
+        try:
+            _proxy = requests.get(self._url, headers=self._auth, timeout=10).json()
+        finally:
+            return _proxy.get("data")
+
+    def __call__(self, enable_proxy: bool = False, *args, **kwargs):
+        self._enable_proxy = enable_proxy
+        if self._enable_proxy:
+            self._proxies = self._fetch_proxies()
+        return self

+ 20 - 0
codes_hospital/utils/tools.py

@@ -0,0 +1,20 @@
+import socket
+
+from loguru import logger
+
+
+def get_host_ip():
+    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    try:
+        s.connect(('8.8.8.8', 80))
+        ip = s.getsockname()[0]
+    finally:
+        s.close()
+    return ip
+
+
+def err_details(worker):
+    worker_exception = worker.exception()
+    if worker_exception:
+        logger.exception("Worker return exception: {}".format(worker_exception))
+    return worker

+ 32 - 9
find_source/config/conf.yaml

@@ -1,27 +1,50 @@
-# mongo
+#mongo:
+#  host: 172.17.4.87
+#  port: !!int 27080
+
 mongo:
-  host: 172.17.4.87
-  port: !!int 27080
+  host: 127.0.0.1
+  port: !!int 27017
+
+#mongo:
 #  host: 127.0.0.1
+#  port: !!int 27001
+
+#mongo:
+#  host: 192.168.20.144
 #  port: !!int 27017
 
 
-# redis
+#redis:
+#  host: 172.17.4.232
+#  port: !!int 7361
+#  pwd: ""
+#  db: !!int 3
+
+
 redis:
   host: 127.0.0.1
   port: !!int 6379
   pwd: ""
   db: !!int 10
 
+#redis:
+#  host: 192.168.20.144
+#  port: !!int 6379
+#  pwd: ""
+#  db: !!int 10
+
 
-# es
 es:
-  host: 172.17.145.170
-#  host: 192.168.3.206
-#  host: 127.0.0.1
-  port: !!int 9800
+  host: 172.17.4.184
+  port: !!int 19800
   db: biddingall # es库别名
 
+#es:
+#  host: 127.0.0.1
+#  port: !!int 9800
+#  db: biddingall # es库别名
+
 
 # 代理
 proxy:

+ 1 - 1
find_source/crawler/bloom_filter/utils.py

@@ -4,7 +4,7 @@
 # @File    : utils.py
 # @Software: PyCharm
 # @Python3.6
-import uuid
+import web_uuid
 import math
 import time
 import redis

+ 5 - 1
find_source/crawler/download.py

@@ -1,6 +1,10 @@
 import threading
 
-import chardet
+try:
+    import chardet
+except ImportError:
+    import charset_normalizer as chardet
+
 import requests
 import urllib3
 from loguru import logger

+ 1 - 1
zgzb/common/attachment.py

@@ -2,7 +2,7 @@ import hashlib
 import os
 import re
 import traceback
-import uuid
+import web_uuid
 from urllib.parse import urlparse, unquote
 
 import requests