dongzhaorui 3 gadi atpakaļ
vecāks
revīzija
b949badc0f

+ 4 - 307
codes_hospital/crawl_hospital.py

@@ -1,21 +1,10 @@
-import datetime
-import json
 import time
 from collections import deque
-from functools import wraps
 
-import execjs
-import requests
-import urllib3
-
-from utils.databases import mongo_table
+from defaults import get_cursor, query_hospital, TimerError
 from utils.log import logger
 from utils.socks5 import Socks5Proxy
 
-urllib3.disable_warnings()
-
-zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
-f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
 OPENID_DEQUE = deque([
     "o0VVO5ck5WeNXrXGjscVGc74bXok",
     "o0VVO5QnhbdQfl4fkZWw8faTGkZM",
@@ -28,304 +17,12 @@ OPENID_DEQUE = deque([
 ])
 
 
-class TimerError(IOError):
-
-    def __init__(self, *args, **kwargs):
-        self.msg = args[0]
-
-
-def crawl_timer(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if all([
-            0 <= datetime.datetime.now().weekday() <= 4,  # 周一到周五
-            9 <= datetime.datetime.now().hour <= 17  # 早9点到晚5点
-        ]):
-            # print("进入++++++++++++++++ ", func.__name__)
-            result = func(*args, **kwargs)
-            # print("执行完毕------------------- ", func.__name__)
-            return result
-        raise TimerError('小程序接口停止运营')
-    return wrapper
-
-
-def get_openid():
-    global OPENID_DEQUE
-    openid = OPENID_DEQUE.popleft()
-    OPENID_DEQUE.append(openid)
-    return openid
-
-
-def md5_hex(val):
-    salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
-    val = val + salt
-    script = """
-        function n(t, e, n, r, o, i) {
-            return u(function(t, e) {
-                return t << e | t >>> 32 - e;
-            }(u(u(e, t), u(r, i)), o), n);
-        }
-        function r(t, e, r, o, i, a, u) {
-            return n(e & r | ~e & o, t, e, i, a, u);
-        }
-        function o(t, e, r, o, i, a, u) {
-            return n(e & o | r & ~o, t, e, i, a, u);
-        }
-        function i(t, e, r, o, i, a, u) {
-            return n(e ^ r ^ o, t, e, i, a, u);
-        }
-        function a(t, e, r, o, i, a, u) {
-            return n(r ^ (e | ~o), t, e, i, a, u);
-        }
-        function u(t, e) {
-            var n = (65535 & t) + (65535 & e);
-            return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
-        }
-        hex_md5= function(t) {
-                return function(t) {
-                    for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
-                    return n;
-                }(function(t, e) {
-                    t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
-                    for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
-                        var p = n, h = c, d = s, v = f;
-                        n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586), 
-                        s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330), 
-                        n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426), 
-                        s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983), 
-                        n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417), 
-                        s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162), 
-                        n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101), 
-                        s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329), 
-                        n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632), 
-                        s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302), 
-                        n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083), 
-                        s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848), 
-                        n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690), 
-                        s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501), 
-                        n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784), 
-                        s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734), 
-                        n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463), 
-                        s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556), 
-                        n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353), 
-                        s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640), 
-                        n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222), 
-                        s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189), 
-                        n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835), 
-                        s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651), 
-                        n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415), 
-                        s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055), 
-                        n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606), 
-                        s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799), 
-                        n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744), 
-                        s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649), 
-                        n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379), 
-                        s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551), 
-                        n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
-                    }
-                    return Array(n, c, s, f);
-                }(function(t) {
-                    for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
-                    return e;
-                }(t), 8 * t.length));
-            }
-
-    """
-    ctx = execjs.compile(script)
-    result = ctx.call('hex_md5', val)
-    # print(result)
-    return result
-
-
-def quote(data):
-    script = """
-    getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
-    """
-    ctx = execjs.compile(script)
-    quote_str = ctx.call('getQuote', data)
-    # print(quote_str)
-    return quote_str
-
-
-@crawl_timer
-def callback_requests(func, *args, **kwargs):
-    proxy = kwargs.pop('proxy', None)
-    openid = kwargs.pop('openid')
-    while True:
-        kwargs['openid'] = openid
-        logger.debug(f"[当前openid]:{openid}")
-        proxies = proxy.proxies if proxy is not None else None
-        kwargs['proxies'] = proxies
-        logger.debug(f"[当前代理]:{proxies}")
-        try:
-            return func(*args, **kwargs)
-        except (IOError, AssertionError) as e:
-            if not isinstance(e, AssertionError):
-                logger.error(f"[访问异常]:{e}")
-            time.sleep(3)
-            openid = get_openid()
-            if proxy is not None:
-                proxy.switch()
-
-
-def get_jgdm(query, proxies, openid):
-    results = []
-    url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
-    headers = {
-        "Host": "ss.cods.org.cn",
-        "content-type": "application/x-www-form-urlencoded",
-        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
-        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
-    }
-    val = {
-        "q": query,
-        "t": "common",
-        "currentPage": 1,
-        "xzqh": "",
-        "jglx": "B",  # 事业类型
-        "zczj": "",
-        "clrq": "",
-        "mobile": "",
-        "isDeepSearch": False,
-        "platform": "weixin",
-        "openid": openid
-    }
-    json_str = quote(val)
-    sign = md5_hex(json_str)
-    data = {
-        "jsonString": json_str,
-        "sign": sign
-    }
-    # print(data)
-    request_params = dict(
-        headers=headers,
-        data=data,
-        verify=False,
-        timeout=60,
-        proxies=proxies
-    )
-    try:
-        response = requests.post(url, **request_params)
-    except requests.exceptions.ProxyError:
-        raise requests.RequestException(f"'{query}'jgdm请求失败")
-
-    # print(response)
-    resp_json = response.json()
-    print(json.dumps(resp_json, indent=4, ensure_ascii=False))
-    assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
-    documents = resp_json['jginfoList']["documents"]
-    for item in documents:
-        if item['jyzt'] != '注销':
-            # results.append({'query': query, 'encJgdm': item['encJgdm']})
-            results.append(item['encJgdm'])
-    logger.info(f"[列表查询成功]{query}")
-    return results
-
-
-def get_hospital(query, jgdm, proxies, openid):
-    url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
-    headers = {
-        "Host": "ss.cods.org.cn",
-        "Content-Length": "531",
-        "content-type": "application/x-www-form-urlencoded",
-        "Accept-Encoding": "gzip,compress,br,deflate",
-        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
-        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
-        "Connection": "keep-alive"
-    }
-    val = {
-        "jgdm": jgdm,
-        "keyword": query,
-        "platform": "weixin",
-        "openid": openid
-    }
-    json_str = quote(val)
-    data = {
-        'jsonString': json_str,
-        'sign': md5_hex(json_str)
-    }
-    # print(data)
-    request_params = dict(
-        headers=headers,
-        data=data,
-        proxies=proxies,
-        verify=False,
-        timeout=60
-    )
-    try:
-        response = requests.post(url, **request_params)
-    except requests.exceptions.ProxyError:
-        raise requests.RequestException(f"'{jgdm}'医院请求失败")
-
-    resp_json = response.json()
-    # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
-    assert "code" in resp_json and resp_json["code"] == '0'
-    item = resp_json['document']
-    alias_name = ""
-    if "jgmchis" in item:
-        alias_name = ",".join(item["jgmchis"])
-    hospital = {
-        "search_name": query,  # 搜索名称
-        "hospital_name": item["jgmc"],  # 医院名称
-        "alias_name": alias_name,  # 曾用名 ("xxx,xxx")
-        "credit_no": item["tydm"],  # 统一信用代码
-        "legal_person": item["fddbr"],  # 法定代表人
-        "capital": item["newZczj"],  # 注册资本
-        "establish_date": item["clrq"],  # 成立日期
-        "company_type": "事业单位",  # 企业类型 (机构类型)
-        "operation_startdate": item["jyqxz"],  # 营业期限自
-        "operation_enddate": item["jyqxe"],  # 营业期限至
-        "business_scope": item["jyfw"],  # 经营范围
-        "authority": item["djbmmc"],  # 登记机关(批准机构名称)
-        "company_address": item["zcdz"],  # 联系地址(注册地址)
-        "company_code": item["djh"],  # 注册号(登记号)
-        "organization": item["jjlxdm"],  # (经济类型)
-        "industry": item["jjhydm"],  # (经济行业)
-    }
-    f_hospital_codes.update_one(
-        {'search_name': query, 'credit_no': item["tydm"]},
-        {'$set': hospital},
-        upsert=True
-    )
-    # print(json.dumps(hospital, indent=4, ensure_ascii=False))
-    logger.info(f'[详情查询成功]{hospital["hospital_name"]}')
-    return hospital
-
-
-def query_hospital(tasks, proxy):
-    while len(tasks) > 0:
-        task = tasks.pop(0)
-        query = task['name']
-        openid = get_openid()
-        logger.info(f"[开始查询]{query}")
-        jgdm_lst = callback_requests(get_jgdm, query, proxy=proxy, openid=openid)
-        total = len(jgdm_lst)  # 事业单位的数量
-        logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
-        time.sleep(3)
-        for jgdm in jgdm_lst:
-            callback_requests(get_hospital, query, jgdm, proxy=proxy, openid=openid)
-            logger.info(f"[保存数据]jgdm:{jgdm}")
-            time.sleep(30)
-        zktest_unexists_name.update_one(
-            {'_id': task['_id']},
-            {'$set': {'is_crawl': True, 'count': total}}
-        )
-        time.sleep(20)
-
-
 def crawl_spider():
     logger.info('开始任务')
     proxy = Socks5Proxy(True)
+    global OPENID_DEQUE
     while True:
-        projection = {'_id': 1, 'name': 1}
-        q = {
-            "$and": [
-                {"count": {"$exists": True}},
-                {"count": {"$gt": 0}}
-            ],
-            'is_crawl': {'$exists': False}
-        }
-        cursor = zktest_unexists_name.find(q, projection=projection)
+        cursor = get_cursor()
         tasks = [item for item in cursor.limit(5)]
         if len(tasks) == 0:
             logger.info('任务结束')
@@ -333,7 +30,7 @@ def crawl_spider():
 
         logger.info(f'获取{len(tasks)}条新任务')
         try:
-            query_hospital(tasks, proxy)
+            query_hospital(tasks, proxy, OPENID_DEQUE)
         except TimerError as e:
             logger.info(f'[消息通知]{e.msg}')
             time.sleep(3600)

+ 4 - 307
codes_hospital/crawl_hospital_2.py

@@ -1,21 +1,10 @@
-import datetime
-import json
 import time
 from collections import deque
-from functools import wraps
 
-import execjs
-import requests
-import urllib3
-
-from utils.databases import mongo_table
+from defaults import get_cursor, query_hospital, TimerError
 from utils.log import logger
 from utils.socks5 import Socks5Proxy
 
-urllib3.disable_warnings()
-
-zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
-f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
 OPENID_DEQUE = deque([
     "o0VVO5XuDAgYqBZwQtW0RGN-o1_k",
     "o0VVO5cwIDRTxku-kYohTryqA_i8",
@@ -28,304 +17,12 @@ OPENID_DEQUE = deque([
 ])
 
 
-class TimerError(IOError):
-
-    def __init__(self, *args, **kwargs):
-        self.msg = args[0]
-
-
-def crawl_timer(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if all([
-            0 <= datetime.datetime.now().weekday() <= 4,  # 周一到周五
-            9 <= datetime.datetime.now().hour <= 17  # 早9点到晚5点
-        ]):
-            # print("进入++++++++++++++++ ", func.__name__)
-            result = func(*args, **kwargs)
-            # print("执行完毕------------------- ", func.__name__)
-            return result
-        raise TimerError('小程序接口停止运营')
-    return wrapper
-
-
-def get_openid():
-    global OPENID_DEQUE
-    openid = OPENID_DEQUE.popleft()
-    OPENID_DEQUE.append(openid)
-    return openid
-
-
-def md5_hex(val):
-    salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
-    val = val + salt
-    script = """
-        function n(t, e, n, r, o, i) {
-            return u(function(t, e) {
-                return t << e | t >>> 32 - e;
-            }(u(u(e, t), u(r, i)), o), n);
-        }
-        function r(t, e, r, o, i, a, u) {
-            return n(e & r | ~e & o, t, e, i, a, u);
-        }
-        function o(t, e, r, o, i, a, u) {
-            return n(e & o | r & ~o, t, e, i, a, u);
-        }
-        function i(t, e, r, o, i, a, u) {
-            return n(e ^ r ^ o, t, e, i, a, u);
-        }
-        function a(t, e, r, o, i, a, u) {
-            return n(r ^ (e | ~o), t, e, i, a, u);
-        }
-        function u(t, e) {
-            var n = (65535 & t) + (65535 & e);
-            return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
-        }
-        hex_md5= function(t) {
-                return function(t) {
-                    for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
-                    return n;
-                }(function(t, e) {
-                    t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
-                    for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
-                        var p = n, h = c, d = s, v = f;
-                        n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586), 
-                        s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330), 
-                        n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426), 
-                        s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983), 
-                        n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417), 
-                        s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162), 
-                        n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101), 
-                        s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329), 
-                        n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632), 
-                        s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302), 
-                        n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083), 
-                        s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848), 
-                        n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690), 
-                        s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501), 
-                        n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784), 
-                        s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734), 
-                        n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463), 
-                        s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556), 
-                        n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353), 
-                        s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640), 
-                        n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222), 
-                        s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189), 
-                        n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835), 
-                        s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651), 
-                        n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415), 
-                        s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055), 
-                        n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606), 
-                        s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799), 
-                        n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744), 
-                        s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649), 
-                        n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379), 
-                        s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551), 
-                        n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
-                    }
-                    return Array(n, c, s, f);
-                }(function(t) {
-                    for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
-                    return e;
-                }(t), 8 * t.length));
-            }
-
-    """
-    ctx = execjs.compile(script)
-    result = ctx.call('hex_md5', val)
-    # print(result)
-    return result
-
-
-def quote(data):
-    script = """
-    getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
-    """
-    ctx = execjs.compile(script)
-    quote_str = ctx.call('getQuote', data)
-    # print(quote_str)
-    return quote_str
-
-
-@crawl_timer
-def callback_requests(func, *args, **kwargs):
-    proxy = kwargs.pop('proxy', None)
-    openid = kwargs.pop('openid')
-    while True:
-        kwargs['openid'] = openid
-        logger.debug(f"[当前openid]:{openid}")
-        proxies = proxy.proxies if proxy is not None else None
-        kwargs['proxies'] = proxies
-        logger.debug(f"[当前代理]:{proxies}")
-        try:
-            return func(*args, **kwargs)
-        except (IOError, AssertionError) as e:
-            if not isinstance(e, AssertionError):
-                logger.error(f"[访问异常]:{e}")
-            time.sleep(3)
-            openid = get_openid()
-            if proxy is not None:
-                proxy.switch()
-
-
-def get_jgdm(query, proxies, openid):
-    results = []
-    url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
-    headers = {
-        "Host": "ss.cods.org.cn",
-        "content-type": "application/x-www-form-urlencoded",
-        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
-        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
-    }
-    val = {
-        "q": query,
-        "t": "common",
-        "currentPage": 1,
-        "xzqh": "",
-        "jglx": "B",  # 事业类型
-        "zczj": "",
-        "clrq": "",
-        "mobile": "",
-        "isDeepSearch": False,
-        "platform": "weixin",
-        "openid": openid
-    }
-    json_str = quote(val)
-    sign = md5_hex(json_str)
-    data = {
-        "jsonString": json_str,
-        "sign": sign
-    }
-    # print(data)
-    request_params = dict(
-        headers=headers,
-        data=data,
-        verify=False,
-        timeout=60,
-        proxies=proxies
-    )
-    try:
-        response = requests.post(url, **request_params)
-    except requests.exceptions.ProxyError:
-        raise requests.RequestException(f"'{query}'jgdm请求失败")
-
-    # print(response)
-    resp_json = response.json()
-    print(json.dumps(resp_json, indent=4, ensure_ascii=False))
-    assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
-    documents = resp_json['jginfoList']["documents"]
-    for item in documents:
-        if item['jyzt'] != '注销':
-            # results.append({'query': query, 'encJgdm': item['encJgdm']})
-            results.append(item['encJgdm'])
-    logger.info(f"[列表查询成功]{query}")
-    return results
-
-
-def get_hospital(query, jgdm, proxies, openid):
-    url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
-    headers = {
-        "Host": "ss.cods.org.cn",
-        "Content-Length": "531",
-        "content-type": "application/x-www-form-urlencoded",
-        "Accept-Encoding": "gzip,compress,br,deflate",
-        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
-        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
-        "Connection": "keep-alive"
-    }
-    val = {
-        "jgdm": jgdm,
-        "keyword": query,
-        "platform": "weixin",
-        "openid": openid
-    }
-    json_str = quote(val)
-    data = {
-        'jsonString': json_str,
-        'sign': md5_hex(json_str)
-    }
-    # print(data)
-    request_params = dict(
-        headers=headers,
-        data=data,
-        proxies=proxies,
-        verify=False,
-        timeout=60
-    )
-    try:
-        response = requests.post(url, **request_params)
-    except requests.exceptions.ProxyError:
-        raise requests.RequestException(f"'{jgdm}'医院请求失败")
-
-    resp_json = response.json()
-    # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
-    assert "code" in resp_json and resp_json["code"] == '0'
-    item = resp_json['document']
-    alias_name = ""
-    if "jgmchis" in item:
-        alias_name = ",".join(item["jgmchis"])
-    hospital = {
-        "search_name": query,  # 搜索名称
-        "hospital_name": item["jgmc"],  # 医院名称
-        "alias_name": alias_name,  # 曾用名 ("xxx,xxx")
-        "credit_no": item["tydm"],  # 统一信用代码
-        "legal_person": item["fddbr"],  # 法定代表人
-        "capital": item["newZczj"],  # 注册资本
-        "establish_date": item["clrq"],  # 成立日期
-        "company_type": "事业单位",  # 企业类型 (机构类型)
-        "operation_startdate": item["jyqxz"],  # 营业期限自
-        "operation_enddate": item["jyqxe"],  # 营业期限至
-        "business_scope": item["jyfw"],  # 经营范围
-        "authority": item["djbmmc"],  # 登记机关(批准机构名称)
-        "company_address": item["zcdz"],  # 联系地址(注册地址)
-        "company_code": item["djh"],  # 注册号(登记号)
-        "organization": item["jjlxdm"],  # (经济类型)
-        "industry": item["jjhydm"],  # (经济行业)
-    }
-    f_hospital_codes.update_one(
-        {'search_name': query, 'credit_no': item["tydm"]},
-        {'$set': hospital},
-        upsert=True
-    )
-    # print(json.dumps(hospital, indent=4, ensure_ascii=False))
-    logger.info(f'[详情查询成功]{hospital["hospital_name"]}')
-    return hospital
-
-
-def query_hospital(tasks, proxy):
-    while len(tasks) > 0:
-        task = tasks.pop(0)
-        query = task['name']
-        openid = get_openid()
-        logger.info(f"[开始查询]{query}")
-        jgdm_lst = callback_requests(get_jgdm, query, proxy=proxy, openid=openid)
-        total = len(jgdm_lst)  # 事业单位的数量
-        logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
-        time.sleep(3)
-        for jgdm in jgdm_lst:
-            callback_requests(get_hospital, query, jgdm, proxy=proxy, openid=openid)
-            logger.info(f"[保存数据]jgdm:{jgdm}")
-            time.sleep(30)
-        zktest_unexists_name.update_one(
-            {'_id': task['_id']},
-            {'$set': {'is_crawl': True, 'count': total}}
-        )
-        time.sleep(20)
-
-
 def crawl_spider():
     logger.info('开始任务')
     proxy = Socks5Proxy(True)
+    global OPENID_DEQUE
     while True:
-        projection = {'_id': 1, 'name': 1}
-        q = {
-            "$and": [
-                {"count": {"$exists": True}},
-                {"count": {"$gt": 0}}
-            ],
-            'is_crawl': {'$exists': False}
-        }
-        cursor = zktest_unexists_name.find(q, projection=projection)
+        cursor = get_cursor()
         tasks = [item for item in cursor.limit(5).sort([('_id', -1)])]
         if len(tasks) == 0:
             logger.info('任务结束')
@@ -333,7 +30,7 @@ def crawl_spider():
 
         logger.info(f'获取{len(tasks)}条新任务')
         try:
-            query_hospital(tasks, proxy)
+            query_hospital(tasks, proxy, OPENID_DEQUE)
         except TimerError as e:
             logger.info(f'[消息通知]{e.msg}')
             time.sleep(3600)

+ 4 - 307
codes_hospital/crawl_hospital_3.py

@@ -1,21 +1,10 @@
-import datetime
-import json
 import time
 from collections import deque
-from functools import wraps
 
-import execjs
-import requests
-import urllib3
-
-from utils.databases import mongo_table
+from defaults import get_cursor, query_hospital, TimerError
 from utils.log import logger
 from utils.socks5 import Socks5Proxy
 
-urllib3.disable_warnings()
-
-zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
-f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
 OPENID_DEQUE = deque([
     "o0VVO5R05eei1I4CxIqNUyJIMplA",
     "o0VVO5aRpkS8mkVZOoG3h8YGzGFE",
@@ -27,304 +16,12 @@ OPENID_DEQUE = deque([
 ])
 
 
-class TimerError(IOError):
-
-    def __init__(self, *args, **kwargs):
-        self.msg = args[0]
-
-
-def crawl_timer(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if all([
-            0 <= datetime.datetime.now().weekday() <= 4,  # 周一到周五
-            9 <= datetime.datetime.now().hour <= 17  # 早9点到晚5点
-        ]):
-            # print("进入++++++++++++++++ ", func.__name__)
-            result = func(*args, **kwargs)
-            # print("执行完毕------------------- ", func.__name__)
-            return result
-        raise TimerError('小程序接口停止运营')
-    return wrapper
-
-
-def get_openid():
-    global OPENID_DEQUE
-    openid = OPENID_DEQUE.popleft()
-    OPENID_DEQUE.append(openid)
-    return openid
-
-
-def md5_hex(val):
-    salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
-    val = val + salt
-    script = """
-        function n(t, e, n, r, o, i) {
-            return u(function(t, e) {
-                return t << e | t >>> 32 - e;
-            }(u(u(e, t), u(r, i)), o), n);
-        }
-        function r(t, e, r, o, i, a, u) {
-            return n(e & r | ~e & o, t, e, i, a, u);
-        }
-        function o(t, e, r, o, i, a, u) {
-            return n(e & o | r & ~o, t, e, i, a, u);
-        }
-        function i(t, e, r, o, i, a, u) {
-            return n(e ^ r ^ o, t, e, i, a, u);
-        }
-        function a(t, e, r, o, i, a, u) {
-            return n(r ^ (e | ~o), t, e, i, a, u);
-        }
-        function u(t, e) {
-            var n = (65535 & t) + (65535 & e);
-            return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
-        }
-        hex_md5= function(t) {
-                return function(t) {
-                    for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
-                    return n;
-                }(function(t, e) {
-                    t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
-                    for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
-                        var p = n, h = c, d = s, v = f;
-                        n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586), 
-                        s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330), 
-                        n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426), 
-                        s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983), 
-                        n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417), 
-                        s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162), 
-                        n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101), 
-                        s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329), 
-                        n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632), 
-                        s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302), 
-                        n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083), 
-                        s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848), 
-                        n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690), 
-                        s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501), 
-                        n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784), 
-                        s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734), 
-                        n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463), 
-                        s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556), 
-                        n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353), 
-                        s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640), 
-                        n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222), 
-                        s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189), 
-                        n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835), 
-                        s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651), 
-                        n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415), 
-                        s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055), 
-                        n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606), 
-                        s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799), 
-                        n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744), 
-                        s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649), 
-                        n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379), 
-                        s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551), 
-                        n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
-                    }
-                    return Array(n, c, s, f);
-                }(function(t) {
-                    for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
-                    return e;
-                }(t), 8 * t.length));
-            }
-
-    """
-    ctx = execjs.compile(script)
-    result = ctx.call('hex_md5', val)
-    # print(result)
-    return result
-
-
-def quote(data):
-    script = """
-    getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
-    """
-    ctx = execjs.compile(script)
-    quote_str = ctx.call('getQuote', data)
-    # print(quote_str)
-    return quote_str
-
-
-@crawl_timer
-def callback_requests(func, *args, **kwargs):
-    proxy = kwargs.pop('proxy', None)
-    openid = kwargs.pop('openid')
-    while True:
-        kwargs['openid'] = openid
-        logger.debug(f"[当前openid]:{openid}")
-        proxies = proxy.proxies if proxy is not None else None
-        kwargs['proxies'] = proxies
-        logger.debug(f"[当前代理]:{proxies}")
-        try:
-            return func(*args, **kwargs)
-        except (IOError, AssertionError) as e:
-            if not isinstance(e, AssertionError):
-                logger.error(f"[访问异常]:{e}")
-            time.sleep(3)
-            openid = get_openid()
-            if proxy is not None:
-                proxy.switch()
-
-
-def get_jgdm(query, proxies, openid):
-    results = []
-    url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
-    headers = {
-        "Host": "ss.cods.org.cn",
-        "content-type": "application/x-www-form-urlencoded",
-        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
-        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
-    }
-    val = {
-        "q": query,
-        "t": "common",
-        "currentPage": 1,
-        "xzqh": "",
-        "jglx": "B",  # 事业类型
-        "zczj": "",
-        "clrq": "",
-        "mobile": "",
-        "isDeepSearch": False,
-        "platform": "weixin",
-        "openid": openid
-    }
-    json_str = quote(val)
-    sign = md5_hex(json_str)
-    data = {
-        "jsonString": json_str,
-        "sign": sign
-    }
-    # print(data)
-    request_params = dict(
-        headers=headers,
-        data=data,
-        verify=False,
-        timeout=60,
-        proxies=proxies
-    )
-    try:
-        response = requests.post(url, **request_params)
-    except requests.exceptions.ProxyError:
-        raise requests.RequestException(f"'{query}'jgdm请求失败")
-
-    # print(response)
-    resp_json = response.json()
-    print(json.dumps(resp_json, indent=4, ensure_ascii=False))
-    assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
-    documents = resp_json['jginfoList']["documents"]
-    for item in documents:
-        if item['jyzt'] != '注销':
-            # results.append({'query': query, 'encJgdm': item['encJgdm']})
-            results.append(item['encJgdm'])
-    logger.info(f"[列表查询成功]{query}")
-    return results
-
-
-def get_hospital(query, jgdm, proxies, openid):
-    url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
-    headers = {
-        "Host": "ss.cods.org.cn",
-        "Content-Length": "531",
-        "content-type": "application/x-www-form-urlencoded",
-        "Accept-Encoding": "gzip,compress,br,deflate",
-        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
-        "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
-        "Connection": "keep-alive"
-    }
-    val = {
-        "jgdm": jgdm,
-        "keyword": query,
-        "platform": "weixin",
-        "openid": openid
-    }
-    json_str = quote(val)
-    data = {
-        'jsonString': json_str,
-        'sign': md5_hex(json_str)
-    }
-    # print(data)
-    request_params = dict(
-        headers=headers,
-        data=data,
-        proxies=proxies,
-        verify=False,
-        timeout=60
-    )
-    try:
-        response = requests.post(url, **request_params)
-    except requests.exceptions.ProxyError:
-        raise requests.RequestException(f"'{jgdm}'医院请求失败")
-
-    resp_json = response.json()
-    # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
-    assert "code" in resp_json and resp_json["code"] == '0'
-    item = resp_json['document']
-    alias_name = ""
-    if "jgmchis" in item:
-        alias_name = ",".join(item["jgmchis"])
-    hospital = {
-        "search_name": query,  # 搜索名称
-        "hospital_name": item["jgmc"],  # 医院名称
-        "alias_name": alias_name,  # 曾用名 ("xxx,xxx")
-        "credit_no": item["tydm"],  # 统一信用代码
-        "legal_person": item["fddbr"],  # 法定代表人
-        "capital": item["newZczj"],  # 注册资本
-        "establish_date": item["clrq"],  # 成立日期
-        "company_type": "事业单位",  # 企业类型 (机构类型)
-        "operation_startdate": item["jyqxz"],  # 营业期限自
-        "operation_enddate": item["jyqxe"],  # 营业期限至
-        "business_scope": item["jyfw"],  # 经营范围
-        "authority": item["djbmmc"],  # 登记机关(批准机构名称)
-        "company_address": item["zcdz"],  # 联系地址(注册地址)
-        "company_code": item["djh"],  # 注册号(登记号)
-        "organization": item["jjlxdm"],  # (经济类型)
-        "industry": item["jjhydm"],  # (经济行业)
-    }
-    f_hospital_codes.update_one(
-        {'search_name': query, 'credit_no': item["tydm"]},
-        {'$set': hospital},
-        upsert=True
-    )
-    # print(json.dumps(hospital, indent=4, ensure_ascii=False))
-    logger.info(f'[详情查询成功]{hospital["hospital_name"]}')
-    return hospital
-
-
-def query_hospital(tasks, proxy):
-    while len(tasks) > 0:
-        task = tasks.pop(0)
-        query = task['name']
-        openid = get_openid()
-        logger.info(f"[开始查询]{query}")
-        jgdm_lst = callback_requests(get_jgdm, query, proxy=proxy, openid=openid)
-        total = len(jgdm_lst)  # 事业单位的数量
-        logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
-        time.sleep(3)
-        for jgdm in jgdm_lst:
-            callback_requests(get_hospital, query, jgdm, proxy=proxy, openid=openid)
-            logger.info(f"[保存数据]jgdm:{jgdm}")
-            time.sleep(30)
-        zktest_unexists_name.update_one(
-            {'_id': task['_id']},
-            {'$set': {'is_crawl': True, 'count': total}}
-        )
-        time.sleep(20)
-
-
 def crawl_spider():
     logger.info('开始任务')
     proxy = Socks5Proxy(True)
+    global OPENID_DEQUE
     while True:
-        projection = {'_id': 1, 'name': 1}
-        q = {
-            "$and": [
-                {"count": {"$exists": True}},
-                {"count": {"$gt": 0}}
-            ],
-            'is_crawl': {'$exists': False}
-        }
-        cursor = zktest_unexists_name.find(q, projection=projection)
+        cursor = get_cursor()
         tasks = [item for item in cursor.skip(100).limit(5)]
         if len(tasks) == 0:
             logger.info('任务结束')
@@ -332,7 +29,7 @@ def crawl_spider():
 
         logger.info(f'获取{len(tasks)}条新任务')
         try:
-            query_hospital(tasks, proxy)
+            query_hospital(tasks, proxy, OPENID_DEQUE)
         except TimerError as e:
             logger.info(f'[消息通知]{e.msg}')
             time.sleep(3600)