|
@@ -1,21 +1,10 @@
|
|
|
-import datetime
|
|
|
-import json
|
|
|
import time
|
|
|
from collections import deque
|
|
|
-from functools import wraps
|
|
|
|
|
|
-import execjs
|
|
|
-import requests
|
|
|
-import urllib3
|
|
|
-
|
|
|
-from utils.databases import mongo_table
|
|
|
+from defaults import get_cursor, query_hospital, TimerError
|
|
|
from utils.log import logger
|
|
|
from utils.socks5 import Socks5Proxy
|
|
|
|
|
|
-urllib3.disable_warnings()
|
|
|
-
|
|
|
-zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
|
|
|
-f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
|
|
|
OPENID_DEQUE = deque([
|
|
|
"o0VVO5XuDAgYqBZwQtW0RGN-o1_k",
|
|
|
"o0VVO5cwIDRTxku-kYohTryqA_i8",
|
|
@@ -28,304 +17,12 @@ OPENID_DEQUE = deque([
|
|
|
])
|
|
|
|
|
|
|
|
|
-class TimerError(IOError):
|
|
|
-
|
|
|
- def __init__(self, *args, **kwargs):
|
|
|
- self.msg = args[0]
|
|
|
-
|
|
|
-
|
|
|
-def crawl_timer(func):
|
|
|
- @wraps(func)
|
|
|
- def wrapper(*args, **kwargs):
|
|
|
- if all([
|
|
|
- 0 <= datetime.datetime.now().weekday() <= 4, # 周一到周五
|
|
|
- 9 <= datetime.datetime.now().hour <= 17 # 早9点到晚5点
|
|
|
- ]):
|
|
|
- # print("进入++++++++++++++++ ", func.__name__)
|
|
|
- result = func(*args, **kwargs)
|
|
|
- # print("执行完毕------------------- ", func.__name__)
|
|
|
- return result
|
|
|
- raise TimerError('小程序接口停止运营')
|
|
|
- return wrapper
|
|
|
-
|
|
|
-
|
|
|
-def get_openid():
|
|
|
- global OPENID_DEQUE
|
|
|
- openid = OPENID_DEQUE.popleft()
|
|
|
- OPENID_DEQUE.append(openid)
|
|
|
- return openid
|
|
|
-
|
|
|
-
|
|
|
-def md5_hex(val):
|
|
|
- salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
|
|
|
- val = val + salt
|
|
|
- script = """
|
|
|
- function n(t, e, n, r, o, i) {
|
|
|
- return u(function(t, e) {
|
|
|
- return t << e | t >>> 32 - e;
|
|
|
- }(u(u(e, t), u(r, i)), o), n);
|
|
|
- }
|
|
|
- function r(t, e, r, o, i, a, u) {
|
|
|
- return n(e & r | ~e & o, t, e, i, a, u);
|
|
|
- }
|
|
|
- function o(t, e, r, o, i, a, u) {
|
|
|
- return n(e & o | r & ~o, t, e, i, a, u);
|
|
|
- }
|
|
|
- function i(t, e, r, o, i, a, u) {
|
|
|
- return n(e ^ r ^ o, t, e, i, a, u);
|
|
|
- }
|
|
|
- function a(t, e, r, o, i, a, u) {
|
|
|
- return n(r ^ (e | ~o), t, e, i, a, u);
|
|
|
- }
|
|
|
- function u(t, e) {
|
|
|
- var n = (65535 & t) + (65535 & e);
|
|
|
- return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
|
|
|
- }
|
|
|
- hex_md5= function(t) {
|
|
|
- return function(t) {
|
|
|
- for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
|
|
|
- return n;
|
|
|
- }(function(t, e) {
|
|
|
- t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
|
|
|
- for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
|
|
|
- var p = n, h = c, d = s, v = f;
|
|
|
- n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586),
|
|
|
- s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330),
|
|
|
- n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426),
|
|
|
- s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983),
|
|
|
- n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417),
|
|
|
- s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162),
|
|
|
- n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101),
|
|
|
- s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329),
|
|
|
- n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632),
|
|
|
- s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302),
|
|
|
- n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083),
|
|
|
- s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848),
|
|
|
- n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690),
|
|
|
- s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501),
|
|
|
- n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784),
|
|
|
- s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734),
|
|
|
- n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463),
|
|
|
- s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556),
|
|
|
- n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353),
|
|
|
- s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640),
|
|
|
- n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222),
|
|
|
- s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189),
|
|
|
- n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835),
|
|
|
- s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651),
|
|
|
- n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415),
|
|
|
- s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055),
|
|
|
- n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606),
|
|
|
- s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799),
|
|
|
- n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744),
|
|
|
- s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649),
|
|
|
- n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379),
|
|
|
- s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551),
|
|
|
- n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
|
|
|
- }
|
|
|
- return Array(n, c, s, f);
|
|
|
- }(function(t) {
|
|
|
- for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
|
|
|
- return e;
|
|
|
- }(t), 8 * t.length));
|
|
|
- }
|
|
|
-
|
|
|
- """
|
|
|
- ctx = execjs.compile(script)
|
|
|
- result = ctx.call('hex_md5', val)
|
|
|
- # print(result)
|
|
|
- return result
|
|
|
-
|
|
|
-
|
|
|
-def quote(data):
|
|
|
- script = """
|
|
|
- getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
|
|
|
- """
|
|
|
- ctx = execjs.compile(script)
|
|
|
- quote_str = ctx.call('getQuote', data)
|
|
|
- # print(quote_str)
|
|
|
- return quote_str
|
|
|
-
|
|
|
-
|
|
|
-@crawl_timer
|
|
|
-def callback_requests(func, *args, **kwargs):
|
|
|
- proxy = kwargs.pop('proxy', None)
|
|
|
- openid = kwargs.pop('openid')
|
|
|
- while True:
|
|
|
- kwargs['openid'] = openid
|
|
|
- logger.debug(f"[当前openid]:{openid}")
|
|
|
- proxies = proxy.proxies if proxy is not None else None
|
|
|
- kwargs['proxies'] = proxies
|
|
|
- logger.debug(f"[当前代理]:{proxies}")
|
|
|
- try:
|
|
|
- return func(*args, **kwargs)
|
|
|
- except (IOError, AssertionError) as e:
|
|
|
- if not isinstance(e, AssertionError):
|
|
|
- logger.error(f"[访问异常]:{e}")
|
|
|
- time.sleep(3)
|
|
|
- openid = get_openid()
|
|
|
- if proxy is not None:
|
|
|
- proxy.switch()
|
|
|
-
|
|
|
-
|
|
|
-def get_jgdm(query, proxies, openid):
|
|
|
- results = []
|
|
|
- url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
|
|
|
- headers = {
|
|
|
- "Host": "ss.cods.org.cn",
|
|
|
- "content-type": "application/x-www-form-urlencoded",
|
|
|
- "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
|
|
|
- "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
|
|
|
- }
|
|
|
- val = {
|
|
|
- "q": query,
|
|
|
- "t": "common",
|
|
|
- "currentPage": 1,
|
|
|
- "xzqh": "",
|
|
|
- "jglx": "B", # 事业类型
|
|
|
- "zczj": "",
|
|
|
- "clrq": "",
|
|
|
- "mobile": "",
|
|
|
- "isDeepSearch": False,
|
|
|
- "platform": "weixin",
|
|
|
- "openid": openid
|
|
|
- }
|
|
|
- json_str = quote(val)
|
|
|
- sign = md5_hex(json_str)
|
|
|
- data = {
|
|
|
- "jsonString": json_str,
|
|
|
- "sign": sign
|
|
|
- }
|
|
|
- # print(data)
|
|
|
- request_params = dict(
|
|
|
- headers=headers,
|
|
|
- data=data,
|
|
|
- verify=False,
|
|
|
- timeout=60,
|
|
|
- proxies=proxies
|
|
|
- )
|
|
|
- try:
|
|
|
- response = requests.post(url, **request_params)
|
|
|
- except requests.exceptions.ProxyError:
|
|
|
- raise requests.RequestException(f"'{query}'jgdm请求失败")
|
|
|
-
|
|
|
- # print(response)
|
|
|
- resp_json = response.json()
|
|
|
- print(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
|
- assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
|
|
|
- documents = resp_json['jginfoList']["documents"]
|
|
|
- for item in documents:
|
|
|
- if item['jyzt'] != '注销':
|
|
|
- # results.append({'query': query, 'encJgdm': item['encJgdm']})
|
|
|
- results.append(item['encJgdm'])
|
|
|
- logger.info(f"[列表查询成功]{query}")
|
|
|
- return results
|
|
|
-
|
|
|
-
|
|
|
-def get_hospital(query, jgdm, proxies, openid):
|
|
|
- url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
|
|
|
- headers = {
|
|
|
- "Host": "ss.cods.org.cn",
|
|
|
- "Content-Length": "531",
|
|
|
- "content-type": "application/x-www-form-urlencoded",
|
|
|
- "Accept-Encoding": "gzip,compress,br,deflate",
|
|
|
- "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
|
|
|
- "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
|
|
|
- "Connection": "keep-alive"
|
|
|
- }
|
|
|
- val = {
|
|
|
- "jgdm": jgdm,
|
|
|
- "keyword": query,
|
|
|
- "platform": "weixin",
|
|
|
- "openid": openid
|
|
|
- }
|
|
|
- json_str = quote(val)
|
|
|
- data = {
|
|
|
- 'jsonString': json_str,
|
|
|
- 'sign': md5_hex(json_str)
|
|
|
- }
|
|
|
- # print(data)
|
|
|
- request_params = dict(
|
|
|
- headers=headers,
|
|
|
- data=data,
|
|
|
- proxies=proxies,
|
|
|
- verify=False,
|
|
|
- timeout=60
|
|
|
- )
|
|
|
- try:
|
|
|
- response = requests.post(url, **request_params)
|
|
|
- except requests.exceptions.ProxyError:
|
|
|
- raise requests.RequestException(f"'{jgdm}'医院请求失败")
|
|
|
-
|
|
|
- resp_json = response.json()
|
|
|
- # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
|
- assert "code" in resp_json and resp_json["code"] == '0'
|
|
|
- item = resp_json['document']
|
|
|
- alias_name = ""
|
|
|
- if "jgmchis" in item:
|
|
|
- alias_name = ",".join(item["jgmchis"])
|
|
|
- hospital = {
|
|
|
- "search_name": query, # 搜索名称
|
|
|
- "hospital_name": item["jgmc"], # 医院名称
|
|
|
- "alias_name": alias_name, # 曾用名 ("xxx,xxx")
|
|
|
- "credit_no": item["tydm"], # 统一信用代码
|
|
|
- "legal_person": item["fddbr"], # 法定代表人
|
|
|
- "capital": item["newZczj"], # 注册资本
|
|
|
- "establish_date": item["clrq"], # 成立日期
|
|
|
- "company_type": "事业单位", # 企业类型 (机构类型)
|
|
|
- "operation_startdate": item["jyqxz"], # 营业期限自
|
|
|
- "operation_enddate": item["jyqxe"], # 营业期限至
|
|
|
- "business_scope": item["jyfw"], # 经营范围
|
|
|
- "authority": item["djbmmc"], # 登记机关(批准机构名称)
|
|
|
- "company_address": item["zcdz"], # 联系地址(注册地址)
|
|
|
- "company_code": item["djh"], # 注册号(登记号)
|
|
|
- "organization": item["jjlxdm"], # (经济类型)
|
|
|
- "industry": item["jjhydm"], # (经济行业)
|
|
|
- }
|
|
|
- f_hospital_codes.update_one(
|
|
|
- {'search_name': query, 'credit_no': item["tydm"]},
|
|
|
- {'$set': hospital},
|
|
|
- upsert=True
|
|
|
- )
|
|
|
- # print(json.dumps(hospital, indent=4, ensure_ascii=False))
|
|
|
- logger.info(f'[详情查询成功]{hospital["hospital_name"]}')
|
|
|
- return hospital
|
|
|
-
|
|
|
-
|
|
|
-def query_hospital(tasks, proxy):
|
|
|
- while len(tasks) > 0:
|
|
|
- task = tasks.pop(0)
|
|
|
- query = task['name']
|
|
|
- openid = get_openid()
|
|
|
- logger.info(f"[开始查询]{query}")
|
|
|
- jgdm_lst = callback_requests(get_jgdm, query, proxy=proxy, openid=openid)
|
|
|
- total = len(jgdm_lst) # 事业单位的数量
|
|
|
- logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
|
|
|
- time.sleep(3)
|
|
|
- for jgdm in jgdm_lst:
|
|
|
- callback_requests(get_hospital, query, jgdm, proxy=proxy, openid=openid)
|
|
|
- logger.info(f"[保存数据]jgdm:{jgdm}")
|
|
|
- time.sleep(30)
|
|
|
- zktest_unexists_name.update_one(
|
|
|
- {'_id': task['_id']},
|
|
|
- {'$set': {'is_crawl': True, 'count': total}}
|
|
|
- )
|
|
|
- time.sleep(20)
|
|
|
-
|
|
|
-
|
|
|
def crawl_spider():
|
|
|
logger.info('开始任务')
|
|
|
proxy = Socks5Proxy(True)
|
|
|
+ global OPENID_DEQUE
|
|
|
while True:
|
|
|
- projection = {'_id': 1, 'name': 1}
|
|
|
- q = {
|
|
|
- "$and": [
|
|
|
- {"count": {"$exists": True}},
|
|
|
- {"count": {"$gt": 0}}
|
|
|
- ],
|
|
|
- 'is_crawl': {'$exists': False}
|
|
|
- }
|
|
|
- cursor = zktest_unexists_name.find(q, projection=projection)
|
|
|
+ cursor = get_cursor()
|
|
|
tasks = [item for item in cursor.limit(5).sort([('_id', -1)])]
|
|
|
if len(tasks) == 0:
|
|
|
logger.info('任务结束')
|
|
@@ -333,7 +30,7 @@ def crawl_spider():
|
|
|
|
|
|
logger.info(f'获取{len(tasks)}条新任务')
|
|
|
try:
|
|
|
- query_hospital(tasks, proxy)
|
|
|
+ query_hospital(tasks, proxy, OPENID_DEQUE)
|
|
|
except TimerError as e:
|
|
|
logger.info(f'[消息通知]{e.msg}')
|
|
|
time.sleep(3600)
|