|
@@ -0,0 +1,289 @@
|
|
|
+import json
|
|
|
+import time
|
|
|
+
|
|
|
+import execjs
|
|
|
+import requests
|
|
|
+import urllib3
|
|
|
+
|
|
|
+from utils.databases import mongo_table
|
|
|
+from utils.log import logger
|
|
|
+from utils.socks5 import Socks5Proxy
|
|
|
+
|
|
|
+urllib3.disable_warnings()
|
|
|
+
|
|
|
+zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
|
|
|
+f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
|
|
|
+
|
|
|
+
|
|
|
+def md5_hex(val):
|
|
|
+ salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
|
|
|
+ val = val + salt
|
|
|
+ script = """
|
|
|
+ function n(t, e, n, r, o, i) {
|
|
|
+ return u(function(t, e) {
|
|
|
+ return t << e | t >>> 32 - e;
|
|
|
+ }(u(u(e, t), u(r, i)), o), n);
|
|
|
+ }
|
|
|
+ function r(t, e, r, o, i, a, u) {
|
|
|
+ return n(e & r | ~e & o, t, e, i, a, u);
|
|
|
+ }
|
|
|
+ function o(t, e, r, o, i, a, u) {
|
|
|
+ return n(e & o | r & ~o, t, e, i, a, u);
|
|
|
+ }
|
|
|
+ function i(t, e, r, o, i, a, u) {
|
|
|
+ return n(e ^ r ^ o, t, e, i, a, u);
|
|
|
+ }
|
|
|
+ function a(t, e, r, o, i, a, u) {
|
|
|
+ return n(r ^ (e | ~o), t, e, i, a, u);
|
|
|
+ }
|
|
|
+ function u(t, e) {
|
|
|
+ var n = (65535 & t) + (65535 & e);
|
|
|
+ return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
|
|
|
+ }
|
|
|
+ hex_md5= function(t) {
|
|
|
+ return function(t) {
|
|
|
+ for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
|
|
|
+ return n;
|
|
|
+ }(function(t, e) {
|
|
|
+ t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
|
|
|
+ for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
|
|
|
+ var p = n, h = c, d = s, v = f;
|
|
|
+ n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586),
|
|
|
+ s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330),
|
|
|
+ n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426),
|
|
|
+ s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983),
|
|
|
+ n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417),
|
|
|
+ s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162),
|
|
|
+ n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101),
|
|
|
+ s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329),
|
|
|
+ n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632),
|
|
|
+ s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302),
|
|
|
+ n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083),
|
|
|
+ s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848),
|
|
|
+ n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690),
|
|
|
+ s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501),
|
|
|
+ n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784),
|
|
|
+ s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734),
|
|
|
+ n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463),
|
|
|
+ s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556),
|
|
|
+ n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353),
|
|
|
+ s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640),
|
|
|
+ n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222),
|
|
|
+ s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189),
|
|
|
+ n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835),
|
|
|
+ s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651),
|
|
|
+ n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415),
|
|
|
+ s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055),
|
|
|
+ n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606),
|
|
|
+ s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799),
|
|
|
+ n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744),
|
|
|
+ s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649),
|
|
|
+ n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379),
|
|
|
+ s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551),
|
|
|
+ n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
|
|
|
+ }
|
|
|
+ return Array(n, c, s, f);
|
|
|
+ }(function(t) {
|
|
|
+ for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
|
|
|
+ return e;
|
|
|
+ }(t), 8 * t.length));
|
|
|
+ }
|
|
|
+
|
|
|
+ """
|
|
|
+ ctx = execjs.compile(script)
|
|
|
+ result = ctx.call('hex_md5', val)
|
|
|
+ # print(result)
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def quote(data):
|
|
|
+ script = """
|
|
|
+ getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
|
|
|
+ """
|
|
|
+ ctx = execjs.compile(script)
|
|
|
+ quote_str = ctx.call('getQuote', data)
|
|
|
+ # print(quote_str)
|
|
|
+ return quote_str
|
|
|
+
|
|
|
+
|
|
|
+def get_jgdm(query, proxies):
|
|
|
+ results = []
|
|
|
+ url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
|
|
|
+ headers = {
|
|
|
+ "Host": "ss.cods.org.cn",
|
|
|
+ "content-type": "application/x-www-form-urlencoded",
|
|
|
+ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
|
|
|
+ "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
|
|
|
+ }
|
|
|
+ val = {
|
|
|
+ "q": query,
|
|
|
+ "t": "common",
|
|
|
+ "currentPage": 1,
|
|
|
+ "xzqh": "",
|
|
|
+ "jglx": "B", # 事业类型
|
|
|
+ "zczj": "",
|
|
|
+ "clrq": "",
|
|
|
+ "mobile": "",
|
|
|
+ "isDeepSearch": False,
|
|
|
+ "platform": "weixin",
|
|
|
+ "openid": "o0VVO5Wjhblu4tgm4OkMaJecvsO4"
|
|
|
+ }
|
|
|
+ json_str = quote(val)
|
|
|
+ sign = md5_hex(json_str)
|
|
|
+ data = {
|
|
|
+ "jsonString": json_str,
|
|
|
+ "sign": sign
|
|
|
+ }
|
|
|
+ # print(data)
|
|
|
+ request_params = dict(
|
|
|
+ headers=headers,
|
|
|
+ data=data,
|
|
|
+ verify=False,
|
|
|
+ timeout=60,
|
|
|
+ proxies=proxies
|
|
|
+ )
|
|
|
+ print('proxies >>> ', proxies)
|
|
|
+ try:
|
|
|
+ response = requests.post(url, **request_params)
|
|
|
+ except requests.exceptions.ProxyError:
|
|
|
+ raise requests.RequestException(f"'{query}'jgdm请求失败")
|
|
|
+
|
|
|
+ # print(response)
|
|
|
+ resp_json = response.json()
|
|
|
+ # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
|
+ assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
|
|
|
+ documents = resp_json['jginfoList']["documents"]
|
|
|
+ for item in documents:
|
|
|
+ if item['jyzt'] != '注销':
|
|
|
+ # results.append({'query': query, 'encJgdm': item['encJgdm']})
|
|
|
+ results.append(item['encJgdm'])
|
|
|
+ logger.info(f"[列表查询成功]{proxies} - {query}")
|
|
|
+ return results
|
|
|
+
|
|
|
+
|
|
|
+def get_hospital(query, jgdm, proxies):
|
|
|
+ url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
|
|
|
+ headers = {
|
|
|
+ "Host": "ss.cods.org.cn",
|
|
|
+ "Content-Length": "531",
|
|
|
+ "content-type": "application/x-www-form-urlencoded",
|
|
|
+ "Accept-Encoding": "gzip,compress,br,deflate",
|
|
|
+ "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
|
|
|
+ "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
|
|
|
+ "Connection": "keep-alive"
|
|
|
+ }
|
|
|
+ val = {
|
|
|
+ "jgdm": jgdm,
|
|
|
+ "keyword": query,
|
|
|
+ "platform": "weixin",
|
|
|
+ "openid": "o0VVO5QnhbdQfl4fkZWw8faTGkZM"
|
|
|
+ }
|
|
|
+ json_str = quote(val)
|
|
|
+ data = {
|
|
|
+ 'jsonString': json_str,
|
|
|
+ 'sign': md5_hex(json_str)
|
|
|
+ }
|
|
|
+ # print(data)
|
|
|
+ request_params = dict(
|
|
|
+ headers=headers,
|
|
|
+ data=data,
|
|
|
+ proxies=proxies,
|
|
|
+ verify=False,
|
|
|
+ timeout=60
|
|
|
+ )
|
|
|
+ print("proxies >>> ", proxies)
|
|
|
+ try:
|
|
|
+ response = requests.post(url, **request_params)
|
|
|
+ except requests.exceptions.ProxyError:
|
|
|
+ raise requests.RequestException(f"'{jgdm}'医院请求失败")
|
|
|
+
|
|
|
+ resp_json = response.json()
|
|
|
+ # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
|
+ assert "code" in resp_json and resp_json["code"] == '0'
|
|
|
+ item = resp_json['document']
|
|
|
+ alias_name = ""
|
|
|
+ if "jgmchis" in item:
|
|
|
+ alias_name = ",".join(item["jgmchis"])
|
|
|
+ hospital = {
|
|
|
+ "search_name": query, # 搜索名称
|
|
|
+ "hospital_name": item["jgmc"], # 医院名称
|
|
|
+ "alias_name": alias_name, # 曾用名 ("xxx,xxx")
|
|
|
+ "credit_no": item["tydm"], # 统一信用代码
|
|
|
+ "legal_person": item["fddbr"], # 法定代表人
|
|
|
+ "capital": item["newZczj"], # 注册资本
|
|
|
+ "establish_date": item["clrq"], # 成立日期
|
|
|
+ "company_type": "事业单位", # 企业类型 (机构类型)
|
|
|
+ "operation_startdate": item["jyqxz"], # 营业期限自
|
|
|
+ "operation_enddate": item["jyqxe"], # 营业期限至
|
|
|
+ "business_scope": item["jyfw"], # 经营范围
|
|
|
+ "authority": item["djbmmc"], # 登记机关(批准机构名称)
|
|
|
+ "company_address": item["zcdz"], # 联系地址(注册地址)
|
|
|
+ "company_code": item["djh"], # 注册号(登记号)
|
|
|
+ "organization": item["jjlxdm"], # (经济类型)
|
|
|
+ "industry": item["jjhydm"], # (经济行业)
|
|
|
+ }
|
|
|
+ f_hospital_codes.insert_one(hospital)
|
|
|
+ # print(json.dumps(hospital, indent=4, ensure_ascii=False))
|
|
|
+ logger.info(f'[详情查询成功]{proxies} - {hospital["hospital_name"]}')
|
|
|
+ return hospital
|
|
|
+
|
|
|
+
|
|
|
+def callback_requests(func, *args, **kwargs):
|
|
|
+ proxy = kwargs.pop('proxy', None)
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ proxies = proxy.proxies if proxy is not None else None
|
|
|
+ kwargs.setdefault('proxies', proxies)
|
|
|
+ return func(*args, **kwargs)
|
|
|
+ except (requests.RequestException, AssertionError) as e:
|
|
|
+ logger.error(e)
|
|
|
+ time.sleep(3)
|
|
|
+ if proxy is not None:
|
|
|
+ proxy.switch()
|
|
|
+
|
|
|
+
|
|
|
+def query_hospital(tasks, proxy):
|
|
|
+ while len(tasks) > 0:
|
|
|
+ task = tasks.pop(0)
|
|
|
+ query = task['name']
|
|
|
+ logger.info(f"[开始查询]{query}")
|
|
|
+ jgdm_lst = callback_requests(get_jgdm, query, proxy=proxy)
|
|
|
+ time.sleep(3)
|
|
|
+ for jgdm in jgdm_lst:
|
|
|
+ callback_requests(get_hospital, query, jgdm, proxy=proxy)
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ total = len(jgdm_lst) # 事业单位的数量
|
|
|
+ zktest_unexists_name.update_one(
|
|
|
+ {'_id': task['_id']},
|
|
|
+ {'$set': {'is_crawl': True, 'count': total}}
|
|
|
+ )
|
|
|
+ logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
|
|
|
+ time.sleep(20)
|
|
|
+
|
|
|
+
|
|
|
+def crawl_spider():
|
|
|
+ logger.info('开始任务')
|
|
|
+ proxy = Socks5Proxy(True)
|
|
|
+ while True:
|
|
|
+ projection = {'_id': 1, 'name': 1}
|
|
|
+ q = {
|
|
|
+ "$and": [
|
|
|
+ {"count": {"$exists": True}},
|
|
|
+ {"count": {"$gt": 0}}
|
|
|
+ ],
|
|
|
+ 'is_crawl': {'$exists': False}
|
|
|
+ }
|
|
|
+ cursor = zktest_unexists_name.find(q, projection=projection)
|
|
|
+ tasks = [item for item in cursor.limit(5)]
|
|
|
+ if len(tasks) == 0:
|
|
|
+ logger.info('任务结束')
|
|
|
+ break
|
|
|
+
|
|
|
+ logger.info(f'获取{len(tasks)}条新任务')
|
|
|
+ query_hospital(tasks, proxy)
|
|
|
+ proxy.switch()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ crawl_spider()
|