defaults.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. import datetime
  2. import json
  3. import time
  4. from collections import deque
  5. from functools import wraps
  6. import execjs
  7. import requests
  8. import urllib3
  9. from utils.databases import mongo_table
  10. from utils.log import logger
  11. urllib3.disable_warnings()
  12. zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
  13. f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
  14. openid = None # 全局openid
  15. class TimerError(IOError):
  16. def __init__(self, *args, **kwargs):
  17. self.msg = args[0]
  18. class CrawlError(Exception):
  19. def __init__(self, *args, **kwargs):
  20. self.msg = args[0]
  21. class RequestError(Exception):
  22. def __init__(self, *args, **kwargs):
  23. self.msg = args[0]
  24. def spider_listener(func):
  25. @wraps(func)
  26. def wrapper(*args, **kwargs):
  27. if all([
  28. 0 <= datetime.datetime.now().weekday() <= 4, # 周一到周五
  29. 9 <= datetime.datetime.now().hour <= 18 # 早9点到晚19点
  30. ]):
  31. result = func(*args, **kwargs)
  32. return result
  33. raise TimerError('小程序接口停止运营')
  34. return wrapper
  35. def get_cursor():
  36. projection = {'_id': 1, 'name': 1}
  37. q = {
  38. "$and": [
  39. {"count": {"$exists": True}},
  40. {"count": {"$gt": 0}}
  41. ],
  42. 'is_crawl': {'$exists': False}
  43. }
  44. cursor = zktest_unexists_name.find(q, projection=projection)
  45. return cursor
  46. def get_openid(openid_deque: deque):
  47. openid = openid_deque.popleft()
  48. openid_deque.append(openid)
  49. return openid
  50. def md5_hex(val):
  51. salt = "A523B4A5C52203AA9C2D97F6CB45CB35"
  52. val = val + salt
  53. script = """
  54. function n(t, e, n, r, o, i) {
  55. return u(function(t, e) {
  56. return t << e | t >>> 32 - e;
  57. }(u(u(e, t), u(r, i)), o), n);
  58. }
  59. function r(t, e, r, o, i, a, u) {
  60. return n(e & r | ~e & o, t, e, i, a, u);
  61. }
  62. function o(t, e, r, o, i, a, u) {
  63. return n(e & o | r & ~o, t, e, i, a, u);
  64. }
  65. function i(t, e, r, o, i, a, u) {
  66. return n(e ^ r ^ o, t, e, i, a, u);
  67. }
  68. function a(t, e, r, o, i, a, u) {
  69. return n(r ^ (e | ~o), t, e, i, a, u);
  70. }
  71. function u(t, e) {
  72. var n = (65535 & t) + (65535 & e);
  73. return (t >> 16) + (e >> 16) + (n >> 16) << 16 | 65535 & n;
  74. }
  75. hex_md5= function(t) {
  76. return function(t) {
  77. for (var e = "0123456789abcdef", n = "", r = 0; r < 4 * t.length; r++) n += e.charAt(t[r >> 2] >> r % 4 * 8 + 4 & 15) + e.charAt(t[r >> 2] >> r % 4 * 8 & 15);
  78. return n;
  79. }(function(t, e) {
  80. t[e >> 5] |= 128 << e % 32, t[14 + (e + 64 >>> 9 << 4)] = e;
  81. for (var n = 1732584193, c = -271733879, s = -1732584194, f = 271733878, l = 0; l < t.length; l += 16) {
  82. var p = n, h = c, d = s, v = f;
  83. n = r(n, c, s, f, t[l + 0], 7, -680876936), f = r(f, n, c, s, t[l + 1], 12, -389564586),
  84. s = r(s, f, n, c, t[l + 2], 17, 606105819), c = r(c, s, f, n, t[l + 3], 22, -1044525330),
  85. n = r(n, c, s, f, t[l + 4], 7, -176418897), f = r(f, n, c, s, t[l + 5], 12, 1200080426),
  86. s = r(s, f, n, c, t[l + 6], 17, -1473231341), c = r(c, s, f, n, t[l + 7], 22, -45705983),
  87. n = r(n, c, s, f, t[l + 8], 7, 1770035416), f = r(f, n, c, s, t[l + 9], 12, -1958414417),
  88. s = r(s, f, n, c, t[l + 10], 17, -42063), c = r(c, s, f, n, t[l + 11], 22, -1990404162),
  89. n = r(n, c, s, f, t[l + 12], 7, 1804603682), f = r(f, n, c, s, t[l + 13], 12, -40341101),
  90. s = r(s, f, n, c, t[l + 14], 17, -1502002290), c = r(c, s, f, n, t[l + 15], 22, 1236535329),
  91. n = o(n, c, s, f, t[l + 1], 5, -165796510), f = o(f, n, c, s, t[l + 6], 9, -1069501632),
  92. s = o(s, f, n, c, t[l + 11], 14, 643717713), c = o(c, s, f, n, t[l + 0], 20, -373897302),
  93. n = o(n, c, s, f, t[l + 5], 5, -701558691), f = o(f, n, c, s, t[l + 10], 9, 38016083),
  94. s = o(s, f, n, c, t[l + 15], 14, -660478335), c = o(c, s, f, n, t[l + 4], 20, -405537848),
  95. n = o(n, c, s, f, t[l + 9], 5, 568446438), f = o(f, n, c, s, t[l + 14], 9, -1019803690),
  96. s = o(s, f, n, c, t[l + 3], 14, -187363961), c = o(c, s, f, n, t[l + 8], 20, 1163531501),
  97. n = o(n, c, s, f, t[l + 13], 5, -1444681467), f = o(f, n, c, s, t[l + 2], 9, -51403784),
  98. s = o(s, f, n, c, t[l + 7], 14, 1735328473), c = o(c, s, f, n, t[l + 12], 20, -1926607734),
  99. n = i(n, c, s, f, t[l + 5], 4, -378558), f = i(f, n, c, s, t[l + 8], 11, -2022574463),
  100. s = i(s, f, n, c, t[l + 11], 16, 1839030562), c = i(c, s, f, n, t[l + 14], 23, -35309556),
  101. n = i(n, c, s, f, t[l + 1], 4, -1530992060), f = i(f, n, c, s, t[l + 4], 11, 1272893353),
  102. s = i(s, f, n, c, t[l + 7], 16, -155497632), c = i(c, s, f, n, t[l + 10], 23, -1094730640),
  103. n = i(n, c, s, f, t[l + 13], 4, 681279174), f = i(f, n, c, s, t[l + 0], 11, -358537222),
  104. s = i(s, f, n, c, t[l + 3], 16, -722521979), c = i(c, s, f, n, t[l + 6], 23, 76029189),
  105. n = i(n, c, s, f, t[l + 9], 4, -640364487), f = i(f, n, c, s, t[l + 12], 11, -421815835),
  106. s = i(s, f, n, c, t[l + 15], 16, 530742520), c = i(c, s, f, n, t[l + 2], 23, -995338651),
  107. n = a(n, c, s, f, t[l + 0], 6, -198630844), f = a(f, n, c, s, t[l + 7], 10, 1126891415),
  108. s = a(s, f, n, c, t[l + 14], 15, -1416354905), c = a(c, s, f, n, t[l + 5], 21, -57434055),
  109. n = a(n, c, s, f, t[l + 12], 6, 1700485571), f = a(f, n, c, s, t[l + 3], 10, -1894986606),
  110. s = a(s, f, n, c, t[l + 10], 15, -1051523), c = a(c, s, f, n, t[l + 1], 21, -2054922799),
  111. n = a(n, c, s, f, t[l + 8], 6, 1873313359), f = a(f, n, c, s, t[l + 15], 10, -30611744),
  112. s = a(s, f, n, c, t[l + 6], 15, -1560198380), c = a(c, s, f, n, t[l + 13], 21, 1309151649),
  113. n = a(n, c, s, f, t[l + 4], 6, -145523070), f = a(f, n, c, s, t[l + 11], 10, -1120210379),
  114. s = a(s, f, n, c, t[l + 2], 15, 718787259), c = a(c, s, f, n, t[l + 9], 21, -343485551),
  115. n = u(n, p), c = u(c, h), s = u(s, d), f = u(f, v);
  116. }
  117. return Array(n, c, s, f);
  118. }(function(t) {
  119. for (var e = Array(), n = 0; n < 8 * t.length; n += 8) e[n >> 5] |= (255 & t.charCodeAt(n / 8)) << n % 32;
  120. return e;
  121. }(t), 8 * t.length));
  122. }
  123. """
  124. ctx = execjs.compile(script)
  125. result = ctx.call('hex_md5', val)
  126. # print(result)
  127. return result
  128. def quote(data):
  129. script = """
  130. getQuote = function(data){return encodeURIComponent(JSON.stringify(data))}
  131. """
  132. ctx = execjs.compile(script)
  133. quote_str = ctx.call('getQuote', data)
  134. # print(quote_str)
  135. return quote_str
  136. def check_response(response, *args):
  137. resp_json = response.json()
  138. logger.debug(json.dumps(resp_json, indent=4, ensure_ascii=False))
  139. open_id = f" OpenId:{args[0]}" if len(args) > 0 else None
  140. if 'resultType' in resp_json and resp_json['resultType'] == 'ipError':
  141. raise CrawlError(resp_json['resultTypeMemo'] + open_id)
  142. if 'code' in resp_json and resp_json["code"] != '0':
  143. raise CrawlError(resp_json['msg'] + open_id)
  144. if len(resp_json) == 0:
  145. raise CrawlError("详情页请求结果为空" + open_id)
  146. @spider_listener
  147. def callback_requests(func, *args, **kwargs):
  148. global openid
  149. proxy = kwargs.pop('proxy', None)
  150. openid = kwargs.pop('openid')
  151. openid_dq = kwargs.pop('openid_dq')
  152. while True:
  153. kwargs['openid'] = openid
  154. logger.debug(f"[当前openid]:{openid}")
  155. proxies = proxy.proxies if proxy is not None else None
  156. kwargs['proxies'] = proxies
  157. logger.debug(f"[当前代理]:{proxies}")
  158. try:
  159. return func(*args, **kwargs)
  160. except RequestError as e:
  161. logger.error(f"[请求异常]:{e}")
  162. time.sleep(3)
  163. if proxy is not None:
  164. proxy.switch()
  165. except (CrawlError, AssertionError) as e:
  166. logger.error(f"[查询异常]:{e}")
  167. openid = get_openid(openid_dq)
  168. time.sleep(3)
  169. def get_jgdm(query, proxies, openid):
  170. results = []
  171. url = "https://ss.cods.org.cn/MiniProService/search/searchRMini"
  172. headers = {
  173. "Host": "ss.cods.org.cn",
  174. "content-type": "application/x-www-form-urlencoded",
  175. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
  176. "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html"
  177. }
  178. val = {
  179. "q": query,
  180. "t": "common",
  181. "currentPage": 1,
  182. "xzqh": "",
  183. "jglx": "B", # 事业类型
  184. "zczj": "",
  185. "clrq": "",
  186. "mobile": "",
  187. "isDeepSearch": False,
  188. "platform": "weixin",
  189. "openid": openid
  190. }
  191. json_str = quote(val)
  192. sign = md5_hex(json_str)
  193. data = {
  194. "jsonString": json_str,
  195. "sign": sign
  196. }
  197. # print(data)
  198. request_params = dict(
  199. headers=headers,
  200. data=data,
  201. verify=False,
  202. timeout=60,
  203. proxies=proxies
  204. )
  205. try:
  206. response = requests.post(url, **request_params)
  207. except requests.RequestException as e:
  208. raise RequestError(f"'{query}'jgdm请求失败, 原因:{e}")
  209. # print(response)
  210. check_response(response, openid)
  211. resp_json = response.json()
  212. assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
  213. documents = resp_json['jginfoList']["documents"]
  214. for item in documents:
  215. if item['jyzt'] != '注销':
  216. results.append(item['encJgdm'])
  217. logger.info(f"[查询成功]获取{len(results)}条'{query}'相关信息")
  218. return results
  219. def get_hospital(query, jgdm, proxies, openid):
  220. url = "https://ss.cods.org.cn/MiniProService/detailPage/detail.base"
  221. headers = {
  222. "Host": "ss.cods.org.cn",
  223. "Content-Length": "531",
  224. "content-type": "application/x-www-form-urlencoded",
  225. "Accept-Encoding": "gzip,compress,br,deflate",
  226. "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.26(0x18001a2e) NetType/WIFI Language/zh_CN",
  227. "Referer": "https://servicewechat.com/wxa97584cd2e4d83ad/10/page-frame.html",
  228. "Connection": "keep-alive"
  229. }
  230. val = {
  231. "jgdm": jgdm,
  232. "keyword": query,
  233. "platform": "weixin",
  234. "openid": openid
  235. }
  236. json_str = quote(val)
  237. data = {
  238. 'jsonString': json_str,
  239. 'sign': md5_hex(json_str)
  240. }
  241. # print(data)
  242. request_params = dict(
  243. headers=headers,
  244. data=data,
  245. proxies=proxies,
  246. verify=False,
  247. timeout=60
  248. )
  249. try:
  250. response = requests.post(url, **request_params)
  251. except requests.RequestException as e:
  252. raise RequestError(f"'{jgdm}'医院详情请求失败, 原因:{e}")
  253. check_response(response, openid)
  254. resp_json = response.json()
  255. # print(json.dumps(resp_json, indent=4, ensure_ascii=False))
  256. assert "code" in resp_json and resp_json["code"] == '0'
  257. item = resp_json['document']
  258. alias_name = ""
  259. if "jgmchis" in item:
  260. alias_name = ",".join(item["jgmchis"])
  261. hospital = {
  262. "search_name": query, # 搜索名称
  263. "hospital_name": item["jgmc"], # 医院名称
  264. "alias_name": alias_name, # 曾用名 ("xxx,xxx")
  265. "credit_no": item["tydm"], # 统一信用代码
  266. "legal_person": item["fddbr"], # 法定代表人
  267. "capital": item["newZczj"], # 注册资本
  268. "establish_date": item["clrq"], # 成立日期
  269. "company_type": "事业单位", # 企业类型 (机构类型)
  270. "operation_startdate": item["jyqxz"], # 营业期限自
  271. "operation_enddate": item["jyqxe"], # 营业期限至
  272. "business_scope": item["jyfw"], # 经营范围
  273. "authority": item["djbmmc"], # 登记机关(批准机构名称)
  274. "company_address": item["zcdz"], # 联系地址(注册地址)
  275. "company_code": item["djh"], # 注册号(登记号)
  276. "organization": item["jjlxdm"], # (经济类型)
  277. "industry": item["jjhydm"], # (经济行业)
  278. }
  279. f_hospital_codes.update_one(
  280. {'search_name': query, 'credit_no': item["tydm"]},
  281. {'$set': hospital},
  282. upsert=True
  283. )
  284. # print(json.dumps(hospital, indent=4, ensure_ascii=False))
  285. logger.info(f"[查询成功]获取'{hospital['hospital_name']}'详情数据")
  286. return hospital
  287. def query_hospital(tasks, proxy, openid_deque):
  288. global openid
  289. while len(tasks) > 0:
  290. task = tasks.pop(0)
  291. query = task['name']
  292. openid = get_openid(openid_deque)
  293. logger.info(f"[开始查询]{query}")
  294. params = dict(proxy=proxy, openid=openid, openid_dq=openid_deque)
  295. # 列表页
  296. jgdm_lst = callback_requests(get_jgdm, query, **params)
  297. time.sleep(3)
  298. # 详情页
  299. for jgdm in jgdm_lst:
  300. params.update(dict(openid=openid))
  301. callback_requests(get_hospital, query, jgdm, **params)
  302. logger.info(f"[保存数据]jgdm:{jgdm}")
  303. time.sleep(15)
  304. # 更新采集任务状态
  305. zktest_unexists_name.update_one(
  306. {'_id': task['_id']},
  307. {
  308. '$set': {
  309. 'is_crawl': True,
  310. 'count': len(jgdm_lst) # 事业单位的数量
  311. }
  312. }
  313. )
  314. time.sleep(60)
  315. # if __name__ == '__main__':
  316. # get_hospital('沈阳市儿童医院', '1653fbc6f5c496974321f967286cba59', None, 'o0VVO5Qj5EZzjeaKjCQUhhiYprBw')
  317. # get_jgdm('沈阳市儿童医院', None, 'o0VVO5Qj5EZzjeaKjCQUhhiYprBw')