|
@@ -15,6 +15,7 @@ urllib3.disable_warnings()
|
|
|
|
|
|
zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
|
|
zktest_unexists_name = mongo_table('py_spider', 'zktest_unexists_name')
|
|
f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
|
|
f_hospital_codes = mongo_table('py_theme', 'f_hospital_codes')
|
|
|
|
+openid = None # 全局openid
|
|
|
|
|
|
|
|
|
|
class TimerError(IOError):
|
|
class TimerError(IOError):
|
|
@@ -40,11 +41,9 @@ def spider_listener(func):
|
|
def wrapper(*args, **kwargs):
|
|
def wrapper(*args, **kwargs):
|
|
if all([
|
|
if all([
|
|
0 <= datetime.datetime.now().weekday() <= 4, # 周一到周五
|
|
0 <= datetime.datetime.now().weekday() <= 4, # 周一到周五
|
|
- 9 <= datetime.datetime.now().hour <= 17 # 早9点到晚5点
|
|
|
|
|
|
+ 9 <= datetime.datetime.now().hour <= 18 # 早9点到晚19点
|
|
]):
|
|
]):
|
|
- # print("进入执行++++++++++++++++ ", func.__name__)
|
|
|
|
result = func(*args, **kwargs)
|
|
result = func(*args, **kwargs)
|
|
- # print("执行完毕------------------- ", func.__name__)
|
|
|
|
return result
|
|
return result
|
|
raise TimerError('小程序接口停止运营')
|
|
raise TimerError('小程序接口停止运营')
|
|
return wrapper
|
|
return wrapper
|
|
@@ -160,19 +159,21 @@ def quote(data):
|
|
return quote_str
|
|
return quote_str
|
|
|
|
|
|
|
|
|
|
-def check_response(response):
|
|
|
|
|
|
+def check_response(response, *args):
|
|
resp_json = response.json()
|
|
resp_json = response.json()
|
|
logger.debug(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
logger.debug(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
|
|
+ open_id = f" OpenId:{args[0]}" if len(args) > 0 else None
|
|
if 'resultType' in resp_json and resp_json['resultType'] == 'ipError':
|
|
if 'resultType' in resp_json and resp_json['resultType'] == 'ipError':
|
|
- raise CrawlError(resp_json['resultTypeMemo'])
|
|
|
|
|
|
+ raise CrawlError(resp_json['resultTypeMemo'] + open_id)
|
|
if 'code' in resp_json and resp_json["code"] != '0':
|
|
if 'code' in resp_json and resp_json["code"] != '0':
|
|
- raise CrawlError(resp_json['msg'])
|
|
|
|
|
|
+ raise CrawlError(resp_json['msg'] + open_id)
|
|
if len(resp_json) == 0:
|
|
if len(resp_json) == 0:
|
|
- raise CrawlError(f"详情页请求结果为空")
|
|
|
|
|
|
+ raise CrawlError("详情页请求结果为空" + open_id)
|
|
|
|
|
|
|
|
|
|
@spider_listener
|
|
@spider_listener
|
|
def callback_requests(func, *args, **kwargs):
|
|
def callback_requests(func, *args, **kwargs):
|
|
|
|
+ global openid
|
|
proxy = kwargs.pop('proxy', None)
|
|
proxy = kwargs.pop('proxy', None)
|
|
openid = kwargs.pop('openid')
|
|
openid = kwargs.pop('openid')
|
|
openid_dq = kwargs.pop('openid_dq')
|
|
openid_dq = kwargs.pop('openid_dq')
|
|
@@ -190,7 +191,7 @@ def callback_requests(func, *args, **kwargs):
|
|
if proxy is not None:
|
|
if proxy is not None:
|
|
proxy.switch()
|
|
proxy.switch()
|
|
except (CrawlError, AssertionError) as e:
|
|
except (CrawlError, AssertionError) as e:
|
|
- logger.error(f"[OpenId异常]:{e}")
|
|
|
|
|
|
+ logger.error(f"[查询异常]:{e}")
|
|
openid = get_openid(openid_dq)
|
|
openid = get_openid(openid_dq)
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
|
|
|
|
@@ -237,7 +238,7 @@ def get_jgdm(query, proxies, openid):
|
|
raise RequestError(f"'{query}'jgdm请求失败, 原因:{e}")
|
|
raise RequestError(f"'{query}'jgdm请求失败, 原因:{e}")
|
|
|
|
|
|
# print(response)
|
|
# print(response)
|
|
- check_response(response)
|
|
|
|
|
|
+ check_response(response, openid)
|
|
resp_json = response.json()
|
|
resp_json = response.json()
|
|
assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
|
|
assert 'resultType' in resp_json and resp_json['resultType'] != 'ipError'
|
|
documents = resp_json['jginfoList']["documents"]
|
|
documents = resp_json['jginfoList']["documents"]
|
|
@@ -283,7 +284,7 @@ def get_hospital(query, jgdm, proxies, openid):
|
|
except requests.RequestException as e:
|
|
except requests.RequestException as e:
|
|
raise RequestError(f"'{jgdm}'医院详情请求失败, 原因:{e}")
|
|
raise RequestError(f"'{jgdm}'医院详情请求失败, 原因:{e}")
|
|
|
|
|
|
- check_response(response)
|
|
|
|
|
|
+ check_response(response, openid)
|
|
resp_json = response.json()
|
|
resp_json = response.json()
|
|
# print(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
# print(json.dumps(resp_json, indent=4, ensure_ascii=False))
|
|
assert "code" in resp_json and resp_json["code"] == '0'
|
|
assert "code" in resp_json and resp_json["code"] == '0'
|
|
@@ -320,22 +321,20 @@ def get_hospital(query, jgdm, proxies, openid):
|
|
|
|
|
|
|
|
|
|
def query_hospital(tasks, proxy, openid_deque):
|
|
def query_hospital(tasks, proxy, openid_deque):
|
|
|
|
+ global openid
|
|
while len(tasks) > 0:
|
|
while len(tasks) > 0:
|
|
task = tasks.pop(0)
|
|
task = tasks.pop(0)
|
|
query = task['name']
|
|
query = task['name']
|
|
- logger.info(f"[开始查询]{query}")
|
|
|
|
openid = get_openid(openid_deque)
|
|
openid = get_openid(openid_deque)
|
|
- request_params = dict(
|
|
|
|
- proxy=proxy,
|
|
|
|
- openid=openid,
|
|
|
|
- openid_dq=openid_deque
|
|
|
|
- )
|
|
|
|
|
|
+ logger.info(f"[开始查询]{query}")
|
|
|
|
+ params = dict(proxy=proxy, openid=openid, openid_dq=openid_deque)
|
|
# 列表页
|
|
# 列表页
|
|
- jgdm_lst = callback_requests(get_jgdm, query, **request_params)
|
|
|
|
|
|
+ jgdm_lst = callback_requests(get_jgdm, query, **params)
|
|
time.sleep(3)
|
|
time.sleep(3)
|
|
# 详情页
|
|
# 详情页
|
|
for jgdm in jgdm_lst:
|
|
for jgdm in jgdm_lst:
|
|
- callback_requests(get_hospital, query, jgdm, **request_params)
|
|
|
|
|
|
+ params.update(dict(openid=openid))
|
|
|
|
+ callback_requests(get_hospital, query, jgdm, **params)
|
|
logger.info(f"[保存数据]jgdm:{jgdm}")
|
|
logger.info(f"[保存数据]jgdm:{jgdm}")
|
|
time.sleep(15)
|
|
time.sleep(15)
|
|
# 更新采集任务状态
|
|
# 更新采集任务状态
|