|
@@ -168,7 +168,7 @@ def check_response(response):
|
|
|
if 'code' in resp_json and resp_json["code"] != '0':
|
|
|
raise CrawlError(resp_json['msg'])
|
|
|
if len(resp_json) == 0:
|
|
|
- raise CrawlError(f"响应结果:{resp_json}")
|
|
|
+ raise CrawlError(f"详情页请求结果为空")
|
|
|
|
|
|
|
|
|
@spider_listener
|
|
@@ -189,7 +189,8 @@ def callback_requests(func, *args, **kwargs):
|
|
|
time.sleep(3)
|
|
|
if proxy is not None:
|
|
|
proxy.switch()
|
|
|
- except (CrawlError, AssertionError):
|
|
|
+ except (CrawlError, AssertionError) as e:
|
|
|
+ logger.error(f"[OpenId异常]:{e}")
|
|
|
openid = get_openid(openid_dq)
|
|
|
time.sleep(3)
|
|
|
|
|
@@ -243,7 +244,7 @@ def get_jgdm(query, proxies, openid):
|
|
|
for item in documents:
|
|
|
if item['jyzt'] != '注销':
|
|
|
results.append(item['encJgdm'])
|
|
|
- logger.info(f"[查询成功]列表页 - {query}")
|
|
|
+ logger.info(f"[查询成功]获取{len(results)}条'{query}'相关信息")
|
|
|
return results
|
|
|
|
|
|
|
|
@@ -314,7 +315,7 @@ def get_hospital(query, jgdm, proxies, openid):
|
|
|
upsert=True
|
|
|
)
|
|
|
# print(json.dumps(hospital, indent=4, ensure_ascii=False))
|
|
|
- logger.info(f'[查询成功]详情页 - {hospital["hospital_name"]}')
|
|
|
+ logger.info(f"[查询成功]获取'{hospital['hospital_name']}'详情数据")
|
|
|
return hospital
|
|
|
|
|
|
|
|
@@ -331,8 +332,6 @@ def query_hospital(tasks, proxy, openid_deque):
|
|
|
)
|
|
|
# 列表页
|
|
|
jgdm_lst = callback_requests(get_jgdm, query, **request_params)
|
|
|
- total = len(jgdm_lst) # 事业单位的数量
|
|
|
- logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
|
|
|
time.sleep(3)
|
|
|
# 详情页
|
|
|
for jgdm in jgdm_lst:
|
|
@@ -342,7 +341,12 @@ def query_hospital(tasks, proxy, openid_deque):
|
|
|
# 更新采集任务状态
|
|
|
zktest_unexists_name.update_one(
|
|
|
{'_id': task['_id']},
|
|
|
- {'$set': {'is_crawl': True, 'count': total}}
|
|
|
+ {
|
|
|
+ '$set': {
|
|
|
+ 'is_crawl': True,
|
|
|
+ 'count': len(jgdm_lst) # 事业单位的数量
|
|
|
+ }
|
|
|
+ }
|
|
|
)
|
|
|
time.sleep(60)
|
|
|
|