dongzhaorui před 3 roky
rodič
revize
f4d0feb45b

+ 1 - 1
codes_hospital/crawl_hospital.py

@@ -19,7 +19,7 @@ OPENID_DEQUE = deque([
 
 def crawl_spider():
     logger.info('开始任务')
-    proxy = Socks5Proxy(True)
+    proxy = Socks5Proxy(False)
     global OPENID_DEQUE
     while True:
         cursor = get_cursor()

+ 1 - 1
codes_hospital/crawl_hospital_2.py

@@ -19,7 +19,7 @@ OPENID_DEQUE = deque([
 
 def crawl_spider():
     logger.info('开始任务')
-    proxy = Socks5Proxy(True)
+    proxy = Socks5Proxy(False)
     global OPENID_DEQUE
     while True:
         cursor = get_cursor()

+ 1 - 1
codes_hospital/crawl_hospital_3.py

@@ -18,7 +18,7 @@ OPENID_DEQUE = deque([
 
 def crawl_spider():
     logger.info('开始任务')
-    proxy = Socks5Proxy(True)
+    proxy = Socks5Proxy(False)
     global OPENID_DEQUE
     while True:
         cursor = get_cursor()

+ 11 - 7
codes_hospital/defaults.py

@@ -168,7 +168,7 @@ def check_response(response):
     if 'code' in resp_json and resp_json["code"] != '0':
         raise CrawlError(resp_json['msg'])
     if len(resp_json) == 0:
-        raise CrawlError(f"响应结果:{resp_json}")
+        raise CrawlError(f"详情页请求结果为空")
 
 
 @spider_listener
@@ -189,7 +189,8 @@ def callback_requests(func, *args, **kwargs):
             time.sleep(3)
             if proxy is not None:
                 proxy.switch()
-        except (CrawlError, AssertionError):
+        except (CrawlError, AssertionError) as e:
+            logger.error(f"[OpenId异常]:{e}")
             openid = get_openid(openid_dq)
             time.sleep(3)
 
@@ -243,7 +244,7 @@ def get_jgdm(query, proxies, openid):
     for item in documents:
         if item['jyzt'] != '注销':
             results.append(item['encJgdm'])
-    logger.info(f"[查询成功]列表页 - {query}")
+    logger.info(f"[查询成功]获取{len(results)}条'{query}'相关信息")
     return results
 
 
@@ -314,7 +315,7 @@ def get_hospital(query, jgdm, proxies, openid):
         upsert=True
     )
     # print(json.dumps(hospital, indent=4, ensure_ascii=False))
-    logger.info(f'[查询成功]详情页 - {hospital["hospital_name"]}')
+    logger.info(f"[查询成功]获取'{hospital['hospital_name']}'详情数据")
     return hospital
 
 
@@ -331,8 +332,6 @@ def query_hospital(tasks, proxy, openid_deque):
         )
         # 列表页
         jgdm_lst = callback_requests(get_jgdm, query, **request_params)
-        total = len(jgdm_lst)  # 事业单位的数量
-        logger.info(f"[查询成功]获取{total}条'{query}'相关信息")
         time.sleep(3)
         # 详情页
         for jgdm in jgdm_lst:
@@ -342,7 +341,12 @@ def query_hospital(tasks, proxy, openid_deque):
         # 更新采集任务状态
         zktest_unexists_name.update_one(
             {'_id': task['_id']},
-            {'$set': {'is_crawl': True, 'count': total}}
+            {
+                '$set': {
+                    'is_crawl': True,
+                    'count': len(jgdm_lst)  # 事业单位的数量
+                }
+            }
         )
         time.sleep(60)