2 yıl önce · d3ef68909f
--- a/A数据处理/sync_data/monitor_summary.py
+++ b/A数据处理/sync_data/monitor_summary.py
@@ -45,6 +45,9 @@ spider_monitor = mongodb1["spider_monitor"]
 
				 # luaconfig表
			
 
				 spider_lua_config = mongodb2["luaconfig"]
			
 
				 
			
 
				+# 特殊网站
			
 
				+special_sites = ["云南省政府采购网", "湖南省政府采购电子卖场"]
			
 
				+
			
 
				 
			
 
				 def get_md5(*args, **kwargs):
			
 
				     """
			
@@ -58,8 +61,8 @@ def get_md5(*args, **kwargs):
 
				     conditions = ["site", "channel", "spidercode"]
			
 
				     data_lst = list(filter(lambda x: x is not None, args))
			
 
				     for k, v in kwargs.items():
			
 
				-        if k in conditions and kwargs[k] and kwargs[k] not in data_lst:
			
 
				-            data_lst.append(kwargs[k])
			
 
				+        if k in conditions and (v and v not in data_lst):
			
 
				+            data_lst.append(v)
			
 
				 
			
 
				     if not data_lst or len(data_lst) != 3:
			
 
				         # raise AttributeError(f"缺少{conditions}属性")
			
@@ -187,13 +190,11 @@ def aggregate_query_crawl_count(runtime):
 
				             spidercode = items["spidercode"]
			
 
				             business_type = items["business_type"]
			
 
				 
			
 
				-            if site == "湖北省住房和城乡建设厅":
			
 
				-                print("123")
			
 
				-
			
 
				-            if len(spider_item) > 1:
			
 
				-                logger.warning(f"{spidercode} -> {site}--存在风险, {len(spider_item)}")
			
 
				+            if len(spider_item) > 1 and site not in special_sites:
			
 
				+                logger.warning(f"[Monitor]{spidercode} -> {site}--存在风险, {len(spider_item)}")
			
 
				 
			
 
				             is_list = str(business_type).endswith("List")
			
 
				+
			
 
				             hash_key = get_md5(**items)  # 防止多站点对应1个spidercode,数据相互重叠
			
 
				             if not hash_key:
			
 
				                 # logger.error(f"[Monitor]{site}-{channel}-{spidercode}--监控异常")
			
@@ -241,23 +242,24 @@ def aggregate_query_crawl_count(runtime):
 
				 
			
 
				             # 监控爬虫任务，当 spidercode_at_site_num > 1
			
 
				             # 表明创建的爬虫任务存在问题，问题反馈数据寻源人员
			
 
				-            label = f"{business_type}_{spidercode}"
			
 
				-            if label not in label_dict:
			
 
				-                aggregate_items[hash_key]["spidercode_at_site_num"] = 1
			
 
				-                conditions = {"keys": [hash_key], "websites": [site]}
			
 
				-                label_dict.setdefault(label, conditions)
			
 
				-            else:
			
 
				-                # 相同spidercode但site不同的爬虫进行计数+1
			
 
				-                websites = label_dict[label]["websites"]
			
 
				-                if site not in websites:
			
 
				-                    keys = label_dict[label]["keys"]
			
 
				-                    for key in keys:
			
 
				-                        aggregate_items[key]["spidercode_at_site_num"] += 1
			
 
				-                    # 记录身份id - hash_key
			
 
				-                    keys.append(hash_key)
			
 
				-                    # 记录站点
			
 
				-                    websites.append(site)
			
 
				-                    aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
			
 
				+            if site not in special_sites:
			
 
				+                label = f"{business_type}_{spidercode}"
			
 
				+                if label not in label_dict:
			
 
				+                    aggregate_items[hash_key]["spidercode_at_site_num"] = 1
			
 
				+                    conditions = {"keys": [hash_key], "websites": [site]}
			
 
				+                    label_dict.setdefault(label, conditions)
			
 
				+                else:
			
 
				+                    # 相同spidercode但site不同的爬虫进行计数+1
			
 
				+                    websites = label_dict[label]["websites"]
			
 
				+                    if site not in websites:
			
 
				+                        keys = label_dict[label]["keys"]
			
 
				+                        for key in keys:
			
 
				+                            aggregate_items[key]["spidercode_at_site_num"] += 1
			
 
				+                        # 记录身份id - hash_key
			
 
				+                        keys.append(hash_key)
			
 
				+                        # 记录站点
			
 
				+                        websites.append(site)
			
 
				+                        aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
			
 
				 
			
 
				     return aggregate_items
			
 
				 
			
@@ -395,24 +397,6 @@ def get_detail_downloadfailnum(**kwargs):
 
				     return count
			
 
				 
			
 
				 
			
 
				-def get_count(document, business_type: str):
			
 
				-    if business_type.title() not in ["List", "Detail"]:
			
 
				-        raise ValueError("business_type")
			
 
				-
			
 
				-    if str(document["business_type"]).endswith(business_type):
			
 
				-        return document["count"]
			
 
				-    return 0
			
 
				-
			
 
				-
			
 
				-def get_rel_count(document, business_type: str):
			
 
				-    if business_type.title() not in ["List", "Detail"]:
			
 
				-        raise ValueError("business_type")
			
 
				-
			
 
				-    if str(document["business_type"]).endswith(business_type):
			
 
				-        return document["rel_count"]
			
 
				-    return 0
			
 
				-
			
 
				-
			
 
				 def main():
			
 
				     summary_queue = []
			
 
				     crawlers = get_crawler_basic_information()