2 жил өмнө · d3ef68909f
--- a/A数据处理/sync_data/monitor_summary.py
+++ b/A数据处理/sync_data/monitor_summary.py
@@ -45,6 +45,9 @@ spider_monitor = mongodb1["spider_monitor"]
 
															 # luaconfig表
														
 
															 spider_lua_config = mongodb2["luaconfig"]
														
 
															+# 特殊网站
														
 
															+special_sites = ["云南省政府采购网", "湖南省政府采购电子卖场"]
														
 
															+
														
 
															 def get_md5(*args, **kwargs):
														
 
															     """
														
@@ -58,8 +61,8 @@ def get_md5(*args, **kwargs):
 
															     conditions = ["site", "channel", "spidercode"]
														
 
															     data_lst = list(filter(lambda x: x is not None, args))
														
 
															     for k, v in kwargs.items():
														
 
															-        if k in conditions and kwargs[k] and kwargs[k] not in data_lst:
														
 
															-            data_lst.append(kwargs[k])
														
 
															+        if k in conditions and (v and v not in data_lst):
														
 
															+            data_lst.append(v)
														
 
															     if not data_lst or len(data_lst) != 3:
														
 
															         # raise AttributeError(f"缺少{conditions}属性")
														
@@ -187,13 +190,11 @@ def aggregate_query_crawl_count(runtime):
 
															             spidercode = items["spidercode"]
														
 
															             business_type = items["business_type"]
														
 
															-            if site == "湖北省住房和城乡建设厅":
														
 
															-                print("123")
														
 
															-
														
 
															-            if len(spider_item) > 1:
														
 
															-                logger.warning(f"{spidercode} -> {site}--存在风险, {len(spider_item)}")
														
 
															+            if len(spider_item) > 1 and site not in special_sites:
														
 
															+                logger.warning(f"[Monitor]{spidercode} -> {site}--存在风险, {len(spider_item)}")
														
 
															             is_list = str(business_type).endswith("List")
														
 
															+
														
 
															             hash_key = get_md5(**items)  # 防止多站点对应1个spidercode,数据相互重叠
														
 
															             if not hash_key:
														
 
															                 # logger.error(f"[Monitor]{site}-{channel}-{spidercode}--监控异常")
														
@@ -241,23 +242,24 @@ def aggregate_query_crawl_count(runtime):
 
															             # 监控爬虫任务，当 spidercode_at_site_num > 1
														
 
															             # 表明创建的爬虫任务存在问题，问题反馈数据寻源人员
														
 
															-            label = f"{business_type}_{spidercode}"
														
 
															-            if label not in label_dict:
														
 
															-                aggregate_items[hash_key]["spidercode_at_site_num"] = 1
														
 
															-                conditions = {"keys": [hash_key], "websites": [site]}
														
 
															-                label_dict.setdefault(label, conditions)
														
 
															-            else:
														
 
															-                # 相同spidercode但site不同的爬虫进行计数+1
														
 
															-                websites = label_dict[label]["websites"]
														
 
															-                if site not in websites:
														
 
															-                    keys = label_dict[label]["keys"]
														
 
															-                    for key in keys:
														
 
															-                        aggregate_items[key]["spidercode_at_site_num"] += 1
														
 
															-                    # 记录身份id - hash_key
														
 
															-                    keys.append(hash_key)
														
 
															-                    # 记录站点
														
 
															-                    websites.append(site)
														
 
															-                    aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
														
 
															+            if site not in special_sites:
														
 
															+                label = f"{business_type}_{spidercode}"
														
 
															+                if label not in label_dict:
														
 
															+                    aggregate_items[hash_key]["spidercode_at_site_num"] = 1
														
 
															+                    conditions = {"keys": [hash_key], "websites": [site]}
														
 
															+                    label_dict.setdefault(label, conditions)
														
 
															+                else:
														
 
															+                    # 相同spidercode但site不同的爬虫进行计数+1
														
 
															+                    websites = label_dict[label]["websites"]
														
 
															+                    if site not in websites:
														
 
															+                        keys = label_dict[label]["keys"]
														
 
															+                        for key in keys:
														
 
															+                            aggregate_items[key]["spidercode_at_site_num"] += 1
														
 
															+                        # 记录身份id - hash_key
														
 
															+                        keys.append(hash_key)
														
 
															+                        # 记录站点
														
 
															+                        websites.append(site)
														
 
															+                        aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
														
 
															     return aggregate_items
														
@@ -395,24 +397,6 @@ def get_detail_downloadfailnum(**kwargs):
 
															     return count
														
 
															-def get_count(document, business_type: str):
														
 
															-    if business_type.title() not in ["List", "Detail"]:
														
 
															-        raise ValueError("business_type")
														
 
															-
														
 
															-    if str(document["business_type"]).endswith(business_type):
														
 
															-        return document["count"]
														
 
															-    return 0
														
 
															-
														
 
															-
														
 
															-def get_rel_count(document, business_type: str):
														
 
															-    if business_type.title() not in ["List", "Detail"]:
														
 
															-        raise ValueError("business_type")
														
 
															-
														
 
															-    if str(document["business_type"]).endswith(business_type):
														
 
															-        return document["rel_count"]
														
 
															-    return 0
														
 
															-
														
 
															-
														
 
															 def main():
														
 
															     summary_queue = []
														
 
															     crawlers = get_crawler_basic_information()