Kaynağa Gözat

update:过滤特殊网站

dongzhaorui 2 yıl önce
ebeveyn
işleme
d3ef68909f
1 değiştirilmiş dosya ile 26 ekleme ve 42 silme
  1. 26 42
      A数据处理/sync_data/monitor_summary.py

+ 26 - 42
A数据处理/sync_data/monitor_summary.py

@@ -45,6 +45,9 @@ spider_monitor = mongodb1["spider_monitor"]
 # luaconfig表
 spider_lua_config = mongodb2["luaconfig"]
 
+# 特殊网站
+special_sites = ["云南省政府采购网", "湖南省政府采购电子卖场"]
+
 
 def get_md5(*args, **kwargs):
     """
@@ -58,8 +61,8 @@ def get_md5(*args, **kwargs):
     conditions = ["site", "channel", "spidercode"]
     data_lst = list(filter(lambda x: x is not None, args))
     for k, v in kwargs.items():
-        if k in conditions and kwargs[k] and kwargs[k] not in data_lst:
-            data_lst.append(kwargs[k])
+        if k in conditions and (v and v not in data_lst):
+            data_lst.append(v)
 
     if not data_lst or len(data_lst) != 3:
         # raise AttributeError(f"缺少{conditions}属性")
@@ -187,13 +190,11 @@ def aggregate_query_crawl_count(runtime):
             spidercode = items["spidercode"]
             business_type = items["business_type"]
 
-            if site == "湖北省住房和城乡建设厅":
-                print("123")
-
-            if len(spider_item) > 1:
-                logger.warning(f"{spidercode} -> {site}--存在风险, {len(spider_item)}")
+            if len(spider_item) > 1 and site not in special_sites:
+                logger.warning(f"[Monitor]{spidercode} -> {site}--存在风险, {len(spider_item)}")
 
             is_list = str(business_type).endswith("List")
+
             hash_key = get_md5(**items)  # 防止多站点对应1个spidercode,数据相互重叠
             if not hash_key:
                 # logger.error(f"[Monitor]{site}-{channel}-{spidercode}--监控异常")
@@ -241,23 +242,24 @@ def aggregate_query_crawl_count(runtime):
 
             # 监控爬虫任务,当 spidercode_at_site_num > 1
             # 表明创建的爬虫任务存在问题,问题反馈数据寻源人员
-            label = f"{business_type}_{spidercode}"
-            if label not in label_dict:
-                aggregate_items[hash_key]["spidercode_at_site_num"] = 1
-                conditions = {"keys": [hash_key], "websites": [site]}
-                label_dict.setdefault(label, conditions)
-            else:
-                # 相同spidercode但site不同的爬虫进行计数+1
-                websites = label_dict[label]["websites"]
-                if site not in websites:
-                    keys = label_dict[label]["keys"]
-                    for key in keys:
-                        aggregate_items[key]["spidercode_at_site_num"] += 1
-                    # 记录身份id - hash_key
-                    keys.append(hash_key)
-                    # 记录站点
-                    websites.append(site)
-                    aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
+            if site not in special_sites:
+                label = f"{business_type}_{spidercode}"
+                if label not in label_dict:
+                    aggregate_items[hash_key]["spidercode_at_site_num"] = 1
+                    conditions = {"keys": [hash_key], "websites": [site]}
+                    label_dict.setdefault(label, conditions)
+                else:
+                    # 相同spidercode但site不同的爬虫进行计数+1
+                    websites = label_dict[label]["websites"]
+                    if site not in websites:
+                        keys = label_dict[label]["keys"]
+                        for key in keys:
+                            aggregate_items[key]["spidercode_at_site_num"] += 1
+                        # 记录身份id - hash_key
+                        keys.append(hash_key)
+                        # 记录站点
+                        websites.append(site)
+                        aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
 
     return aggregate_items
 
@@ -395,24 +397,6 @@ def get_detail_downloadfailnum(**kwargs):
     return count
 
 
-def get_count(document, business_type: str):
-    if business_type.title() not in ["List", "Detail"]:
-        raise ValueError("business_type")
-
-    if str(document["business_type"]).endswith(business_type):
-        return document["count"]
-    return 0
-
-
-def get_rel_count(document, business_type: str):
-    if business_type.title() not in ["List", "Detail"]:
-        raise ValueError("business_type")
-
-    if str(document["business_type"]).endswith(business_type):
-        return document["rel_count"]
-    return 0
-
-
 def main():
     summary_queue = []
     crawlers = get_crawler_basic_information()