|
@@ -45,6 +45,9 @@ spider_monitor = mongodb1["spider_monitor"]
|
|
# luaconfig表
|
|
# luaconfig表
|
|
spider_lua_config = mongodb2["luaconfig"]
|
|
spider_lua_config = mongodb2["luaconfig"]
|
|
|
|
|
|
|
|
+# 特殊网站
|
|
|
|
+special_sites = ["云南省政府采购网", "湖南省政府采购电子卖场"]
|
|
|
|
+
|
|
|
|
|
|
def get_md5(*args, **kwargs):
|
|
def get_md5(*args, **kwargs):
|
|
"""
|
|
"""
|
|
@@ -58,8 +61,8 @@ def get_md5(*args, **kwargs):
|
|
conditions = ["site", "channel", "spidercode"]
|
|
conditions = ["site", "channel", "spidercode"]
|
|
data_lst = list(filter(lambda x: x is not None, args))
|
|
data_lst = list(filter(lambda x: x is not None, args))
|
|
for k, v in kwargs.items():
|
|
for k, v in kwargs.items():
|
|
- if k in conditions and kwargs[k] and kwargs[k] not in data_lst:
|
|
|
|
- data_lst.append(kwargs[k])
|
|
|
|
|
|
+ if k in conditions and (v and v not in data_lst):
|
|
|
|
+ data_lst.append(v)
|
|
|
|
|
|
if not data_lst or len(data_lst) != 3:
|
|
if not data_lst or len(data_lst) != 3:
|
|
# raise AttributeError(f"缺少{conditions}属性")
|
|
# raise AttributeError(f"缺少{conditions}属性")
|
|
@@ -187,13 +190,11 @@ def aggregate_query_crawl_count(runtime):
|
|
spidercode = items["spidercode"]
|
|
spidercode = items["spidercode"]
|
|
business_type = items["business_type"]
|
|
business_type = items["business_type"]
|
|
|
|
|
|
- if site == "湖北省住房和城乡建设厅":
|
|
|
|
- print("123")
|
|
|
|
-
|
|
|
|
- if len(spider_item) > 1:
|
|
|
|
- logger.warning(f"{spidercode} -> {site}--存在风险, {len(spider_item)}")
|
|
|
|
|
|
+ if len(spider_item) > 1 and site not in special_sites:
|
|
|
|
+ logger.warning(f"[Monitor]{spidercode} -> {site}--存在风险, {len(spider_item)}")
|
|
|
|
|
|
is_list = str(business_type).endswith("List")
|
|
is_list = str(business_type).endswith("List")
|
|
|
|
+
|
|
hash_key = get_md5(**items) # 防止多站点对应1个spidercode,数据相互重叠
|
|
hash_key = get_md5(**items) # 防止多站点对应1个spidercode,数据相互重叠
|
|
if not hash_key:
|
|
if not hash_key:
|
|
# logger.error(f"[Monitor]{site}-{channel}-{spidercode}--监控异常")
|
|
# logger.error(f"[Monitor]{site}-{channel}-{spidercode}--监控异常")
|
|
@@ -241,23 +242,24 @@ def aggregate_query_crawl_count(runtime):
|
|
|
|
|
|
# 监控爬虫任务,当 spidercode_at_site_num > 1
|
|
# 监控爬虫任务,当 spidercode_at_site_num > 1
|
|
# 表明创建的爬虫任务存在问题,问题反馈数据寻源人员
|
|
# 表明创建的爬虫任务存在问题,问题反馈数据寻源人员
|
|
- label = f"{business_type}_{spidercode}"
|
|
|
|
- if label not in label_dict:
|
|
|
|
- aggregate_items[hash_key]["spidercode_at_site_num"] = 1
|
|
|
|
- conditions = {"keys": [hash_key], "websites": [site]}
|
|
|
|
- label_dict.setdefault(label, conditions)
|
|
|
|
- else:
|
|
|
|
- # 相同spidercode但site不同的爬虫进行计数+1
|
|
|
|
- websites = label_dict[label]["websites"]
|
|
|
|
- if site not in websites:
|
|
|
|
- keys = label_dict[label]["keys"]
|
|
|
|
- for key in keys:
|
|
|
|
- aggregate_items[key]["spidercode_at_site_num"] += 1
|
|
|
|
- # 记录身份id - hash_key
|
|
|
|
- keys.append(hash_key)
|
|
|
|
- # 记录站点
|
|
|
|
- websites.append(site)
|
|
|
|
- aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
|
|
|
|
|
|
+ if site not in special_sites:
|
|
|
|
+ label = f"{business_type}_{spidercode}"
|
|
|
|
+ if label not in label_dict:
|
|
|
|
+ aggregate_items[hash_key]["spidercode_at_site_num"] = 1
|
|
|
|
+ conditions = {"keys": [hash_key], "websites": [site]}
|
|
|
|
+ label_dict.setdefault(label, conditions)
|
|
|
|
+ else:
|
|
|
|
+ # 相同spidercode但site不同的爬虫进行计数+1
|
|
|
|
+ websites = label_dict[label]["websites"]
|
|
|
|
+ if site not in websites:
|
|
|
|
+ keys = label_dict[label]["keys"]
|
|
|
|
+ for key in keys:
|
|
|
|
+ aggregate_items[key]["spidercode_at_site_num"] += 1
|
|
|
|
+ # 记录身份id - hash_key
|
|
|
|
+ keys.append(hash_key)
|
|
|
|
+ # 记录站点
|
|
|
|
+ websites.append(site)
|
|
|
|
+ aggregate_items[hash_key]["spidercode_at_site_num"] = len(websites)
|
|
|
|
|
|
return aggregate_items
|
|
return aggregate_items
|
|
|
|
|
|
@@ -395,24 +397,6 @@ def get_detail_downloadfailnum(**kwargs):
|
|
return count
|
|
return count
|
|
|
|
|
|
|
|
|
|
-def get_count(document, business_type: str):
|
|
|
|
- if business_type.title() not in ["List", "Detail"]:
|
|
|
|
- raise ValueError("business_type")
|
|
|
|
-
|
|
|
|
- if str(document["business_type"]).endswith(business_type):
|
|
|
|
- return document["count"]
|
|
|
|
- return 0
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def get_rel_count(document, business_type: str):
|
|
|
|
- if business_type.title() not in ["List", "Detail"]:
|
|
|
|
- raise ValueError("business_type")
|
|
|
|
-
|
|
|
|
- if str(document["business_type"]).endswith(business_type):
|
|
|
|
- return document["rel_count"]
|
|
|
|
- return 0
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def main():
|
|
def main():
|
|
summary_queue = []
|
|
summary_queue = []
|
|
crawlers = get_crawler_basic_information()
|
|
crawlers = get_crawler_basic_information()
|