|
@@ -17,8 +17,7 @@ from crawler.utils import (
|
|
iter_node,
|
|
iter_node,
|
|
check_page_by_words,
|
|
check_page_by_words,
|
|
predict_bidding_model,
|
|
predict_bidding_model,
|
|
- is_contains,
|
|
|
|
- compress_str
|
|
|
|
|
|
+ is_contains
|
|
)
|
|
)
|
|
from settings import Dzr
|
|
from settings import Dzr
|
|
|
|
|
|
@@ -78,22 +77,14 @@ class DataExcavate(BasicService):
|
|
is_attachment_url, # 判断是否附件下载地址
|
|
is_attachment_url, # 判断是否附件下载地址
|
|
is_login_url, # 判断是否登录地址
|
|
is_login_url, # 判断是否登录地址
|
|
self.validator.data(url), # 垃圾池 - 判重任务请求网址
|
|
self.validator.data(url), # 垃圾池 - 判重任务请求网址
|
|
- self.validator.data(domain), # 垃圾池 - 过滤域名(一级)
|
|
|
|
- self.collector.data(domain), # 收录池 - 判重域名(一级)
|
|
|
|
|
|
+ self.validator.data(domain), # 垃圾池 - 过滤域名
|
|
|
|
+ curr_depth > 1 and self.collector.data(domain), # 收录池 - 判重域名(一级)[种子不与已收录判重]
|
|
]):
|
|
]):
|
|
logger.debug(f'<{self.thread_name}> - 无效任务 - {curr_depth} - {url}')
|
|
logger.debug(f'<{self.thread_name}> - 无效任务 - {curr_depth} - {url}')
|
|
return True
|
|
return True
|
|
|
|
|
|
return False
|
|
return False
|
|
|
|
|
|
- def filter_data(self, lst):
|
|
|
|
- """通过垃圾过滤器过滤数据"""
|
|
|
|
- results = []
|
|
|
|
- for val in lst:
|
|
|
|
- if not self.validator.data(val):
|
|
|
|
- results.append(val)
|
|
|
|
- return results
|
|
|
|
-
|
|
|
|
def same_origin_strategy(self, source, task: Task):
|
|
def same_origin_strategy(self, source, task: Task):
|
|
"""同源策略"""
|
|
"""同源策略"""
|
|
# 排查时间文本
|
|
# 排查时间文本
|
|
@@ -105,6 +96,7 @@ class DataExcavate(BasicService):
|
|
if pt and not node.getchildren() and node not in date_nodes:
|
|
if pt and not node.getchildren() and node not in date_nodes:
|
|
date_nodes.append(node)
|
|
date_nodes.append(node)
|
|
hit_date_total += 1
|
|
hit_date_total += 1
|
|
|
|
+
|
|
# 全文排查检索词
|
|
# 全文排查检索词
|
|
hit_text_total = 0
|
|
hit_text_total = 0
|
|
all_text = ["".join(text.split()) for text in element.itertext()]
|
|
all_text = ["".join(text.split()) for text in element.itertext()]
|
|
@@ -113,10 +105,16 @@ class DataExcavate(BasicService):
|
|
# print(text)
|
|
# print(text)
|
|
if check_page_by_words(text):
|
|
if check_page_by_words(text):
|
|
hit_text_total += 1
|
|
hit_text_total += 1
|
|
|
|
+
|
|
# 寻源结果
|
|
# 寻源结果
|
|
if all([3 < hit_date_total < 50, hit_text_total > 3]):
|
|
if all([3 < hit_date_total < 50, hit_text_total > 3]):
|
|
|
|
+ if hit_text_total < 5:
|
|
|
|
+ # 关键词计数为3-5网站 标记为2
|
|
|
|
+ task["hit5"] = 2
|
|
self.push_domain(task)
|
|
self.push_domain(task)
|
|
elif hit_text_total > 5:
|
|
elif hit_text_total > 5:
|
|
|
|
+ # 关键词计数为5以上网站 标记为1
|
|
|
|
+ task["hit5"] = 1
|
|
self.push_domain(task)
|
|
self.push_domain(task)
|
|
else:
|
|
else:
|
|
self.push_remove(task)
|
|
self.push_remove(task)
|