1 年之前 · e00949aa4c
--- a/find_source/crawler/services/excavate.py
+++ b/find_source/crawler/services/excavate.py
@@ -17,8 +17,7 @@ from crawler.utils import (
 
				     iter_node,
			
 
				     check_page_by_words,
			
 
				     predict_bidding_model,
			
 
				-    is_contains,
			
 
				-    compress_str
			
 
				+    is_contains
			
 
				 )
			
 
				 from settings import Dzr
			
 
				 
			
@@ -78,22 +77,14 @@ class DataExcavate(BasicService):
 
				             is_attachment_url,  # 判断是否附件下载地址
			
 
				             is_login_url,  # 判断是否登录地址
			
 
				             self.validator.data(url),  # 垃圾池 - 判重任务请求网址
			
 
				-            self.validator.data(domain),  # 垃圾池 - 过滤域名（一级）
			
 
				-            self.collector.data(domain),  # 收录池 - 判重域名（一级）
			
 
				+            self.validator.data(domain),  # 垃圾池 - 过滤域名
			
 
				+            curr_depth > 1 and self.collector.data(domain),  # 收录池 - 判重域名（一级）[种子不与已收录判重]
			
 
				         ]):
			
 
				             logger.debug(f'<{self.thread_name}> - 无效任务 - {curr_depth} - {url}')
			
 
				             return True
			
 
				 
			
 
				         return False
			
 
				 
			
 
				-    def filter_data(self, lst):
			
 
				-        """通过垃圾过滤器过滤数据"""
			
 
				-        results = []
			
 
				-        for val in lst:
			
 
				-            if not self.validator.data(val):
			
 
				-                results.append(val)
			
 
				-        return results
			
 
				-
			
 
				     def same_origin_strategy(self, source, task: Task):
			
 
				         """同源策略"""
			
 
				         # 排查时间文本
			
@@ -105,6 +96,7 @@ class DataExcavate(BasicService):
 
				             if pt and not node.getchildren() and node not in date_nodes:
			
 
				                 date_nodes.append(node)
			
 
				                 hit_date_total += 1
			
 
				+
			
 
				         # 全文排查检索词
			
 
				         hit_text_total = 0
			
 
				         all_text = ["".join(text.split()) for text in element.itertext()]
			
@@ -113,10 +105,16 @@ class DataExcavate(BasicService):
 
				             # print(text)
			
 
				             if check_page_by_words(text):
			
 
				                 hit_text_total += 1
			
 
				+
			
 
				         # 寻源结果
			
 
				         if all([3 < hit_date_total < 50, hit_text_total > 3]):
			
 
				+            if hit_text_total < 5:
			
 
				+                # 关键词计数为3-5网站 标记为2
			
 
				+                task["hit5"] = 2
			
 
				             self.push_domain(task)
			
 
				         elif hit_text_total > 5:
			
 
				+            # 关键词计数为5以上网站   标记为1
			
 
				+            task["hit5"] = 1
			
 
				             self.push_domain(task)
			
 
				         else:
			
 
				             self.push_remove(task)
			
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -54,8 +54,11 @@ def extract_domain(url):
 
				     # >>> extract_domain('http://forums.bbc.co.uk')
			
 
				     'bbc.co.uk'
			
 
				     """
			
 
				-    ext = tldextract.extract(url)
			
 
				-    return ext.registered_domain or ext.ipv4
			
 
				+    # ext = tldextract.extract(url)
			
 
				+    # return ext.registered_domain or ext.ipv4
			
 
				+
			
 
				+    _, h, p = get_host(url)
			
 
				+    return f"{h}:{p}" if p else h  # 域名判重时使用全称
			
 
				 
			
 
				 
			
 
				 def extract_fqdn(url):