1 年間前 · e00949aa4c
--- a/find_source/crawler/services/excavate.py
+++ b/find_source/crawler/services/excavate.py
@@ -17,8 +17,7 @@ from crawler.utils import (
 
															     iter_node,
														
 
															     check_page_by_words,
														
 
															     predict_bidding_model,
														
 
															-    is_contains,
														
 
															-    compress_str
														
 
															+    is_contains
														
 
															 )
														
 
															 from settings import Dzr
														
@@ -78,22 +77,14 @@ class DataExcavate(BasicService):
 
															             is_attachment_url,  # 判断是否附件下载地址
														
 
															             is_login_url,  # 判断是否登录地址
														
 
															             self.validator.data(url),  # 垃圾池 - 判重任务请求网址
														
 
															-            self.validator.data(domain),  # 垃圾池 - 过滤域名（一级）
														
 
															-            self.collector.data(domain),  # 收录池 - 判重域名（一级）
														
 
															+            self.validator.data(domain),  # 垃圾池 - 过滤域名
														
 
															+            curr_depth > 1 and self.collector.data(domain),  # 收录池 - 判重域名（一级）[种子不与已收录判重]
														
 
															         ]):
														
 
															             logger.debug(f'<{self.thread_name}> - 无效任务 - {curr_depth} - {url}')
														
 
															             return True
														
 
															         return False
														
 
															-    def filter_data(self, lst):
														
 
															-        """通过垃圾过滤器过滤数据"""
														
 
															-        results = []
														
 
															-        for val in lst:
														
 
															-            if not self.validator.data(val):
														
 
															-                results.append(val)
														
 
															-        return results
														
 
															-
														
 
															     def same_origin_strategy(self, source, task: Task):
														
 
															         """同源策略"""
														
 
															         # 排查时间文本
														
@@ -105,6 +96,7 @@ class DataExcavate(BasicService):
 
															             if pt and not node.getchildren() and node not in date_nodes:
														
 
															                 date_nodes.append(node)
														
 
															                 hit_date_total += 1
														
 
															+
														
 
															         # 全文排查检索词
														
 
															         hit_text_total = 0
														
 
															         all_text = ["".join(text.split()) for text in element.itertext()]
														
@@ -113,10 +105,16 @@ class DataExcavate(BasicService):
 
															             # print(text)
														
 
															             if check_page_by_words(text):
														
 
															                 hit_text_total += 1
														
 
															+
														
 
															         # 寻源结果
														
 
															         if all([3 < hit_date_total < 50, hit_text_total > 3]):
														
 
															+            if hit_text_total < 5:
														
 
															+                # 关键词计数为3-5网站 标记为2
														
 
															+                task["hit5"] = 2
														
 
															             self.push_domain(task)
														
 
															         elif hit_text_total > 5:
														
 
															+            # 关键词计数为5以上网站   标记为1
														
 
															+            task["hit5"] = 1
														
 
															             self.push_domain(task)
														
 
															         else:
														
 
															             self.push_remove(task)
														
--- a/find_source/crawler/utils.py
+++ b/find_source/crawler/utils.py
@@ -54,8 +54,11 @@ def extract_domain(url):
 
															     # >>> extract_domain('http://forums.bbc.co.uk')
														
 
															     'bbc.co.uk'
														
 
															     """
														
 
															-    ext = tldextract.extract(url)
														
 
															-    return ext.registered_domain or ext.ipv4
														
 
															+    # ext = tldextract.extract(url)
														
 
															+    # return ext.registered_domain or ext.ipv4
														
 
															+
														
 
															+    _, h, p = get_host(url)
														
 
															+    return f"{h}:{p}" if p else h  # 域名判重时使用全称
														
 
															 def extract_fqdn(url):