dongzhaorui 3 年之前
父节点
当前提交
95bbdb69f0
共有 1 个文件被更改,包括 23 次插入1 次删除
  1. 23 1
      find_source/crawler/services/data_excavate.py

+ 23 - 1
find_source/crawler/services/data_excavate.py

@@ -8,9 +8,12 @@ from crawler.utils import (
     extract_base_url,
     extract_page_title,
     extract_domain,
+    parser_domain,
     err_details,
 )
 
+TLDS = ['com', 'cn']
+
 
 class DataExcavate(BasicSearch):
 
@@ -19,6 +22,24 @@ class DataExcavate(BasicSearch):
         self._interval = loop_interval
         self._workers = workers
 
+    def is_rubbish(self, url: str):
+        if self.validator.data(url):
+            return True
+
+        domain = extract_domain(url)
+        if self.validator.data(domain):
+            return True
+
+        if domain.startswith('www.'):
+            domain = domain.replace('www.', '')
+
+        domain_lst = parser_domain(domain)
+        domain_lst = [d for d in domain_lst if d not in TLDS]
+        for val in domain_lst:
+            if self.validator.data(val):
+                return True
+        return False
+
     def retrieve_site(self, task: Task):
         t_name = threading.currentThread().getName()
         logger.info(f'[{t_name}]开始请求 - {task["url"]}')
@@ -67,7 +88,8 @@ class DataExcavate(BasicSearch):
                 continue
 
             task_key, task = tasks
-            if self.validator.data(task['url']):
+            if self.is_rubbish(task['url']):
+                logger.info(f'[{t_name}]过滤网址 - {task["url"]}')
                 continue
             '''挖掘站点'''
             success = self.retrieve_site(task)