dongzhaorui 3 anni fa
parent
commit
071ff03ba8
1 ha cambiato i file con 24 aggiunte e 11 eliminazioni
  1. 24 11
      find_source/crawler/services/excavate.py

+ 24 - 11
find_source/crawler/services/excavate.py

@@ -1,3 +1,4 @@
+import re
 import threading
 from concurrent.futures import ThreadPoolExecutor, wait
 from typing import List
@@ -14,6 +15,8 @@ from crawler.utils import (
 )
 
 TLDS = ['com', 'cn', 'net', 'org']
+URL_SUFFIX = ['pdf', 'xls', 'xlsx', 'docx', 'doc', 'rar', 'zip']
+URL_SUFFIX_PATTERN = '.*(' + '|'.join(URL_SUFFIX) + ')$'
 
 
 class DataExcavate(BasicSearch):
@@ -87,26 +90,33 @@ class DataExcavate(BasicSearch):
         items = self.parser.site_items(page_source, task['base_url'])
         lst = []
         _c = 0  # 过滤词计数器
+        sub_depth = task['depth'] + 1
+        sub_weight = task['weight'] + 1
         for item in items:
             name, url = item['name'], item['host']
             if self.validator.words(name):
                 lst.append(self.make_task(
                     url=url,
                     name=name,
-                    depth=task['depth'] + 1,
+                    depth=sub_depth,
                     origin=task['origin'],
                     groups=task['groups'],
                     classify=self.visit_classify,
-                    weight=task['weight'] + 1
+                    weight=sub_weight
                 ))
                 _c += 1
 
         if _c > 1:
-            self.push_domain(task)
+            save = self.push_domain(task)
         else:
-            self.push_remove(task)
+            save = self.push_remove(task)
+
+        msg = f'<{t_name}> - 新增网址 - {task["url"]}'
+        if not save:
+            msg = f'<{t_name}> - 重复网址 - {task["url"]}'
+        logger.debug(msg)
         '''层级深的,优先采集'''
-        self.scheduler.add_excavate(lst, level=task['weight'] + 1)
+        self.scheduler.add_excavate(lst, level=sub_weight)
         return True
 
     def excavate(self):
@@ -128,14 +138,17 @@ class DataExcavate(BasicSearch):
             '''层级控制'''
             if task['depth'] > self._max_depth:
                 logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
+                self.push_records(task)
                 continue
 
-            try:
-                success = self.process(t_name, task)
-                if not success:
-                    self.validator.add_data(task['url'])
-            except Exception as e:
-                logger.exception(e)
+            dont_visit = re.match(URL_SUFFIX_PATTERN, task['url']) is not None
+            if not dont_visit:
+                try:
+                    success = self.process(t_name, task)
+                    if not success:
+                        self.validator.add_data(task['url'])
+                except Exception as e:
+                    logger.exception(e)
             # '''挖掘记录'''
             # self.push_records(task)