dongzhaorui hace 3 años
padre
commit
fd5f8e30d5
Se han modificado 1 ficheros con 17 adiciones y 14 borrados
  1. 17 14
      find_source/crawler/services/excavate.py

+ 17 - 14
find_source/crawler/services/excavate.py

@@ -18,11 +18,11 @@ TLDS = ['com', 'cn', 'net', 'org']
 
 class DataExcavate(BasicSearch):
 
-    def __init__(self, workers=1, loop_interval=60, **kwargs):
+    def __init__(self, **kwargs):
+        self._workers = (kwargs.pop('workers', None) or 1)
+        self._max_depth = (kwargs.pop('excavate_depth', None) or 1)
+        self._interval = (kwargs.pop('excavate_interval', None) or 60)
         super(DataExcavate, self).__init__(**kwargs)
-        self._interval = loop_interval
-        self._workers = workers
-        self._max_depth = (kwargs.pop('excavate_depth', None) or 3)
         self._default_depth = 1
 
     def _init_depth(self, task: Task):
@@ -68,10 +68,9 @@ class DataExcavate(BasicSearch):
                 return True
         return False
 
-    def process(self, task: Task):
-        t_name = threading.currentThread().getName()
+    def process(self, t_name: str, task: Task):
         logger.info(f'<{t_name}> - 请求 - {task["url"]}')
-        response = self.downloader.get(task['url'])
+        response = self.downloader.get(task['url'], disable_debug_log=False)
         status_code = response.status_code
         page_source = response.text
         reason = response.reason
@@ -94,19 +93,20 @@ class DataExcavate(BasicSearch):
                 lst.append(self.make_task(
                     url=url,
                     name=name,
+                    depth=task['depth'] + 1,
                     origin=task['origin'],
                     groups=task['groups'],
                     classify=self.visit_classify,
-                    weight=task['weight']
+                    weight=task['weight'] + 1
                 ))
                 _c += 1
 
         if _c > 1:
-            if self.push_domain(task):
-                lst = self.check_depth(task, lst)
+            self.push_domain(task)
         else:
             self.push_remove(task)
-        self.scheduler.add_excavate(lst, level=task['weight'])
+        '''层级深的,优先采集'''
+        self.scheduler.add_excavate(lst, level=task['weight'] + 1)
         return True
 
     def excavate(self):
@@ -125,11 +125,14 @@ class DataExcavate(BasicSearch):
                 logger.debug(f'<{t_name}> - 垃圾数据 - {task["url"]}')
                 continue
 
-            '''数据挖掘'''
+            '''层级控制'''
+            if task['depth'] > self._max_depth:
+                logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
+                continue
+
             try:
-                success = self.process(task)
+                success = self.process(t_name, task)
                 if not success:
-                    '''url - 添加过滤器'''
                     self.validator.add_data(task['url'])
             except Exception as e:
                 logger.exception(e)