dongzhaorui 3 years ago
parent
commit
087fe1d3f4
1 changed files with 18 additions and 15 deletions
  1. 18 15
      find_source/crawler/services/excavate.py

+ 18 - 15
find_source/crawler/services/excavate.py

@@ -78,7 +78,7 @@ class DataExcavate(BasicService):
 
     def process(self, t_name: str, task: Task):
         logger.info(f'<{t_name}> - 请求 - {task["url"]}')
-        response = self.downloader.get(task['url'], disable_debug_log=False)
+        response = self.downloader.get(task['url'])
         status_code = response.status_code
         page_source = response.text
         reason = response.reason
@@ -92,23 +92,26 @@ class DataExcavate(BasicService):
         task['name'] = extract_page_title(page_source)
         task['base_url'] = extract_base_url(task['url'])
 
-        items = self.parser.site_items(page_source, task['base_url'])
         lst = []
+        _history = []
         _c = 0  # 过滤词计数器
         sub_depth = task['depth'] + 1
         sub_weight = task['weight'] + 1
+        items = self.parser.site_items(page_source, task['base_url'])
         for item in items:
             name, url = item['name'], item['host']
             if self.validator.words(name):
-                lst.append(self.make_task(
-                    url=url,
-                    name=name,
-                    depth=sub_depth,
-                    origin=task['origin'],
-                    groups=task['groups'],
-                    classify=self.visit_classify,
-                    weight=sub_weight
-                ))
+                if url not in _history:
+                    lst.append(self.make_task(
+                        url=url,
+                        name=name,
+                        depth=sub_depth,
+                        origin=task['origin'],
+                        groups=task['groups'],
+                        classify=self.visit_classify,
+                        weight=sub_weight
+                    ))
+                    _history.append(url)
                 _c += 1
 
         if _c > 1:
@@ -116,9 +119,9 @@ class DataExcavate(BasicService):
         else:
             save = self.push_remove(task)
 
-        msg = f'<{t_name}> - 新增网址 - {task["url"]}'
+        msg = f'<{t_name}> - 保存网址 - {task["url"]}'
         if not save:
-            msg = f'<{t_name}> - 重复网址 - {task["url"]}'
+            msg = f'<{t_name}> - 丢弃网址 - {task["url"]}'
         logger.debug(msg)
         '''层级深的,优先采集'''
         self.scheduler.add_excavate(lst, level=sub_weight)
@@ -130,7 +133,7 @@ class DataExcavate(BasicService):
         while True:
             tasks = self.scheduler.get_excavate_task()
             if len(tasks) == 0:
-                self.loops_interval(self._interval)
+                self.loops_interval(self._interval, enable_debug_log=True)
                 continue
 
             task_key, task = tasks
@@ -143,7 +146,7 @@ class DataExcavate(BasicService):
             '''层级控制'''
             if task['depth'] > self._max_depth:
                 logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
-                self.push_records(task)
+                # self.push_records(task)
                 continue
 
             dont_visit = re.match(URL_SUFFIX_PATTERN, task['url']) is not None