dongzhaorui 3 jaren geleden
bovenliggende
commit
fca3867638
1 gewijzigde bestanden met toevoegingen van 13 en 10 verwijderingen
  1. 13 10
      find_source/crawler/services/excavate.py

+ 13 - 10
find_source/crawler/services/excavate.py

@@ -7,7 +7,7 @@ from common.log import logger
 from crawler.Task import Task
 from crawler.services.basics import BasicService
 from crawler.utils import (
-    extract_base_url,
+    extract_host,
     extract_page_title,
     extract_domain,
     split_domain,
@@ -90,16 +90,16 @@ class DataExcavate(BasicService):
 
         task['domain'] = extract_domain(task['url'])
         task['name'] = extract_page_title(page_source)
-        task['base_url'] = extract_base_url(task['url'])
+        task['base_url'] = extract_host(task['url'])
 
         lst = []
         _history = []
         _c = 0  # 过滤词计数器
         sub_depth = task['depth'] + 1
         sub_weight = task['weight'] + 1
-        items = self.parser.site_items(page_source, task['base_url'])
+        items = self.parser.non_origin(page_source, task['url'])
         for item in items:
-            name, url = item['name'], item['host']
+            name, url = item['title'], item['href']
             if self.validator.words(name):
                 if url not in _history:
                     lst.append(self.make_task(
@@ -116,12 +116,15 @@ class DataExcavate(BasicService):
 
         if _c > 1:
             save = self.push_domain(task)
+            msg = f'<{t_name}> - 收录成功 - {task["url"]}'
+            if not save:
+                msg = f'<{t_name}> - 重复收录 - {task["url"]}'
         else:
-            save = self.push_remove(task)
+            remove = self.push_remove(task)
+            msg = f'<{t_name}> - 过滤丢弃 - {task["url"]}'
+            if not remove:
+                msg = f'<{t_name}> - 重复收录 - {task["url"]}'
 
-        msg = f'<{t_name}> - 保存网址 - {task["url"]}'
-        if not save:
-            msg = f'<{t_name}> - 丢弃网址 - {task["url"]}'
         logger.debug(msg)
         '''层级深的,优先采集'''
         self.scheduler.add_excavate(lst, level=sub_weight)
@@ -137,13 +140,13 @@ class DataExcavate(BasicService):
                 continue
 
             task_key, task = tasks
-            '''初始化网站层级'''
+            # 初始化网站层级
             self._init_depth(task)
             if self.is_rubbish(task['url']):
                 logger.debug(f'<{t_name}> - 垃圾数据 - {task["url"]}')
                 continue
 
-            '''层级控制'''
+            # 层级控制
             if task['depth'] > self._max_depth:
                 logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
                 # self.push_records(task)