dongzhaorui 3 gadi atpakaļ
vecāks
revīzija
f2b54782d6
1 mainītis faili ar 9 papildinājumiem un 10 dzēšanām
  1. 9 10
      find_source/crawler/spiders.py

+ 9 - 10
find_source/crawler/spiders.py

@@ -429,19 +429,18 @@ class DataExcavate(BasicSearch):
         if self.validator.url(task['url']):
             return
 
-        logger.info(f'request host -> {task["url"]}')
+        logger.info(f'[数据挖掘]开始请求 -> {task["url"]}')
         response = self.downloader.get(task['url'])
         if response.status_code != 200 or response.text in ['', None]:
+            logger.error(f'[数据挖掘]访问异常 -> {task["url"]}')
             return
 
         task['domain'] = extract_domain(task['url'])
         page_source = response.text
-        title = extract_page_title(page_source)
-        task['name'] = title
-        base_url = extract_base_url(task['url'])
-        task['base_url'] = base_url
+        task['name'] = extract_page_title(page_source)
+        task['base_url'] = extract_base_url(task['url'])
 
-        items = self.parser.site_items(page_source, base_url)
+        items = self.parser.site_items(page_source, task['base_url'])
         lst = []
         _c = 0  # 页面包含的关键词计数器
         for item in items:
@@ -460,10 +459,6 @@ class DataExcavate(BasicSearch):
         if _c > 1:
             self.save(task)
         self.scheduler.add_excavate(lst, level=task['weight'])
-        '''domain - 添加过滤器'''
-        self.validator.add_url(task['domain'])
-        '''url - 添加过滤器'''
-        self.validator.add_url(task['url'])
 
     def excavate(self):
         t_name = threading.currentThread().getName()
@@ -477,6 +472,10 @@ class DataExcavate(BasicSearch):
             task_key, task = tasks
             task['update_at'] = int2long(int(time.time()))
             self.retrieve_site(task)
+            '''domain - 添加过滤器'''
+            self.validator.add_url(task['domain'])
+            '''url - 添加过滤器'''
+            self.validator.add_url(task['url'])
 
     def start(self):
         logger.info(f'[数据挖掘]初始化加载')