dongzhaorui 3 лет назад
Родитель
Сommit
ed6ed36337
1 измененных файлов с 7 добавлено и 2 удалено
  1. 7 2
      find_source/crawler/services/excavate.py

+ 7 - 2
find_source/crawler/services/excavate.py

@@ -5,13 +5,14 @@ from typing import List
 
 from common.log import logger
 from crawler.Task import Task
-from crawler.services.basics import BasicSearch
+from crawler.services.basics import BasicService
 from crawler.utils import (
     extract_base_url,
     extract_page_title,
     extract_domain,
     split_domain,
     err_details,
+    is_url,
 )
 
 TLDS = ['com', 'cn', 'net', 'org']
@@ -19,7 +20,8 @@ URL_SUFFIX = ['pdf', 'xls', 'xlsx', 'docx', 'doc', 'rar', 'zip']
 URL_SUFFIX_PATTERN = '.*(' + '|'.join(URL_SUFFIX) + ')$'
 
 
-class DataExcavate(BasicSearch):
+class DataExcavate(BasicService):
+    """数据挖掘服务"""
 
     def __init__(self, **kwargs):
         self._workers = (kwargs.pop('workers', None) or 1)
@@ -48,6 +50,9 @@ class DataExcavate(BasicSearch):
         :param url: 网址
         :return: bool
         """
+        if not is_url(url):
+            return True
+
         if self.validator.data(url):
             return True