|
@@ -5,13 +5,14 @@ from typing import List
|
|
|
|
|
|
from common.log import logger
|
|
|
from crawler.Task import Task
|
|
|
-from crawler.services.basics import BasicSearch
|
|
|
+from crawler.services.basics import BasicService
|
|
|
from crawler.utils import (
|
|
|
extract_base_url,
|
|
|
extract_page_title,
|
|
|
extract_domain,
|
|
|
split_domain,
|
|
|
err_details,
|
|
|
+ is_url,
|
|
|
)
|
|
|
|
|
|
TLDS = ['com', 'cn', 'net', 'org']
|
|
@@ -19,7 +20,8 @@ URL_SUFFIX = ['pdf', 'xls', 'xlsx', 'docx', 'doc', 'rar', 'zip']
|
|
|
URL_SUFFIX_PATTERN = '.*(' + '|'.join(URL_SUFFIX) + ')$'
|
|
|
|
|
|
|
|
|
-class DataExcavate(BasicSearch):
|
|
|
+class DataExcavate(BasicService):
|
|
|
+ """数据挖掘服务"""
|
|
|
|
|
|
def __init__(self, **kwargs):
|
|
|
self._workers = (kwargs.pop('workers', None) or 1)
|
|
@@ -48,6 +50,9 @@ class DataExcavate(BasicSearch):
|
|
|
:param url: 网址
|
|
|
:return: bool
|
|
|
"""
|
|
|
+ if not is_url(url):
|
|
|
+ return True
|
|
|
+
|
|
|
if self.validator.data(url):
|
|
|
return True
|
|
|
|