|
@@ -16,7 +16,7 @@ from crawler.utils import (
|
|
html2element,
|
|
html2element,
|
|
iter_node,
|
|
iter_node,
|
|
check_page_by_words,
|
|
check_page_by_words,
|
|
- predict_bidding_model
|
|
|
|
|
|
+ predict_bidding_model, is_contains
|
|
)
|
|
)
|
|
from settings import Dzr
|
|
from settings import Dzr
|
|
|
|
|
|
@@ -70,9 +70,11 @@ class DataExcavate(BasicService):
|
|
curr_depth = task['depth']
|
|
curr_depth = task['depth']
|
|
domain = extract_domain(url)
|
|
domain = extract_domain(url)
|
|
is_attachment_url = re.match(URL_SUFFIX_PATTERN, url) is not None
|
|
is_attachment_url = re.match(URL_SUFFIX_PATTERN, url) is not None
|
|
|
|
+ is_login_url = is_contains(url, 'login')
|
|
if any([
|
|
if any([
|
|
curr_depth > self._max_depth, # 检查任务层级
|
|
curr_depth > self._max_depth, # 检查任务层级
|
|
- is_attachment_url, # 检查网址是否附件下载地址
|
|
|
|
|
|
+ is_attachment_url, # 判断是否附件下载地址
|
|
|
|
+ is_login_url, # 判断是否登录地址
|
|
self.validator.data(url), # 垃圾池 - 判重任务请求网址
|
|
self.validator.data(url), # 垃圾池 - 判重任务请求网址
|
|
self.validator.data(domain), # 垃圾池 - 过滤域名(一级)
|
|
self.validator.data(domain), # 垃圾池 - 过滤域名(一级)
|
|
self.collector.data(domain), # 收录池 - 判重域名(一级)
|
|
self.collector.data(domain), # 收录池 - 判重域名(一级)
|