|
@@ -7,7 +7,7 @@ from common.log import logger
|
|
|
from crawler.Task import Task
|
|
|
from crawler.services.basics import BasicService
|
|
|
from crawler.utils import (
|
|
|
- extract_base_url,
|
|
|
+ extract_host,
|
|
|
extract_page_title,
|
|
|
extract_domain,
|
|
|
split_domain,
|
|
@@ -90,16 +90,16 @@ class DataExcavate(BasicService):
|
|
|
|
|
|
task['domain'] = extract_domain(task['url'])
|
|
|
task['name'] = extract_page_title(page_source)
|
|
|
- task['base_url'] = extract_base_url(task['url'])
|
|
|
+ task['base_url'] = extract_host(task['url'])
|
|
|
|
|
|
lst = []
|
|
|
_history = []
|
|
|
_c = 0 # 过滤词计数器
|
|
|
sub_depth = task['depth'] + 1
|
|
|
sub_weight = task['weight'] + 1
|
|
|
- items = self.parser.site_items(page_source, task['base_url'])
|
|
|
+ items = self.parser.non_origin(page_source, task['url'])
|
|
|
for item in items:
|
|
|
- name, url = item['name'], item['host']
|
|
|
+ name, url = item['title'], item['href']
|
|
|
if self.validator.words(name):
|
|
|
if url not in _history:
|
|
|
lst.append(self.make_task(
|
|
@@ -116,12 +116,15 @@ class DataExcavate(BasicService):
|
|
|
|
|
|
if _c > 1:
|
|
|
save = self.push_domain(task)
|
|
|
+ msg = f'<{t_name}> - 收录成功 - {task["url"]}'
|
|
|
+ if not save:
|
|
|
+ msg = f'<{t_name}> - 重复收录 - {task["url"]}'
|
|
|
else:
|
|
|
- save = self.push_remove(task)
|
|
|
+ remove = self.push_remove(task)
|
|
|
+ msg = f'<{t_name}> - 过滤丢弃 - {task["url"]}'
|
|
|
+ if not remove:
|
|
|
+ msg = f'<{t_name}> - 重复收录 - {task["url"]}'
|
|
|
|
|
|
- msg = f'<{t_name}> - 保存网址 - {task["url"]}'
|
|
|
- if not save:
|
|
|
- msg = f'<{t_name}> - 丢弃网址 - {task["url"]}'
|
|
|
logger.debug(msg)
|
|
|
'''层级深的,优先采集'''
|
|
|
self.scheduler.add_excavate(lst, level=sub_weight)
|
|
@@ -137,13 +140,13 @@ class DataExcavate(BasicService):
|
|
|
continue
|
|
|
|
|
|
task_key, task = tasks
|
|
|
- '''初始化网站层级'''
|
|
|
+ # 初始化网站层级
|
|
|
self._init_depth(task)
|
|
|
if self.is_rubbish(task['url']):
|
|
|
logger.debug(f'<{t_name}> - 垃圾数据 - {task["url"]}')
|
|
|
continue
|
|
|
|
|
|
- '''层级控制'''
|
|
|
+ # 层级控制
|
|
|
if task['depth'] > self._max_depth:
|
|
|
logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
|
|
|
# self.push_records(task)
|