|
@@ -78,7 +78,7 @@ class DataExcavate(BasicService):
|
|
|
|
|
|
def process(self, t_name: str, task: Task):
|
|
def process(self, t_name: str, task: Task):
|
|
logger.info(f'<{t_name}> - 请求 - {task["url"]}')
|
|
logger.info(f'<{t_name}> - 请求 - {task["url"]}')
|
|
- response = self.downloader.get(task['url'], disable_debug_log=False)
|
|
|
|
|
|
+ response = self.downloader.get(task['url'])
|
|
status_code = response.status_code
|
|
status_code = response.status_code
|
|
page_source = response.text
|
|
page_source = response.text
|
|
reason = response.reason
|
|
reason = response.reason
|
|
@@ -92,23 +92,26 @@ class DataExcavate(BasicService):
|
|
task['name'] = extract_page_title(page_source)
|
|
task['name'] = extract_page_title(page_source)
|
|
task['base_url'] = extract_base_url(task['url'])
|
|
task['base_url'] = extract_base_url(task['url'])
|
|
|
|
|
|
- items = self.parser.site_items(page_source, task['base_url'])
|
|
|
|
lst = []
|
|
lst = []
|
|
|
|
+ _history = []
|
|
_c = 0 # 过滤词计数器
|
|
_c = 0 # 过滤词计数器
|
|
sub_depth = task['depth'] + 1
|
|
sub_depth = task['depth'] + 1
|
|
sub_weight = task['weight'] + 1
|
|
sub_weight = task['weight'] + 1
|
|
|
|
+ items = self.parser.site_items(page_source, task['base_url'])
|
|
for item in items:
|
|
for item in items:
|
|
name, url = item['name'], item['host']
|
|
name, url = item['name'], item['host']
|
|
if self.validator.words(name):
|
|
if self.validator.words(name):
|
|
- lst.append(self.make_task(
|
|
|
|
- url=url,
|
|
|
|
- name=name,
|
|
|
|
- depth=sub_depth,
|
|
|
|
- origin=task['origin'],
|
|
|
|
- groups=task['groups'],
|
|
|
|
- classify=self.visit_classify,
|
|
|
|
- weight=sub_weight
|
|
|
|
- ))
|
|
|
|
|
|
+ if url not in _history:
|
|
|
|
+ lst.append(self.make_task(
|
|
|
|
+ url=url,
|
|
|
|
+ name=name,
|
|
|
|
+ depth=sub_depth,
|
|
|
|
+ origin=task['origin'],
|
|
|
|
+ groups=task['groups'],
|
|
|
|
+ classify=self.visit_classify,
|
|
|
|
+ weight=sub_weight
|
|
|
|
+ ))
|
|
|
|
+ _history.append(url)
|
|
_c += 1
|
|
_c += 1
|
|
|
|
|
|
if _c > 1:
|
|
if _c > 1:
|
|
@@ -116,9 +119,9 @@ class DataExcavate(BasicService):
|
|
else:
|
|
else:
|
|
save = self.push_remove(task)
|
|
save = self.push_remove(task)
|
|
|
|
|
|
- msg = f'<{t_name}> - 新增网址 - {task["url"]}'
|
|
|
|
|
|
+ msg = f'<{t_name}> - 保存网址 - {task["url"]}'
|
|
if not save:
|
|
if not save:
|
|
- msg = f'<{t_name}> - 重复网址 - {task["url"]}'
|
|
|
|
|
|
+ msg = f'<{t_name}> - 丢弃网址 - {task["url"]}'
|
|
logger.debug(msg)
|
|
logger.debug(msg)
|
|
'''层级深的,优先采集'''
|
|
'''层级深的,优先采集'''
|
|
self.scheduler.add_excavate(lst, level=sub_weight)
|
|
self.scheduler.add_excavate(lst, level=sub_weight)
|
|
@@ -130,7 +133,7 @@ class DataExcavate(BasicService):
|
|
while True:
|
|
while True:
|
|
tasks = self.scheduler.get_excavate_task()
|
|
tasks = self.scheduler.get_excavate_task()
|
|
if len(tasks) == 0:
|
|
if len(tasks) == 0:
|
|
- self.loops_interval(self._interval)
|
|
|
|
|
|
+ self.loops_interval(self._interval, enable_debug_log=True)
|
|
continue
|
|
continue
|
|
|
|
|
|
task_key, task = tasks
|
|
task_key, task = tasks
|
|
@@ -143,7 +146,7 @@ class DataExcavate(BasicService):
|
|
'''层级控制'''
|
|
'''层级控制'''
|
|
if task['depth'] > self._max_depth:
|
|
if task['depth'] > self._max_depth:
|
|
logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
|
|
logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
|
|
- self.push_records(task)
|
|
|
|
|
|
+ # self.push_records(task)
|
|
continue
|
|
continue
|
|
|
|
|
|
dont_visit = re.match(URL_SUFFIX_PATTERN, task['url']) is not None
|
|
dont_visit = re.match(URL_SUFFIX_PATTERN, task['url']) is not None
|