|
@@ -1,3 +1,4 @@
|
|
|
+import re
|
|
|
import threading
|
|
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
from typing import List
|
|
@@ -14,6 +15,8 @@ from crawler.utils import (
|
|
|
)
|
|
|
|
|
|
TLDS = ['com', 'cn', 'net', 'org']
|
|
|
+URL_SUFFIX = ['pdf', 'xls', 'xlsx', 'docx', 'doc', 'rar', 'zip']
|
|
|
+URL_SUFFIX_PATTERN = '.*(' + '|'.join(URL_SUFFIX) + ')$'
|
|
|
|
|
|
|
|
|
class DataExcavate(BasicSearch):
|
|
@@ -87,26 +90,33 @@ class DataExcavate(BasicSearch):
|
|
|
items = self.parser.site_items(page_source, task['base_url'])
|
|
|
lst = []
|
|
|
_c = 0 # 过滤词计数器
|
|
|
+ sub_depth = task['depth'] + 1
|
|
|
+ sub_weight = task['weight'] + 1
|
|
|
for item in items:
|
|
|
name, url = item['name'], item['host']
|
|
|
if self.validator.words(name):
|
|
|
lst.append(self.make_task(
|
|
|
url=url,
|
|
|
name=name,
|
|
|
- depth=task['depth'] + 1,
|
|
|
+ depth=sub_depth,
|
|
|
origin=task['origin'],
|
|
|
groups=task['groups'],
|
|
|
classify=self.visit_classify,
|
|
|
- weight=task['weight'] + 1
|
|
|
+ weight=sub_weight
|
|
|
))
|
|
|
_c += 1
|
|
|
|
|
|
if _c > 1:
|
|
|
- self.push_domain(task)
|
|
|
+ save = self.push_domain(task)
|
|
|
else:
|
|
|
- self.push_remove(task)
|
|
|
+ save = self.push_remove(task)
|
|
|
+
|
|
|
+ msg = f'<{t_name}> - 新增网址 - {task["url"]}'
|
|
|
+ if not save:
|
|
|
+ msg = f'<{t_name}> - 重复网址 - {task["url"]}'
|
|
|
+ logger.debug(msg)
|
|
|
'''层级深的,优先采集'''
|
|
|
- self.scheduler.add_excavate(lst, level=task['weight'] + 1)
|
|
|
+ self.scheduler.add_excavate(lst, level=sub_weight)
|
|
|
return True
|
|
|
|
|
|
def excavate(self):
|
|
@@ -128,14 +138,17 @@ class DataExcavate(BasicSearch):
|
|
|
'''层级控制'''
|
|
|
if task['depth'] > self._max_depth:
|
|
|
logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
|
|
|
+ self.push_records(task)
|
|
|
continue
|
|
|
|
|
|
- try:
|
|
|
- success = self.process(t_name, task)
|
|
|
- if not success:
|
|
|
- self.validator.add_data(task['url'])
|
|
|
- except Exception as e:
|
|
|
- logger.exception(e)
|
|
|
+ dont_visit = re.match(URL_SUFFIX_PATTERN, task['url']) is not None
|
|
|
+ if not dont_visit:
|
|
|
+ try:
|
|
|
+ success = self.process(t_name, task)
|
|
|
+ if not success:
|
|
|
+ self.validator.add_data(task['url'])
|
|
|
+ except Exception as e:
|
|
|
+ logger.exception(e)
|
|
|
# '''挖掘记录'''
|
|
|
# self.push_records(task)
|
|
|
|