|
@@ -18,11 +18,11 @@ TLDS = ['com', 'cn', 'net', 'org']
|
|
|
|
|
|
class DataExcavate(BasicSearch):
|
|
|
|
|
|
- def __init__(self, workers=1, loop_interval=60, **kwargs):
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ self._workers = (kwargs.pop('workers', None) or 1)
|
|
|
+ self._max_depth = (kwargs.pop('excavate_depth', None) or 1)
|
|
|
+ self._interval = (kwargs.pop('excavate_interval', None) or 60)
|
|
|
super(DataExcavate, self).__init__(**kwargs)
|
|
|
- self._interval = loop_interval
|
|
|
- self._workers = workers
|
|
|
- self._max_depth = (kwargs.pop('excavate_depth', None) or 3)
|
|
|
self._default_depth = 1
|
|
|
|
|
|
def _init_depth(self, task: Task):
|
|
@@ -68,10 +68,9 @@ class DataExcavate(BasicSearch):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
- def process(self, task: Task):
|
|
|
- t_name = threading.currentThread().getName()
|
|
|
+ def process(self, t_name: str, task: Task):
|
|
|
logger.info(f'<{t_name}> - 请求 - {task["url"]}')
|
|
|
- response = self.downloader.get(task['url'])
|
|
|
+ response = self.downloader.get(task['url'], disable_debug_log=False)
|
|
|
status_code = response.status_code
|
|
|
page_source = response.text
|
|
|
reason = response.reason
|
|
@@ -94,19 +93,20 @@ class DataExcavate(BasicSearch):
|
|
|
lst.append(self.make_task(
|
|
|
url=url,
|
|
|
name=name,
|
|
|
+ depth=task['depth'] + 1,
|
|
|
origin=task['origin'],
|
|
|
groups=task['groups'],
|
|
|
classify=self.visit_classify,
|
|
|
- weight=task['weight']
|
|
|
+ weight=task['weight'] + 1
|
|
|
))
|
|
|
_c += 1
|
|
|
|
|
|
if _c > 1:
|
|
|
- if self.push_domain(task):
|
|
|
- lst = self.check_depth(task, lst)
|
|
|
+ self.push_domain(task)
|
|
|
else:
|
|
|
self.push_remove(task)
|
|
|
- self.scheduler.add_excavate(lst, level=task['weight'])
|
|
|
+ '''层级深的,优先采集'''
|
|
|
+ self.scheduler.add_excavate(lst, level=task['weight'] + 1)
|
|
|
return True
|
|
|
|
|
|
def excavate(self):
|
|
@@ -125,11 +125,14 @@ class DataExcavate(BasicSearch):
|
|
|
logger.debug(f'<{t_name}> - 垃圾数据 - {task["url"]}')
|
|
|
continue
|
|
|
|
|
|
- '''数据挖掘'''
|
|
|
+ '''层级控制'''
|
|
|
+ if task['depth'] > self._max_depth:
|
|
|
+ logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
|
|
|
+ continue
|
|
|
+
|
|
|
try:
|
|
|
- success = self.process(task)
|
|
|
+ success = self.process(t_name, task)
|
|
|
if not success:
|
|
|
- '''url - 添加过滤器'''
|
|
|
self.validator.add_data(task['url'])
|
|
|
except Exception as e:
|
|
|
logger.exception(e)
|