|
@@ -21,7 +21,7 @@ class QueryKeyWord(BasicSearch):
|
|
|
self._interval = (kwargs.pop('query_interval', None) or 60)
|
|
|
super(QueryKeyWord, self).__init__(**kwargs)
|
|
|
self.engine = (engine or BingSearchEngine())
|
|
|
- self._name = engine.__class__.__name__
|
|
|
+ self._name = self.engine.__class__.__name__
|
|
|
|
|
|
def query_keyword(self):
|
|
|
t_name = threading.currentThread().getName()
|
|
@@ -42,15 +42,14 @@ class QueryKeyWord(BasicSearch):
|
|
|
urls = self.engine.search(task['search'], cur_page)
|
|
|
for url in urls:
|
|
|
base_url = extract_base_url(url)
|
|
|
- if self.validator.data(base_url):
|
|
|
- continue
|
|
|
- lst.append(self.make_task(
|
|
|
- url=base_url,
|
|
|
- origin=task['origin'],
|
|
|
- groups=task['groups'],
|
|
|
- classify=self.visit_classify,
|
|
|
- weight=task['weight'],
|
|
|
- ))
|
|
|
+ if not self.validator.data(base_url):
|
|
|
+ lst.append(self.make_task(
|
|
|
+ url=base_url,
|
|
|
+ origin=task['origin'],
|
|
|
+ groups=task['groups'],
|
|
|
+ classify=self.visit_classify,
|
|
|
+ weight=task['weight'],
|
|
|
+ ))
|
|
|
'''推送数据挖掘队列'''
|
|
|
self.scheduler.add_excavate(lst, level=task['weight'])
|
|
|
logger.info(f'<{t_name}> - {self._name} - {task["search"]} - 第{cur_page}页,共{len(lst)}条')
|
|
@@ -74,7 +73,7 @@ class QueryOrganization(BasicSearch):
|
|
|
self._interval = (kwargs.pop('query_interval', None) or 60)
|
|
|
super(QueryOrganization, self).__init__(**kwargs)
|
|
|
self.engine = (engine or QccSearchEngine())
|
|
|
- self._name = engine.__class__.__name__
|
|
|
+ self._name = self.engine.__class__.__name__
|
|
|
|
|
|
def query_org(self):
|
|
|
t_name = threading.currentThread().getName()
|
|
@@ -97,7 +96,7 @@ class QueryOrganization(BasicSearch):
|
|
|
self.push_query(task)
|
|
|
if not is_url(url):
|
|
|
continue
|
|
|
- '''此处通过收录器判断是否是已收录网站,再决定是否推送数据挖掘队列'''
|
|
|
+ '''此处通过收录器判断是否已收录网站,再决定是否推送数据挖掘队列'''
|
|
|
if self.collector.data(task['domain']):
|
|
|
continue
|
|
|
'''设置任务为数据挖掘类型'''
|