|
@@ -71,7 +71,7 @@ class BasicSearch:
|
|
|
def loops_interval(interval):
|
|
|
t_name = threading.currentThread().getName()
|
|
|
next_run_time = delay_by((interval or 300))
|
|
|
- logger.debug(f'程序运行结束:<{t_name}>,下次运行时间:<{next_run_time}>')
|
|
|
+ logger.debug(f'运行结束:<{t_name}>,下次运行时间:{next_run_time}')
|
|
|
time.sleep(interval)
|
|
|
|
|
|
@staticmethod
|
|
@@ -92,6 +92,20 @@ class BasicSearch:
|
|
|
}
|
|
|
return item
|
|
|
|
|
|
+ @staticmethod
|
|
|
+ def make_domain_item(task: Task):
|
|
|
+ item = {
|
|
|
+ 'name': task['name'],
|
|
|
+ 'url': task['url'],
|
|
|
+ 'domain': task['domain'],
|
|
|
+ 'depth': task['depth'],
|
|
|
+ 'origin': task['origin'],
|
|
|
+ 'groups': task['groups'],
|
|
|
+ 'create_at': task['create_at'],
|
|
|
+ 'update_at': task['update_at'],
|
|
|
+ }
|
|
|
+ return item
|
|
|
+
|
|
|
@staticmethod
|
|
|
def make_duplicate_removal(task: Task):
|
|
|
item = {
|
|
@@ -102,41 +116,51 @@ class BasicSearch:
|
|
|
return item
|
|
|
|
|
|
def _push_data(self, purpose: str, task: Task, collection):
|
|
|
- if purpose == 'save':
|
|
|
- insert_one(collection, self.make_retrieve_item(task))
|
|
|
+ t_name = threading.currentThread().getName()
|
|
|
+ if purpose == 'query':
|
|
|
+ item = self.make_retrieve_item(task)
|
|
|
+ insert_one(collection, item)
|
|
|
+ logger.info(f'<{t_name}> - 上传查询结果 - {item["_id"]}')
|
|
|
+ elif purpose == 'domain':
|
|
|
+ item = self.make_domain_item(task)
|
|
|
+ insert_one(collection, item)
|
|
|
+ logger.info(f'<{t_name}> - 上传挖掘结果 - {item["_id"]}')
|
|
|
elif purpose == 'remove':
|
|
|
- insert_one(collection, self.make_duplicate_removal(task))
|
|
|
+ item = self.make_duplicate_removal(task)
|
|
|
+ insert_one(collection, item)
|
|
|
+ logger.info(f'<{t_name}> - 上传去重特征 - {item["_id"]}')
|
|
|
else:
|
|
|
insert_one(collection, task)
|
|
|
+ logger.info(f'<{t_name}> - 上传记录数据 - {task["_id"]}')
|
|
|
|
|
|
def push_remove(self, task: Task):
|
|
|
- """数据去重的垃圾表"""
|
|
|
- logger.info(f"[上传去重特征]【{task['name']} - {task['url']}】")
|
|
|
+ """数据去重表"""
|
|
|
if not self.validator.data(task['url']):
|
|
|
self._push_data('remove', task, MGO_REMOVAL_DUPLICATE)
|
|
|
self.validator.add_data(task['url'])
|
|
|
+ return True
|
|
|
+ return False
|
|
|
|
|
|
def push_domain(self, task: Task):
|
|
|
- """挖掘网站的查询结果"""
|
|
|
- logger.info(f"[数据挖掘 - 推送]【{task['name']} - {task['domain']}】")
|
|
|
+ """数据挖掘结果,推送保存"""
|
|
|
if not self.collector.data(task['domain']):
|
|
|
- self._push_data('save', task, MGO_DOMAIN)
|
|
|
+ self._push_data('domain', task, MGO_DOMAIN)
|
|
|
self.collector.add_data(task['domain'])
|
|
|
+ return True
|
|
|
+ return False
|
|
|
|
|
|
def push_query(self, task: Task):
|
|
|
- """搜索组织单位查询结果"""
|
|
|
- logger.info(f"[查询结果 - 推送]【{task['name']} - {task['url']}】")
|
|
|
- self._push_data('save', task, MGO_QUERY)
|
|
|
+ """搜索组织单位查询结果,推送保存"""
|
|
|
+ self._push_data('query', task, MGO_QUERY)
|
|
|
|
|
|
def push_records(self, task: Task):
|
|
|
"""挖掘数据的记录"""
|
|
|
- if task['name'] > 20:
|
|
|
+ if len(task['name']) > 20:
|
|
|
task['name'] = '{:.20s}'.format(task['name'])
|
|
|
- logger.info(f"[数据记录 - 推送]【{task['name']} - {task['url']}】")
|
|
|
self._push_data('records', task, MGO_RECORDS)
|
|
|
|
|
|
def orgs_table(self) -> List[Mapping]:
|
|
|
- """组织|单位"""
|
|
|
+ """组织|单位表"""
|
|
|
search_orgs = []
|
|
|
cursor = MGO_ORGS.find(self.query, projection=self.projection)
|
|
|
for item in cursor.sort(self.sort):
|
|
@@ -144,7 +168,7 @@ class BasicSearch:
|
|
|
return search_orgs
|
|
|
|
|
|
def keywords_table(self):
|
|
|
- """关键词"""
|
|
|
+ """关键词表"""
|
|
|
search_keywords = []
|
|
|
cursor = MGO_KEYWORDS.find(projection=self.projection)
|
|
|
for item in cursor.sort(self.sort):
|
|
@@ -152,7 +176,7 @@ class BasicSearch:
|
|
|
return search_keywords
|
|
|
|
|
|
def seed_urls_table(self) -> List[Mapping]:
|
|
|
- """种子urls"""
|
|
|
+ """种子列表"""
|
|
|
search_urls = []
|
|
|
cursor = MGO_URLS.find(self.query, projection=self.projection)
|
|
|
for item in cursor.sort(self.sort):
|
|
@@ -160,7 +184,7 @@ class BasicSearch:
|
|
|
return search_urls
|
|
|
|
|
|
def competing_goods_table(self):
|
|
|
- """竞品urls"""
|
|
|
+ """竞品列表"""
|
|
|
competing_goods = []
|
|
|
cursor = MGO_COMPETING_GOODS.find(self.query, projection=self.projection)
|
|
|
for item in cursor.sort(self.sort):
|