|
@@ -176,17 +176,18 @@ class DataExcavate(BasicService):
|
|
|
status_code, page_source = self.fetch_page(task)
|
|
|
task['status_code'] = status_code
|
|
|
|
|
|
- predict_data(page_source, task)
|
|
|
-
|
|
|
if page_source is None:
|
|
|
# 访问失败的域名是否添加过滤器?
|
|
|
self.push_remove(task)
|
|
|
return False
|
|
|
- task['domain'] = extract_domain(task['url'])
|
|
|
- task['base_url'] = extract_host(task['url'])
|
|
|
- task['name'] = extract_page_title(page_source)
|
|
|
- self.same_origin_strategy(page_source, task)
|
|
|
- self.non_origin_strategy(page_source, task)
|
|
|
+
|
|
|
+ predict_res = predict_data(page_source, task) # 招投标预测结果
|
|
|
+ if predict_res['predict']:
|
|
|
+ task['domain'] = extract_domain(task['url'])
|
|
|
+ task['base_url'] = extract_host(task['url'])
|
|
|
+ task['name'] = extract_page_title(page_source)
|
|
|
+ self.same_origin_strategy(page_source, task)
|
|
|
+ self.non_origin_strategy(page_source, task)
|
|
|
return True
|
|
|
|
|
|
def excavate(self):
|
|
@@ -203,7 +204,7 @@ class DataExcavate(BasicService):
|
|
|
try:
|
|
|
self.process(task)
|
|
|
except Exception as e:
|
|
|
- logger.exception(e)
|
|
|
+ logger.exception(f'<{self.thread_name}> {e}')
|
|
|
# '''挖掘记录'''
|
|
|
# self.push_records(task)
|
|
|
|