Procházet zdrojové kódy

update:使用招投标预测模型检测站点是否招投标网站类型

dongzhaorui před 2 roky
rodič
revize
90ef450b2f
1 změnil soubory, kde provedl 9 přidání a 8 odebrání
  1. 9 8
      find_source/crawler/services/excavate.py

+ 9 - 8
find_source/crawler/services/excavate.py

@@ -176,17 +176,18 @@ class DataExcavate(BasicService):
         status_code, page_source = self.fetch_page(task)
         task['status_code'] = status_code
 
-        predict_data(page_source, task)
-
         if page_source is None:
             # 访问失败的域名是否添加过滤器?
             self.push_remove(task)
             return False
-        task['domain'] = extract_domain(task['url'])
-        task['base_url'] = extract_host(task['url'])
-        task['name'] = extract_page_title(page_source)
-        self.same_origin_strategy(page_source, task)
-        self.non_origin_strategy(page_source, task)
+
+        predict_res = predict_data(page_source, task)  # 招投标预测结果
+        if predict_res['predict']:
+            task['domain'] = extract_domain(task['url'])
+            task['base_url'] = extract_host(task['url'])
+            task['name'] = extract_page_title(page_source)
+            self.same_origin_strategy(page_source, task)
+            self.non_origin_strategy(page_source, task)
         return True
 
     def excavate(self):
@@ -203,7 +204,7 @@ class DataExcavate(BasicService):
                 try:
                     self.process(task)
                 except Exception as e:
-                    logger.exception(e)
+                    logger.exception(f'<{self.thread_name}> {e}')
             # '''挖掘记录'''
             # self.push_records(task)