dongzhaorui před 1 rokem
rodič
revize
e9595e5e51
1 změnil soubory, kde provedl 32 přidání a 7 odebrání
  1. 32 7
      find_source/crawler/utils.py

+ 32 - 7
find_source/crawler/utils.py

@@ -21,13 +21,13 @@ from crawler.defaults import (
     PAGE_TEXT_FILTER_WORDS
 )
 from predict_bidding_model import exists_ztb
+from predict_bidding_model_v2 import predict as v2_predict
 
 
-def err_details(worker):
-    worker_exception = worker.exception()
-    if worker_exception:
-        logger.exception("Worker return exception: {}".format(worker_exception))
-    return worker
+def err_details(future):
+    error = future.exception()
+    if error:
+        logger.exception("Worker return exception: {}".format(error))
 
 
 def split_domain(val: str):
@@ -123,15 +123,34 @@ def join_url(url: str, parameters: dict):
     return urljoin(url, _data)
 
 
-def extract_text(source: str):
-    soup = BeautifulSoup(source, "lxml")
+def extract_text_by_bs4(html: str):
+    if not html:
+        return ""
+
+    soup = BeautifulSoup(html, "lxml")
     return soup.get_text()
 
 
+def extract_text_by_lxml(html):
+    if not html:
+        return ""
+
+    selector = html2element(html)
+    return selector.text_content()
+
+
+def extract_text(html, parser="lxml"):
+    if parser == "bs4":
+        return extract_text_by_bs4(html)
+    else:
+        return extract_text_by_lxml(html)
+
+
 def verify_text(val: str, length=50):
     """检查数字、字母、中文的个数"""
     if val is None:
         return False
+
     sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
     for pattern in sub_pattern:
         val = re.sub(pattern, '', val)
@@ -294,5 +313,11 @@ def predict_bidding_model(item: dict):
     return result
 
 
+def predict_bidding_model_v2(items: list):
+    result = v2_predict(items)  # ndarry
+    sum_result_row = result.sum(axis=0)
+    return int(sum_result_row)
+
+
 def compress_str(content, level=9):
     return zlib.compress(content.encode(encoding='utf-8'), level=level)