|
@@ -21,13 +21,13 @@ from crawler.defaults import (
|
|
|
PAGE_TEXT_FILTER_WORDS
|
|
|
)
|
|
|
from predict_bidding_model import exists_ztb
|
|
|
+from predict_bidding_model_v2 import predict as v2_predict
|
|
|
|
|
|
|
|
|
-def err_details(worker):
|
|
|
- worker_exception = worker.exception()
|
|
|
- if worker_exception:
|
|
|
- logger.exception("Worker return exception: {}".format(worker_exception))
|
|
|
- return worker
|
|
|
+def err_details(future):
|
|
|
+ error = future.exception()
|
|
|
+ if error:
|
|
|
+ logger.exception("Worker return exception: {}".format(error))
|
|
|
|
|
|
|
|
|
def split_domain(val: str):
|
|
@@ -123,15 +123,34 @@ def join_url(url: str, parameters: dict):
|
|
|
return urljoin(url, _data)
|
|
|
|
|
|
|
|
|
-def extract_text(source: str):
|
|
|
- soup = BeautifulSoup(source, "lxml")
|
|
|
+def extract_text_by_bs4(html: str):
|
|
|
+ if not html:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ soup = BeautifulSoup(html, "lxml")
|
|
|
return soup.get_text()
|
|
|
|
|
|
|
|
|
+def extract_text_by_lxml(html):
|
|
|
+ if not html:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ selector = html2element(html)
|
|
|
+ return selector.text_content()
|
|
|
+
|
|
|
+
|
|
|
+def extract_text(html, parser="lxml"):
|
|
|
+ if parser == "bs4":
|
|
|
+ return extract_text_by_bs4(html)
|
|
|
+ else:
|
|
|
+ return extract_text_by_lxml(html)
|
|
|
+
|
|
|
+
|
|
|
def verify_text(val: str, length=50):
|
|
|
"""检查数字、字母、中文的个数"""
|
|
|
if val is None:
|
|
|
return False
|
|
|
+
|
|
|
sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
|
|
|
for pattern in sub_pattern:
|
|
|
val = re.sub(pattern, '', val)
|
|
@@ -294,5 +313,11 @@ def predict_bidding_model(item: dict):
|
|
|
return result
|
|
|
|
|
|
|
|
|
+def predict_bidding_model_v2(items: list):
|
|
|
+ result = v2_predict(items) # ndarry
|
|
|
+ sum_result_row = result.sum(axis=0)
|
|
|
+ return int(sum_result_row)
|
|
|
+
|
|
|
+
|
|
|
def compress_str(content, level=9):
|
|
|
return zlib.compress(content.encode(encoding='utf-8'), level=level)
|