Ver Fonte

update - 新增文本压缩方法

dongzhaorui há 2 anos atrás
pai
commit
fbdddf6580

+ 1 - 1
find_source/crawler/services/excavate.py

@@ -33,7 +33,7 @@ def predict_data(html, task: Task):
     data_json = predict_bidding_model(data)
     Dzr.insert_one({
         'site': task['origin'],
-        'html': html,
+        # 'html': compress_str(html),
         'url': task['url'],
         'predict': data_json['predict'],
         'comeintime': int2long(int(time.time()))

+ 5 - 0
find_source/crawler/utils.py

@@ -1,4 +1,5 @@
 import re
+import zlib
 from html import unescape
 from urllib.parse import urlencode, urljoin
 
@@ -275,3 +276,7 @@ def predict_bidding_model(item: dict):
     predict = any({v for _, v in predict_result.items()})
     result['predict'] = int(predict)
     return result
+
+
+def compress_str(content, level=9):
+    return zlib.compress(content.encode(encoding='utf-8'), level=level)