3 年之前 · 2c284cd37c
--- a/ybw/config/load.py
+++ b/ybw/config/load.py
@@ -12,7 +12,6 @@ __all__ = [
 
															     'jy_proxy',
														
 
															     'crawler_url',
														
 
															     'region',
														
 
															-    'analyze_url',
														
 
															     'node_module_path'
														
 
															 ]
														
@@ -35,7 +34,6 @@ with open(_yaml_constants, encoding="utf-8") as fp:
 
															     constants = yaml.safe_load(fp)
														
 
															     headers: dict = constants['headers']
														
 
															     crawler_url: dict = constants['crawler_url']
														
 
															-    analyze_url = f'http://{es_conf["host"]}:{es_conf["port"]}/{es_conf["db"]}/_analyze'
														
 
															 with open(_yaml_areas, encoding="utf-8") as fr:
														
 
															     areas = yaml.safe_load(fr)
														
--- a/ybw/utils/databases.py
+++ b/ybw/utils/databases.py
@@ -1,10 +1,9 @@
 
															 import bson
														
 
															 import pymongo
														
 
															 import redis
														
 
															-import requests
														
 
															 from elasticsearch import Elasticsearch
														
 
															-from config.load import mongo_conf, redis_conf, es_conf, analyze_url
														
 
															+from config.load import mongo_conf, redis_conf, es_conf
														
 
															 # ---------------------------------- mongo ----------------------------------
														
 
															 MONGO_URI_CLIENTS = {}    # a dictionary hold all client with uri as key
														
@@ -55,25 +54,6 @@ def es_client(cfg=None):
 
															     return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
														
 
															-def es_participles_service(text: str):
														
 
															-    """
														
 
															-    获取文本的分词列表
														
 
															-
														
 
															-    :param text: 需要分词的文本
														
 
															-    :return: 分词列表
														
 
															-    """
														
 
															-    result = []
														
 
															-    params = {"text": text, "analyzer": "ik_smart"}
														
 
															-    res = requests.get(analyze_url, params=params, timeout=60)
														
 
															-    if res.status_code == 200:
														
 
															-        tokens = res.json().get('tokens', [])
														
 
															-        for x in tokens:
														
 
															-            if x["token"].encode('utf-8').isalpha():
														
 
															-                continue
														
 
															-            result.append(x["token"])
														
 
															-    return result
														
 
															-
														
 
															-
														
 
															 def es_query(title: str, publish_time: int):
														
 
															     """
														
 
															     查询es
														
@@ -85,28 +65,26 @@ def es_query(title: str, publish_time: int):
 
															     client = es_client()
														
 
															     stime = publish_time - 432000  # 往前推5天
														
 
															     etime = publish_time + 432000
														
 
															-    conditions = []
														
 
															-    participles = es_participles_service(title)
														
 
															-    for word in participles:
														
 
															-        conditions.append({
														
 
															-            "multi_match": {
														
 
															-                "query": word,
														
 
															-                "type": "phrase",
														
 
															-                "fields": ["title"]
														
 
															-            }
														
 
															-        })
														
 
															-    conditions.append({"range": {"publishtime": {"from": stime, "to": etime}}})
														
 
															+    # 通过发布标题和发布时间范围查询
														
 
															     query = {
														
 
															         "query": {
														
 
															             "bool": {
														
 
															-                "must": conditions,
														
 
															-                "minimum_should_match": 1
														
 
															+                "must": [
														
 
															+                    {
														
 
															+                        "multi_match": {
														
 
															+                            "query": title,
														
 
															+                            "type": "phrase",
														
 
															+                            "fields": ["title"]
														
 
															+                        }
														
 
															+                    },
														
 
															+                    {"range": {'publishtime': {"from": stime, "to": etime}}}
														
 
															+                ]
														
 
															             }
														
 
															         }
														
 
															     }
														
 
															     result = client.search(index=es_conf['db'], body=query, request_timeout=100)
														
 
															-    count = len(result['hits']['hits'])
														
 
															-    return count
														
 
															+    total = int(result['hits']['total'])
														
 
															+    return total
														
 
															 # ---------------------------------- redis ----------------------------------