|
@@ -1,10 +1,9 @@
|
|
import bson
|
|
import bson
|
|
import pymongo
|
|
import pymongo
|
|
import redis
|
|
import redis
|
|
-import requests
|
|
|
|
from elasticsearch import Elasticsearch
|
|
from elasticsearch import Elasticsearch
|
|
|
|
|
|
-from config.load import mongo_conf, redis_conf, es_conf, analyze_url
|
|
|
|
|
|
+from config.load import mongo_conf, redis_conf, es_conf
|
|
|
|
|
|
# ---------------------------------- mongo ----------------------------------
|
|
# ---------------------------------- mongo ----------------------------------
|
|
MONGO_URI_CLIENTS = {} # a dictionary hold all client with uri as key
|
|
MONGO_URI_CLIENTS = {} # a dictionary hold all client with uri as key
|
|
@@ -55,25 +54,6 @@ def es_client(cfg=None):
|
|
return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
|
|
return Elasticsearch([{"host": cfg['host'], "port": cfg['port']}])
|
|
|
|
|
|
|
|
|
|
-def es_participles_service(text: str):
|
|
|
|
- """
|
|
|
|
- 获取文本的分词列表
|
|
|
|
-
|
|
|
|
- :param text: 需要分词的文本
|
|
|
|
- :return: 分词列表
|
|
|
|
- """
|
|
|
|
- result = []
|
|
|
|
- params = {"text": text, "analyzer": "ik_smart"}
|
|
|
|
- res = requests.get(analyze_url, params=params, timeout=60)
|
|
|
|
- if res.status_code == 200:
|
|
|
|
- tokens = res.json().get('tokens', [])
|
|
|
|
- for x in tokens:
|
|
|
|
- if x["token"].encode('utf-8').isalpha():
|
|
|
|
- continue
|
|
|
|
- result.append(x["token"])
|
|
|
|
- return result
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def es_query(title: str, publish_time: int):
|
|
def es_query(title: str, publish_time: int):
|
|
"""
|
|
"""
|
|
查询es
|
|
查询es
|
|
@@ -85,28 +65,26 @@ def es_query(title: str, publish_time: int):
|
|
client = es_client()
|
|
client = es_client()
|
|
stime = publish_time - 432000 # 往前推5天
|
|
stime = publish_time - 432000 # 往前推5天
|
|
etime = publish_time + 432000
|
|
etime = publish_time + 432000
|
|
- conditions = []
|
|
|
|
- participles = es_participles_service(title)
|
|
|
|
- for word in participles:
|
|
|
|
- conditions.append({
|
|
|
|
- "multi_match": {
|
|
|
|
- "query": word,
|
|
|
|
- "type": "phrase",
|
|
|
|
- "fields": ["title"]
|
|
|
|
- }
|
|
|
|
- })
|
|
|
|
- conditions.append({"range": {"publishtime": {"from": stime, "to": etime}}})
|
|
|
|
|
|
+ # 通过发布标题和发布时间范围查询
|
|
query = {
|
|
query = {
|
|
"query": {
|
|
"query": {
|
|
"bool": {
|
|
"bool": {
|
|
- "must": conditions,
|
|
|
|
- "minimum_should_match": 1
|
|
|
|
|
|
+ "must": [
|
|
|
|
+ {
|
|
|
|
+ "multi_match": {
|
|
|
|
+ "query": title,
|
|
|
|
+ "type": "phrase",
|
|
|
|
+ "fields": ["title"]
|
|
|
|
+ }
|
|
|
|
+ },
|
|
|
|
+ {"range": {'publishtime': {"from": stime, "to": etime}}}
|
|
|
|
+ ]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
result = client.search(index=es_conf['db'], body=query, request_timeout=100)
|
|
result = client.search(index=es_conf['db'], body=query, request_timeout=100)
|
|
- count = len(result['hits']['hits'])
|
|
|
|
- return count
|
|
|
|
|
|
+ total = int(result['hits']['total'])
|
|
|
|
+ return total
|
|
|
|
|
|
|
|
|
|
# ---------------------------------- redis ----------------------------------
|
|
# ---------------------------------- redis ----------------------------------
|