1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- import bson
- import pymongo
- from elasticsearch import Elasticsearch
- import setting
- from utils.title_participle import get_should
- # ---------------------------------- mongo ----------------------------------
- MONGO_URI_CLIENTS = {} # a dictionary hold all client with uri as key
- def mongo_client(cfg=None, host=None, port=None, fork=False, **kwargs):
- if host is not None and port is not None:
- uri = f'mongodb://{host}:{port}'
- else:
- _cfg = (cfg or {'host': setting.MONGO_IP, 'port': setting.MONGO_PORT})
- uri = f'mongodb://{_cfg["host"]}:{_cfg["port"]}'
- if fork:
- return pymongo.MongoClient(uri, **kwargs)
- global MONGO_URI_CLIENTS
- matched_client = MONGO_URI_CLIENTS.get(uri)
- if matched_client is None:
- new_client = pymongo.MongoClient(uri, **kwargs)
- if new_client is not None:
- MONGO_URI_CLIENTS[uri] = new_client
- return new_client
- return matched_client
- def mongo_database(name: str, **kwargs):
- client = mongo_client(**kwargs)
- return client.get_database(name)
- def mongo_table(db: str, name: str, **kwargs):
- database = mongo_database(db, **kwargs)
- return database.get_collection(name)
- def int2long(param: int):
- """int 转换成 long """
- return bson.int64.Int64(param)
- def object_id(_id: str):
- return bson.objectid.ObjectId(_id)
- # ---------------------------------- es ----------------------------------
- def es_client(cfg=None):
- if cfg is None:
- cfg = {
- 'host': setting.ES_IP,
- 'username': setting.ES_USERNAME,
- 'pwd': setting.ES_PASSWORD,
- 'port': setting.ES_PORT,
- }
- hosts = [{'host': cfg['host'], 'port': cfg['port']}]
- auth = (cfg['username'], cfg['pwd'])
- return Elasticsearch(hosts, http_auth=auth)
- def es_query(title: str, publish_time: int):
- """
- 查询es
- :param title: 标题
- :param publish_time: 发布时间
- :return:
- """
- client = es_client()
- stime = publish_time - 432000 # 往前推5天
- etime = publish_time + 432000
- time_limit = {"range": {'publishtime': {"from": stime, "to": etime}}}
- should_list = get_should(title) # 对标题进行分词组合query语句
- # 通过发布标题和发布时间范围查询
- query = {
- "query": {
- "bool": {
- "must": [time_limit],
- "should": should_list,
- "minimum_should_match": "10<90%",
- }
- }
- }
- result = client.search(index=setting.ES_INDEX, body=query, request_timeout=100)
- total = int(result['hits']['total']['value'])
- return total
|