|
@@ -8,6 +8,7 @@ Created on 2023-02-21
|
|
|
"""
|
|
|
|
|
|
import ast
|
|
|
+import re
|
|
|
import time
|
|
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
from typing import Dict
|
|
@@ -24,13 +25,6 @@ from redis.exceptions import DataError
|
|
|
|
|
|
from log import logger
|
|
|
|
|
|
-# mongo
|
|
|
-MONGO_HOST = "172.17.4.87"
|
|
|
-MONGO_PORT = 27080
|
|
|
-MONGO_DB = "py_spider"
|
|
|
-mcli = MongoClient(MONGO_HOST, MONGO_PORT)
|
|
|
-mongodb = mcli[MONGO_DB]
|
|
|
-
|
|
|
|
|
|
# redis
|
|
|
class Encoder(RedisEncoder):
|
|
@@ -79,10 +73,17 @@ pool = redis.ConnectionPool(
|
|
|
)
|
|
|
rcli = redis.StrictRedis(connection_pool=pool, decode_responses=True)
|
|
|
|
|
|
+# mongo
|
|
|
+MONGO_HOST = "172.17.4.87"
|
|
|
+MONGO_PORT = 27080
|
|
|
+MONGO_DB = "py_spider"
|
|
|
+mcli = MongoClient(MONGO_HOST, MONGO_PORT)
|
|
|
+mongodb = mcli[MONGO_DB]
|
|
|
+
|
|
|
# es
|
|
|
-ES_HOST = '172.17.145.178'
|
|
|
+ES_HOST = "172.17.145.178"
|
|
|
ES_PORT = 9800
|
|
|
-ES_INDEX = 'biddingall'
|
|
|
+ES_INDEX = "biddingall"
|
|
|
ecli = Elasticsearch([{"host": ES_HOST, "port": ES_PORT}])
|
|
|
|
|
|
# 延时间隔
|
|
@@ -93,7 +94,7 @@ def literal_eval(node_or_string):
|
|
|
try:
|
|
|
return ast.literal_eval(node_or_string)
|
|
|
except ValueError as e:
|
|
|
- if 'malformed node or string' in e.args[0]:
|
|
|
+ if "malformed node or string" in e.args[0]:
|
|
|
from bson import Code, ObjectId # eval变量作用域,ObjectId参数
|
|
|
return eval(node_or_string)
|
|
|
else:
|
|
@@ -138,8 +139,7 @@ def es_query(title, publish_time):
|
|
|
}
|
|
|
}
|
|
|
result = ecli.search(body=query, index=ES_INDEX, request_timeout=100)
|
|
|
- # print(result['hits']['total'])
|
|
|
- total = int(result['hits']['total'])
|
|
|
+ total = int(result["hits"]["total"])
|
|
|
return total
|
|
|
|
|
|
|
|
@@ -161,34 +161,42 @@ def rpush(name, values, is_redis_cluster=False):
|
|
|
return rcli.rpush(name, values)
|
|
|
|
|
|
|
|
|
+def handle_big_document(item):
|
|
|
+ if "contenthtml" in item:
|
|
|
+ item["contenthtml"] = re.sub("<img[^>]*>", "<br>", item["contenthtml"])
|
|
|
+
|
|
|
+
|
|
|
def insert_one(table, item: Dict):
|
|
|
"""MongoDB 单条入库"""
|
|
|
if item is not None:
|
|
|
- item.pop('_id', '')
|
|
|
+ item.pop("_id", "")
|
|
|
if item.get("comeintime"):
|
|
|
- item['comeintime'] = int64.Int64(item['comeintime'])
|
|
|
+ item["comeintime"] = int64.Int64(item["comeintime"])
|
|
|
try:
|
|
|
- title = item.get('title')
|
|
|
+ title = item.get("title")
|
|
|
result = mongodb[table].insert_one(item)
|
|
|
- logger.info(f'{table}-{str(result.inserted_id)}-{title}--上传成功')
|
|
|
+ logger.info(f"[Send]{table}-{str(result.inserted_id)}-{title}--上传成功")
|
|
|
except Exception as e:
|
|
|
+ if "BSON document too large" in ''.join(e.args):
|
|
|
+ handle_big_document(item) # MongoDB文档保存要求 BSON 大小限制 16 MB
|
|
|
+
|
|
|
rpush(get_redis_key(table), item)
|
|
|
- logger.error(table + f"--推送失败,原因:{''.join(e.args)}")
|
|
|
+ logger.error("[Send]" + table + f"--推送失败,原因:{''.join(e.args)}")
|
|
|
|
|
|
|
|
|
def sync_data(table):
|
|
|
redis_key = get_redis_key(table)
|
|
|
total = rcli.llen(redis_key)
|
|
|
- logger.info(f"同步表名:{table},推送总数:{total}")
|
|
|
+ logger.info(f"[Send]同步表名:{table},推送总数:{total}")
|
|
|
for _ in range(total):
|
|
|
obj = rcli.lpop(redis_key)
|
|
|
if obj is None:
|
|
|
- logger.warning(f'{table} 错误数据:{obj}')
|
|
|
+ logger.warning(f"[Send]{table} 错误数据:{obj}")
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
item = literal_eval(obj)
|
|
|
- if table != 'mgp_list':
|
|
|
+ if table != "mgp_list":
|
|
|
insert_one(table, item)
|
|
|
else:
|
|
|
title = item.get("item").get("title")
|
|
@@ -198,7 +206,7 @@ def sync_data(table):
|
|
|
t_diff = int(time.time()) - item.get("comeintime")
|
|
|
if t_diff <= DELAY:
|
|
|
rpush(redis_key, item)
|
|
|
- logger.info(f"{site}-{title}-等待{t_diff}秒--延时入库")
|
|
|
+ logger.info(f"[Send]{site}-{title}-等待{t_diff}秒--延时入库")
|
|
|
# es检索流程
|
|
|
elif item.get("if_es"):
|
|
|
pt = item.get("item").get("publishtime")
|
|
@@ -207,22 +215,24 @@ def sync_data(table):
|
|
|
else:
|
|
|
insert_one(table, item)
|
|
|
except Exception as e:
|
|
|
- # print(e)
|
|
|
- # print(f'{table} {type(obj)} >>>>> {repr(obj)}')
|
|
|
rpush(redis_key, obj)
|
|
|
+ logger.error("[Send]" + table + f"--推送失败,原因:{''.join(e.args)}")
|
|
|
|
|
|
|
|
|
def err_msg(worker):
|
|
|
err = worker.exception()
|
|
|
if err:
|
|
|
- logger.exception("worker err: {}".format(err))
|
|
|
+ logger.exception("[Send]worker err: {}".format(err))
|
|
|
return worker
|
|
|
|
|
|
|
|
|
@func_set_timeout(60 * 20)
|
|
|
def main():
|
|
|
- logger.info("数据同步开始")
|
|
|
- tables = ["mgp_list", "data_bak", "spider_heartbeat", "njpc_list", "data_njpc", "listdata_err"]
|
|
|
+ logger.info("[Send]数据同步开始")
|
|
|
+ tables = [
|
|
|
+ "mgp_list", "data_bak", "njpc_list", "data_njpc",
|
|
|
+ "listdata_err", "spider_heartbeat",
|
|
|
+ ]
|
|
|
with ThreadPoolExecutor() as pool:
|
|
|
futures = []
|
|
|
for table in tables:
|
|
@@ -230,11 +240,11 @@ def main():
|
|
|
f.add_done_callback(err_msg)
|
|
|
futures.append(f)
|
|
|
wait(futures)
|
|
|
- logger.info("数据同步结束")
|
|
|
+ logger.info("[Send]数据同步结束")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
try:
|
|
|
main()
|
|
|
except FunctionTimedOut:
|
|
|
- logger.warning("数据同步超时")
|
|
|
+ logger.warning("[Send]数据同步超时")
|