2 долоо хоног өмнө · a15e4bee09
--- a/app.py
+++ b/app.py
@@ -43,7 +43,7 @@ check_chain = {
 
															     "projectname": {
														
 
															         "checker": projectname_checker,
														
 
															     },
														
 
															-    "winner": {
														
 
															+    "s_winner": {
														
 
															         "checker": winner_checker,
														
 
															     },
														
 
															     "buyer": {
														
@@ -80,7 +80,7 @@ def check_params(func, param_rows: dict, is_check_type=False) -> (bool, list):
 
															     """
														
 
															     函数传参数
														
 
															     """
														
 
															-    _default_params = {"attach_text": {}, "bidamount": 0, "budget": 0,"bidopentime":0,"publishtime":0,"subtype":"","supervisorrate":0,"city":"郑州市","district":"金水区"}  # 必要参数的默认值
														
 
															+    _default_params = {"attach_text": {}, "bidamount": 0, "budget": 0,"s_winner":"","bidopentime":0,"publishtime":0,"subtype":"","supervisorrate":0,"city":"郑州市","district":"金水区","com_package":[]}  # 必要参数的默认值
														
 
															     func_params = inspect.getfullargspec(func)  # 取出函数的参数
														
 
															     args_annotations = func_params.annotations  # 参数类型dict，取出参数类型
														
 
															     args_list = func_params.args  # 参数list
														
@@ -197,8 +197,8 @@ if __name__ == '__main__':
 
															     "title" : "黄冈罗田碳和瑞新能源科技有限公司罗田分公司租用罗田县三里畈镇尹家垸村三组尹小丹农户屋顶新建35.75分布式光伏发电项目",
														
 
															     "dataging" : int(0),
														
 
															     # "bidopentime":int(1798739414),
														
 
															-    "publishtime" : int(1698739410),
														
 
															-    # "subtype" : "招标",
														
 
															+    # "publishtime" : int(1751937052),
														
 
															+    "subtype" : "成交",
														
 
															     "purchasinglist" : [
														
 
															         {
														
 
															             "score" : 0.8275, 
														
@@ -231,7 +231,7 @@ if __name__ == '__main__':
 
															     "basicClass" : "货物", 
														
 
															     "rate" : "97%", 
														
 
															     "autoid" : int(249351001), 
														
 
															-    "bidamount" : 1034.1111,
														
 
															+    # "bidamount" : 1034.1111,
														
 
															     "bidway" : "电子投标", 
														
 
															     # "budget" : None,
														
 
															     "supervisorrate": 0.03,
														
@@ -250,7 +250,7 @@ if __name__ == '__main__':
 
															     "purchasing_tag" : "台式计算机,计算机,摄像头,液晶显示器,DR,M9,液晶,4G", 
														
 
															     "s_subscopeclass" : "信息技术_其他,行政办公_通用办公设备", 
														
 
															     "s_topscopeclass" : "行政办公,信息技术", 
														
 
															-    # "s_winner" : "二连浩特市智慧真彩文体办公",
														
 
															+    "s_winner" : "二连浩特市智慧真彩文体办公院院",
														
 
															     "subscopeclass" : [
														
 
															         "信息技术_其他", 
														
 
															         "行政办公_通用办公设备"
														
@@ -276,7 +276,6 @@ if __name__ == '__main__':
 
															             "name": "不干胶标签"
														
 
															         }
														
 
															     ],
														
 
															-        # "winner" : "二连浩特市智慧真彩文体办公",
														
 
															     "pici" : int(1698740066)
														
 
															 }
														
 
															     # result=check(row,rules={
														
@@ -511,12 +510,47 @@ if __name__ == '__main__':
 
															     # #     }
														
 
															     # })
														
 
															     result = check(row, rules={
														
 
															-        "com_package" : {
														
 
															-            "1000" : {
														
 
															-                "name" : "分包类数据",
														
 
															-                "parent_name" : "分包类型",
														
 
															-                "parent_code" : "01"
														
 
															+        "s_winner": {
														
 
															+            "0103": {
														
 
															+                "name": "包含叠词，异常词汇，特殊词汇",
														
 
															+                "parent_name": "名称错误",
														
 
															+                "parent_code": "01"
														
 
															             }
														
 
															         },
														
 
															+        "publishtime": {
														
 
															+                    "0201": {
														
 
															+                        "name": "发布时间 > 开标时间  ",
														
 
															+                        "parent_name": "数据范围类型",
														
 
															+                        "parent_code": "02"
														
 
															+                    },
														
 
															+                    "0202": {
														
 
															+                        "name": "发布时间 > 当前时间",
														
 
															+                        "parent_name": "数据范围类型",
														
 
															+                        "parent_code": "02"
														
 
															+                    }
														
 
															+        },
														
 
															+        # "bidamount": {
														
 
															+        #             "0101": {
														
 
															+        #                 "name": "互相校验（预算和中标金额的比例）",
														
 
															+        #                 "parent_name": "金额错误",
														
 
															+        #                 "parent_code": "01"
														
 
															+        #             },
														
 
															+        #             "0102": {
														
 
															+        #                 "name": "过大过小[100，10亿]",
														
 
															+        #                 "parent_name": "金额错误",
														
 
															+        #                 "parent_code": "01"
														
 
															+        #             },
														
 
															+        #             "0103": {
														
 
															+        #                 "name": "中标金额小数点位数超过4位",
														
 
															+        #                 "parent_name": "金额错误",
														
 
															+        #                 "parent_code": "01",
														
 
															+        #                 "checkFn": "01",
														
 
															+        #             },
														
 
															+        #             "0104": {
														
 
															+        #                 "name": "中标金额存在费率，折扣率",
														
 
															+        #                 "parent_name": "金额错误",
														
 
															+        #                 "parent_code": "01"
														
 
															+        #             }
														
 
															+        #         },
														
 
															     })
														
 
															     print(result)
														
--- a/client_spider.py
+++ b/client_spider.py
@@ -14,32 +14,21 @@ from elasticsearch import Elasticsearch
 
															 ReluClient = MongoDBInterface(ReluMongodb)
														
 
															 # 评估服务配置
														
 
															-a2s_ip = "192.168.3.240:9090"
														
 
															-# a2s_ip = "172.17.0.11:9090"
														
 
															+a2s_ip = "172.20.100.235:9090"
														
 
															 topic = "quality_bid"
														
 
															 #本地测试用的主题
														
 
															 # topic = "test_quality_bid"
														
 
															 timeout = 180
														
 
															-# 获取当前时间
														
 
															-now = datetime.now()
														
 
															-current_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
														
 
															-# 获取今天的日期
														
 
															-today = datetime.today()
														
 
															-# 获取昨天的日期
														
 
															-yesterday = today - timedelta(days=1)
														
 
															-# 获取昨天0点的时间
														
 
															-yesterday_midnight = datetime(yesterday.year, yesterday.month, yesterday.day)
														
 
															-# 获取今天0点的时间
														
 
															-today_midnight = datetime(today.year, today.month, today.day)
														
 
															-# 转换为Unix时间戳
														
 
															-start_date = int(yesterday_midnight.timestamp())
														
 
															-end_date = int(today_midnight.timestamp())
														
 
															-
														
 
															-# ES 连接配置
														
 
															-es_host = "http://127.0.0.1:19800"
														
 
															-es_username = "jianyuGr"
														
 
															-es_password = "we3g8glKfe#"
														
 
															+# # ES 连接配置
														
 
															+# es_host = "http://127.0.0.1:19800"
														
 
															+# es_username = "jianyuGr"
														
 
															+# es_password = "we3g8glKfe#"
														
 
															+
														
 
															+#正式es
														
 
															+es_host = "http://172.17.4.184:19908"
														
 
															+es_username = "qyfw_es_2"
														
 
															+es_password = "Khfdals33#"
														
 
															 # 初始化 Elasticsearch 客户端
														
 
															 es_client = Elasticsearch(es_host,http_auth=(es_username, es_password),retry_on_timeout=True) # 使用基本认证
														
@@ -86,7 +75,7 @@ def insert_batch_data(conn, params):
 
															     """
														
 
															     执行批量插入数据
														
 
															     """
														
 
															-    query = """INSERT IGNORE INTO bid_analysis (mongoid,toptype,subtype, site, spidercode, channel,comeintime, area, city, district, score, error_type, spider_modified_time) 
														
 
															+    query = """INSERT IGNORE INTO bid_analysis (mongoid,toptype,subtype, site, spidercode, channel,comeintime, area, city, district, score, error_type, create_time) 
														
 
															                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s,%s)"""
														
 
															     MysqlUtil.insert_data(conn, query, params)
														
@@ -112,7 +101,7 @@ def get_last_processed_id():
 
															     """
														
 
															     # 这里假设从文件读取中断 ID，你也可以从数据库或 Redis 等存储获取
														
 
															     try:
														
 
															-        with open('docs/last_processed_id.txt', 'r') as f:
														
 
															+        with open('docs/last_processed_id_mysql.txt', 'r') as f:
														
 
															             last_id = f.read().strip()
														
 
															             if last_id:
														
 
															                 return last_id
														
@@ -125,22 +114,31 @@ def save_last_processed_id(last_id):
 
															     """
														
 
															     保存当前处理的最大 ID，用于恢复
														
 
															     """
														
 
															-    with open('docs/last_processed_id.txt', 'w') as f:
														
 
															+    with open('docs/last_processed_id_mysql.txt', 'w') as f:
														
 
															         f.write(str(last_id))
														
 
															+def clear_last_processed_id():
														
 
															+    """
														
 
															+    清空 last_processed_id.txt 文件
														
 
															+    """
														
 
															+    open('docs/last_processed_id_mysql.txt', 'w').close()
														
 
															 def batch_load_data():
														
 
															     """
														
 
															     批量数据质量检查
														
 
															     """
														
 
															+    # 获取今天的日期（字符串格式）
														
 
															+    today_date = datetime.now().strftime("%Y-%m-%d")
														
 
															+    # 获取今天 8:00:00 的时间戳
														
 
															+    start_date = int(datetime.strptime(f"{today_date} 08:00:00", "%Y-%m-%d %H:%M:%S").timestamp())
														
 
															+    # 获取今天 12:00:00 的时间戳
														
 
															+    end_date = int(datetime.strptime(f"{today_date} 12:00:00", "%Y-%m-%d %H:%M:%S").timestamp())
														
 
															+
														
 
															     # 规则查询,根据必要条件 公司名称（用户ID）、版本号
														
 
															     rules_id = get_rule("北京剑鱼信息技术有限公司", "v1.2")
														
 
															     print(rules_id)
														
 
															     # 初始化mysql
														
 
															-    # conn = MysqlUtil.connect_to_mysql(host='192.168.3.14', port='4000', user='DataScBi', password='DT#Sc20221123Ht',database='quality')
														
 
															-    conn = MysqlUtil.connect_to_mysql(host='192.168.3.217', port='4000', user='root', password='=PDT49#80Z!RVv52_z',database='quality')
														
 
															-    # 初始化爬虫代码库
														
 
															-    collection = MongoClient(f'mongodb://{"127.0.0.1:27089"}/', unicode_decode_error_handler="ignore", directConnection=True)["editor"]["lua_logs_auditor"]
														
 
															+    conn = MysqlUtil.connect_to_mysql(host='172.20.45.129', port='4000', user='root', password='=PDT49#80Z!RVv52_z',database='quality')
														
 
															     # 获取上次处理的 ID，如果没有，则从头开始
														
 
															     last_processed_id = get_last_processed_id()
														
@@ -174,7 +172,7 @@ def batch_load_data():
 
															     try:
														
 
															         # 使用 scroll API 来分批获取数据
														
 
															-        response = es_client.search(index="bidding", body=es_query, size=100)
														
 
															+        response = es_client.search(index="bidding", body=es_query)
														
 
															         hits = response['hits']['hits']
														
 
															         while hits:
														
@@ -211,29 +209,11 @@ def batch_load_data():
 
															                 district = item.get("district", "")
														
 
															                 score = item.get("score", "")
														
 
															                 error_type_data = json.dumps(data)
														
 
															-                spider_modified_time = current_datetime
														
 
															-
														
 
															-                info = collection.find_one({"code": spidercode})
														
 
															-                if info:
														
 
															-                    spider_modified_time = info.get("modifytime", "")
														
 
															-                    spider_modified_time = datetime.fromtimestamp(spider_modified_time)
														
 
															+                create_time = today_date
														
 
															-                params = (item["_id"], toptype, subtype, site, spidercode,channel, comeintime, area, city, district, score, error_type_data,spider_modified_time)
														
 
															+                params = (item["_id"], toptype, subtype, site, spidercode,channel, comeintime, area, city, district, score, error_type_data,create_time)
														
 
															                 insert_batch_data(conn, params)
														
 
															-                # 遍历错误原因字典并提取非空字典中的值
														
 
															-                for key, value in data.items():
														
 
															-                    error_ids = []
														
 
															-                    if isinstance(value, dict) and value:
														
 
															-                        cleaned_key = key[:-3] if key.endswith('_qa') else key  # 去除 '_qa' 后缀
														
 
															-                        for sub_key, sub_value in value.items():
														
 
															-                            error_id = find_error_id(conn, cleaned_key, sub_value)
														
 
															-                            if error_id:
														
 
															-                                error_ids.append(error_id)
														
 
															-                            print(f"  {sub_key}: {sub_value}")
														
 
															-
														
 
															-                        # 插入错误ID到cleaned_key_error字段
														
 
															-                        insert_dynamic_error_field(conn, cleaned_key, error_ids, item["_id"])
														
 
															                 print("------一条数据结束------")
														
 
															             # 保存当前批次处理的最大 ID
														
 
															             if max_id:
														
@@ -245,12 +225,13 @@ def batch_load_data():
 
															             # 获取下一批数据
														
 
															             search_after = hits[-1]["_id"]  # 获取当前批次最后一条数据的 _id 作为下一批的起始点
														
 
															             es_query["search_after"] = [search_after]  # 保持 _id 类型一致
														
 
															-            response = es_client.search(index="bidding", body=es_query, size="100")
														
 
															+            response = es_client.search(index="bidding", body=es_query)
														
 
															             hits = response['hits']['hits']
														
 
															             # 如果没有更多数据，跳出循环
														
 
															             if not hits:
														
 
															                 print("没有更多数据，结束批次处理")
														
 
															+                clear_last_processed_id()
														
 
															                 break
														
 
															         print("数据处理完成")
														
 
															     except Exception as e:
														
--- a/client_mysql_new.py
+++ b/client_mysql_new.py
@@ -0,0 +1,220 @@
 
															+# coding:utf-8
														
 
															+import time
														
 
															+from a2s.tools import json_serialize, json_deserialize
														
 
															+from a2s.a2s_client import a2s_execute
														
 
															+from docs.config import ReluMongodb
														
 
															+from util.mogodb_helper import MongoDBInterface
														
 
															+from pymongo import MongoClient
														
 
															+from util.mysql_tool import MysqlUtil
														
 
															+import json
														
 
															+from datetime import datetime, timedelta
														
 
															+from elasticsearch import Elasticsearch
														
 
															+
														
 
															+
														
 
															+ReluClient = MongoDBInterface(ReluMongodb)
														
 
															+
														
 
															+# 评估服务配置
														
 
															+a2s_ip = "172.20.100.235:9090"
														
 
															+topic = "quality_bid"
														
 
															+#本地测试用的主题
														
 
															+# topic = "test_quality_bid"
														
 
															+timeout = 180
														
 
															+
														
 
															+# # 测试es
														
 
															+# es_host = "http://127.0.0.1:19800"
														
 
															+# es_username = "jianyuGr"
														
 
															+# es_password = "we3g8glKfe#"
														
 
															+
														
 
															+#正式es
														
 
															+es_host = "http://172.17.4.184:19908"
														
 
															+es_username = "qyfw_es_2"
														
 
															+es_password = "Khfdals33#"
														
 
															+
														
 
															+# 初始化 Elasticsearch 客户端
														
 
															+es_client = Elasticsearch(es_host,http_auth=(es_username, es_password),retry_on_timeout=True) # 使用基本认证
														
 
															+
														
 
															+# 开始评估
														
 
															+def start_quality(data: dict, rules_id: int, a2s_ip, topic, timeout, retry=3):
														
 
															+    # 本次不使用SSL，所以channel是不安全的
														
 
															+    row = {"data": data, "rules_id": rules_id}
														
 
															+    bytes_data = json_serialize(row)
														
 
															+    for t in range(retry):
														
 
															+        print("topic",topic)
														
 
															+        try:
														
 
															+            resp_data = a2s_execute(a2s_ip, topic, timeout, bytes_data)
														
 
															+            if resp_data is None:
														
 
															+                continue
														
 
															+            result = json_deserialize(resp_data)
														
 
															+            return result
														
 
															+        except Exception as e:
														
 
															+            print(e)
														
 
															+    return {}
														
 
															+
														
 
															+# 获取规则ID
														
 
															+def get_rule(company, version):
														
 
															+    rule_id = ReluClient.find_rule_by_company(ReluMongodb["col"], company, version)
														
 
															+    return rule_id
														
 
															+
														
 
															+def find_error_id(conn, cleaned_key, sub_value):
														
 
															+    """
														
 
															+    查找 error_dict 中的 id
														
 
															+    """
														
 
															+    query = """SELECT id FROM error_dict WHERE fields = %s AND error = %s"""
														
 
															+    params = (cleaned_key, sub_value)
														
 
															+    result = MysqlUtil.query_data(conn, query, params)
														
 
															+    #[(10,)]
														
 
															+    # 检查查询结果是否为空
														
 
															+    if not result:
														
 
															+        print(f"Error: No matching record found for fields={cleaned_key}, error={sub_value}")
														
 
															+        return None  # 或者返回一个默认值，根据需求而定
														
 
															+
														
 
															+    record = result[0][0]
														
 
															+    return record
														
 
															+
														
 
															+def insert_batch_data(conn, params):
														
 
															+    """
														
 
															+    执行批量插入数据
														
 
															+    """
														
 
															+    query = """INSERT IGNORE INTO bid_analysis (mongoid,toptype,subtype, site, spidercode, channel,comeintime, area, city, district, score, error_type, create_time) 
														
 
															+               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s,%s)"""
														
 
															+    MysqlUtil.insert_data(conn, query, params)
														
 
															+
														
 
															+def insert_dynamic_error_field(conn, cleaned_key, error_ids, mongoid):
														
 
															+    """
														
 
															+    动态插入 error_ids 到相应的 cleaned_key_error 字段
														
 
															+    """
														
 
															+    # 构造动态插入 SQL 语句，更新指定的 cleaned_key_error 字段
														
 
															+    query = f"""
														
 
															+        UPDATE bid_analysis 
														
 
															+        SET {cleaned_key}_error = %s 
														
 
															+        WHERE mongoid = %s
														
 
															+    """
														
 
															+    # 拼接多个 error_id，用分隔符分开
														
 
															+    error_ids_str = ','.join(map(str, error_ids))
														
 
															+    params = (error_ids_str, mongoid )
														
 
															+
														
 
															+    MysqlUtil.update_data(conn, query, params)
														
 
															+
														
 
															+def has_non_empty_qa(data):
														
 
															+    # 获取data字典
														
 
															+    data_dict = data.get('data', {})
														
 
															+
														
 
															+    # 遍历所有键值对
														
 
															+    for key, value in data_dict.items():
														
 
															+        # 检查键以'_qa'结尾且值不为空
														
 
															+        if key.endswith('_qa') and value:  # value不为None、空字典、空列表等
														
 
															+            return True
														
 
															+    return False
														
 
															+
														
 
															+def batch_load_data():
														
 
															+    """
														
 
															+    批量数据质量检查
														
 
															+    """
														
 
															+    # 获取今天的日期（字符串格式）
														
 
															+    today_date = datetime.now().strftime("%Y-%m-%d")
														
 
															+    yesterday_date = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
														
 
															+    # 获取昨天 00:00:00 的时间戳
														
 
															+    start_date = int(datetime.strptime(f"{yesterday_date} 00:00:00", "%Y-%m-%d %H:%M:%S").timestamp())
														
 
															+    # print("start_date",start_date)
														
 
															+    # 获取今天 00:00:00 的时间戳
														
 
															+    end_date = int(datetime.strptime(f"{today_date} 00:00:00", "%Y-%m-%d %H:%M:%S").timestamp())
														
 
															+    # print("end_date", end_date)
														
 
															+    # 规则查询,根据必要条件 公司名称（用户ID）、版本号
														
 
															+    rules_id = get_rule("北京剑鱼信息技术有限公司", "v1.2")
														
 
															+    print(rules_id)
														
 
															+    # 初始化mysql
														
 
															+    conn = MysqlUtil.connect_to_mysql(host='172.20.45.129', port='4000', user='root', password='=PDT49#80Z!RVv52_z',database='quality')
														
 
															+
														
 
															+    # 获取ES数据
														
 
															+    es_query = {
														
 
															+        "query": {
														
 
															+            "bool": {
														
 
															+                "filter": [
														
 
															+                    {
														
 
															+                        "range": {
														
 
															+                            "comeintime": {
														
 
															+                                "gte": start_date,
														
 
															+                                "lt": end_date
														
 
															+                            }
														
 
															+                        }
														
 
															+                    }
														
 
															+                ]
														
 
															+            }
														
 
															+        },
														
 
															+        "sort": [
														
 
															+            {"_id": {"order": "asc"}}  # 如果 comeintime 相同，再按 _id 排序
														
 
															+        ],
														
 
															+        "size": 100  # 每次返回的数据量
														
 
															+    }
														
 
															+
														
 
															+    try:
														
 
															+        # 使用 scroll API 来分批获取数据
														
 
															+        response = es_client.search(index="bidding", body=es_query)
														
 
															+        hits = response['hits']['hits']
														
 
															+
														
 
															+        while hits:
														
 
															+            print(f"---- 批次开始 ----")
														
 
															+            max_id = None
														
 
															+            for hit in hits:
														
 
															+                item = hit["_source"]
														
 
															+                print("------一条数据开始--------")
														
 
															+                max_id = hit["_id"]
														
 
															+                print(f"正在处理数据: {max_id}")
														
 
															+                item["_id"] = str(hit["_id"])
														
 
															+
														
 
															+                # 质量检查逻辑
														
 
															+                result = start_quality(item, rules_id, a2s_ip, topic, timeout)
														
 
															+                print(result)
														
 
															+
														
 
															+                code = result.get("code")
														
 
															+                if code != 200:
														
 
															+                    # 数据出错，跳过
														
 
															+                    continue
														
 
															+                #只将有错误的数据存库
														
 
															+                if has_non_empty_qa(result):
														
 
															+                    data = result.get("data", {})
														
 
															+
														
 
															+                    # 数据插入到 MySQL
														
 
															+                    toptype = item.get("toptype", "")
														
 
															+                    subtype = item.get("subtype", "")
														
 
															+                    site = item.get("site", "")
														
 
															+                    spidercode = item.get("spidercode", "")
														
 
															+                    channel = item.get("channel", "")
														
 
															+                    comeintime = item.get("comeintime", "")
														
 
															+                    comeintime = datetime.fromtimestamp(comeintime)
														
 
															+                    area = item.get("area", "")
														
 
															+                    city = item.get("city", "")
														
 
															+                    district = item.get("district", "")
														
 
															+                    score = item.get("score", "")
														
 
															+                    error_type_data = json.dumps(data)
														
 
															+                    create_time = today_date
														
 
															+
														
 
															+                    params = (item["_id"], toptype, subtype, site, spidercode,channel, comeintime, area, city, district, score, error_type_data,create_time)
														
 
															+                    insert_batch_data(conn, params)
														
 
															+
														
 
															+                    print("------一条数据结束------")
														
 
															+            # 批次结束的打印信息
														
 
															+            print("---- 当前批次数据处理完成 ----")
														
 
															+
														
 
															+            # 获取下一批数据
														
 
															+            search_after = hits[-1]["_id"]  # 获取当前批次最后一条数据的 _id 作为下一批的起始点
														
 
															+            es_query["search_after"] = [search_after]  # 保持 _id 类型一致
														
 
															+            response = es_client.search(index="bidding", body=es_query)
														
 
															+            hits = response['hits']['hits']
														
 
															+
														
 
															+            # 如果没有更多数据，跳出循环
														
 
															+            if not hits:
														
 
															+                print("没有更多数据，结束批次处理")
														
 
															+                break
														
 
															+        print("数据处理完成")
														
 
															+    except Exception as e:
														
 
															+        print(f"错误: {e}")
														
 
															+        time.sleep(10)
														
 
															+    finally:
														
 
															+        if conn.is_connected():
														
 
															+            conn.close()  # 确保连接关闭
														
 
															+            print("MySQL 连接已关闭")
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    batch_load_data()
														
 
															+
														
--- a/client_spider_mongo.py
+++ b/client_spider_mongo.py
@@ -30,11 +30,11 @@ yesterday_8_am = datetime(yesterday.year, yesterday.month, yesterday.day, 8, 0,
 
															 # 转换为时间戳（秒级）
														
 
															 # current_datetime = int(yesterday_8_am.timestamp())
														
 
															 #create_time，用于批次字段，值为数据的年月日:2025-02-12,1739289600，1739894400
														
 
															-current_datetime=int(1740585600)
														
 
															+current_datetime=int(1743004800)
														
 
															 # 时间段
														
 
															-start_date = int(datetime(2025, 2, 27, 8, 0, 0).timestamp())  # 2025-01-20 00:00:00
														
 
															-end_date = int(datetime(2025, 2, 27, 12, 0 ,00).timestamp())  # 2025-01-20 23:59:59
														
 
															+start_date = int(datetime(2025, 3, 27, 8, 0, 0).timestamp())  # 2025-01-20 00:00:00
														
 
															+end_date = int(datetime(2025, 3, 27, 12, 0, 0).timestamp())  # 2025-01-20 23:59:59
														
 
															 # ES 连接配置
														
@@ -72,7 +72,7 @@ def insert_batch_data_mongo(collection,params):
 
															     执行批量插入数据到 MongoDB
														
 
															     """
														
 
															     # 将参数转换为字典列表
														
 
															-    documents=["mongoid","toptype","subtype","site","spidercode","channel","comeintime", "area","city","district","error_type","spider_modified_time","spider_important","site_important","create_time"]
														
 
															+    documents=["mongoid","toptype","subtype","site","spidercode","channel","comeintime", "area","city","district","error_type","create_time"]
														
 
															     doc={}
														
 
															     for indx,param in enumerate(params):
														
 
															         doc[documents[indx]] =param
														
@@ -187,10 +187,10 @@ def batch_load_data():
 
															                 area = item.get("area", "")
														
 
															                 city = item.get("city", "")
														
 
															                 district = item.get("district", "")
														
 
															-
														
 
															+                error_type=data
														
 
															                 create_time = current_datetime
														
 
															-                params = (item["_id"], toptype, subtype, site, spidercode,channel, comeintime, area, city, district, create_time)
														
 
															+                params = (item["_id"], toptype, subtype, site, spidercode,channel, comeintime, area, city, district, error_type,create_time)
														
 
															                 insert_batch_data_mongo(coll_user, params)
														
--- a/docs/last_processed_id_mysql.txt
+++ b/docs/last_processed_id_mysql.txt
@@ -0,0 +1 @@
 
															+67e4a6eb3309c0998be727e9
														
--- a/tables/fields/NoField.py
+++ b/tables/fields/NoField.py
@@ -17,7 +17,7 @@ class NoFieldChecker(object):
 
															             "title": self.check_title,
														
 
															             "projectname": self.check_projectname,
														
 
															             "buyer":self.check_buyer,
														
 
															-            "winner": self.check_winner,
														
 
															+            "s_winner": self.check_winner,
														
 
															             "owner":self.check_owner,
														
 
															             "budget": self.check_budget,
														
 
															             "bidamount": self.check_bidamount,
														
@@ -64,7 +64,7 @@ class NoFieldChecker(object):
 
															         """
														
 
															         subtype = obj.get("subtype", "")
														
 
															         if subtype in ["中标", "成交", "合同", "验收"]:
														
 
															-            winner = obj.get("winner")
														
 
															+            winner = obj.get("s_winner")
														
 
															             if not winner:
														
 
															                 return True
														
 
															         return False
														
--- a/tables/fields/publishtime.py
+++ b/tables/fields/publishtime.py
@@ -1,7 +1,7 @@
 
															 """
														
 
															     中标时间字段检查
														
 
															 """
														
 
															-
														
 
															+import time
														
 
															 class PublishtimeChecker(object):
														
 
															     """
														
@@ -14,6 +14,12 @@ class PublishtimeChecker(object):
 
															                 "parent_name": "数据范围类型",
														
 
															                 "parent_code": "02",
														
 
															                 "checkFn": self.check0201
														
 
															+            },
														
 
															+            "0202": {
														
 
															+                "name": "发布时间 > 当前时间",
														
 
															+                "parent_name": "数据范围类型",
														
 
															+                "parent_code": "02",
														
 
															+                "checkFn": self.check0202
														
 
															             }
														
 
															         }
														
@@ -31,4 +37,11 @@ class PublishtimeChecker(object):
 
															                 # 两者中有一方为空不判断
														
 
															                 return False
														
 
															         else:
														
 
															-            return False
														
 
															+            return False
														
 
															+
														
 
															+    def check0202(self, publishtime:int ) -> bool:
														
 
															+        current_timestamp = int(time.time())
														
 
															+        if publishtime > current_timestamp:
														
 
															+            return True
														
 
															+        else:
														
 
															+            return False
														
--- a/tables/fields/winner.py
+++ b/tables/fields/winner.py
@@ -265,25 +265,25 @@ class WinnerChecker(object):
 
															                             return False
														
 
															         return True
														
 
															-    def check0103(self,winner:str):
														
 
															+    def check0103(self,s_winner:str):
														
 
															         #中标单位名称以异常词开始
														
 
															         with open(abnormal_config["table_field_config"]["path1"], "r") as f:
														
 
															             reads = csv.reader(f)
														
 
															             for n in  reads:
														
 
															                 p1 = re.compile("^"+n[0])
														
 
															-                if p1.match(winner):
														
 
															+                if p1.match(s_winner):
														
 
															                     return True
														
 
															         # 中标单位名称包含异常词
														
 
															         with open(abnormal_config["table_field_config"]["path2"], "r") as f:
														
 
															             reads = csv.reader(f)
														
 
															             for n in  reads:
														
 
															-                if n[0] in winner:
														
 
															+                if n[0] in s_winner:
														
 
															                     return True
														
 
															         # 中标单位名称以异常词结尾
														
 
															         with open(abnormal_config["table_field_config"]["path3"], "r") as f:
														
 
															             reads = csv.reader(f)
														
 
															             for w in reads:
														
 
															-                if re.search(f"{w[0]}$", winner):
														
 
															+                if re.search(f"{w[0]}$", s_winner):
														
 
															                     return True
														
 
															         return False
														
--- a/test.py
+++ b/test.py