1 tháng trước cách đây · f687535fe7
--- a/lib/__pycache__/monitor_tools.cpython-38.pyc
+++ b/lib/__pycache__/monitor_tools.cpython-38.pyc
--- a/lib/learn
+++ b/lib/learn
@@ -3,4 +3,6 @@ from pymongo import MongoClient
 
				 collection_bid = MongoClient(f'mongodb://{"viewdata"}:{"viewdata"}@{"127.0.0.1:27088"}/',unicode_decode_error_handler="ignore", directConnection=True)["qfw"]["bidding"]
			
 
				 #连接测试环境mongo
			
 
				 db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
			
 
				-coll_user = db["standard_sample_data_all"]
			
 
				+coll_user = db["standard_sample_data_all"]
			
 
				+#常用的查询方式
			
 
				+for doc in final_results.find({"_id" :{"$gt": ObjectId("68023bce5f834436f09d7e9f")}}).sort('_id',1):
			
--- a/lib/monitor_tools_online.py
+++ b/lib/monitor_tools_online.py
@@ -0,0 +1,231 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- coding:utf-8 -*-
			
 
				+# author : liumiaomiao
			
 
				+#从es库中导出数据到测试环境mongo库
			
 
				+from lib.es_tools import esutil
			
 
				+from datetime import datetime, timedelta
			
 
				+from lib.mongo_tools import MongoUtil,Data_save,MongoSentence
			
 
				+from lib.mysql_tools import MysqlUtil
			
 
				+from lib.clickhouse_tools import ClickhouseUtil, logger
			
 
				+
			
 
				+class monitorTools:
			
 
				+    # 定义一周的时间范围，转换为Unix时间戳格式
			
 
				+    end_date = int(datetime.now().timestamp())
			
 
				+    start_date = int((datetime.now() - timedelta(days=7)).timestamp())
			
 
				+    print(f"开始时间：{start_date}--结束时间{end_date}")
			
 
				+
			
 
				+    #标准库bidding-es 每周统计入库数量
			
 
				+    def es_bidding(self):
			
 
				+        """
			
 
				+        es链接
			
 
				+        """
			
 
				+        db_config = {
			
 
				+            # es
			
 
				+            'es_host': '172.17.4.184',
			
 
				+            'es_port': 19908,
			
 
				+            'es_http_auth': ('qyfw_es_2','Khfdals33#'),  # 重新申请
			
 
				+            'timeout': 10000,
			
 
				+            'index': "bidding"
			
 
				+        }
			
 
				+        query = {"query": {"bool": {"must": [{"range": {"comeintime": {"from": f"{self.start_date}", "to": f"{self.end_date}"}}}]}}}
			
 
				+        # 传入查询语句query 以及配置信息
			
 
				+        # es=esutil.get_es(db_config["es_host"], db_config["es_http_auth"], db_config["es_port"],db_config["index"])
			
 
				+        counts=esutil.get_es_count(query,**db_config)
			
 
				+        count = counts['count']
			
 
				+        print("标准库es-bidding每周入库数据量：",count)
			
 
				+        return count
			
 
				+
			
 
				+    # 标准库bidding-es 碎片化数据每周统计入库数量
			
 
				+    def es_bidding_fragment(self):
			
 
				+        #正式环境
			
 
				+        db_config = {
			
 
				+            # es
			
 
				+            'es_host': '172.17.4.184',
			
 
				+            'es_port': 19908,
			
 
				+            'es_http_auth': ('qyfw_es_2', 'Khfdals33#'),  # 重新申请
			
 
				+            'timeout': 10000,
			
 
				+            'index': "bidding"
			
 
				+        }
			
 
				+        # #测试环境http://192.168.3.149:9201
			
 
				+        # db_config = {
			
 
				+        #     # es
			
 
				+        #     'es_host': '192.168.3.149',
			
 
				+        #     'es_port': 9201,
			
 
				+        #     # 'es_http_auth': ('jianyuGr', 'we3g8glKfe#'),  # 重新申请
			
 
				+        #     'timeout': 10000,
			
 
				+        #     'index': "bidding"
			
 
				+        # }
			
 
				+        # 定义要监控的字段值
			
 
				+        tags = [
			
 
				+            "情报_法务",
			
 
				+            "情报_财务审计",
			
 
				+            "情报_招标代理",
			
 
				+            "情报_管理咨询",
			
 
				+            "情报_保险",
			
 
				+            "情报_工程设计咨询",
			
 
				+            "情报_安防",
			
 
				+            "情报_印务商机",
			
 
				+            "情报_环境采购",
			
 
				+            "情报_家具招投标"
			
 
				+        ]
			
 
				+
			
 
				+        # 初始化字典，将所有标签的计数设置为0
			
 
				+        data = {}
			
 
				+        for tag in tags:
			
 
				+            query = {
			
 
				+                "query": {"bool": {"must": [{"range": {"comeintime": {"from": f"{self.start_date}", "to": f"{self.end_date}"}}},
			
 
				+                                            {"term": {"tag_topinformation": tag}}]}}}
			
 
				+            count = esutil.get_es_count(query, **db_config)
			
 
				+            print(f"标准库es-bidding{tag}每周入库数据量：", count['count'])
			
 
				+            data[tag]=count['count']
			
 
				+        # 检查数据字典以确保所有标签都被更新
			
 
				+        print("数据字典内容:", data)  # 打印整个数据字典
			
 
				+        return data
			
 
				+
			
 
				+    #拟在建es数据 每周统计入库数量
			
 
				+    def es_nzj(self):
			
 
				+        """
			
 
				+        es链接
			
 
				+        """
			
 
				+        db_config = {
			
 
				+            # es
			
 
				+            'es_host': '172.17.4.184',
			
 
				+            'es_port': 19908,
			
 
				+            'es_http_auth': ('qyfw_es_2', 'Khfdals33#'),  # 重新申请
			
 
				+            'timeout': 10000,
			
 
				+            'index': "proposed_v1"
			
 
				+        }
			
 
				+        query = {
			
 
				+            "query": {"match_all": {}}}
			
 
				+        # 传入查询语句query 以及配置信息
			
 
				+        # es=esutil.get_es(db_config["es_host"], db_config["es_http_auth"], db_config["es_port"],db_config["index"])
			
 
				+        counts = esutil.get_es_count(query, **db_config)
			
 
				+        count=counts['count']
			
 
				+        print("拟在建es入库数据总量：", count)
			
 
				+        return count
			
 
				+
			
 
				+    #医械通es，每周统计入库数量
			
 
				+    def medical_es(self):
			
 
				+        """
			
 
				+        es链接
			
 
				+        """
			
 
				+        db_config = {
			
 
				+            # es
			
 
				+            'es_host': '172.17.4.184',
			
 
				+            'es_port': 19908,
			
 
				+            'es_http_auth': ('qyfw_es_2', 'Khfdals33#'),  # 重新申请
			
 
				+            'timeout': 10000,
			
 
				+            'index': "bidding"
			
 
				+        }
			
 
				+        query = {
			
 
				+            "query": {"bool": {"must": [{"range": {"comeintime": {"from": f"{self.start_date}", "to": f"{self.end_date}"}}},{"term": {"bid_field": "0101"}}]}}}
			
 
				+        # 传入查询语句query 以及配置信息
			
 
				+        # es=esutil.get_es(db_config["es_host"], db_config["es_http_auth"], db_config["es_port"],db_config["index"])
			
 
				+        counts = esutil.get_es_count(query, **db_config)
			
 
				+        count = counts['count']
			
 
				+        print("医械通es每周入库数据量：", count)
			
 
				+        return count
			
 
				+
			
 
				+    #标准库bidding-mongo 每周统计入库数量
			
 
				+    def bidding(self):
			
 
				+        collection = MongoUtil.get_coon(host='172.31.31.202:27081', database='qfw',collection='bidding',authuser='dataFx',authpass='data@fenxi')
			
 
				+        query = { "comeintime": {"$gte": self.start_date, "$lt": self.end_date}}
			
 
				+        count=MongoSentence.count(collection,query)
			
 
				+        print("标准库bidding-mongo 每周统计入库数量",count)
			
 
				+        return count
			
 
				+
			
 
				+    #标准库bidding-mongo碎片化数据 每周统计入库数量
			
 
				+    def bidding_fragment(self):
			
 
				+        collection = MongoUtil.get_coon(host='172.31.31.202:27081', database='qfw',collection='bidding',authuser='dataFx',authpass='data@fenxi')
			
 
				+        # 定义要监控的字段值
			
 
				+        tags = [
			
 
				+            "情报_法务",
			
 
				+            "情报_财务审计",
			
 
				+            "情报_招标代理",
			
 
				+            "情报_管理咨询",
			
 
				+            "情报_保险",
			
 
				+            "情报_工程设计咨询",
			
 
				+            "情报_安防",
			
 
				+            "情报_印务商机",
			
 
				+            "情报_环境采购",
			
 
				+            "情报_家具招投标"
			
 
				+        ]
			
 
				+        # tags = [
			
 
				+        #     "情报_环境采购",
			
 
				+        #     "情报_家具招投标"
			
 
				+        # ]
			
 
				+        data={}
			
 
				+        for tag in tags:
			
 
				+            query = {"comeintime": {"$gte": self.start_date, "$lt": self.end_date},"tag_topinformation":tag}
			
 
				+            count=MongoSentence.count(collection,query)
			
 
				+            print(f"标准库bidding-mongo{tag}每周统计入库数量",count)
			
 
				+            data[tag]=count
			
 
				+        return data
			
 
				+
			
 
				+    #拟在建baseinfo-mysql 每周统计入库数量
			
 
				+    def nzj(self):
			
 
				+        # MySQL 数据库连接配置
			
 
				+        # mysql_db_config = {
			
 
				+        #     'host': '192.168.3.149',
			
 
				+        #     'port': 4000,
			
 
				+        #     'user': 'datagroup',
			
 
				+        #     'password': 'Dgrpdb#2024@36',
			
 
				+        #     'database': 'jianyu_subjectdb',
			
 
				+        #     'charset': 'utf8mb4'
			
 
				+        # }
			
 
				+
			
 
				+        now = datetime.now()
			
 
				+        end_date = now.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+        start_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+        # SQL 查询
			
 
				+        mysql_query = "SELECT COUNT(*) FROM jianyu_subjectdb.dwd_f_nzj_baseinfo WHERE createtime >= %s AND createtime <= %s"
			
 
				+        params = (start_date, end_date)
			
 
				+        conn=MysqlUtil.connect_to_mysql(host='172.17.162.27',port='14000',user='jydev',password='JSuytest#s211',database='jianyu_subjectdb')
			
 
				+        count=MysqlUtil.execute_sql(conn,mysql_query,params)
			
 
				+        print("拟在建baseinfo-mysql每周统计入库数量", count)
			
 
				+        return count
			
 
				+
			
 
				+    #人脉数据，每周统计入库数量
			
 
				+    def connections(self):
			
 
				+        client = None
			
 
				+        try:
			
 
				+            query = f"SELECT COUNT(*) FROM information.transaction_info_all WHERE create_time >={self.start_date} AND create_time <={self.end_date}"
			
 
				+            # conn=ClickhouseUtil.connect_to_clickhouse(host='192.168.3.207',port='19000',user='jytop',password='pwdTopJy123',database='information')
			
 
				+            client=ClickhouseUtil.connect_to_clickhouse(host='cc-2ze9tv451wov14w9e.clickhouse.ads.aliyuncs.com',port=9000,user='jydev',password='ClTe0331kho2025',database='information')
			
 
				+            count=ClickhouseUtil.execute_sql(client,query)
			
 
				+            result=count[0][0]
			
 
				+            print("人脉数据每周统计入库数量", result)
			
 
				+            return result
			
 
				+        except Exception as e:
			
 
				+            logger.error("An error occurred: %s", e)
			
 
				+            raise
			
 
				+        finally:
			
 
				+            if client:
			
 
				+                client.disconnect()  # 释放连接
			
 
				+
			
 
				+    #医械通，每周统计入库数量
			
 
				+    def medical(self):
			
 
				+        collection = MongoUtil.get_coon(host='172.31.31.202:27081', database='qfw',collection='bidding',authuser='dataFx',authpass='data@fenxi')
			
 
				+        query = {"comeintime": {"$gte": self.start_date, "$lt": self.end_date},"bid_field":"0101"}
			
 
				+        count = MongoSentence.count(collection, query)
			
 
				+        print("医械通每周统计入库数量", count)
			
 
				+        return count
			
 
				+
			
 
				+    #统计结果入库
			
 
				+    def save_to_mongo(self,title,count):
			
 
				+        collection=Data_save.save_con(host='172.20.45.129',port=27002,database='data_quality',collection='statistics')
			
 
				+        now = datetime.now()
			
 
				+        timestamp = int(now.timestamp())
			
 
				+        document = {
			
 
				+            title: {
			
 
				+                "timestamp": timestamp,
			
 
				+                "count": count
			
 
				+            }
			
 
				+        }
			
 
				+        Data_save.insert_one(collection,document)
			
 
				+
			
 
				+
			
 
				+monitor=monitorTools()
			
 
				+
			
 
				+
			
--- a/tools/临时/spidercodes
+++ b/tools/临时/spidercodes
--- a/tools/周报/mongo，es断流监控/monitor_all.py
+++ b/tools/周报/mongo，es断流监控/monitor_all.py
--- a/tools/周报/周报表格导出/weekly_data_store.py
+++ b/tools/周报/周报表格导出/weekly_data_store.py
@@ -0,0 +1,263 @@
 
				+from pymongo import MongoClient
			
 
				+from datetime import datetime, timedelta
			
 
				+import pandas as pd
			
 
				+import pymysql
			
 
				+# 数据入库量及数据监控时效 导出execl
			
 
				+# MongoDB连接配置
			
 
				+host = '172.20.45.129'
			
 
				+port = 27002
			
 
				+dbname = 'data_quality'
			
 
				+collection_name = 'statistics'
			
 
				+
			
 
				+# 创建MongoDB连接
			
 
				+client = MongoClient(host, port)
			
 
				+db = client[dbname]
			
 
				+collection = db[collection_name]
			
 
				+
			
 
				+# 获取当前时间和一周前的时间
			
 
				+end_time = datetime.now().replace(hour=23, minute=59, second=59, microsecond=999999)
			
 
				+start_time = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
			
 
				+
			
 
				+# 将datetime转换为Unix时间戳（整数类型，去掉小数部分）
			
 
				+start_timestamp = int(start_time.timestamp())
			
 
				+end_timestamp = int(end_time.timestamp())
			
 
				+
			
 
				+# 输出调试信息：检查开始时间和结束时间
			
 
				+print("Start time:", start_time)
			
 
				+print("End time:", end_time)
			
 
				+print("Start timestamp:", start_timestamp)
			
 
				+print("End timestamp:", end_timestamp)
			
 
				+
			
 
				+# ----------------- 第一个Sheet: 断流监控_mongo库 -------------------
			
 
				+
			
 
				+# 查询过去一周的数据（断流监控_mongo库）
			
 
				+pipeline_mongo = [
			
 
				+    {
			
 
				+        "$match": {
			
 
				+            "$or": [
			
 
				+                {"bidding.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"connections.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"nzj.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"medical.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"bidding_fragment.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}}
			
 
				+            ]
			
 
				+        }
			
 
				+    },
			
 
				+    {
			
 
				+        "$limit": 5  # 限制查询返回的结果为前5条数据，便于调试
			
 
				+    }
			
 
				+]
			
 
				+
			
 
				+# 获取符合条件的数据
			
 
				+data_mongo = list(collection.aggregate(pipeline_mongo))
			
 
				+
			
 
				+# 初始化MongoDB字段统计数据
			
 
				+bidding_count = 0
			
 
				+connections_count = 0
			
 
				+nzj_count = 0
			
 
				+medical_count = 0
			
 
				+bidding_fragment_data = {
			
 
				+    "情报_法务": 0,
			
 
				+    "情报_财务审计": 0,
			
 
				+    "情报_招标代理": 0,
			
 
				+    "情报_管理咨询": 0,
			
 
				+    "情报_保险": 0,
			
 
				+    "情报_工程设计咨询": 0,
			
 
				+    "情报_安防": 0,
			
 
				+    "情报_印务商机": 0,
			
 
				+    "情报_环境采购": 0,
			
 
				+    "情报_家具招投标": 0
			
 
				+}
			
 
				+
			
 
				+# 统计MongoDB数据
			
 
				+for doc in data_mongo:
			
 
				+    if 'bidding' in doc:
			
 
				+        bidding_count += doc['bidding'].get('count', 0)
			
 
				+    if 'connections' in doc:
			
 
				+        connections_count += doc['connections'].get('count', 0)
			
 
				+    if 'nzj' in doc:
			
 
				+        nzj_count += doc['nzj'].get('count', 0)
			
 
				+    if 'medical' in doc :
			
 
				+        medical_count += doc['medical'].get('count', 0)
			
 
				+    if 'bidding_fragment' in doc:
			
 
				+        for key, value in doc['bidding_fragment'].get('count', {}).items():
			
 
				+            if key in bidding_fragment_data:
			
 
				+                bidding_fragment_data[key] += value
			
 
				+
			
 
				+# ----------------- 第二个Sheet: 断流监控—es -------------------
			
 
				+
			
 
				+# 查询过去一周的数据（断流监控—es）
			
 
				+pipeline_es = [
			
 
				+    {
			
 
				+        "$match": {
			
 
				+            "$or": [
			
 
				+                {"es_bidding.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"es_nzj.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"es_medical.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
			
 
				+                {"es_bidding_fragment.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}}
			
 
				+            ]
			
 
				+        }
			
 
				+    }
			
 
				+]
			
 
				+
			
 
				+# 获取符合条件的数据
			
 
				+data_es = list(collection.aggregate(pipeline_es))
			
 
				+
			
 
				+# 初始化ES字段统计数据
			
 
				+es_bidding_count = 0
			
 
				+es_nzj_count = 0
			
 
				+es_medical_count = 0
			
 
				+es_bidding_fragment_data = {
			
 
				+    "情报_法务": 0,
			
 
				+    "情报_财务审计": 0,
			
 
				+    "情报_招标代理": 0,
			
 
				+    "情报_管理咨询": 0,
			
 
				+    "情报_保险": 0,
			
 
				+    "情报_工程设计咨询": 0,
			
 
				+    "情报_安防": 0,
			
 
				+    "情报_印务商机": 0,
			
 
				+    "情报_环境采购": 0,
			
 
				+    "情报_家具招投标": 0
			
 
				+}
			
 
				+
			
 
				+# 统计ES数据
			
 
				+for doc in data_es:
			
 
				+    if 'es_bidding' in doc:
			
 
				+        es_bidding_count += doc['es_bidding'].get('count', 0)
			
 
				+    if 'es_nzj' in doc:
			
 
				+        es_nzj_count += doc['es_nzj'].get('count', 0)
			
 
				+    if 'es_medical' in doc:
			
 
				+        es_medical_count += doc['es_medical'].get('count', 0)
			
 
				+    if 'es_bidding_fragment' in doc:
			
 
				+        for key, value in doc['es_bidding_fragment'].get('count', {}).items():
			
 
				+            if key in es_bidding_fragment_data:
			
 
				+                es_bidding_fragment_data[key] += value
			
 
				+
			
 
				+# ----------------- 第三个Sheet: 数据时效监控 -------------------
			
 
				+
			
 
				+# 查询过去一周的数据（数据时效监控）
			
 
				+pipeline_timeliness = [
			
 
				+    {
			
 
				+        "$match": {
			
 
				+            "data_timeliness.timestamp": {
			
 
				+                "$gte": start_timestamp,  # 使用整数Unix时间戳
			
 
				+                "$lt": end_timestamp  # 使用整数Unix时间戳
			
 
				+            }
			
 
				+        }
			
 
				+    },
			
 
				+    {
			
 
				+        "$limit": 5  # 限制查询返回的结果为前5条数据，便于调试
			
 
				+    }
			
 
				+]
			
 
				+
			
 
				+# 获取符合条件的数据
			
 
				+data_timeliness = list(collection.aggregate(pipeline_timeliness))
			
 
				+
			
 
				+# 初始化字段统计数据
			
 
				+timeliness_data = {
			
 
				+    "[0,5)分钟": 0,
			
 
				+    "[5,15)分钟": 0,
			
 
				+    "[15,30)分钟": 0,
			
 
				+    "[30,60)分钟": 0,
			
 
				+    "[1,3)小时": 0,
			
 
				+    "[3,7)小时": 0,
			
 
				+    "[7,15)小时": 0,
			
 
				+    "[15,24)小时": 0,
			
 
				+    "[1,2)天": 0,
			
 
				+    "[2,3)天": 0,
			
 
				+    "3天+": 0
			
 
				+}
			
 
				+
			
 
				+# 统计数据
			
 
				+for doc in data_timeliness:
			
 
				+    if 'data_timeliness' in doc:
			
 
				+        count_data = doc['data_timeliness'].get('count', {})
			
 
				+        timeliness_data["[0,5)分钟"] += float(count_data.get("a1", "0%").replace('%', ''))
			
 
				+        timeliness_data["[5,15)分钟"] += float(count_data.get("a2", "0%").replace('%', ''))
			
 
				+        timeliness_data["[15,30)分钟"] += float(count_data.get("a3", "0%").replace('%', ''))
			
 
				+        timeliness_data["[30,60)分钟"] += float(count_data.get("a4", "0%").replace('%', ''))
			
 
				+        timeliness_data["[1,3)小时"] += float(count_data.get("a5", "0%").replace('%', ''))
			
 
				+        timeliness_data["[3,7)小时"] += float(count_data.get("a6", "0%").replace('%', ''))
			
 
				+        timeliness_data["[7,15)小时"] += float(count_data.get("a7", "0%").replace('%', ''))
			
 
				+        timeliness_data["[15,24)小时"] += float(count_data.get("a8", "0%").replace('%', ''))
			
 
				+        timeliness_data["[1,2)天"] += float(count_data.get("a9", "0%").replace('%', ''))
			
 
				+        timeliness_data["[2,3)天"] += float(count_data.get("a10", "0%").replace('%', ''))
			
 
				+        timeliness_data["3天+"] += float(count_data.get("a11", "0%").replace('%', ''))
			
 
				+
			
 
				+# 获取当前时间的一周时间范围字符串
			
 
				+date_range = f"{start_time.strftime('%Y/%m/%d')}-{end_time.strftime('%Y/%m/%d')}"
			
 
				+
			
 
				+# 构建Excel数据
			
 
				+columns = ['日期', '标讯每周入库数据量', '人脉管理数据', '拟在建数据量(全国)','医械通'] + list(bidding_fragment_data.keys())
			
 
				+data_row_mongo = [date_range, bidding_count, connections_count, nzj_count,medical_count] + list(bidding_fragment_data.values())
			
 
				+
			
 
				+columns_es = ['日期', '标讯每周入库数据量', '拟在建数据量(全国)','医械通'] + list(es_bidding_fragment_data.keys())
			
 
				+data_row_es = [date_range, es_bidding_count,  es_nzj_count,es_medical_count] + list(es_bidding_fragment_data.values())
			
 
				+
			
 
				+columns_timeliness = ['日期'] + list(timeliness_data.keys())
			
 
				+data_row_timeliness = [date_range] + list(timeliness_data.values())
			
 
				+
			
 
				+def insert_mysql():
			
 
				+    # MySQL 连接
			
 
				+    conn = pymysql.connect(host='172.20.45.129', port='4000', user='root', password='=PDT49#80Z!RVv52_z',database='quality')
			
 
				+    cursor = conn.cursor()
			
 
				+
			
 
				+    # 插入数据入库监控表
			
 
				+    sql_monitoring = """
			
 
				+    INSERT INTO data_monitoring (
			
 
				+        date_range, type, total_weekly_entries, renmaitong_data, planning_projects_data, medical_device_data,
			
 
				+        legal_intelligence, financial_audit_intelligence, bidding_agency_intelligence, management_consulting_intelligence, insurance_intelligence,
			
 
				+        engineering_consulting_intelligence, security_intelligence, printing_business_intelligence, environmental_procurement_intelligence, furniture_bidding_intelligence
			
 
				+    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+    """
			
 
				+
			
 
				+    data_monitoring = [
			
 
				+        ('2025/03/20-2025/03/27', 'mongo', 5000, 1200, 2300, 1500, 300, 500, 600, 450, 200, 180, 220, 190, 210, 170),
			
 
				+        ('2025/03/20-2025/03/27', 'es', 4800, 1100, 2200, 1400, 280, 480, 590, 430, 190, 170, 210, 180, 200, 160)
			
 
				+    ]
			
 
				+    cursor.executemany(sql_monitoring, data_monitoring)
			
 
				+
			
 
				+    # 插入数据时效监控表
			
 
				+    sql_timeliness = """
			
 
				+    INSERT INTO response_time_distribution (
			
 
				+        date_range, range_0_5_min, range_5_15_min, range_15_30_min, range_30_60_min,
			
 
				+        range_1_3_hour, range_3_7_hour, range_7_15_hour, range_15_24_hour,
			
 
				+        range_1_2_day, range_2_3_day, range_3_plus_day
			
 
				+    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				+    """
			
 
				+
			
 
				+    data_timeliness = [
			
 
				+        ('2025/03/20-2025/03/27', 28.31, 15.42, 10.85, 8.34, 12.50, 6.75, 5.20, 3.90, 4.10, 2.45, 2.18)
			
 
				+    ]
			
 
				+    cursor.executemany(sql_timeliness, data_timeliness)
			
 
				+
			
 
				+    # 提交事务
			
 
				+    conn.commit()
			
 
				+    cursor.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+
			
 
				+# 创建DataFrame并写入Excel
			
 
				+excel_file = 'mongo_data_statistics_combined1.xlsx'
			
 
				+
			
 
				+with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
			
 
				+    # 写入第一个sheet（断流监控_mongo库）
			
 
				+    df_mongo = pd.DataFrame([data_row_mongo], columns=columns)
			
 
				+    df_mongo.to_excel(writer, sheet_name='入库数据量监控-mongo(每周)', index=False)
			
 
				+
			
 
				+    # 写入第二个sheet（断流监控—es）
			
 
				+    df_es = pd.DataFrame([data_row_es], columns=columns_es)
			
 
				+    df_es.to_excel(writer, sheet_name='入库量数据量监控-es(每周)', index=False)
			
 
				+
			
 
				+    # 将timeliness_data中的值转换为百分比字符串
			
 
				+    for key in timeliness_data:
			
 
				+        timeliness_data[key] = f"{timeliness_data[key]:.2f}%"
			
 
				+
			
 
				+    # 构建数据行
			
 
				+    data_row_timeliness = [date_range] + list(timeliness_data.values())
			
 
				+
			
 
				+    # 写入第三个sheet（数据时效监控）
			
 
				+    df_timeliness = pd.DataFrame([data_row_timeliness], columns=columns_timeliness)
			
 
				+    df_timeliness.to_excel(writer, sheet_name='数据时效监控(7天平均值)', index=False)
			
 
				+
			
 
				+print(f"统计结果已写入Excel文件: {excel_file}")
			
--- a/tools/基于抽取表ai和规则对比/ai抽取和规则抽取对比.py
+++ b/tools/基于抽取表ai和规则对比/ai抽取和规则抽取对比.py
--- a/tools/基于抽取表ai和规则对比/new.py
+++ b/tools/基于抽取表ai和规则对比/new.py
--- a/tools/基于抽取表ai和规则对比/一致性对比.py
+++ b/tools/基于抽取表ai和规则对比/一致性对比.py
--- a/tools/样本数据导出/ai_exchange_to_multipacket.py
+++ b/tools/样本数据导出/ai_exchange_to_multipacket.py
--- a/tools/样本数据导出/fix_site_data_export.py
+++ b/tools/样本数据导出/fix_site_data_export.py
--- a/tools/样本数据导出/sample_data_export.py
+++ b/tools/样本数据导出/sample_data_export.py
@@ -1,15 +1,41 @@
 
				 from pymongo import MongoClient
			
 
				-
			
 
				+from urllib.parse import quote_plus  # 可选，若密码中有特殊字符
			
 
				+
			
 
				+# MongodbConfigSource = {
			
 
				+#     "ip_port": "127.0.0.1:27088",
			
 
				+#     "user": "viewdata",
			
 
				+#     "password": "viewdata",
			
 
				+#     "db": "qfw",
			
 
				+#     "col": "zktest_0422_fenbao"
			
 
				+# }
			
 
				 def sample_data(N):
			
 
				-    # 连接MongoDB数据库
			
 
				-    db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
			
 
				-    coll_user = db["customer_data"]
			
 
				-
			
 
				 
			
 
				+    # 连接MongoDB数据库
			
 
				+    db = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore").data_quality
			
 
				+    coll_user = db["zktest_0422_fenbao"]
			
 
				+    # 构建连接字符串（含特殊字符建议用 quote_plus 编码）
			
 
				+    # user = quote_plus(MongodbConfigSource['user'])
			
 
				+    # password = quote_plus(MongodbConfigSource['password'])
			
 
				+    # mongo_uri = f"mongodb://{user}:{password}@{MongodbConfigSource['ip_port']}/?authSource=admin"
			
 
				+
			
 
				+    # client = MongoClient(mongo_uri, unicode_decode_error_handler="ignore")
			
 
				+    # # 获取数据库与集合
			
 
				+    # db = client[MongodbConfigSource["db"]]
			
 
				+    # coll_user = db[MongodbConfigSource["col"]]
			
 
				+
			
 
				+    # 统计符合筛选条件的总数据量
			
 
				+    filter_condition = {
			
 
				+         "$or": [
			
 
				+        {"toptype": "招标"},
			
 
				+        {"subtype": {"$in": ["中标", "成交", "合同", "验收"]}}
			
 
				+    ]
			
 
				+    }
			
 
				+    count_all = coll_user.count_documents(filter_condition)
			
 
				+    print("Filtered Document Count:", count_all)
			
 
				     # 统计总的数据量
			
 
				-    count_all = coll_user.estimated_document_count()
			
 
				-    # count_all = coll_user.count_documents({"tag": 1})
			
 
				-    print("Total Document Count:", count_all)
			
 
				+    # count_all = coll_user.estimated_document_count()
			
 
				+    # # count_all = coll_user.count_documents({"tag": 1})
			
 
				+    # print("Total Document Count:", count_all)
			
 
				 
			
 
				     # 把符合条件的站点名称存起来
			
 
				     site_list = {}
			
@@ -45,16 +71,22 @@ def sample_data(N):
 
				 
			
 
				         # 计算每次抽取的间隔
			
 
				         jiange = int(site_list[key] / num)
			
 
				+        query = {
			
 
				+            "$or": [
			
 
				+                {"toptype": "招标"},
			
 
				+                {"subtype": {"$in": ["中标", "成交", "合同", "验收"]}}
			
 
				+            ]
			
 
				+        }
			
 
				 
			
 
				         # 从每个站点等间隔地取数据
			
 
				         for i in range(num):
			
 
				             if marked_count >= N:
			
 
				                 break  # 再次检查是否已达到目标数量
			
 
				 
			
 
				-            for info in coll_user.find({"site": key}).sort("_id", 1).skip(i*jiange).limit(1):
			
 
				+            for info in coll_user.find(query).sort("_id", -1).skip(i*jiange).limit(1):
			
 
				                 print(f"Updating document with _id: {info['_id']}")
			
 
				                 # 更新文档，设置标记
			
 
				-                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 2}})
			
 
				+                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 1}})
			
 
				                 if update_result.modified_count == 0:
			
 
				                     print("No document updated for _id:", info["_id"])
			
 
				                 else:
			
--- a/tools/数据抽样/sample_data_export_new.py
+++ b/tools/数据抽样/sample_data_export_new.py
@@ -0,0 +1,58 @@
 
				+from pymongo import MongoClient
			
 
				+def sample_data(N):
			
 
				+    # db = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore").data_quality
			
 
				+    db = MongoClient('mongodb://127.0.0.1:27087/', unicode_decode_error_handler="ignore",directConnection=True).jyqyfw  # 清洗库
			
 
				+
			
 
				+    coll_user = db["usermail_Unicom_1_2"]
			
 
				+
			
 
				+    filter_condition = {
			
 
				+        "$or": [
			
 
				+            {"tag": 1},
			
 
				+            {"tag": 2}
			
 
				+        ]
			
 
				+    }
			
 
				+
			
 
				+    # 获取所有站点及其文档数
			
 
				+    site_list = {}
			
 
				+    site_count = coll_user.aggregate([
			
 
				+        {"$match": filter_condition},
			
 
				+        {"$group": {"_id": "$site", "count": {"$sum": 1}}},
			
 
				+        {"$sort": {"count": -1}}
			
 
				+    ])
			
 
				+    for item in site_count:
			
 
				+        site_list[item["_id"]] = item["count"]
			
 
				+
			
 
				+    total_docs = sum(site_list.values())
			
 
				+    remaining = N
			
 
				+    marked_count = 0
			
 
				+
			
 
				+    for site, count in site_list.items():
			
 
				+        if remaining <= 0:
			
 
				+            break
			
 
				+
			
 
				+        # 计算该站点应分配的样本数
			
 
				+        num = max(1, round(N * count / total_docs))
			
 
				+        num = min(num, remaining)
			
 
				+
			
 
				+        print(f"Processing site: {site} - Allocating {num} samples")
			
 
				+
			
 
				+        # 使用随机抽样
			
 
				+        pipeline = [
			
 
				+            {"$match": {"site": site, **filter_condition}},
			
 
				+            {"$sample": {"size": num}},
			
 
				+            {"$project": {"_id": 1}}
			
 
				+        ]
			
 
				+
			
 
				+        sampled_ids = [doc["_id"] for doc in coll_user.aggregate(pipeline)]
			
 
				+        if not sampled_ids:
			
 
				+            continue
			
 
				+
			
 
				+        update_result = coll_user.update_many(
			
 
				+            {"_id": {"$in": sampled_ids}},
			
 
				+            {"$set": {"mark": 1}}
			
 
				+        )
			
 
				+        marked_count += update_result.modified_count
			
 
				+        remaining -= update_result.modified_count
			
 
				+
			
 
				+    print(f"Total marked documents: {marked_count}")
			
 
				+sample_data(2000)
			
--- a/tools/数据抽样/抽样方法最新.py
+++ b/tools/数据抽样/抽样方法最新.py
--- a/tools/数据质量监控平台/kb-数据问题统计/KB问题统计汇总.xlsx
+++ b/tools/数据质量监控平台/kb-数据问题统计/KB问题统计汇总.xlsx
--- a/tools/数据质量监控平台/kb-数据问题统计/execl_kb.py
+++ b/tools/数据质量监控平台/kb-数据问题统计/execl_kb.py
--- a/tools/数据质量监控平台/kb-数据问题统计/task_kb.py
+++ b/tools/数据质量监控平台/kb-数据问题统计/task_kb.py
--- a/tools/数据质量监控平台/基于标准数据的字段分析结果.py
+++ b/tools/数据质量监控平台/基于标准数据的字段分析结果.py
--- a/tools/数据质量监控平台/标讯基础信息分析结果入库.py
+++ b/tools/数据质量监控平台/标讯基础信息分析结果入库.py
--- a/tools/标准样本数据入库/标准样本数据汇总.xlsx
+++ b/tools/标准样本数据入库/标准样本数据汇总.xlsx
--- a/tools/标讯数据附件为空数量统计/统计.py
+++ b/tools/标讯数据附件为空数量统计/统计.py
--- a/tools/爬虫数据质量一期/1、bid_analysis表抽取数据到抽取表.py
+++ b/tools/爬虫数据质量一期/1、bid_analysis表抽取数据到抽取表.py
--- a/tools/爬虫数据质量一期/2、bid_analysis表错误原因及数量统计输出,存入抽取表.py
+++ b/tools/爬虫数据质量一期/2、bid_analysis表错误原因及数量统计输出,存入抽取表.py
--- a/tools/爬虫数据质量一期/3、抽取表完善爬虫字段.py
+++ b/tools/爬虫数据质量一期/3、抽取表完善爬虫字段.py
--- a/tools/爬虫数据质量一期/4、bid_extract计算爬虫增长率并发邮件提醒.py
+++ b/tools/爬虫数据质量一期/4、bid_extract计算爬虫增长率并发邮件提醒.py
@@ -1,12 +1,12 @@
 
				 from pymongo import MongoClient
			
 
				 import smtplib
			
 
				 from email.mime.text import MIMEText
			
 
				-
			
 
				+from datetime import datetime
			
 
				 
			
 
				 def send_email(subject, body, to_email):
			
 
				-    sender_email = "your_email@example.com"
			
 
				-    sender_password = "your_password"
			
 
				-    smtp_server = "smtp.example.com"
			
 
				+    sender_email = "liumm_6064@163.com"
			
 
				+    sender_password = "TPVBPYSETVHWTIDH"
			
 
				+    smtp_server = "smtp.163.com"
			
 
				 
			
 
				     msg = MIMEText(body, "plain", "utf-8")
			
 
				     msg["Subject"] = subject
			
@@ -23,6 +23,8 @@ def send_email(subject, body, to_email):
 
				     except Exception as e:
			
 
				         print("邮件发送失败", e)
			
 
				 
			
 
				+def format_timestamp(batch_id):
			
 
				+    return datetime.utcfromtimestamp(batch_id).strftime('%Y-%m-%d %H:%M:%S')
			
 
				 
			
 
				 def calculate_growth_rate(data):
			
 
				     batch_updates = data.get("batch_updates", [])
			
@@ -38,17 +40,19 @@ def calculate_growth_rate(data):
 
				         prev_total = batch_updates[i - 1]["update_info"].get("总数量", 0)
			
 
				         latest_total = batch_updates[i]["update_info"].get("总数量", 0)
			
 
				 
			
 
				+        batch_updates[i]["batch_time"] = format_timestamp(batch_updates[i]["batch_id"])  # 添加时间转换字段
			
 
				+
			
 
				         if prev_total == 0:
			
 
				             growth_rate = "N/A"
			
 
				         else:
			
 
				             growth_rate_value = ((latest_total - prev_total) / prev_total) * 100
			
 
				             growth_rate = f"{growth_rate_value:.2f} %"
			
 
				 
			
 
				-            # 低于 20% 发送邮件提醒
			
 
				-            if growth_rate_value < 20:
			
 
				+            # 仅在最新批次的增长率低于 20% 时发送邮件
			
 
				+            if i == len(batch_updates) - 1 and growth_rate_value < 20:
			
 
				                 subject = "数据增长率低于 20% 提醒"
			
 
				-                body = f"批次 {batch_updates[i]['batch_id']} 的增长率仅为 {growth_rate}，请关注！"
			
 
				-                send_email(subject, body, "recipient@example.com")
			
 
				+                body = f"最新批次 {batch_updates[i]['batch_id']} ({batch_updates[i]['batch_time']}) 的增长率仅为 {growth_rate}，请关注！"
			
 
				+                send_email(subject, body, "liumiaomiao@topnet.net.cn.com")
			
 
				 
			
 
				         batch_updates[i]["update_info"]["增长率"] = growth_rate
			
 
				 
			
--- a/tools/爬虫数据质量一期/5、根据抽取表生成分析表格.py
+++ b/tools/爬虫数据质量一期/5、根据抽取表生成分析表格.py
--- a/tools/爬虫数据质量一期/test.py
+++ b/tools/爬虫数据质量一期/test.py
--- a/tools/爬虫数据质量一期/爬虫代码输出.py
+++ b/tools/爬虫数据质量一期/爬虫代码输出.py
--- a/tools/爬虫数据质量一期/爬虫数据动态.xlsx
+++ b/tools/爬虫数据质量一期/爬虫数据动态.xlsx
--- a/tools/爬虫数据质量一期/爬虫数据动态1.xlsx
+++ b/tools/爬虫数据质量一期/爬虫数据动态1.xlsx
--- a/tools/爬虫数据质量二期/spider_quality.py
+++ b/tools/爬虫数据质量二期/spider_quality.py
--- a/tools/生成标准样本库的分析数据/test.py
+++ b/tools/生成标准样本库的分析数据/test.py
--- a/tools/生成标准样本库的分析数据/test2.py
+++ b/tools/生成标准样本库的分析数据/test2.py
--- a/tools/生成标准样本库的分析数据/test3.py
+++ b/tools/生成标准样本库的分析数据/test3.py
--- a/tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mongo.py
+++ b/tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mongo.py
@@ -14,7 +14,7 @@ MongodbConfigLocal = {
 
				 
			
 
				 # MySQL 配置信息
			
 
				 mysql_config = {
			
 
				-    "host": "192.168.3.217",
			
 
				+    "host": "172.20.45.129",
			
 
				     "user": "root",
			
 
				     "password": "=PDT49#80Z!RVv52_z",
			
 
				     "database": "quality",
			
@@ -32,8 +32,7 @@ field_mapping = {
 
				     "projectcode": "projectcode_ai",
			
 
				     "budget": "budget_ai",
			
 
				     "s_winner": "s_winner_ai",
			
 
				-    "bidamount": "bidamount_ai",
			
 
				-    "multipackage": "multipackage_ai"
			
 
				+    "bidamount": "bidamount_ai"
			
 
				 }
			
 
				 
			
 
				 def main():
			
@@ -50,7 +49,7 @@ def main():
 
				     ) as mysql_conn:
			
 
				         with mysql_conn.cursor() as mysql_cursor:
			
 
				             # 从 MySQL 中读取 _id 列表
			
 
				-            mysql_cursor.execute("SELECT _id FROM bid_llizhikun")
			
 
				+            mysql_cursor.execute("SELECT _id FROM sample_bid_analysis")
			
 
				             ids = mysql_cursor.fetchall()
			
 
				 
			
 
				             for (_id,) in ids:
			
@@ -66,12 +65,15 @@ def main():
 
				                 if not mongo_data:
			
 
				                     continue
			
 
				 
			
 
				-                # 构造更新数据
			
 
				-                update_fields = {field_mapping[key]: mongo_data.get(key, None) for key in field_mapping}
			
 
				+                # 构造更新数据，若值为 None 或 ""，则填充为 None
			
 
				+                update_fields = {
			
 
				+                    field_mapping[key]: None if not mongo_data.get(key) else mongo_data[key]
			
 
				+                    for key in field_mapping
			
 
				+                }
			
 
				 
			
 
				                 # 构造更新 SQL
			
 
				                 update_sql = f"""
			
 
				-                UPDATE bid_llizhikun
			
 
				+                UPDATE sample_bid_analysis
			
 
				                 SET {", ".join([f"{field} = %s" for field in update_fields.keys()])}
			
 
				                 WHERE _id = %s
			
 
				                 """
			
--- a/tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mysql.py
+++ b/tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mysql.py
@@ -0,0 +1,87 @@
 
				+from pymongo import MongoClient
			
 
				+from bson import ObjectId  # 导入 ObjectId
			
 
				+import pymysql
			
 
				+from lib.mogodb_helper import MongoDBInterface
			
 
				+
			
 
				+# MongoDB 配置信息
			
 
				+MongodbConfigLocal = {
			
 
				+    "ip_port": "127.0.0.1:27088",
			
 
				+    "user": "viewdata",
			
 
				+    "password": "viewdata",
			
 
				+    "db": "qfw",
			
 
				+    "col": "bidding"  # 替换为实际集合名称
			
 
				+}
			
 
				+
			
 
				+# MySQL 配置信息
			
 
				+mysql_config = {
			
 
				+    "host": "172.20.45.129",
			
 
				+    "user": "root",
			
 
				+    "password": "=PDT49#80Z!RVv52_z",
			
 
				+    "database": "quality",
			
 
				+    "port": 4000
			
 
				+}
			
 
				+
			
 
				+# 字段映射
			
 
				+field_mapping = {
			
 
				+    "toptype": "toptype_ai",
			
 
				+    "subtype": "subtype_ai",
			
 
				+    "area": "area_ai",
			
 
				+    "city": "city_ai",
			
 
				+    "buyer": "buyer_ai",
			
 
				+    "projectname": "projectname_ai",
			
 
				+    "projectcode": "projectcode_ai",
			
 
				+    "budget": "budget_ai",
			
 
				+    "s_winner": "s_winner_ai",
			
 
				+    "bidamount": "bidamount_ai"
			
 
				+}
			
 
				+
			
 
				+def main():
			
 
				+    # 实例化 MongoDBInterface
			
 
				+    mongo_db_interface = MongoDBInterface(MongodbConfigLocal)
			
 
				+
			
 
				+    # 使用 MySQL 的 with 语句管理连接
			
 
				+    with pymysql.connect(
			
 
				+            host=mysql_config["host"],
			
 
				+            port=mysql_config["port"],
			
 
				+            user=mysql_config["user"],
			
 
				+            password=mysql_config["password"],
			
 
				+            database=mysql_config["database"]
			
 
				+    ) as mysql_conn:
			
 
				+        with mysql_conn.cursor() as mysql_cursor:
			
 
				+            # 从 MySQL 中读取 _id 列表
			
 
				+            mysql_cursor.execute("SELECT _id FROM sample_bid_analysis")
			
 
				+            ids = mysql_cursor.fetchall()
			
 
				+
			
 
				+            for (_id,) in ids:
			
 
				+                # 将 _id 转换为 ObjectId 类型
			
 
				+                try:
			
 
				+                    object_id = ObjectId(_id)
			
 
				+                except Exception as e:
			
 
				+                    print(f"Invalid ObjectId: {_id}, skipping. Error: {e}")
			
 
				+                    continue
			
 
				+
			
 
				+                # 使用 MongoDBInterface 的 find_by_id 方法从 MongoDB 查询数据
			
 
				+                mongo_data = mongo_db_interface.find_by_id(MongodbConfigLocal["col"], object_id)
			
 
				+                if not mongo_data:
			
 
				+                    continue
			
 
				+
			
 
				+                # 构造更新数据，若值为 None 或 ""，则填充为 None
			
 
				+                update_fields = {
			
 
				+                    field_mapping[key]: None if not mongo_data.get(key) else mongo_data[key]
			
 
				+                    for key in field_mapping
			
 
				+                }
			
 
				+
			
 
				+                # 构造更新 SQL
			
 
				+                update_sql = f"""
			
 
				+                UPDATE sample_bid_analysis
			
 
				+                SET {", ".join([f"{field} = %s" for field in update_fields.keys()])}
			
 
				+                WHERE _id = %s
			
 
				+                """
			
 
				+                update_values = list(update_fields.values()) + [_id]
			
 
				+
			
 
				+                # 执行更新操作
			
 
				+                mysql_cursor.execute(update_sql, update_values)
			
 
				+                mysql_conn.commit()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/tools/生成标准样本库的分析数据/生成统计结果.py
+++ b/tools/生成标准样本库的分析数据/生成统计结果.py
--- a/tools/生成标准样本库的分析数据/生成统计结果_入库.py
+++ b/tools/生成标准样本库的分析数据/生成统计结果_入库.py
@@ -0,0 +1,289 @@
 
				+import pymysql
			
 
				+import pymongo
			
 
				+import pandas as pd
			
 
				+from openpyxl import Workbook
			
 
				+from openpyxl.styles import Font, Alignment
			
 
				+
			
 
				+# # MySQL 配置信息
			
 
				+# MYSQL_CONFIG = {
			
 
				+#     "host": "172.20.45.129",
			
 
				+#     "user": "root",
			
 
				+#     "password": "=PDT49#80Z!RVv52_z",
			
 
				+#     "database": "quality",
			
 
				+#     "port": 4000
			
 
				+# }
			
 
				+# # 连接 MySQL 并读取数据
			
 
				+# def fetch_data():
			
 
				+#     conn = pymysql.connect(**MYSQL_CONFIG)
			
 
				+#     query = "SELECT * FROM sample_bid_analysis;"
			
 
				+#     df = pd.read_sql(query, conn)
			
 
				+#     conn.close()
			
 
				+#     return df
			
 
				+# MongoDB 连接配置
			
 
				+MONGO_CONFIG = {
			
 
				+    "host": "172.20.45.129",
			
 
				+    "port": 27002,
			
 
				+    "db": "data_quality",
			
 
				+    "col": "standard_sample_data_new",
			
 
				+}
			
 
				+# MySQL 配置
			
 
				+MYSQL_CONFIG = {
			
 
				+    "host": "172.20.45.129",
			
 
				+    "user": "root",
			
 
				+    "password": "=PDT49#80Z!RVv52_z",
			
 
				+    "database": "quality",
			
 
				+    "port": 4000
			
 
				+}
			
 
				+
			
 
				+# 连接 MongoDB 并读取数据
			
 
				+def fetch_data():
			
 
				+    client = pymongo.MongoClient(f"mongodb://{MONGO_CONFIG['host']}:{MONGO_CONFIG['port']}")
			
 
				+    db = client[MONGO_CONFIG["db"]]
			
 
				+    collection = db[MONGO_CONFIG["col"]]
			
 
				+
			
 
				+    # 读取数据并转换为 DataFrame
			
 
				+    data = list(collection.find({}, {"_id": 0}))  # 去掉 `_id` 字段
			
 
				+    df = pd.DataFrame(data)
			
 
				+
			
 
				+    client.close()
			
 
				+    return df
			
 
				+
			
 
				+# 判断 projectname 是否互为包含关系
			
 
				+def is_contained(str1, str2):
			
 
				+    """ 判断 str1 和 str2 是否互相包含（非空值情况下） """
			
 
				+    if pd.isna(str1) or pd.isna(str2):  # 如果有 NaN 值，直接返回 False
			
 
				+        return False
			
 
				+    return str1 in str2 or str2 in str1  # 互为包含
			
 
				+
			
 
				+# 计算统计数据
			
 
				+def calculate_metrics_and_accuracy(df, category):
			
 
				+    """ 计算表格所需数据 """
			
 
				+    # 确定数据类别：中标类 or 招标类
			
 
				+    if category == "中标类":
			
 
				+        bid_types = ["成交", "单一", "废标", "合同", "结果变更", "流标", "验收", "中标", "其它"]
			
 
				+        df = df[df["subtype"].isin(bid_types)]
			
 
				+        fields = ["toptype", "subtype", "area", "city", "buyer", "projectname", "projectcode", "budget", "s_winner", "bidamount"]
			
 
				+
			
 
				+    else:  # 招标类
			
 
				+        bid_types = ["成交", "单一", "废标", "合同", "结果变更", "流标", "验收", "中标", "其它", "拟建"]
			
 
				+        df = df[~df["subtype"].isin(bid_types)]
			
 
				+        fields = ["toptype", "subtype", "area", "city", "buyer", "projectname", "projectcode", "budget"]
			
 
				+
			
 
				+
			
 
				+    results = []
			
 
				+    # 统一将 None、<NA> 和空字符串都转为 pd.NA
			
 
				+    df = df.replace({None: pd.NA, '': pd.NA})  # 替换 None 和空字符串为 pd.NA
			
 
				+    df = df.fillna(pd.NA)  # 确保所有空值都转为 pd.NA
			
 
				+    correct_rows = 0  # 整行正确的计数
			
 
				+    total_count = len(df)  # 样本总量
			
 
				+
			
 
				+    for _, row in df.iterrows():
			
 
				+        row_correct = True  # 假设整行正确
			
 
				+
			
 
				+        for field in fields:
			
 
				+            original_value = row.get(field, pd.NA)
			
 
				+            ai_value = row.get(f"{field}_ai", pd.NA)
			
 
				+
			
 
				+            if field == "projectname":  # 特殊处理 projectname
			
 
				+                is_correct = is_contained(original_value, ai_value)
			
 
				+            else:
			
 
				+                # 这里避免 pd.NA 直接比较导致错误
			
 
				+                if pd.isna(original_value) or pd.isna(ai_value):
			
 
				+                    is_correct = pd.isna(original_value) and pd.isna(ai_value)  # 如果都为空，算正确
			
 
				+                else:
			
 
				+                    is_correct = original_value == ai_value  # 正常比较
			
 
				+
			
 
				+            if not is_correct:
			
 
				+                row_correct = False  # 只要有一个字段错误，整行就是错误的
			
 
				+
			
 
				+        if row_correct:
			
 
				+            correct_rows += 1  # 统计整行正确的数量
			
 
				+
			
 
				+    # 计算整行正确率
			
 
				+    single_row_accuracy = correct_rows / total_count if total_count else 0
			
 
				+
			
 
				+    for field in fields:
			
 
				+        total_count = len(df)  # 样本数据总量
			
 
				+        null_count = df[field].isna().sum()  # 原文无值
			
 
				+        valid_count = total_count - null_count  # 原文有值的数量
			
 
				+
			
 
				+        if field == "projectname":  # 特殊处理 projectname
			
 
				+            extract_correct_count = df.apply(lambda row: is_contained(row["projectname"], row["projectname_ai"]),axis=1).sum()
			
 
				+            extract_error_count = valid_count - extract_correct_count
			
 
				+            extract_correct_no_null = extract_correct_count  # 互为包含的都算正确
			
 
				+            extract_error_no_null = extract_error_count
			
 
				+        else:  # 其他字段的正常处理逻辑
			
 
				+            extract_error_count = ((df[field].isna() & df[f"{field}_ai"].notna()) |
			
 
				+                                   (df[field].notna() & df[f"{field}_ai"].isna()) |
			
 
				+                                   (df[field].notna() & df[f"{field}_ai"].notna() & (
			
 
				+                                               df[field] != df[f"{field}_ai"]))).sum()
			
 
				+
			
 
				+            # 抽取错误的数量（含原文无）
			
 
				+            extract_correct_count = total_count - extract_error_count  # 抽取正确的数量（含原文无）
			
 
				+            extract_error_no_null = (df[field].notna() & (df[field] != df.get(f"{field}_ai", df[field]))).sum()  # 抽取错误的数量（不含原文无）
			
 
				+            extract_correct_no_null = valid_count - extract_error_no_null  # 抽取有值且正确数量（不含原文无）
			
 
				+
			
 
				+        # 计算比率
			
 
				+        recognition_rate = valid_count / total_count if total_count else 0  # 识别率
			
 
				+        recognition_correct_rate = extract_correct_count / total_count if total_count else 0  # 识别正确率
			
 
				+        correct_rate = extract_correct_no_null / valid_count if valid_count else 0  # 正确率（原文存在情况下）
			
 
				+
			
 
				+        results.append([
			
 
				+            field, total_count, null_count, valid_count, extract_error_count,
			
 
				+            extract_correct_count, extract_error_no_null, extract_correct_no_null,
			
 
				+            f"{recognition_rate:.2%}", f"{recognition_correct_rate:.2%}", f"{correct_rate:.2%}"
			
 
				+        ])
			
 
				+        results.append({
			
 
				+            "field_name": field,
			
 
				+            "sample_total": total_count,
			
 
				+            "original_null": null_count,
			
 
				+            "original_exist": valid_count,
			
 
				+            "extract_error_total": extract_error_count,
			
 
				+            "extract_correct_total": extract_correct_count,
			
 
				+            "extract_error_exist": extract_error_no_null,
			
 
				+            "extract_correct_exist": extract_correct_no_null,
			
 
				+            "recognition_rate": f"{recognition_rate:.2%}",
			
 
				+            "correct_recognition_rate": f"{recognition_correct_rate:.2%}",
			
 
				+            "accuracy_rate": f"{correct_rate:.2%}",
			
 
				+            "data_type": category
			
 
				+        })
			
 
				+
			
 
				+    columns = ["字段", "样本数据总量", "原文无值", "原文有值的数量", "抽取错误的数量（含原文无）",
			
 
				+               "抽取正确的数量（含原文无）", "抽取错误的数量（不含原文无）",
			
 
				+               "抽取有值且正确数量（不含原文无）", "识别率", "识别正确率", "正确率（原文存在情况下）"]
			
 
				+    df_fields = pd.DataFrame(results, columns=columns)
			
 
				+
			
 
				+    # 整行统计数据
			
 
				+    df_overall = pd.DataFrame([["数据总量", total_count],
			
 
				+                               ["整行都正确的数量", correct_rows],
			
 
				+                               ["单行正确率", f"{single_row_accuracy:.2%}"]],
			
 
				+                              columns=["指标", "数值"])
			
 
				+
			
 
				+    # 构建整体统计
			
 
				+    overall_data = {
			
 
				+        "total_data_count": total_count,
			
 
				+        "correct_rows_count": correct_rows,
			
 
				+        "row_accuracy": f"{correct_rows / total_count:.2%}" if total_count else "0.00%",
			
 
				+        "data_type": category
			
 
				+    }
			
 
				+    return df_fields,df_overall,overall_data
			
 
				+
			
 
				+
			
 
				+# # 计算整体正确率
			
 
				+# def calculate_overall_accuracy(df, fields):
			
 
				+#     """ 计算整行正确的数量及单行正确率 """
			
 
				+#     total_count = len(df)  # 样本总量
			
 
				+#
			
 
				+#     # 判断每行所有字段是否都正确（projectname 需使用互为包含逻辑）
			
 
				+#     def is_row_correct(row):
			
 
				+#         for field in fields:
			
 
				+#             if pd.isna(row[field]) and pd.isna(row[f"{field}_ai"]):  # 如果原值和 AI 值都为空，算正确
			
 
				+#                 continue
			
 
				+#             if field == "projectname":
			
 
				+#                 if not is_contained(row["projectname"], row["projectname_ai"]):  # projectname 互为包含
			
 
				+#                     return False
			
 
				+#             else:
			
 
				+#                 if row[field] != row.get(f"{field}_ai", row[field]):  # 其他字段直接对比
			
 
				+#                     return False
			
 
				+#         return True
			
 
				+#
			
 
				+#     correct_rows = df.apply(is_row_correct, axis=1).sum()  # 统计整行正确的数量
			
 
				+#     single_row_accuracy = correct_rows / total_count if total_count else 0  # 计算单行正确率
			
 
				+#
			
 
				+#     return pd.DataFrame([["数据总量", total_count],
			
 
				+#                          ["整行都正确的数量", correct_rows],
			
 
				+#                          ["单行正确率", f"{single_row_accuracy:.2%}"]],
			
 
				+#                         columns=["指标", "数值"])
			
 
				+
			
 
				+# 导出 Excel
			
 
				+def export_to_excel(df_bid_fields, df_bid_overall,df_tender_fields,df_tender_overall):
			
 
				+    file_path = "数据分析结果.xlsx"
			
 
				+    with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
			
 
				+        df_bid_fields.to_excel(writer, sheet_name="字段统计-中标类", index=False)
			
 
				+        df_bid_overall.to_excel(writer, sheet_name="整体正确率-中标类", index=False)
			
 
				+        df_tender_fields.to_excel(writer, sheet_name="字段统计-招标类", index=False)
			
 
				+        df_tender_overall.to_excel(writer, sheet_name="整体正确率-招标类", index=False)
			
 
				+
			
 
				+        # Excel 格式优化
			
 
				+        workbook = writer.book
			
 
				+        for sheet in workbook.sheetnames:
			
 
				+            ws = workbook[sheet]
			
 
				+            for col in ws.columns:
			
 
				+                max_length = 0
			
 
				+                col_letter = col[0].column_letter
			
 
				+                for cell in col:
			
 
				+                    try:
			
 
				+                        if cell.value:
			
 
				+                            max_length = max(max_length, len(str(cell.value)))
			
 
				+                    except:
			
 
				+                        pass
			
 
				+                ws.column_dimensions[col_letter].width = max_length + 2  # 调整列宽
			
 
				+
			
 
				+            # 加粗第一行
			
 
				+            for cell in ws[1]:
			
 
				+                cell.font = Font(bold=True)
			
 
				+                cell.alignment = Alignment(horizontal="center", vertical="center")
			
 
				+
			
 
				+    print(f"Excel 文件已保存：{file_path}")
			
 
				+
			
 
				+def save_to_database(df_fields, df_overall):
			
 
				+    """保存到优化后的数据库结构"""
			
 
				+    conn = pymysql.connect(**MYSQL_CONFIG)
			
 
				+    cursor = conn.cursor()
			
 
				+
			
 
				+    try:
			
 
				+        # 插入字段统计
			
 
				+        for _, row in df_fields.iterrows():
			
 
				+            sql = """
			
 
				+            INSERT INTO sample_data_analysis (
			
 
				+                field_name, sample_total, original_null, original_exist,
			
 
				+                extract_error_total, extract_correct_total, extract_error_exist,
			
 
				+                extract_correct_exist, recognition_rate, correct_recognition_rate,
			
 
				+                accuracy_rate, data_type
			
 
				+            ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
			
 
				+            """
			
 
				+            cursor.execute(sql, (
			
 
				+                row['field_name'], row['sample_total'], row['original_null'],
			
 
				+                row['original_exist'], row['extract_error_total'],
			
 
				+                row['extract_correct_total'], row['extract_error_exist'],
			
 
				+                row['extract_correct_exist'], row['recognition_rate'],
			
 
				+                row['correct_recognition_rate'], row['accuracy_rate'],
			
 
				+                row['data_type']
			
 
				+            ))
			
 
				+
			
 
				+        # 插入整体统计
			
 
				+        for _, row in df_overall.iterrows():
			
 
				+            sql = """
			
 
				+            INSERT INTO data_quality_analysis 
			
 
				+            (total_data_count, correct_rows_count, row_accuracy, data_type)
			
 
				+            VALUES (%s,%s,%s,%s)
			
 
				+            """
			
 
				+            cursor.execute(sql, (
			
 
				+                row['total_data_count'], row['correct_rows_count'],
			
 
				+                row['row_accuracy'], row['data_type']
			
 
				+            ))
			
 
				+
			
 
				+        conn.commit()
			
 
				+        print(f"成功插入 {len(df_fields)} 条字段记录和 {len(df_overall)} 条整体记录")
			
 
				+    except Exception as e:
			
 
				+        conn.rollback()
			
 
				+        print(f"数据库操作失败: {str(e)}")
			
 
				+        raise  # 抛出异常以便调试
			
 
				+    finally:
			
 
				+        cursor.close()
			
 
				+        conn.close()
			
 
				+
			
 
				+# 主函数
			
 
				+def main():
			
 
				+    df = fetch_data()
			
 
				+    df_bid_fields, df_bid_overall = calculate_metrics_and_accuracy(df, "中标类")
			
 
				+    df_tender_fields, df_tender_overall = calculate_metrics_and_accuracy(df, "招标类")
			
 
				+    export_to_excel(df_bid_fields, df_bid_overall,df_tender_fields,df_tender_overall)
			
 
				+    # 合并结果
			
 
				+    all_fields = pd.concat([df_bid_fields, df_tender_fields])
			
 
				+    all_overall = pd.concat([df_bid_overall, df_tender_overall])
			
 
				+    # 存储数据
			
 
				+    save_to_database(all_fields, all_overall)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/tools/高质量站点第一版/111.xlsx
+++ b/tools/高质量站点第一版/111.xlsx
--- a/tools/高质量站点第一版/_id.csv
+++ b/tools/高质量站点第一版/_id.csv
@@ -0,0 +1,51 @@
 
				+spidercode
			
 
				+a_qgzbgggsssyq_qbgg
			
 
				+a_zgzfcgw_zydwzfcgyxgk_gjjs_01
			
 
				+sc_gzzwsjjcgxt_jjgg
			
 
				+gd_gdszfcgw_syss_cggg
			
 
				+jx_jxszfcgdzmc_ggdt_htgg
			
 
				+a_zgzfcgw_zfcghtgg_new
			
 
				+a_zgjcjtcgpt_fzbgg_cggg
			
 
				+a_zgjcjtcgpt_fzbgg_jggg
			
 
				+hn_hnszfcgdzmc_hnsbj_ggdt
			
 
				+xj_xjwwezzqzfcgw_dzmcgg_cgcg
			
 
				+a_zgzfcgw_zydwzfcgyxgk_gjjs_new_01
			
 
				+a_zgzbtbggfwpt_zbgg2
			
 
				+js_ntszfcgwssc_xjgg_xqgg
			
 
				+gx_gxzzzzqzfcg_dzmchtgg
			
 
				+zj_zjzfcgw_cggg_sylx
			
 
				+a_oycg_gkcggg
			
 
				+ah_ahzfcgypt_cjgg
			
 
				+a_zgjcjtcgpt_fzbgg_bggg
			
 
				+a_jdcgwxwz_cgdtzxcgxx
			
 
				+js_ntszfcgwssc_xjgg_cjgg
			
 
				+gd_gdswszjfwcs_cggg
			
 
				+a_zgzbtbggfwpt_zhbjggs2
			
 
				+ah_ahzfcgypt_htgg
			
 
				+sd_zgsdzfcgw_xxgk_sxhtgk
			
 
				+gd_gdszfcgw_syss_dzmc
			
 
				+a_jsxmhjyxdjbbaxt_gg_nipc
			
 
				+a_zgzbtbggfwpt_wasjgf_zbgg
			
 
				+gz_gzszfcgdzmc_gzsbj_ggdt_01
			
 
				+ah_ahzfcgypt_ysgg
			
 
				+ha_hnstzxmzxspjgptbsdt_xmbljggs_njpc
			
 
				+hb_hbzwfww_bacx_njpc
			
 
				+a_zgzbycgw_zbxx_zbxx
			
 
				+a_gtcgpt_cgjg
			
 
				+a_zgjcjtcgpt_zbzq_zhbgg
			
 
				+xj_xjwwezzqzfcgw_dzmcgg_wscs
			
 
				+gd_gdswszjfwcs_zxgs
			
 
				+sd_zgsdzfcgw_sxzhbgg_new
			
 
				+a_syjtyxgs_zh
			
 
				+jx_jxswszjfwcs_cggg
			
 
				+a_gjggzyjypt_gcjs_kbjl
			
 
				+nm_nmgzzqzfcgw_dzmc_htgs
			
 
				+a_zgzbtbggfwpt_wasjgf_kbjl
			
 
				+a_gtcgpt_cggg
			
 
				+jx_jxszfcgdzmc_htgg
			
 
				+js_jsstzxmzxspjgpt_gsxx_bazcx_njpc
			
 
				+a_zgzbtbggfwpt_zhbhxrgs2
			
 
				+a_bjgc_jggs
			
 
				+a_zgzbycgw_zbxx_zb
			
 
				+a_zgzfcgw_dfgg_new
			
 
				+a_zgzfcgw_zydwzfcgyxgk_gjjs
			
--- a/tools/高质量站点第一版/test.py
+++ b/tools/高质量站点第一版/test.py
@@ -0,0 +1,67 @@
 
				+import pandas as pd
			
 
				+from collections import defaultdict
			
 
				+
			
 
				+
			
 
				+def transform_spider_data(input_file, output_file):
			
 
				+    # 读取原始数据
			
 
				+    raw_df = pd.read_excel(input_file)
			
 
				+
			
 
				+    # 初始化结果字典
			
 
				+    result = defaultdict(lambda: {
			
 
				+        '采购意向': 0,
			
 
				+        '预告': 0,
			
 
				+        '招标': 0,
			
 
				+        '结果': 0,
			
 
				+        '信用': 0
			
 
				+    })
			
 
				+
			
 
				+    # 定义列名映射（原始列名 -> 类型名）
			
 
				+    column_mapping = {
			
 
				+        '采购意向': '采购意向',
			
 
				+        '预告': '预告',
			
 
				+        '招标': '招标',
			
 
				+        '结果': '结果',
			
 
				+        '信用': '信用'
			
 
				+    }
			
 
				+
			
 
				+    # 处理每一对spidercode-类型列
			
 
				+    for i in range(0, len(raw_df.columns), 2):
			
 
				+        if i + 1 >= len(raw_df.columns):
			
 
				+            break
			
 
				+
			
 
				+        # 获取当前列对
			
 
				+        spidercode_col = raw_df.columns[i]
			
 
				+        count_col = raw_df.columns[i + 1]
			
 
				+
			
 
				+        # 获取类型名称
			
 
				+        typ = column_mapping.get(count_col, count_col)
			
 
				+
			
 
				+        # 处理每一行数据
			
 
				+        for _, row in raw_df.iterrows():
			
 
				+            spidercode = row[spidercode_col]
			
 
				+            count = row[count_col]
			
 
				+
			
 
				+            if pd.notna(spidercode) and pd.notna(count):
			
 
				+                try:
			
 
				+                    result[spidercode][typ] += int(count)
			
 
				+                except (ValueError, TypeError):
			
 
				+                    continue
			
 
				+
			
 
				+    # 转换为DataFrame
			
 
				+    df = pd.DataFrame.from_dict(result, orient='index')
			
 
				+    df.reset_index(inplace=True)
			
 
				+    df.rename(columns={'index': 'spidercode'}, inplace=True)
			
 
				+
			
 
				+    # 计算总量
			
 
				+    df['总量'] = df[['采购意向', '预告', '招标', '结果', '信用']].sum(axis=1)
			
 
				+
			
 
				+    # 重新排序列
			
 
				+    df = df[['spidercode', '总量', '采购意向', '预告', '招标', '结果', '信用']]
			
 
				+
			
 
				+    # 保存结果
			
 
				+    df.to_excel(output_file, index=False)
			
 
				+    print(f"转换完成，结果已保存到{output_file}")
			
 
				+
			
 
				+
			
 
				+# 使用示例
			
 
				+transform_spider_data('111.xlsx', 'transformed_data.xlsx')
			
--- a/tools/高质量站点第一版/根据id找出爬虫代码.py
+++ b/tools/高质量站点第一版/根据id找出爬虫代码.py
@@ -0,0 +1,37 @@
 
				+import csv
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+
			
 
				+def mark_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name):
			
 
				+    # 连接到MongoDB
			
 
				+    client = MongoClient(mongo_uri)
			
 
				+    db = client[db_name]
			
 
				+    collection = db[collection_name]
			
 
				+
			
 
				+    # 读取 CSV 文件中的 spidercodes
			
 
				+    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
			
 
				+        csv_reader = csv.reader(csv_file)
			
 
				+        next(csv_reader)  # 跳过标题行
			
 
				+        spidercodes = [row[0] for row in csv_reader]  # 假设 spidercode 是第一列
			
 
				+
			
 
				+    # 去除重复的 spidercodes
			
 
				+    unique_spidercodes = list(set(spidercodes))
			
 
				+
			
 
				+    # 批量更新符合条件的文档，设置 flag=1
			
 
				+    result = collection.update_many(
			
 
				+        {"spidercode": {"$in": unique_spidercodes}},
			
 
				+        {"$set": {"tag": 1}}
			
 
				+    )
			
 
				+
			
 
				+    print(f"成功更新了 {result.modified_count} 条文档")
			
 
				+
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    # 配置参数
			
 
				+    csv_file_path = 'spidercodes.csv'  # 替换为你的CSV文件路径
			
 
				+    mongo_uri = 'mongodb://172.20.45.129:27002/'  # MongoDB连接字符串
			
 
				+    db_name = 'data_quality'  # 数据库名称
			
 
				+    collection_name = 'bidding_202505'  # 集合名称
			
 
				+
			
 
				+    mark_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name)
			
--- a/tools/高质量站点第一版/统计标讯数量.py
+++ b/tools/高质量站点第一版/统计标讯数量.py
@@ -0,0 +1,43 @@
 
				+import csv
			
 
				+from pymongo import MongoClient
			
 
				+from collections import defaultdict
			
 
				+def count_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name):
			
 
				+    # 连接到MongoDB
			
 
				+    client = MongoClient(mongo_uri)
			
 
				+    db = client[db_name]
			
 
				+    collection = db[collection_name]
			
 
				+
			
 
				+    # 读取 CSV 文件（严格按行顺序）
			
 
				+    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
			
 
				+        csv_reader = csv.reader(csv_file)
			
 
				+        next(csv_reader)  # 跳过标题行
			
 
				+        spidercodes = [row[0] for row in csv_reader]  # 假设 spidercode 是第一列
			
 
				+
			
 
				+        # 一次性查询所有符合条件的数据（避免多次查询）
			
 
				+        # query = {"toptype": "招标", "spidercode": {"$in": list(set(spidercodes))}}
			
 
				+        # 查询条件：subtype 是 "中标" 或 "成交"
			
 
				+        query = {
			
 
				+            "subtype": {"$in": ["中标", "成交"]},
			
 
				+            "spidercode": {"$in": list(set(spidercodes))}
			
 
				+        }
			
 
				+        cursor = collection.find(query, {"spidercode": 1})
			
 
				+
			
 
				+        # 统计每个 spidercode 的数量
			
 
				+        code_counts = {}
			
 
				+        for doc in cursor:
			
 
				+            code = doc["spidercode"]
			
 
				+            code_counts[code] = code_counts.get(code, 0) + 1
			
 
				+
			
 
				+        # 按 CSV 顺序输出
			
 
				+        for code in spidercodes:
			
 
				+            print(f"{code}: {code_counts.get(code, 0)}")
			
 
				+
			
 
				+# 使用示例
			
 
				+if __name__ == "__main__":
			
 
				+    # 配置参数
			
 
				+    csv_file_path = 'spidercodes.csv'  # 替换为你的CSV文件路径
			
 
				+    mongo_uri = 'mongodb://172.20.45.129:27002/'  # MongoDB连接字符串
			
 
				+    db_name = 'data_quality'  # 数据库名称
			
 
				+    collection_name = 'result_new'  # 集合名称
			
 
				+
			
 
				+    count_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name)
			
--- a/tools/高质量站点第一版/高质量站点-正文规则角度.py
+++ b/tools/高质量站点第一版/高质量站点-正文规则角度.py
@@ -0,0 +1,95 @@
 
				+import pymongo
			
 
				+from openpyxl import Workbook
			
 
				+from openpyxl.styles import Font
			
 
				+import datetime
			
 
				+
			
 
				+# MongoDB连接配置
			
 
				+client = pymongo.MongoClient("mongodb://172.20.45.129:27002/")
			
 
				+db = client["data_quality"]  # 替换为你的数据库名
			
 
				+collection = db["final_results"]  # 替换为你的集合名
			
 
				+
			
 
				+# 自定义字段组合 - 这些字段的_flag值需要都为1
			
 
				+required_fields = [
			
 
				+    "area_flag",
			
 
				+    # "multipackage_flag",
			
 
				+    "projectname_flag",
			
 
				+    # "projectcode_flag",
			
 
				+    # "budget_flag",
			
 
				+    # "s_winner_flag",
			
 
				+    "buyer_flag",
			
 
				+    "city_flag",
			
 
				+    # "toptype_flag",
			
 
				+    # "subtype_flag",
			
 
				+    # "bidamount_flag"
			
 
				+]
			
 
				+
			
 
				+# 查询条件：所有required_fields的_flag值都为1
			
 
				+query = {field: 1 for field in required_fields}
			
 
				+query["toptype"] = "采购意向"
			
 
				+
			
 
				+# query = {
			
 
				+#     "$and": [
			
 
				+#         {"$or": [{"subtype": "中标"}, {"subtype": "成交"}]},
			
 
				+#         {field: 1 for field in required_fields}  # 直接放字典，不要用 **
			
 
				+#     ]
			
 
				+# }
			
 
				+# 要导出的字段
			
 
				+export_fields = ["site", "channel", "spidercode"]
			
 
				+
			
 
				+# 获取数据并按spidercode分组
			
 
				+results = collection.find(query)
			
 
				+grouped_data = {}
			
 
				+
			
 
				+for doc in results:
			
 
				+    spidercode = doc.get("spidercode", "unknown")
			
 
				+    if spidercode not in grouped_data:
			
 
				+        # 确保存储的是字典，而不是字符串
			
 
				+        grouped_data[spidercode] = {
			
 
				+            "site": doc.get("site", ""),
			
 
				+            "channel": doc.get("channel", ""),
			
 
				+            "spidercode": spidercode
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+# 创建Excel工作簿
			
 
				+wb = Workbook()
			
 
				+ws = wb.active
			
 
				+ws.title = "Export Results"
			
 
				+
			
 
				+# 写入表头
			
 
				+headers = ["序号"] + export_fields
			
 
				+ws.append(headers)
			
 
				+
			
 
				+# 设置表头样式
			
 
				+for cell in ws[1]:
			
 
				+    cell.font = Font(bold=True)
			
 
				+
			
 
				+# 写入数据
			
 
				+row_num = 2
			
 
				+for idx, (spidercode, record) in enumerate(grouped_data.items(), start=1):
			
 
				+    ws.append([
			
 
				+        idx,  # 序号
			
 
				+        record["site"],  # 直接访问字典，而不是用 .get()
			
 
				+        record["channel"],
			
 
				+        record["spidercode"]
			
 
				+    ])
			
 
				+    row_num += 1
			
 
				+
			
 
				+# 自动调整列宽
			
 
				+for column in ws.columns:
			
 
				+    max_length = 0
			
 
				+    column_letter = column[0].column_letter
			
 
				+    for cell in column:
			
 
				+        try:
			
 
				+            if len(str(cell.value)) > max_length:
			
 
				+                max_length = len(str(cell.value))
			
 
				+        except:
			
 
				+            pass
			
 
				+    adjusted_width = (max_length + 2)
			
 
				+    ws.column_dimensions[column_letter].width = adjusted_width
			
 
				+
			
 
				+# 保存Excel文件
			
 
				+filename = f"export_result_采购意向新.xlsx"
			
 
				+wb.save(filename)
			
 
				+
			
 
				+print(f"导出完成，文件已保存为: {filename}")
			
--- a/tools/高质量站点第一版/高质量站点-脚本1.py
+++ b/tools/高质量站点第一版/高质量站点-脚本1.py
@@ -0,0 +1,87 @@
 
				+from bson import ObjectId
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+
			
 
				+def process_tagged_documents():
			
 
				+    # 直接在URI中包含用户名和密码
			
 
				+    username = "viewdata"
			
 
				+    password = "viewdata"
			
 
				+    host = "127.0.0.1"  # 例如: localhost 或 192.168.1.100
			
 
				+    port = "27088"  # 默认MongoDB端口
			
 
				+
			
 
				+    # 构建连接URI
			
 
				+    mongo_uri = f"mongodb://{username}:{password}@{host}:{port}/"
			
 
				+
			
 
				+
			
 
				+    # 连接MongoDB
			
 
				+    client1 = MongoClient('mongodb://127.0.0.1:27087/',unicode_decode_error_handler="ignore", directConnection=True)  #清洗库
			
 
				+    client2 = MongoClient(mongo_uri,unicode_decode_error_handler="ignore", directConnection=True)  #bidding库
			
 
				+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  #测试库
			
 
				+
			
 
				+    # 定义数据库和集合
			
 
				+    db1 = client1['jyqykhfw']  # 替换为实际的数据库1名称
			
 
				+    db2 = client2['qfw']  # 替换为实际的数据库2名称
			
 
				+    db3 = client3['data_quality']  # 替换为实际的数据库3名称
			
 
				+
			
 
				+    collection1 = db1['f_sourceinfo_chinaunicom_zhong']  # 替换为实际的集合1名称
			
 
				+    collection2 = db2['bidding']  # 替换为实际的bidding集合名称
			
 
				+    collection3 = db3['result']  # 替换为实际的结果集合名称
			
 
				+
			
 
				+    # 定义要检测的字段列表（可自定义）
			
 
				+    fields_to_check = ['projectname','projectcode','area','city','budget','bidamount', 's_winner', 'buyer']
			
 
				+
			
 
				+
			
 
				+    # 组合查询条件（增加_id条件）
			
 
				+    combined_query = {
			
 
				+        '$and': [
			
 
				+            {'i_ckdata': {'$gt': 1}},
			
 
				+            # {'_id': {'$lt': ObjectId("67a5a0563309c0998b14b361")}},
			
 
				+            *[{f'v_taginfo.{field}': 1} for field in fields_to_check]
			
 
				+        ]
			
 
				+    }
			
 
				+    # 按照_id升序排序（1表示升序，-1表示降序）
			
 
				+    sort_order = [('_id', -1)]
			
 
				+    # 遍历符合条件的文档
			
 
				+    for doc in collection1.find(combined_query).sort(sort_order):
			
 
				+        doc_id = ObjectId(doc['id'])
			
 
				+
			
 
				+        # 在库2的bidding集合中查找该id
			
 
				+        bidding_doc = collection2.find_one({'_id': doc_id})
			
 
				+        if bidding_doc:
			
 
				+            site = bidding_doc.get('site')
			
 
				+            channel = bidding_doc.get('channel')
			
 
				+            spidercode = bidding_doc.get('spidercode')
			
 
				+            current_flag = 2
			
 
				+            # 检查库3中是否已存在相同的site和channel,spidercode组合
			
 
				+            existing_record = collection3.find_one({
			
 
				+                'site': site,
			
 
				+                'channel': channel,
			
 
				+                'spidercode': spidercode,
			
 
				+                'flag':current_flag
			
 
				+            })
			
 
				+            if existing_record:
			
 
				+                print(f"记录已存在，跳过: _id={doc_id},site={site}, channel={channel}")
			
 
				+                continue
			
 
				+            # 提取需要的字段
			
 
				+            result = {
			
 
				+                '_id': doc_id,
			
 
				+                'site': bidding_doc.get('site'),
			
 
				+                'channel': bidding_doc.get('channel'),
			
 
				+                'spidercode': bidding_doc.get('spidercode'),
			
 
				+                'toptype': bidding_doc.get('toptype'),
			
 
				+                'subtype': bidding_doc.get('subtype'),
			
 
				+                'flag': current_flag  # 可以根据需要设置不同的flag值
			
 
				+            }
			
 
				+            # 存入库3
			
 
				+            collection3.update_one(
			
 
				+                {'_id': doc_id},
			
 
				+                {'$set': result},
			
 
				+                upsert=True
			
 
				+            )
			
 
				+            print(f"已存入新记录: _id={doc_id}, site={site}, channel={channel}")
			
 
				+        else:
			
 
				+            print(f"bidding记录不存在，跳过: id={doc_id}")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    process_tagged_documents()
			
--- a/tools/高质量站点第一版/高质量站点-脚本2.py
+++ b/tools/高质量站点第一版/高质量站点-脚本2.py
@@ -0,0 +1,97 @@
 
				+from bson import ObjectId
			
 
				+from pymongo import MongoClient, UpdateOne
			
 
				+from pymongo.errors import BulkWriteError
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+def process_tagged_documents_batch(batch_size=100):
			
 
				+    # 连接MongoDB
			
 
				+    client1 = MongoClient('mongodb://127.0.0.1:27087/',
			
 
				+                          unicode_decode_error_handler="ignore",
			
 
				+                          directConnection=True)  # 清洗库
			
 
				+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
			
 
				+
			
 
				+    # 定义数据库和集合
			
 
				+    db1 = client1['jyqykhfw']
			
 
				+    db3 = client3['data_quality']
			
 
				+
			
 
				+    collection1 = db1['f_sourceinfo_chinaunicom_zhong']
			
 
				+    collection3 = db3['result_new']
			
 
				+
			
 
				+    # 查询库1中i_ckdata>1的文档
			
 
				+    base_query = {'i_ckdata': {'$gt': 1}}
			
 
				+    # 按照_id降序排序
			
 
				+    sort_order = [('_id', 1)]
			
 
				+
			
 
				+    # 批量处理变量
			
 
				+    operations = []
			
 
				+    processed_count = 0
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    try:
			
 
				+        # 使用批量处理方式
			
 
				+        for doc in collection1.find(base_query).sort(sort_order).batch_size(batch_size):
			
 
				+            try:
			
 
				+                _id=doc['_id']
			
 
				+                doc_id = ObjectId(doc['id'])
			
 
				+
			
 
				+                v_taginfo = doc.get('v_taginfo', {})
			
 
				+                v_baseinfo = doc.get('v_baseinfo', {})
			
 
				+                site = v_baseinfo.get('site', '')
			
 
				+                channel = v_baseinfo.get('channel', '')
			
 
				+                spidercode = v_baseinfo.get('spidercode', '')
			
 
				+                toptype = v_baseinfo.get('toptype', '')
			
 
				+                subtype = v_baseinfo.get('subtype', '')
			
 
				+
			
 
				+                result = {
			
 
				+                    'v_taginfo': v_taginfo,
			
 
				+                    'site': site,
			
 
				+                    'channel': channel,
			
 
				+                    'spidercode': spidercode,
			
 
				+                    'toptype' : toptype,
			
 
				+                    'subtype' : subtype
			
 
				+                }
			
 
				+                # 输出当前正在处理的ID
			
 
				+                print(f"正在处理文档 _id:{_id} , id: {doc_id}")
			
 
				+
			
 
				+                # 使用UpdateOne构建正确的批量操作
			
 
				+                operations.append(
			
 
				+                    UpdateOne(
			
 
				+                        {'_id': doc_id},
			
 
				+                        {'$set': result},
			
 
				+                        upsert=True
			
 
				+                    )
			
 
				+                )
			
 
				+
			
 
				+                # 当达到批量大小时执行批量操作
			
 
				+                if len(operations) >= batch_size:
			
 
				+                    collection3.bulk_write(operations, ordered=False)
			
 
				+                    processed_count += len(operations)
			
 
				+                    print(f"已批量处理 {processed_count} 条记录")
			
 
				+                    operations = []
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
			
 
				+                continue
			
 
				+
			
 
				+        # 处理剩余的不足一个批次的操作
			
 
				+        if operations:
			
 
				+            collection3.bulk_write(operations, ordered=False)
			
 
				+            processed_count += len(operations)
			
 
				+            print(f"最后一批处理了 {len(operations)} 条记录")
			
 
				+
			
 
				+    except BulkWriteError as bwe:
			
 
				+        print(f"批量写入时发生错误: {bwe.details}")
			
 
				+    except Exception as e:
			
 
				+        print(f"处理过程中发生错误: {str(e)}")
			
 
				+    finally:
			
 
				+        end_time = time.time()
			
 
				+        print(f"处理完成，共处理 {processed_count} 条记录")
			
 
				+        print(f"总耗时: {end_time - start_time:.2f} 秒")
			
 
				+        client1.close()
			
 
				+        client3.close()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # 可以调整batch_size参数以获得最佳性能
			
 
				+    process_tagged_documents_batch(batch_size=500)
			
--- a/tools/高质量站点第一版/高质量站点-脚本3.py
+++ b/tools/高质量站点第一版/高质量站点-脚本3.py
@@ -0,0 +1,56 @@
 
				+from bson import ObjectId
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+
			
 
				+def process_tagged_documents():
			
 
				+
			
 
				+    # 连接MongoDB
			
 
				+    client1 = MongoClient('mongodb://127.0.0.1:27087/',unicode_decode_error_handler="ignore", directConnection=True)  #清洗库
			
 
				+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  #测试库
			
 
				+
			
 
				+    # 定义数据库和集合
			
 
				+    db1 = client1['jyqykhfw']  # 替换为实际的数据库1名称
			
 
				+    db3 = client3['data_quality']  # 替换为实际的数据库3名称
			
 
				+
			
 
				+    collection1 = db1['f_sourceinfo_2025Jslt_yys']  # 替换为实际的集合1名称
			
 
				+    collection3 = db3['result_new']  # 替换为实际的结果集合名称
			
 
				+
			
 
				+    # 查询库1中i_ckdata>1的文档
			
 
				+    base_query = {
			
 
				+        '$and': [
			
 
				+            {'i_ckdata': {'$gt': 1}},
			
 
				+        ]
			
 
				+    }
			
 
				+    # 按照_id升序排序（1表示升序，-1表示降序）
			
 
				+    sort_order = [('_id', 1)]
			
 
				+    # 遍历符合条件的文档
			
 
				+    for doc in collection1.find(base_query).sort(sort_order):
			
 
				+        _id=doc['_id']
			
 
				+        doc_id = ObjectId(doc['id'])
			
 
				+
			
 
				+        # 检查v_taginfo中的字段
			
 
				+        v_taginfo = doc.get('v_taginfo', {})
			
 
				+        v_baseinfo =doc.get('v_baseinfo',{})
			
 
				+        site = v_baseinfo.get('site','')
			
 
				+        channel = v_baseinfo.get('channel','')
			
 
				+        spidercode = v_baseinfo.get('spidercode','')
			
 
				+
			
 
				+        result = {
			
 
				+            'v_taginfo' :v_taginfo,
			
 
				+            'site':site,
			
 
				+            'channel':channel,
			
 
				+            'spidercode':spidercode
			
 
				+
			
 
				+        }
			
 
				+        # 存入库3
			
 
				+        collection3.update_one(
			
 
				+            {'_id': doc_id},
			
 
				+            {'$set': result},
			
 
				+            upsert=True
			
 
				+        )
			
 
				+        print(f"已存入新记录: _id={_id},id={doc_id}")
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    process_tagged_documents()
			
--- a/tools/高质量站点第一版/高质量站点-脚本41.py
+++ b/tools/高质量站点第一版/高质量站点-脚本41.py
@@ -0,0 +1,77 @@
 
				+from bson import ObjectId
			
 
				+from pymongo import MongoClient, UpdateOne
			
 
				+from pymongo.errors import BulkWriteError
			
 
				+
			
 
				+
			
 
				+def process_tagged_documents_batch():
			
 
				+    # 连接MongoDB
			
 
				+    client = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
			
 
				+    db = client['data_quality']
			
 
				+    collection = db['result_new']
			
 
				+
			
 
				+    # 定义要检测的字段列表（可自定义）
			
 
				+    # fields_to_check = ['projectname', 'area', 'city', 'budget', 'buyer']
			
 
				+    # fields_to_check = ['projectname', 'projectcode','area', 'city', 'buyer']
			
 
				+    # fields_to_check = ['projectname', 'projectcode','area', 'city','budget', 'buyer','toptype','subtype']
			
 
				+    fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'bidamount', 's_winner', 'toptype', 'subtype']
			
 
				+    sort_order = [('_id', 1)]
			
 
				+
			
 
				+    # 构建查询条件：toptype为"采购意向"且所有指定字段在v_taginfo中都为1
			
 
				+    query = {
			
 
				+        "$and": [
			
 
				+            {"$or": [{"subtype": "合同"}, {"subtype": "验收"}]},
			
 
				+            *[{f"v_taginfo.{field}": 1} for field in fields_to_check]
			
 
				+        ]
			
 
				+    }
			
 
				+
			
 
				+    # 批量处理参数
			
 
				+    batch_size = 500  # 每批处理500个文档
			
 
				+    operations = []
			
 
				+    processed_count = 0
			
 
				+
			
 
				+    try:
			
 
				+        # 查询符合条件的文档
			
 
				+        cursor = collection.find(query).sort(sort_order).batch_size(batch_size)
			
 
				+
			
 
				+        for doc in cursor:
			
 
				+            try:
			
 
				+                doc_id = doc['_id']
			
 
				+                # 输出当前正在处理的ID
			
 
				+                print(f"正在处理文档 _id: {doc_id}")
			
 
				+                # 准备批量操作
			
 
				+                operations.append(
			
 
				+                    UpdateOne(
			
 
				+                        {'_id': doc_id},
			
 
				+                        {'$set': {'flag5': 1}},
			
 
				+                        upsert=True
			
 
				+                    )
			
 
				+                )
			
 
				+
			
 
				+                # 达到批量大小时执行
			
 
				+                if len(operations) >= batch_size:
			
 
				+                    collection.bulk_write(operations, ordered=False)
			
 
				+                    processed_count += len(operations)
			
 
				+                    print(f"已批量处理 {processed_count} 条记录")
			
 
				+                    operations = []
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
			
 
				+                continue
			
 
				+
			
 
				+        # 处理剩余的不足一个批次的操作
			
 
				+        if operations:
			
 
				+            collection.bulk_write(operations, ordered=False)
			
 
				+            processed_count += len(operations)
			
 
				+            print(f"最后一批处理了 {len(operations)} 条记录")
			
 
				+
			
 
				+    except BulkWriteError as bwe:
			
 
				+        print(f"批量写入时发生错误: {bwe.details}")
			
 
				+    except Exception as e:
			
 
				+        print(f"处理过程中发生错误: {str(e)}")
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        print(f"处理完成，共处理 {processed_count} 条记录")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    process_tagged_documents_batch()
			
--- a/tools/高质量站点第一版/高质量站点-脚本51.py
+++ b/tools/高质量站点第一版/高质量站点-脚本51.py
@@ -0,0 +1,89 @@
 
				+from bson import ObjectId
			
 
				+from pymongo import MongoClient, UpdateOne
			
 
				+from pymongo.errors import BulkWriteError
			
 
				+
			
 
				+
			
 
				+def process_tagged_documents_batch():
			
 
				+    # 连接MongoDB
			
 
				+    client = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
			
 
				+    db = client['data_quality']
			
 
				+    collection = db['result_new']
			
 
				+
			
 
				+    # 定义要检测的字段列表（可自定义）
			
 
				+    # fields_to_check = ['projectname', 'area', 'city', 'budget', 'buyer']
			
 
				+    # fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'buyer']
			
 
				+    # fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'budget', 'buyer', 'toptype', 'subtype']
			
 
				+    fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'bidamount', 's_winner', 'toptype', 'subtype']
			
 
				+
			
 
				+    sort_order = [('_id', 1)]
			
 
				+
			
 
				+    # 批量处理参数
			
 
				+    batch_size = 500  # 每批处理500个文档
			
 
				+    operations = []
			
 
				+    processed_count = 0
			
 
				+    flagged_count = 0  # 统计被打标记的文档数量
			
 
				+
			
 
				+    query = {
			
 
				+        "subtype": {"$in": ["合同", "验收"]}
			
 
				+    }
			
 
				+
			
 
				+    try:
			
 
				+        # 查询所有文档（或者可以根据需要添加其他查询条件）
			
 
				+        cursor = collection.find(query).sort(sort_order).batch_size(batch_size)
			
 
				+
			
 
				+        for doc in cursor:
			
 
				+            try:
			
 
				+                doc_id = doc['_id']
			
 
				+                # 检查是否有字段值为2
			
 
				+                has_invalid_field = False  # 初始化标记变量
			
 
				+
			
 
				+                # 详细检查每个字段是否有值为2的情况
			
 
				+                for field in fields_to_check:
			
 
				+                    # 获取嵌套字段v_taginfo下的字段值，默认为0
			
 
				+                    field_value = doc.get('v_taginfo', {}).get(field, 0)
			
 
				+                    if field_value == 2:
			
 
				+                        has_invalid_field = True
			
 
				+                        break  # 发现一个无效字段就停止检查
			
 
				+
			
 
				+                # 仅当有字段值为2时才设置flag=1
			
 
				+                if has_invalid_field:
			
 
				+                    # 准备批量操作
			
 
				+                    operations.append(
			
 
				+                        UpdateOne(
			
 
				+                            {'_id': doc_id},
			
 
				+                            {'$set': {'err5': 1}}
			
 
				+                            # 注意：这里移除了upsert=True，因为我们只更新已存在的文档
			
 
				+                        )
			
 
				+                    )
			
 
				+                    flagged_count += 1
			
 
				+                    print(f"标记文档 _id: {doc_id} (检测到字段值为2)")
			
 
				+
			
 
				+                # 达到批量大小时执行
			
 
				+                if len(operations) >= batch_size:
			
 
				+                    if operations:  # 确保操作列表不为空
			
 
				+                        collection.bulk_write(operations, ordered=False)
			
 
				+                        processed_count += len(operations)
			
 
				+                        print(f"已批量处理 {processed_count} 条记录 (其中 {flagged_count} 条被标记)")
			
 
				+                        operations = []
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
			
 
				+                continue
			
 
				+
			
 
				+        # 处理剩余的不足一个批次的操作
			
 
				+        if operations:
			
 
				+            collection.bulk_write(operations, ordered=False)
			
 
				+            processed_count += len(operations)
			
 
				+            print(f"最后一批处理了 {len(operations)} 条记录")
			
 
				+
			
 
				+    except BulkWriteError as bwe:
			
 
				+        print(f"批量写入时发生错误: {bwe.details}")
			
 
				+    except Exception as e:
			
 
				+        print(f"处理过程中发生错误: {str(e)}")
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+        print(f"处理完成，共处理 {processed_count} 条记录，其中 {flagged_count} 条被标记为flag=1")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    process_tagged_documents_batch()
			
--- a/tools/高质量站点第一版/高质量站点-脚本61.py
+++ b/tools/高质量站点第一版/高质量站点-脚本61.py
@@ -0,0 +1,98 @@
 
				+from pymongo import MongoClient, UpdateOne
			
 
				+from pymongo.errors import BulkWriteError
			
 
				+
			
 
				+
			
 
				+def process_filtered_by_spidercode():
			
 
				+    # 连接配置
			
 
				+    mongo_uri = "mongodb://viewdata:viewdata@127.0.0.1:27088/"
			
 
				+
			
 
				+    # 连接MongoDB
			
 
				+    client2 = MongoClient(mongo_uri, unicode_decode_error_handler="ignore", directConnection=True)  # bidding库
			
 
				+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
			
 
				+
			
 
				+    # 定义数据库和集合
			
 
				+    db2 = client2['qfw']
			
 
				+    db3 = client3['data_quality']
			
 
				+    collection2 = db2['bidding']
			
 
				+    collection3 = db3['result_new']
			
 
				+    collection4 = db3['result2']
			
 
				+
			
 
				+    # 批量处理参数
			
 
				+    batch_size = 500
			
 
				+    operations = []
			
 
				+    processed_spidercodes = set()  # 内存去重集合
			
 
				+
			
 
				+    try:
			
 
				+        # 查询条件
			
 
				+        query = {
			
 
				+            'err1': 1
			
 
				+        }
			
 
				+
			
 
				+        # 获取符合条件的文档ID（只获取_id字段提高性能）
			
 
				+        filtered_ids = [doc['_id'] for doc in collection3.find(query, {'_id': 1})]
			
 
				+        print(f"找到 {len(filtered_ids)} 条符合条件的记录")
			
 
				+
			
 
				+        # 批量查询bidding集合获取爬虫代码和其他信息
			
 
				+        bidding_docs = collection2.find(
			
 
				+            {'_id': {'$in': filtered_ids}},
			
 
				+            {'site': 1, 'channel': 1, 'spidercode': 1, '_id': 1}
			
 
				+        )
			
 
				+
			
 
				+        # 处理每个文档
			
 
				+        for doc in bidding_docs:
			
 
				+            try:
			
 
				+                spidercode = doc.get('spidercode', '').strip()
			
 
				+
			
 
				+                # 检查爬虫代码有效性
			
 
				+                if not spidercode:
			
 
				+                    print(f"无效爬虫代码，跳过文档: _id={doc.get('_id')}")
			
 
				+                    continue
			
 
				+
			
 
				+                # 内存去重检查
			
 
				+                if spidercode in processed_spidercodes:
			
 
				+                    continue
			
 
				+
			
 
				+                processed_spidercodes.add(spidercode)
			
 
				+
			
 
				+                # 准备批量操作（以spidercode为唯一键）
			
 
				+                operations.append(
			
 
				+                    UpdateOne(
			
 
				+                        {'spidercode': spidercode},  # 去重依据
			
 
				+                        {
			
 
				+                            '$set': {
			
 
				+                                'site': doc.get('site'),
			
 
				+                                'channel': doc.get('channel'),
			
 
				+                                'spidercode': spidercode,
			
 
				+                                'err1':1
			
 
				+                            }
			
 
				+                        },
			
 
				+                        upsert=True
			
 
				+                    )
			
 
				+                )
			
 
				+
			
 
				+                # 执行批量操作
			
 
				+                if len(operations) >= batch_size:
			
 
				+                    collection4.bulk_write(operations, ordered=False)
			
 
				+                    print(f"已处理 {len(processed_spidercodes)} 个唯一爬虫代码 | 当前批次: {len(operations)} 条")
			
 
				+                    operations = []
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
			
 
				+                continue
			
 
				+
			
 
				+        # 处理剩余操作
			
 
				+        if operations:
			
 
				+            collection4.bulk_write(operations, ordered=False)
			
 
				+
			
 
				+    except BulkWriteError as bwe:
			
 
				+        print(f"批量写入错误: {bwe.details}")
			
 
				+    except Exception as e:
			
 
				+        print(f"处理错误: {str(e)}")
			
 
				+    finally:
			
 
				+        client2.close()
			
 
				+        client3.close()
			
 
				+        print(f"处理完成: 共存储 {len(processed_spidercodes)} 个唯一爬虫代码")
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    process_filtered_by_spidercode()
			
--- a/tools/高质量站点第一版/高质量站点-脚本7.py
+++ b/tools/高质量站点第一版/高质量站点-脚本7.py
@@ -0,0 +1,80 @@
 
				+from pymongo import MongoClient
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+def export_unique_flag1_spidercodes():
			
 
				+    # 连接MongoDB
			
 
				+    client = MongoClient('mongodb://172.20.45.129:27002/')
			
 
				+    db = client['data_quality']
			
 
				+    collection = db['result_new']
			
 
				+
			
 
				+    try:
			
 
				+        print("正在获取spidercode列表...")
			
 
				+
			
 
				+        # 1. 获取flag1=1的所有spidercode
			
 
				+        flag1_spidercodes = set(collection.distinct(
			
 
				+            "spidercode",
			
 
				+            {"flag5": 1}
			
 
				+        ))
			
 
				+
			
 
				+        # 2. 获取err1=1的所有spidercode
			
 
				+        err1_spidercodes = set(collection.distinct(
			
 
				+            "spidercode",
			
 
				+            {"err5": 1}
			
 
				+        ))
			
 
				+
			
 
				+        print(f"flag1=1的spidercode数量: {len(flag1_spidercodes)}")
			
 
				+        print(f"err1=1的spidercode数量: {len(err1_spidercodes)}")
			
 
				+
			
 
				+        # 3. 找出flag1=1但不在err1=1中的spidercode
			
 
				+        unique_spidercodes = flag1_spidercodes - err1_spidercodes
			
 
				+        print(f"找到 {len(unique_spidercodes)} 个符合条件的唯一spidercode")
			
 
				+
			
 
				+        if not unique_spidercodes:
			
 
				+            print("没有找到符合条件的数据")
			
 
				+            return
			
 
				+
			
 
				+        # 4. 查询这些spidercode对应的文档（按spidercode分组，取每个组的第一条记录）
			
 
				+        pipeline = [
			
 
				+            {"$match": {
			
 
				+                "spidercode": {"$in": list(unique_spidercodes)},
			
 
				+                "flag5": 1
			
 
				+            }},
			
 
				+            {"$group": {
			
 
				+                "_id": "$spidercode",
			
 
				+                "site": {"$first": "$site"},
			
 
				+                "channel": {"$first": "$channel"},
			
 
				+                "spidercode": {"$first": "$spidercode"}
			
 
				+            }}
			
 
				+        ]
			
 
				+
			
 
				+        print("正在聚合查询文档...")
			
 
				+        cursor = collection.aggregate(pipeline)
			
 
				+
			
 
				+        # 5. 创建DataFrame
			
 
				+        df = pd.DataFrame(list(cursor))
			
 
				+
			
 
				+        # 移除MongoDB生成的_id列
			
 
				+        if '_id' in df.columns:
			
 
				+            df.drop('_id', axis=1, inplace=True)
			
 
				+
			
 
				+        # 6. 导出到Excel
			
 
				+        output_file = "flag1_unique_spidercodes.xlsx"
			
 
				+        df.to_excel(
			
 
				+            output_file,
			
 
				+            index=False,
			
 
				+            columns=['spidercode', 'site', 'channel'],
			
 
				+            engine='openpyxl'
			
 
				+        )
			
 
				+
			
 
				+        print(f"成功导出 {len(df)} 条记录到 {output_file}")
			
 
				+        print(f"导出的字段: {list(df.columns)}")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"处理过程中发生错误: {str(e)}")
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    export_unique_flag1_spidercodes()
			
--- a/tools/高质量站点第二版/ai抽取和规则抽取对比结果.py
+++ b/tools/高质量站点第二版/ai抽取和规则抽取对比结果.py
--- a/tools/高质量站点第二版/增加一致性对比-智昆.py
+++ b/tools/高质量站点第二版/增加一致性对比-智昆.py
--- a/tools/高质量站点第二版/找出爬虫比例.py
+++ b/tools/高质量站点第二版/找出爬虫比例.py
@@ -0,0 +1,47 @@
 
				+from pymongo import MongoClient
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+def count_spidercode_stats(db_name='your_db_name', collection_name='your_collection_name'):
			
 
				+    # 连接MongoDB
			
 
				+    client = MongoClient('mongodb://172.20.45.129:27002/')
			
 
				+    db = client[db_name]
			
 
				+    collection = db[collection_name]
			
 
				+
			
 
				+    # 聚合查询统计每个spidercode的数量
			
 
				+    pipeline = [
			
 
				+        {"$group": {
			
 
				+            "_id": "$spidercode",
			
 
				+            "count": {"$sum": 1}
			
 
				+        }},
			
 
				+        {"$sort": {"count": -1}}
			
 
				+    ]
			
 
				+
			
 
				+    # 执行聚合查询
			
 
				+    results = list(collection.aggregate(pipeline))
			
 
				+
			
 
				+    if not results:
			
 
				+        print("没有找到数据")
			
 
				+        return
			
 
				+
			
 
				+    # 转换为DataFrame
			
 
				+    df = pd.DataFrame(results)
			
 
				+    df.rename(columns={'_id': 'spidercode'}, inplace=True)
			
 
				+
			
 
				+    # 计算总数和占比
			
 
				+    total_count = df['count'].sum()
			
 
				+    df['percentage'] = (df['count'] / total_count * 100).round(2)
			
 
				+
			
 
				+    # 打印结果
			
 
				+    print(f"总记录数: {total_count}")
			
 
				+    print("\n每个spidercode的数量及占比:")
			
 
				+    print(df.to_string(index=False))
			
 
				+
			
 
				+    # 保存到Excel
			
 
				+    output_file = 'spidercode_stats.xlsx'
			
 
				+    df.to_excel(output_file, index=False)
			
 
				+    print(f"\n结果已保存到 {output_file}")
			
 
				+
			
 
				+
			
 
				+# 使用示例
			
 
				+count_spidercode_stats(db_name='data_quality', collection_name='bidding_20250515')
			
--- a/tools/高质量站点第二版/找出爬虫比例2.py
+++ b/tools/高质量站点第二版/找出爬虫比例2.py
--- a/tools/高质量站点第二版/统计三个大模型和规则一致性的比例.py
+++ b/tools/高质量站点第二版/统计三个大模型和规则一致性的比例.py
--- a/tools/高质量站点第二版/记录
+++ b/tools/高质量站点第二版/记录
@@ -0,0 +1 @@
 
				+样本数据：2025-5-15 日的数据，一共21979
		`@@ -0,0 +1 @@`
		`+样本数据：2025-5-15 日的数据，一共21979`