liumiaomiao 1 tháng trước cách đây
mục cha
commit
f687535fe7
57 tập tin đã thay đổi với 1919 bổ sung26 xóa
  1. BIN
      lib/__pycache__/monitor_tools.cpython-38.pyc
  2. 3 1
      lib/learn
  3. 231 0
      lib/monitor_tools_online.py
  4. 0 0
      tools/临时/spidercodes
  5. 0 0
      tools/周报/mongo,es断流监控/monitor_all.py
  6. 263 0
      tools/周报/周报表格导出/weekly_data_store.py
  7. 0 0
      tools/基于抽取表ai和规则对比/ai抽取和规则抽取对比.py
  8. 0 0
      tools/基于抽取表ai和规则对比/new.py
  9. 0 0
      tools/基于抽取表ai和规则对比/一致性对比.py
  10. 0 0
      tools/数据抽样/ai_exchange_to_multipacket.py
  11. 0 0
      tools/数据抽样/fix_site_data_export.py
  12. 42 10
      tools/数据抽样/sample_data_export.py
  13. 58 0
      tools/数据抽样/sample_data_export_new.py
  14. 0 0
      tools/数据抽样/抽样方法最新.py
  15. BIN
      tools/数据质量监控平台/kb-数据问题统计/KB问题统计汇总.xlsx
  16. 0 0
      tools/数据质量监控平台/kb-数据问题统计/execl_kb.py
  17. 0 0
      tools/数据质量监控平台/kb-数据问题统计/task_kb.py
  18. 0 0
      tools/数据质量监控平台/基于标准数据的字段分析结果.py
  19. 0 0
      tools/数据质量监控平台/标讯基础信息分析结果入库.py
  20. 0 0
      tools/标准样本数据入库/标准样本数据汇总.xlsx
  21. 0 0
      tools/标讯数据附件为空数量统计/统计.py
  22. 0 0
      tools/爬虫数据质量一期/1、bid_analysis表抽取数据到抽取表.py
  23. 0 0
      tools/爬虫数据质量一期/2、bid_analysis表错误原因及数量统计输出,存入抽取表.py
  24. 0 0
      tools/爬虫数据质量一期/3、抽取表完善爬虫字段.py
  25. 12 8
      tools/爬虫数据质量一期/4、bid_extract计算爬虫增长率并发邮件提醒.py
  26. 0 0
      tools/爬虫数据质量一期/5、根据抽取表生成分析表格.py
  27. 0 0
      tools/爬虫数据质量一期/test.py
  28. 0 0
      tools/爬虫数据质量一期/爬虫代码输出.py
  29. 0 0
      tools/爬虫数据质量一期/爬虫数据动态.xlsx
  30. 0 0
      tools/爬虫数据质量一期/爬虫数据动态1.xlsx
  31. 0 0
      tools/爬虫数据质量二期/spider_quality.py
  32. 0 0
      tools/生成标准样本库的分析数据/test.py
  33. 0 0
      tools/生成标准样本库的分析数据/test2.py
  34. 0 0
      tools/生成标准样本库的分析数据/test3.py
  35. 9 7
      tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mongo.py
  36. 87 0
      tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mysql.py
  37. 0 0
      tools/生成标准样本库的分析数据/生成统计结果.py
  38. 289 0
      tools/生成标准样本库的分析数据/生成统计结果_入库.py
  39. BIN
      tools/高质量站点第一版/111.xlsx
  40. 51 0
      tools/高质量站点第一版/_id.csv
  41. 67 0
      tools/高质量站点第一版/test.py
  42. 37 0
      tools/高质量站点第一版/根据id找出爬虫代码.py
  43. 43 0
      tools/高质量站点第一版/统计标讯数量.py
  44. 95 0
      tools/高质量站点第一版/高质量站点-正文规则角度.py
  45. 87 0
      tools/高质量站点第一版/高质量站点-脚本1.py
  46. 97 0
      tools/高质量站点第一版/高质量站点-脚本2.py
  47. 56 0
      tools/高质量站点第一版/高质量站点-脚本3.py
  48. 77 0
      tools/高质量站点第一版/高质量站点-脚本41.py
  49. 89 0
      tools/高质量站点第一版/高质量站点-脚本51.py
  50. 98 0
      tools/高质量站点第一版/高质量站点-脚本61.py
  51. 80 0
      tools/高质量站点第一版/高质量站点-脚本7.py
  52. 0 0
      tools/高质量站点第二版/ai抽取和规则抽取对比结果.py
  53. 0 0
      tools/高质量站点第二版/增加一致性对比-智昆.py
  54. 47 0
      tools/高质量站点第二版/找出爬虫比例.py
  55. 0 0
      tools/高质量站点第二版/找出爬虫比例2.py
  56. 0 0
      tools/高质量站点第二版/统计三个大模型和规则一致性的比例.py
  57. 1 0
      tools/高质量站点第二版/记录

BIN
lib/__pycache__/monitor_tools.cpython-38.pyc


+ 3 - 1
lib/learn

@@ -3,4 +3,6 @@ from pymongo import MongoClient
 collection_bid = MongoClient(f'mongodb://{"viewdata"}:{"viewdata"}@{"127.0.0.1:27088"}/',unicode_decode_error_handler="ignore", directConnection=True)["qfw"]["bidding"]
 #连接测试环境mongo
 db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
-coll_user = db["standard_sample_data_all"]
+coll_user = db["standard_sample_data_all"]
+#常用的查询方式
+for doc in final_results.find({"_id" :{"$gt": ObjectId("68023bce5f834436f09d7e9f")}}).sort('_id',1):

+ 231 - 0
lib/monitor_tools_online.py

@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# author : liumiaomiao
+#从es库中导出数据到测试环境mongo库
+from lib.es_tools import esutil
+from datetime import datetime, timedelta
+from lib.mongo_tools import MongoUtil,Data_save,MongoSentence
+from lib.mysql_tools import MysqlUtil
+from lib.clickhouse_tools import ClickhouseUtil, logger
+
+class monitorTools:
+    # 定义一周的时间范围,转换为Unix时间戳格式
+    end_date = int(datetime.now().timestamp())
+    start_date = int((datetime.now() - timedelta(days=7)).timestamp())
+    print(f"开始时间:{start_date}--结束时间{end_date}")
+
+    #标准库bidding-es 每周统计入库数量
+    def es_bidding(self):
+        """
+        es链接
+        """
+        db_config = {
+            # es
+            'es_host': '172.17.4.184',
+            'es_port': 19908,
+            'es_http_auth': ('qyfw_es_2','Khfdals33#'),  # 重新申请
+            'timeout': 10000,
+            'index': "bidding"
+        }
+        query = {"query": {"bool": {"must": [{"range": {"comeintime": {"from": f"{self.start_date}", "to": f"{self.end_date}"}}}]}}}
+        # 传入查询语句query 以及配置信息
+        # es=esutil.get_es(db_config["es_host"], db_config["es_http_auth"], db_config["es_port"],db_config["index"])
+        counts=esutil.get_es_count(query,**db_config)
+        count = counts['count']
+        print("标准库es-bidding每周入库数据量:",count)
+        return count
+
+    # 标准库bidding-es 碎片化数据每周统计入库数量
+    def es_bidding_fragment(self):
+        #正式环境
+        db_config = {
+            # es
+            'es_host': '172.17.4.184',
+            'es_port': 19908,
+            'es_http_auth': ('qyfw_es_2', 'Khfdals33#'),  # 重新申请
+            'timeout': 10000,
+            'index': "bidding"
+        }
+        # #测试环境http://192.168.3.149:9201
+        # db_config = {
+        #     # es
+        #     'es_host': '192.168.3.149',
+        #     'es_port': 9201,
+        #     # 'es_http_auth': ('jianyuGr', 'we3g8glKfe#'),  # 重新申请
+        #     'timeout': 10000,
+        #     'index': "bidding"
+        # }
+        # 定义要监控的字段值
+        tags = [
+            "情报_法务",
+            "情报_财务审计",
+            "情报_招标代理",
+            "情报_管理咨询",
+            "情报_保险",
+            "情报_工程设计咨询",
+            "情报_安防",
+            "情报_印务商机",
+            "情报_环境采购",
+            "情报_家具招投标"
+        ]
+
+        # 初始化字典,将所有标签的计数设置为0
+        data = {}
+        for tag in tags:
+            query = {
+                "query": {"bool": {"must": [{"range": {"comeintime": {"from": f"{self.start_date}", "to": f"{self.end_date}"}}},
+                                            {"term": {"tag_topinformation": tag}}]}}}
+            count = esutil.get_es_count(query, **db_config)
+            print(f"标准库es-bidding{tag}每周入库数据量:", count['count'])
+            data[tag]=count['count']
+        # 检查数据字典以确保所有标签都被更新
+        print("数据字典内容:", data)  # 打印整个数据字典
+        return data
+
+    #拟在建es数据 每周统计入库数量
+    def es_nzj(self):
+        """
+        es链接
+        """
+        db_config = {
+            # es
+            'es_host': '172.17.4.184',
+            'es_port': 19908,
+            'es_http_auth': ('qyfw_es_2', 'Khfdals33#'),  # 重新申请
+            'timeout': 10000,
+            'index': "proposed_v1"
+        }
+        query = {
+            "query": {"match_all": {}}}
+        # 传入查询语句query 以及配置信息
+        # es=esutil.get_es(db_config["es_host"], db_config["es_http_auth"], db_config["es_port"],db_config["index"])
+        counts = esutil.get_es_count(query, **db_config)
+        count=counts['count']
+        print("拟在建es入库数据总量:", count)
+        return count
+
+    #医械通es,每周统计入库数量
+    def medical_es(self):
+        """
+        es链接
+        """
+        db_config = {
+            # es
+            'es_host': '172.17.4.184',
+            'es_port': 19908,
+            'es_http_auth': ('qyfw_es_2', 'Khfdals33#'),  # 重新申请
+            'timeout': 10000,
+            'index': "bidding"
+        }
+        query = {
+            "query": {"bool": {"must": [{"range": {"comeintime": {"from": f"{self.start_date}", "to": f"{self.end_date}"}}},{"term": {"bid_field": "0101"}}]}}}
+        # 传入查询语句query 以及配置信息
+        # es=esutil.get_es(db_config["es_host"], db_config["es_http_auth"], db_config["es_port"],db_config["index"])
+        counts = esutil.get_es_count(query, **db_config)
+        count = counts['count']
+        print("医械通es每周入库数据量:", count)
+        return count
+
+    #标准库bidding-mongo 每周统计入库数量
+    def bidding(self):
+        collection = MongoUtil.get_coon(host='172.31.31.202:27081', database='qfw',collection='bidding',authuser='dataFx',authpass='data@fenxi')
+        query = { "comeintime": {"$gte": self.start_date, "$lt": self.end_date}}
+        count=MongoSentence.count(collection,query)
+        print("标准库bidding-mongo 每周统计入库数量",count)
+        return count
+
+    #标准库bidding-mongo碎片化数据 每周统计入库数量
+    def bidding_fragment(self):
+        collection = MongoUtil.get_coon(host='172.31.31.202:27081', database='qfw',collection='bidding',authuser='dataFx',authpass='data@fenxi')
+        # 定义要监控的字段值
+        tags = [
+            "情报_法务",
+            "情报_财务审计",
+            "情报_招标代理",
+            "情报_管理咨询",
+            "情报_保险",
+            "情报_工程设计咨询",
+            "情报_安防",
+            "情报_印务商机",
+            "情报_环境采购",
+            "情报_家具招投标"
+        ]
+        # tags = [
+        #     "情报_环境采购",
+        #     "情报_家具招投标"
+        # ]
+        data={}
+        for tag in tags:
+            query = {"comeintime": {"$gte": self.start_date, "$lt": self.end_date},"tag_topinformation":tag}
+            count=MongoSentence.count(collection,query)
+            print(f"标准库bidding-mongo{tag}每周统计入库数量",count)
+            data[tag]=count
+        return data
+
+    #拟在建baseinfo-mysql 每周统计入库数量
+    def nzj(self):
+        # MySQL 数据库连接配置
+        # mysql_db_config = {
+        #     'host': '192.168.3.149',
+        #     'port': 4000,
+        #     'user': 'datagroup',
+        #     'password': 'Dgrpdb#2024@36',
+        #     'database': 'jianyu_subjectdb',
+        #     'charset': 'utf8mb4'
+        # }
+
+        now = datetime.now()
+        end_date = now.strftime("%Y-%m-%d %H:%M:%S")
+        start_date = (datetime.now() - timedelta(days=7)).strftime("%Y-%m-%d %H:%M:%S")
+
+        # SQL 查询
+        mysql_query = "SELECT COUNT(*) FROM jianyu_subjectdb.dwd_f_nzj_baseinfo WHERE createtime >= %s AND createtime <= %s"
+        params = (start_date, end_date)
+        conn=MysqlUtil.connect_to_mysql(host='172.17.162.27',port='14000',user='jydev',password='JSuytest#s211',database='jianyu_subjectdb')
+        count=MysqlUtil.execute_sql(conn,mysql_query,params)
+        print("拟在建baseinfo-mysql每周统计入库数量", count)
+        return count
+
+    #人脉数据,每周统计入库数量
+    def connections(self):
+        client = None
+        try:
+            query = f"SELECT COUNT(*) FROM information.transaction_info_all WHERE create_time >={self.start_date} AND create_time <={self.end_date}"
+            # conn=ClickhouseUtil.connect_to_clickhouse(host='192.168.3.207',port='19000',user='jytop',password='pwdTopJy123',database='information')
+            client=ClickhouseUtil.connect_to_clickhouse(host='cc-2ze9tv451wov14w9e.clickhouse.ads.aliyuncs.com',port=9000,user='jydev',password='ClTe0331kho2025',database='information')
+            count=ClickhouseUtil.execute_sql(client,query)
+            result=count[0][0]
+            print("人脉数据每周统计入库数量", result)
+            return result
+        except Exception as e:
+            logger.error("An error occurred: %s", e)
+            raise
+        finally:
+            if client:
+                client.disconnect()  # 释放连接
+
+    #医械通,每周统计入库数量
+    def medical(self):
+        collection = MongoUtil.get_coon(host='172.31.31.202:27081', database='qfw',collection='bidding',authuser='dataFx',authpass='data@fenxi')
+        query = {"comeintime": {"$gte": self.start_date, "$lt": self.end_date},"bid_field":"0101"}
+        count = MongoSentence.count(collection, query)
+        print("医械通每周统计入库数量", count)
+        return count
+
+    #统计结果入库
+    def save_to_mongo(self,title,count):
+        collection=Data_save.save_con(host='172.20.45.129',port=27002,database='data_quality',collection='statistics')
+        now = datetime.now()
+        timestamp = int(now.timestamp())
+        document = {
+            title: {
+                "timestamp": timestamp,
+                "count": count
+            }
+        }
+        Data_save.insert_one(collection,document)
+
+
+monitor=monitorTools()
+
+

+ 0 - 0
tools/临时/spidercodes


+ 0 - 0
tools/周报/mongo,es断流监控/monitor_all.py


+ 263 - 0
tools/周报/周报表格导出/weekly_data_store.py

@@ -0,0 +1,263 @@
+from pymongo import MongoClient
+from datetime import datetime, timedelta
+import pandas as pd
+import pymysql
+# 数据入库量及数据监控时效 导出execl
+# MongoDB连接配置
+host = '172.20.45.129'
+port = 27002
+dbname = 'data_quality'
+collection_name = 'statistics'
+
+# 创建MongoDB连接
+client = MongoClient(host, port)
+db = client[dbname]
+collection = db[collection_name]
+
+# 获取当前时间和一周前的时间
+end_time = datetime.now().replace(hour=23, minute=59, second=59, microsecond=999999)
+start_time = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
+
+# 将datetime转换为Unix时间戳(整数类型,去掉小数部分)
+start_timestamp = int(start_time.timestamp())
+end_timestamp = int(end_time.timestamp())
+
+# 输出调试信息:检查开始时间和结束时间
+print("Start time:", start_time)
+print("End time:", end_time)
+print("Start timestamp:", start_timestamp)
+print("End timestamp:", end_timestamp)
+
+# ----------------- 第一个Sheet: 断流监控_mongo库 -------------------
+
+# 查询过去一周的数据(断流监控_mongo库)
+pipeline_mongo = [
+    {
+        "$match": {
+            "$or": [
+                {"bidding.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"connections.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"nzj.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"medical.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"bidding_fragment.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}}
+            ]
+        }
+    },
+    {
+        "$limit": 5  # 限制查询返回的结果为前5条数据,便于调试
+    }
+]
+
+# 获取符合条件的数据
+data_mongo = list(collection.aggregate(pipeline_mongo))
+
+# 初始化MongoDB字段统计数据
+bidding_count = 0
+connections_count = 0
+nzj_count = 0
+medical_count = 0
+bidding_fragment_data = {
+    "情报_法务": 0,
+    "情报_财务审计": 0,
+    "情报_招标代理": 0,
+    "情报_管理咨询": 0,
+    "情报_保险": 0,
+    "情报_工程设计咨询": 0,
+    "情报_安防": 0,
+    "情报_印务商机": 0,
+    "情报_环境采购": 0,
+    "情报_家具招投标": 0
+}
+
+# 统计MongoDB数据
+for doc in data_mongo:
+    if 'bidding' in doc:
+        bidding_count += doc['bidding'].get('count', 0)
+    if 'connections' in doc:
+        connections_count += doc['connections'].get('count', 0)
+    if 'nzj' in doc:
+        nzj_count += doc['nzj'].get('count', 0)
+    if 'medical' in doc :
+        medical_count += doc['medical'].get('count', 0)
+    if 'bidding_fragment' in doc:
+        for key, value in doc['bidding_fragment'].get('count', {}).items():
+            if key in bidding_fragment_data:
+                bidding_fragment_data[key] += value
+
+# ----------------- 第二个Sheet: 断流监控—es -------------------
+
+# 查询过去一周的数据(断流监控—es)
+pipeline_es = [
+    {
+        "$match": {
+            "$or": [
+                {"es_bidding.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"es_nzj.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"es_medical.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}},
+                {"es_bidding_fragment.timestamp": {"$gte": start_timestamp, "$lt": end_timestamp}}
+            ]
+        }
+    }
+]
+
+# 获取符合条件的数据
+data_es = list(collection.aggregate(pipeline_es))
+
+# 初始化ES字段统计数据
+es_bidding_count = 0
+es_nzj_count = 0
+es_medical_count = 0
+es_bidding_fragment_data = {
+    "情报_法务": 0,
+    "情报_财务审计": 0,
+    "情报_招标代理": 0,
+    "情报_管理咨询": 0,
+    "情报_保险": 0,
+    "情报_工程设计咨询": 0,
+    "情报_安防": 0,
+    "情报_印务商机": 0,
+    "情报_环境采购": 0,
+    "情报_家具招投标": 0
+}
+
+# 统计ES数据
+for doc in data_es:
+    if 'es_bidding' in doc:
+        es_bidding_count += doc['es_bidding'].get('count', 0)
+    if 'es_nzj' in doc:
+        es_nzj_count += doc['es_nzj'].get('count', 0)
+    if 'es_medical' in doc:
+        es_medical_count += doc['es_medical'].get('count', 0)
+    if 'es_bidding_fragment' in doc:
+        for key, value in doc['es_bidding_fragment'].get('count', {}).items():
+            if key in es_bidding_fragment_data:
+                es_bidding_fragment_data[key] += value
+
+# ----------------- 第三个Sheet: 数据时效监控 -------------------
+
+# 查询过去一周的数据(数据时效监控)
+pipeline_timeliness = [
+    {
+        "$match": {
+            "data_timeliness.timestamp": {
+                "$gte": start_timestamp,  # 使用整数Unix时间戳
+                "$lt": end_timestamp  # 使用整数Unix时间戳
+            }
+        }
+    },
+    {
+        "$limit": 5  # 限制查询返回的结果为前5条数据,便于调试
+    }
+]
+
+# 获取符合条件的数据
+data_timeliness = list(collection.aggregate(pipeline_timeliness))
+
+# 初始化字段统计数据
+timeliness_data = {
+    "[0,5)分钟": 0,
+    "[5,15)分钟": 0,
+    "[15,30)分钟": 0,
+    "[30,60)分钟": 0,
+    "[1,3)小时": 0,
+    "[3,7)小时": 0,
+    "[7,15)小时": 0,
+    "[15,24)小时": 0,
+    "[1,2)天": 0,
+    "[2,3)天": 0,
+    "3天+": 0
+}
+
+# 统计数据
+for doc in data_timeliness:
+    if 'data_timeliness' in doc:
+        count_data = doc['data_timeliness'].get('count', {})
+        timeliness_data["[0,5)分钟"] += float(count_data.get("a1", "0%").replace('%', ''))
+        timeliness_data["[5,15)分钟"] += float(count_data.get("a2", "0%").replace('%', ''))
+        timeliness_data["[15,30)分钟"] += float(count_data.get("a3", "0%").replace('%', ''))
+        timeliness_data["[30,60)分钟"] += float(count_data.get("a4", "0%").replace('%', ''))
+        timeliness_data["[1,3)小时"] += float(count_data.get("a5", "0%").replace('%', ''))
+        timeliness_data["[3,7)小时"] += float(count_data.get("a6", "0%").replace('%', ''))
+        timeliness_data["[7,15)小时"] += float(count_data.get("a7", "0%").replace('%', ''))
+        timeliness_data["[15,24)小时"] += float(count_data.get("a8", "0%").replace('%', ''))
+        timeliness_data["[1,2)天"] += float(count_data.get("a9", "0%").replace('%', ''))
+        timeliness_data["[2,3)天"] += float(count_data.get("a10", "0%").replace('%', ''))
+        timeliness_data["3天+"] += float(count_data.get("a11", "0%").replace('%', ''))
+
+# 获取当前时间的一周时间范围字符串
+date_range = f"{start_time.strftime('%Y/%m/%d')}-{end_time.strftime('%Y/%m/%d')}"
+
+# 构建Excel数据
+columns = ['日期', '标讯每周入库数据量', '人脉管理数据', '拟在建数据量(全国)','医械通'] + list(bidding_fragment_data.keys())
+data_row_mongo = [date_range, bidding_count, connections_count, nzj_count,medical_count] + list(bidding_fragment_data.values())
+
+columns_es = ['日期', '标讯每周入库数据量', '拟在建数据量(全国)','医械通'] + list(es_bidding_fragment_data.keys())
+data_row_es = [date_range, es_bidding_count,  es_nzj_count,es_medical_count] + list(es_bidding_fragment_data.values())
+
+columns_timeliness = ['日期'] + list(timeliness_data.keys())
+data_row_timeliness = [date_range] + list(timeliness_data.values())
+
+def insert_mysql():
+    # MySQL 连接
+    conn = pymysql.connect(host='172.20.45.129', port='4000', user='root', password='=PDT49#80Z!RVv52_z',database='quality')
+    cursor = conn.cursor()
+
+    # 插入数据入库监控表
+    sql_monitoring = """
+    INSERT INTO data_monitoring (
+        date_range, type, total_weekly_entries, renmaitong_data, planning_projects_data, medical_device_data,
+        legal_intelligence, financial_audit_intelligence, bidding_agency_intelligence, management_consulting_intelligence, insurance_intelligence,
+        engineering_consulting_intelligence, security_intelligence, printing_business_intelligence, environmental_procurement_intelligence, furniture_bidding_intelligence
+    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+    """
+
+    data_monitoring = [
+        ('2025/03/20-2025/03/27', 'mongo', 5000, 1200, 2300, 1500, 300, 500, 600, 450, 200, 180, 220, 190, 210, 170),
+        ('2025/03/20-2025/03/27', 'es', 4800, 1100, 2200, 1400, 280, 480, 590, 430, 190, 170, 210, 180, 200, 160)
+    ]
+    cursor.executemany(sql_monitoring, data_monitoring)
+
+    # 插入数据时效监控表
+    sql_timeliness = """
+    INSERT INTO response_time_distribution (
+        date_range, range_0_5_min, range_5_15_min, range_15_30_min, range_30_60_min,
+        range_1_3_hour, range_3_7_hour, range_7_15_hour, range_15_24_hour,
+        range_1_2_day, range_2_3_day, range_3_plus_day
+    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+    """
+
+    data_timeliness = [
+        ('2025/03/20-2025/03/27', 28.31, 15.42, 10.85, 8.34, 12.50, 6.75, 5.20, 3.90, 4.10, 2.45, 2.18)
+    ]
+    cursor.executemany(sql_timeliness, data_timeliness)
+
+    # 提交事务
+    conn.commit()
+    cursor.close()
+    conn.close()
+
+
+# 创建DataFrame并写入Excel
+excel_file = 'mongo_data_statistics_combined1.xlsx'
+
+with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
+    # 写入第一个sheet(断流监控_mongo库)
+    df_mongo = pd.DataFrame([data_row_mongo], columns=columns)
+    df_mongo.to_excel(writer, sheet_name='入库数据量监控-mongo(每周)', index=False)
+
+    # 写入第二个sheet(断流监控—es)
+    df_es = pd.DataFrame([data_row_es], columns=columns_es)
+    df_es.to_excel(writer, sheet_name='入库量数据量监控-es(每周)', index=False)
+
+    # 将timeliness_data中的值转换为百分比字符串
+    for key in timeliness_data:
+        timeliness_data[key] = f"{timeliness_data[key]:.2f}%"
+
+    # 构建数据行
+    data_row_timeliness = [date_range] + list(timeliness_data.values())
+
+    # 写入第三个sheet(数据时效监控)
+    df_timeliness = pd.DataFrame([data_row_timeliness], columns=columns_timeliness)
+    df_timeliness.to_excel(writer, sheet_name='数据时效监控(7天平均值)', index=False)
+
+print(f"统计结果已写入Excel文件: {excel_file}")

+ 0 - 0
tools/基于抽取表ai和规则对比/ai抽取和规则抽取对比.py


+ 0 - 0
tools/基于抽取表ai和规则对比/new.py


+ 0 - 0
tools/基于抽取表ai和规则对比/一致性对比.py


+ 0 - 0
tools/样本数据导出/ai_exchange_to_multipacket.py → tools/数据抽样/ai_exchange_to_multipacket.py


+ 0 - 0
tools/样本数据导出/fix_site_data_export.py → tools/数据抽样/fix_site_data_export.py


+ 42 - 10
tools/样本数据导出/sample_data_export.py → tools/数据抽样/sample_data_export.py

@@ -1,15 +1,41 @@
 from pymongo import MongoClient
-
+from urllib.parse import quote_plus  # 可选,若密码中有特殊字符
+
+# MongodbConfigSource = {
+#     "ip_port": "127.0.0.1:27088",
+#     "user": "viewdata",
+#     "password": "viewdata",
+#     "db": "qfw",
+#     "col": "zktest_0422_fenbao"
+# }
 def sample_data(N):
-    # 连接MongoDB数据库
-    db = MongoClient('192.168.3.149', 27180, unicode_decode_error_handler="ignore").data_quality
-    coll_user = db["customer_data"]
-
 
+    # 连接MongoDB数据库
+    db = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore").data_quality
+    coll_user = db["zktest_0422_fenbao"]
+    # 构建连接字符串(含特殊字符建议用 quote_plus 编码)
+    # user = quote_plus(MongodbConfigSource['user'])
+    # password = quote_plus(MongodbConfigSource['password'])
+    # mongo_uri = f"mongodb://{user}:{password}@{MongodbConfigSource['ip_port']}/?authSource=admin"
+
+    # client = MongoClient(mongo_uri, unicode_decode_error_handler="ignore")
+    # # 获取数据库与集合
+    # db = client[MongodbConfigSource["db"]]
+    # coll_user = db[MongodbConfigSource["col"]]
+
+    # 统计符合筛选条件的总数据量
+    filter_condition = {
+         "$or": [
+        {"toptype": "招标"},
+        {"subtype": {"$in": ["中标", "成交", "合同", "验收"]}}
+    ]
+    }
+    count_all = coll_user.count_documents(filter_condition)
+    print("Filtered Document Count:", count_all)
     # 统计总的数据量
-    count_all = coll_user.estimated_document_count()
-    # count_all = coll_user.count_documents({"tag": 1})
-    print("Total Document Count:", count_all)
+    # count_all = coll_user.estimated_document_count()
+    # # count_all = coll_user.count_documents({"tag": 1})
+    # print("Total Document Count:", count_all)
 
     # 把符合条件的站点名称存起来
     site_list = {}
@@ -45,16 +71,22 @@ def sample_data(N):
 
         # 计算每次抽取的间隔
         jiange = int(site_list[key] / num)
+        query = {
+            "$or": [
+                {"toptype": "招标"},
+                {"subtype": {"$in": ["中标", "成交", "合同", "验收"]}}
+            ]
+        }
 
         # 从每个站点等间隔地取数据
         for i in range(num):
             if marked_count >= N:
                 break  # 再次检查是否已达到目标数量
 
-            for info in coll_user.find({"site": key}).sort("_id", 1).skip(i*jiange).limit(1):
+            for info in coll_user.find(query).sort("_id", -1).skip(i*jiange).limit(1):
                 print(f"Updating document with _id: {info['_id']}")
                 # 更新文档,设置标记
-                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 2}})
+                update_result = coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 1}})
                 if update_result.modified_count == 0:
                     print("No document updated for _id:", info["_id"])
                 else:

+ 58 - 0
tools/数据抽样/sample_data_export_new.py

@@ -0,0 +1,58 @@
+from pymongo import MongoClient
+def sample_data(N):
+    # db = MongoClient('172.20.45.129', 27002, unicode_decode_error_handler="ignore").data_quality
+    db = MongoClient('mongodb://127.0.0.1:27087/', unicode_decode_error_handler="ignore",directConnection=True).jyqyfw  # 清洗库
+
+    coll_user = db["usermail_Unicom_1_2"]
+
+    filter_condition = {
+        "$or": [
+            {"tag": 1},
+            {"tag": 2}
+        ]
+    }
+
+    # 获取所有站点及其文档数
+    site_list = {}
+    site_count = coll_user.aggregate([
+        {"$match": filter_condition},
+        {"$group": {"_id": "$site", "count": {"$sum": 1}}},
+        {"$sort": {"count": -1}}
+    ])
+    for item in site_count:
+        site_list[item["_id"]] = item["count"]
+
+    total_docs = sum(site_list.values())
+    remaining = N
+    marked_count = 0
+
+    for site, count in site_list.items():
+        if remaining <= 0:
+            break
+
+        # 计算该站点应分配的样本数
+        num = max(1, round(N * count / total_docs))
+        num = min(num, remaining)
+
+        print(f"Processing site: {site} - Allocating {num} samples")
+
+        # 使用随机抽样
+        pipeline = [
+            {"$match": {"site": site, **filter_condition}},
+            {"$sample": {"size": num}},
+            {"$project": {"_id": 1}}
+        ]
+
+        sampled_ids = [doc["_id"] for doc in coll_user.aggregate(pipeline)]
+        if not sampled_ids:
+            continue
+
+        update_result = coll_user.update_many(
+            {"_id": {"$in": sampled_ids}},
+            {"$set": {"mark": 1}}
+        )
+        marked_count += update_result.modified_count
+        remaining -= update_result.modified_count
+
+    print(f"Total marked documents: {marked_count}")
+sample_data(2000)

+ 0 - 0
tools/数据抽样/抽样方法最新.py


BIN
tools/数据质量监控平台/kb-数据问题统计/KB问题统计汇总.xlsx


+ 0 - 0
tools/数据质量监控平台/kb-数据问题统计/execl_kb.py


+ 0 - 0
tools/数据质量监控平台/kb-数据问题统计/task_kb.py


+ 0 - 0
tools/数据质量监控平台/基于标准数据的字段分析结果.py


+ 0 - 0
tools/数据质量监控平台/标讯基础信息分析结果入库.py


+ 0 - 0
tools/从mongo库导出数据execl/output.xlsx → tools/标准样本数据入库/标准样本数据汇总.xlsx


+ 0 - 0
tools/标讯数据附件为空数量统计/统计.py


+ 0 - 0
tools/爬虫相关/1、bid_analysis表抽取数据到抽取表.py → tools/爬虫数据质量一期/1、bid_analysis表抽取数据到抽取表.py


+ 0 - 0
tools/爬虫相关/2、bid_analysis表错误原因及数量统计输出,存入抽取表.py → tools/爬虫数据质量一期/2、bid_analysis表错误原因及数量统计输出,存入抽取表.py


+ 0 - 0
tools/爬虫相关/3、抽取表完善爬虫字段.py → tools/爬虫数据质量一期/3、抽取表完善爬虫字段.py


+ 12 - 8
tools/爬虫相关/4、bid_extract计算爬虫增长率并发邮件提醒.py → tools/爬虫数据质量一期/4、bid_extract计算爬虫增长率并发邮件提醒.py

@@ -1,12 +1,12 @@
 from pymongo import MongoClient
 import smtplib
 from email.mime.text import MIMEText
-
+from datetime import datetime
 
 def send_email(subject, body, to_email):
-    sender_email = "your_email@example.com"
-    sender_password = "your_password"
-    smtp_server = "smtp.example.com"
+    sender_email = "liumm_6064@163.com"
+    sender_password = "TPVBPYSETVHWTIDH"
+    smtp_server = "smtp.163.com"
 
     msg = MIMEText(body, "plain", "utf-8")
     msg["Subject"] = subject
@@ -23,6 +23,8 @@ def send_email(subject, body, to_email):
     except Exception as e:
         print("邮件发送失败", e)
 
+def format_timestamp(batch_id):
+    return datetime.utcfromtimestamp(batch_id).strftime('%Y-%m-%d %H:%M:%S')
 
 def calculate_growth_rate(data):
     batch_updates = data.get("batch_updates", [])
@@ -38,17 +40,19 @@ def calculate_growth_rate(data):
         prev_total = batch_updates[i - 1]["update_info"].get("总数量", 0)
         latest_total = batch_updates[i]["update_info"].get("总数量", 0)
 
+        batch_updates[i]["batch_time"] = format_timestamp(batch_updates[i]["batch_id"])  # 添加时间转换字段
+
         if prev_total == 0:
             growth_rate = "N/A"
         else:
             growth_rate_value = ((latest_total - prev_total) / prev_total) * 100
             growth_rate = f"{growth_rate_value:.2f} %"
 
-            # 低于 20% 发送邮件提醒
-            if growth_rate_value < 20:
+            # 仅在最新批次的增长率低于 20% 发送邮件
+            if i == len(batch_updates) - 1 and growth_rate_value < 20:
                 subject = "数据增长率低于 20% 提醒"
-                body = f"批次 {batch_updates[i]['batch_id']} 的增长率仅为 {growth_rate},请关注!"
-                send_email(subject, body, "recipient@example.com")
+                body = f"最新批次 {batch_updates[i]['batch_id']} ({batch_updates[i]['batch_time']}) 的增长率仅为 {growth_rate},请关注!"
+                send_email(subject, body, "liumiaomiao@topnet.net.cn.com")
 
         batch_updates[i]["update_info"]["增长率"] = growth_rate
 

+ 0 - 0
tools/爬虫相关/5、根据抽取表生成分析表格.py → tools/爬虫数据质量一期/5、根据抽取表生成分析表格.py


+ 0 - 0
tools/爬虫相关/test.py → tools/爬虫数据质量一期/test.py


+ 0 - 0
tools/爬虫相关/爬虫代码输出.py → tools/爬虫数据质量一期/爬虫代码输出.py


+ 0 - 0
tools/爬虫相关/爬虫数据动态.xlsx → tools/爬虫数据质量一期/爬虫数据动态.xlsx


+ 0 - 0
tools/爬虫相关/爬虫数据动态1.xlsx → tools/爬虫数据质量一期/爬虫数据动态1.xlsx


+ 0 - 0
tools/爬虫数据质量二期/spider_quality.py


+ 0 - 0
tools/生成标准样本库的分析数据/test.py


+ 0 - 0
tools/生成标准样本库的分析数据/test2.py


+ 0 - 0
tools/生成标准样本库的分析数据/test3.py


+ 9 - 7
tools/mongo同步至mysql/Data_synchronization.py → tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mongo.py

@@ -14,7 +14,7 @@ MongodbConfigLocal = {
 
 # MySQL 配置信息
 mysql_config = {
-    "host": "192.168.3.217",
+    "host": "172.20.45.129",
     "user": "root",
     "password": "=PDT49#80Z!RVv52_z",
     "database": "quality",
@@ -32,8 +32,7 @@ field_mapping = {
     "projectcode": "projectcode_ai",
     "budget": "budget_ai",
     "s_winner": "s_winner_ai",
-    "bidamount": "bidamount_ai",
-    "multipackage": "multipackage_ai"
+    "bidamount": "bidamount_ai"
 }
 
 def main():
@@ -50,7 +49,7 @@ def main():
     ) as mysql_conn:
         with mysql_conn.cursor() as mysql_cursor:
             # 从 MySQL 中读取 _id 列表
-            mysql_cursor.execute("SELECT _id FROM bid_llizhikun")
+            mysql_cursor.execute("SELECT _id FROM sample_bid_analysis")
             ids = mysql_cursor.fetchall()
 
             for (_id,) in ids:
@@ -66,12 +65,15 @@ def main():
                 if not mongo_data:
                     continue
 
-                # 构造更新数据
-                update_fields = {field_mapping[key]: mongo_data.get(key, None) for key in field_mapping}
+                # 构造更新数据,若值为 None 或 "",则填充为 None
+                update_fields = {
+                    field_mapping[key]: None if not mongo_data.get(key) else mongo_data[key]
+                    for key in field_mapping
+                }
 
                 # 构造更新 SQL
                 update_sql = f"""
-                UPDATE bid_llizhikun
+                UPDATE sample_bid_analysis
                 SET {", ".join([f"{field} = %s" for field in update_fields.keys()])}
                 WHERE _id = %s
                 """

+ 87 - 0
tools/生成标准样本库的分析数据/根据样本数据拉取正式数据生成分析表mysql.py

@@ -0,0 +1,87 @@
+from pymongo import MongoClient
+from bson import ObjectId  # 导入 ObjectId
+import pymysql
+from lib.mogodb_helper import MongoDBInterface
+
+# MongoDB 配置信息
+MongodbConfigLocal = {
+    "ip_port": "127.0.0.1:27088",
+    "user": "viewdata",
+    "password": "viewdata",
+    "db": "qfw",
+    "col": "bidding"  # 替换为实际集合名称
+}
+
+# MySQL 配置信息
+mysql_config = {
+    "host": "172.20.45.129",
+    "user": "root",
+    "password": "=PDT49#80Z!RVv52_z",
+    "database": "quality",
+    "port": 4000
+}
+
+# 字段映射
+field_mapping = {
+    "toptype": "toptype_ai",
+    "subtype": "subtype_ai",
+    "area": "area_ai",
+    "city": "city_ai",
+    "buyer": "buyer_ai",
+    "projectname": "projectname_ai",
+    "projectcode": "projectcode_ai",
+    "budget": "budget_ai",
+    "s_winner": "s_winner_ai",
+    "bidamount": "bidamount_ai"
+}
+
+def main():
+    # 实例化 MongoDBInterface
+    mongo_db_interface = MongoDBInterface(MongodbConfigLocal)
+
+    # 使用 MySQL 的 with 语句管理连接
+    with pymysql.connect(
+            host=mysql_config["host"],
+            port=mysql_config["port"],
+            user=mysql_config["user"],
+            password=mysql_config["password"],
+            database=mysql_config["database"]
+    ) as mysql_conn:
+        with mysql_conn.cursor() as mysql_cursor:
+            # 从 MySQL 中读取 _id 列表
+            mysql_cursor.execute("SELECT _id FROM sample_bid_analysis")
+            ids = mysql_cursor.fetchall()
+
+            for (_id,) in ids:
+                # 将 _id 转换为 ObjectId 类型
+                try:
+                    object_id = ObjectId(_id)
+                except Exception as e:
+                    print(f"Invalid ObjectId: {_id}, skipping. Error: {e}")
+                    continue
+
+                # 使用 MongoDBInterface 的 find_by_id 方法从 MongoDB 查询数据
+                mongo_data = mongo_db_interface.find_by_id(MongodbConfigLocal["col"], object_id)
+                if not mongo_data:
+                    continue
+
+                # 构造更新数据,若值为 None 或 "",则填充为 None
+                update_fields = {
+                    field_mapping[key]: None if not mongo_data.get(key) else mongo_data[key]
+                    for key in field_mapping
+                }
+
+                # 构造更新 SQL
+                update_sql = f"""
+                UPDATE sample_bid_analysis
+                SET {", ".join([f"{field} = %s" for field in update_fields.keys()])}
+                WHERE _id = %s
+                """
+                update_values = list(update_fields.values()) + [_id]
+
+                # 执行更新操作
+                mysql_cursor.execute(update_sql, update_values)
+                mysql_conn.commit()
+
+if __name__ == "__main__":
+    main()

+ 0 - 0
tools/生成标准样本库的分析数据/生成统计结果.py


+ 289 - 0
tools/生成标准样本库的分析数据/生成统计结果_入库.py

@@ -0,0 +1,289 @@
+import pymysql
+import pymongo
+import pandas as pd
+from openpyxl import Workbook
+from openpyxl.styles import Font, Alignment
+
+# # MySQL 配置信息
+# MYSQL_CONFIG = {
+#     "host": "172.20.45.129",
+#     "user": "root",
+#     "password": "=PDT49#80Z!RVv52_z",
+#     "database": "quality",
+#     "port": 4000
+# }
+# # 连接 MySQL 并读取数据
+# def fetch_data():
+#     conn = pymysql.connect(**MYSQL_CONFIG)
+#     query = "SELECT * FROM sample_bid_analysis;"
+#     df = pd.read_sql(query, conn)
+#     conn.close()
+#     return df
+# MongoDB 连接配置
+MONGO_CONFIG = {
+    "host": "172.20.45.129",
+    "port": 27002,
+    "db": "data_quality",
+    "col": "standard_sample_data_new",
+}
+# MySQL 配置
+MYSQL_CONFIG = {
+    "host": "172.20.45.129",
+    "user": "root",
+    "password": "=PDT49#80Z!RVv52_z",
+    "database": "quality",
+    "port": 4000
+}
+
+# 连接 MongoDB 并读取数据
+def fetch_data():
+    client = pymongo.MongoClient(f"mongodb://{MONGO_CONFIG['host']}:{MONGO_CONFIG['port']}")
+    db = client[MONGO_CONFIG["db"]]
+    collection = db[MONGO_CONFIG["col"]]
+
+    # 读取数据并转换为 DataFrame
+    data = list(collection.find({}, {"_id": 0}))  # 去掉 `_id` 字段
+    df = pd.DataFrame(data)
+
+    client.close()
+    return df
+
+# 判断 projectname 是否互为包含关系
+def is_contained(str1, str2):
+    """ 判断 str1 和 str2 是否互相包含(非空值情况下) """
+    if pd.isna(str1) or pd.isna(str2):  # 如果有 NaN 值,直接返回 False
+        return False
+    return str1 in str2 or str2 in str1  # 互为包含
+
+# 计算统计数据
+def calculate_metrics_and_accuracy(df, category):
+    """ 计算表格所需数据 """
+    # 确定数据类别:中标类 or 招标类
+    if category == "中标类":
+        bid_types = ["成交", "单一", "废标", "合同", "结果变更", "流标", "验收", "中标", "其它"]
+        df = df[df["subtype"].isin(bid_types)]
+        fields = ["toptype", "subtype", "area", "city", "buyer", "projectname", "projectcode", "budget", "s_winner", "bidamount"]
+
+    else:  # 招标类
+        bid_types = ["成交", "单一", "废标", "合同", "结果变更", "流标", "验收", "中标", "其它", "拟建"]
+        df = df[~df["subtype"].isin(bid_types)]
+        fields = ["toptype", "subtype", "area", "city", "buyer", "projectname", "projectcode", "budget"]
+
+
+    results = []
+    # 统一将 None、<NA> 和空字符串都转为 pd.NA
+    df = df.replace({None: pd.NA, '': pd.NA})  # 替换 None 和空字符串为 pd.NA
+    df = df.fillna(pd.NA)  # 确保所有空值都转为 pd.NA
+    correct_rows = 0  # 整行正确的计数
+    total_count = len(df)  # 样本总量
+
+    for _, row in df.iterrows():
+        row_correct = True  # 假设整行正确
+
+        for field in fields:
+            original_value = row.get(field, pd.NA)
+            ai_value = row.get(f"{field}_ai", pd.NA)
+
+            if field == "projectname":  # 特殊处理 projectname
+                is_correct = is_contained(original_value, ai_value)
+            else:
+                # 这里避免 pd.NA 直接比较导致错误
+                if pd.isna(original_value) or pd.isna(ai_value):
+                    is_correct = pd.isna(original_value) and pd.isna(ai_value)  # 如果都为空,算正确
+                else:
+                    is_correct = original_value == ai_value  # 正常比较
+
+            if not is_correct:
+                row_correct = False  # 只要有一个字段错误,整行就是错误的
+
+        if row_correct:
+            correct_rows += 1  # 统计整行正确的数量
+
+    # 计算整行正确率
+    single_row_accuracy = correct_rows / total_count if total_count else 0
+
+    for field in fields:
+        total_count = len(df)  # 样本数据总量
+        null_count = df[field].isna().sum()  # 原文无值
+        valid_count = total_count - null_count  # 原文有值的数量
+
+        if field == "projectname":  # 特殊处理 projectname
+            extract_correct_count = df.apply(lambda row: is_contained(row["projectname"], row["projectname_ai"]),axis=1).sum()
+            extract_error_count = valid_count - extract_correct_count
+            extract_correct_no_null = extract_correct_count  # 互为包含的都算正确
+            extract_error_no_null = extract_error_count
+        else:  # 其他字段的正常处理逻辑
+            extract_error_count = ((df[field].isna() & df[f"{field}_ai"].notna()) |
+                                   (df[field].notna() & df[f"{field}_ai"].isna()) |
+                                   (df[field].notna() & df[f"{field}_ai"].notna() & (
+                                               df[field] != df[f"{field}_ai"]))).sum()
+
+            # 抽取错误的数量(含原文无)
+            extract_correct_count = total_count - extract_error_count  # 抽取正确的数量(含原文无)
+            extract_error_no_null = (df[field].notna() & (df[field] != df.get(f"{field}_ai", df[field]))).sum()  # 抽取错误的数量(不含原文无)
+            extract_correct_no_null = valid_count - extract_error_no_null  # 抽取有值且正确数量(不含原文无)
+
+        # 计算比率
+        recognition_rate = valid_count / total_count if total_count else 0  # 识别率
+        recognition_correct_rate = extract_correct_count / total_count if total_count else 0  # 识别正确率
+        correct_rate = extract_correct_no_null / valid_count if valid_count else 0  # 正确率(原文存在情况下)
+
+        results.append([
+            field, total_count, null_count, valid_count, extract_error_count,
+            extract_correct_count, extract_error_no_null, extract_correct_no_null,
+            f"{recognition_rate:.2%}", f"{recognition_correct_rate:.2%}", f"{correct_rate:.2%}"
+        ])
+        results.append({
+            "field_name": field,
+            "sample_total": total_count,
+            "original_null": null_count,
+            "original_exist": valid_count,
+            "extract_error_total": extract_error_count,
+            "extract_correct_total": extract_correct_count,
+            "extract_error_exist": extract_error_no_null,
+            "extract_correct_exist": extract_correct_no_null,
+            "recognition_rate": f"{recognition_rate:.2%}",
+            "correct_recognition_rate": f"{recognition_correct_rate:.2%}",
+            "accuracy_rate": f"{correct_rate:.2%}",
+            "data_type": category
+        })
+
+    columns = ["字段", "样本数据总量", "原文无值", "原文有值的数量", "抽取错误的数量(含原文无)",
+               "抽取正确的数量(含原文无)", "抽取错误的数量(不含原文无)",
+               "抽取有值且正确数量(不含原文无)", "识别率", "识别正确率", "正确率(原文存在情况下)"]
+    df_fields = pd.DataFrame(results, columns=columns)
+
+    # 整行统计数据
+    df_overall = pd.DataFrame([["数据总量", total_count],
+                               ["整行都正确的数量", correct_rows],
+                               ["单行正确率", f"{single_row_accuracy:.2%}"]],
+                              columns=["指标", "数值"])
+
+    # 构建整体统计
+    overall_data = {
+        "total_data_count": total_count,
+        "correct_rows_count": correct_rows,
+        "row_accuracy": f"{correct_rows / total_count:.2%}" if total_count else "0.00%",
+        "data_type": category
+    }
+    return df_fields,df_overall,overall_data
+
+
+# # 计算整体正确率
+# def calculate_overall_accuracy(df, fields):
+#     """ 计算整行正确的数量及单行正确率 """
+#     total_count = len(df)  # 样本总量
+#
+#     # 判断每行所有字段是否都正确(projectname 需使用互为包含逻辑)
+#     def is_row_correct(row):
+#         for field in fields:
+#             if pd.isna(row[field]) and pd.isna(row[f"{field}_ai"]):  # 如果原值和 AI 值都为空,算正确
+#                 continue
+#             if field == "projectname":
+#                 if not is_contained(row["projectname"], row["projectname_ai"]):  # projectname 互为包含
+#                     return False
+#             else:
+#                 if row[field] != row.get(f"{field}_ai", row[field]):  # 其他字段直接对比
+#                     return False
+#         return True
+#
+#     correct_rows = df.apply(is_row_correct, axis=1).sum()  # 统计整行正确的数量
+#     single_row_accuracy = correct_rows / total_count if total_count else 0  # 计算单行正确率
+#
+#     return pd.DataFrame([["数据总量", total_count],
+#                          ["整行都正确的数量", correct_rows],
+#                          ["单行正确率", f"{single_row_accuracy:.2%}"]],
+#                         columns=["指标", "数值"])
+
+# 导出 Excel
+def export_to_excel(df_bid_fields, df_bid_overall,df_tender_fields,df_tender_overall):
+    file_path = "数据分析结果.xlsx"
+    with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
+        df_bid_fields.to_excel(writer, sheet_name="字段统计-中标类", index=False)
+        df_bid_overall.to_excel(writer, sheet_name="整体正确率-中标类", index=False)
+        df_tender_fields.to_excel(writer, sheet_name="字段统计-招标类", index=False)
+        df_tender_overall.to_excel(writer, sheet_name="整体正确率-招标类", index=False)
+
+        # Excel 格式优化
+        workbook = writer.book
+        for sheet in workbook.sheetnames:
+            ws = workbook[sheet]
+            for col in ws.columns:
+                max_length = 0
+                col_letter = col[0].column_letter
+                for cell in col:
+                    try:
+                        if cell.value:
+                            max_length = max(max_length, len(str(cell.value)))
+                    except:
+                        pass
+                ws.column_dimensions[col_letter].width = max_length + 2  # 调整列宽
+
+            # 加粗第一行
+            for cell in ws[1]:
+                cell.font = Font(bold=True)
+                cell.alignment = Alignment(horizontal="center", vertical="center")
+
+    print(f"Excel 文件已保存:{file_path}")
+
+def save_to_database(df_fields, df_overall):
+    """保存到优化后的数据库结构"""
+    conn = pymysql.connect(**MYSQL_CONFIG)
+    cursor = conn.cursor()
+
+    try:
+        # 插入字段统计
+        for _, row in df_fields.iterrows():
+            sql = """
+            INSERT INTO sample_data_analysis (
+                field_name, sample_total, original_null, original_exist,
+                extract_error_total, extract_correct_total, extract_error_exist,
+                extract_correct_exist, recognition_rate, correct_recognition_rate,
+                accuracy_rate, data_type
+            ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
+            """
+            cursor.execute(sql, (
+                row['field_name'], row['sample_total'], row['original_null'],
+                row['original_exist'], row['extract_error_total'],
+                row['extract_correct_total'], row['extract_error_exist'],
+                row['extract_correct_exist'], row['recognition_rate'],
+                row['correct_recognition_rate'], row['accuracy_rate'],
+                row['data_type']
+            ))
+
+        # 插入整体统计
+        for _, row in df_overall.iterrows():
+            sql = """
+            INSERT INTO data_quality_analysis 
+            (total_data_count, correct_rows_count, row_accuracy, data_type)
+            VALUES (%s,%s,%s,%s)
+            """
+            cursor.execute(sql, (
+                row['total_data_count'], row['correct_rows_count'],
+                row['row_accuracy'], row['data_type']
+            ))
+
+        conn.commit()
+        print(f"成功插入 {len(df_fields)} 条字段记录和 {len(df_overall)} 条整体记录")
+    except Exception as e:
+        conn.rollback()
+        print(f"数据库操作失败: {str(e)}")
+        raise  # 抛出异常以便调试
+    finally:
+        cursor.close()
+        conn.close()
+
+# 主函数
+def main():
+    df = fetch_data()
+    df_bid_fields, df_bid_overall = calculate_metrics_and_accuracy(df, "中标类")
+    df_tender_fields, df_tender_overall = calculate_metrics_and_accuracy(df, "招标类")
+    export_to_excel(df_bid_fields, df_bid_overall,df_tender_fields,df_tender_overall)
+    # 合并结果
+    all_fields = pd.concat([df_bid_fields, df_tender_fields])
+    all_overall = pd.concat([df_bid_overall, df_tender_overall])
+    # 存储数据
+    save_to_database(all_fields, all_overall)
+
+if __name__ == "__main__":
+    main()

BIN
tools/高质量站点第一版/111.xlsx


+ 51 - 0
tools/高质量站点第一版/_id.csv

@@ -0,0 +1,51 @@
+spidercode
+a_qgzbgggsssyq_qbgg
+a_zgzfcgw_zydwzfcgyxgk_gjjs_01
+sc_gzzwsjjcgxt_jjgg
+gd_gdszfcgw_syss_cggg
+jx_jxszfcgdzmc_ggdt_htgg
+a_zgzfcgw_zfcghtgg_new
+a_zgjcjtcgpt_fzbgg_cggg
+a_zgjcjtcgpt_fzbgg_jggg
+hn_hnszfcgdzmc_hnsbj_ggdt
+xj_xjwwezzqzfcgw_dzmcgg_cgcg
+a_zgzfcgw_zydwzfcgyxgk_gjjs_new_01
+a_zgzbtbggfwpt_zbgg2
+js_ntszfcgwssc_xjgg_xqgg
+gx_gxzzzzqzfcg_dzmchtgg
+zj_zjzfcgw_cggg_sylx
+a_oycg_gkcggg
+ah_ahzfcgypt_cjgg
+a_zgjcjtcgpt_fzbgg_bggg
+a_jdcgwxwz_cgdtzxcgxx
+js_ntszfcgwssc_xjgg_cjgg
+gd_gdswszjfwcs_cggg
+a_zgzbtbggfwpt_zhbjggs2
+ah_ahzfcgypt_htgg
+sd_zgsdzfcgw_xxgk_sxhtgk
+gd_gdszfcgw_syss_dzmc
+a_jsxmhjyxdjbbaxt_gg_nipc
+a_zgzbtbggfwpt_wasjgf_zbgg
+gz_gzszfcgdzmc_gzsbj_ggdt_01
+ah_ahzfcgypt_ysgg
+ha_hnstzxmzxspjgptbsdt_xmbljggs_njpc
+hb_hbzwfww_bacx_njpc
+a_zgzbycgw_zbxx_zbxx
+a_gtcgpt_cgjg
+a_zgjcjtcgpt_zbzq_zhbgg
+xj_xjwwezzqzfcgw_dzmcgg_wscs
+gd_gdswszjfwcs_zxgs
+sd_zgsdzfcgw_sxzhbgg_new
+a_syjtyxgs_zh
+jx_jxswszjfwcs_cggg
+a_gjggzyjypt_gcjs_kbjl
+nm_nmgzzqzfcgw_dzmc_htgs
+a_zgzbtbggfwpt_wasjgf_kbjl
+a_gtcgpt_cggg
+jx_jxszfcgdzmc_htgg
+js_jsstzxmzxspjgpt_gsxx_bazcx_njpc
+a_zgzbtbggfwpt_zhbhxrgs2
+a_bjgc_jggs
+a_zgzbycgw_zbxx_zb
+a_zgzfcgw_dfgg_new
+a_zgzfcgw_zydwzfcgyxgk_gjjs

+ 67 - 0
tools/高质量站点第一版/test.py

@@ -0,0 +1,67 @@
+import pandas as pd
+from collections import defaultdict
+
+
+def transform_spider_data(input_file, output_file):
+    # 读取原始数据
+    raw_df = pd.read_excel(input_file)
+
+    # 初始化结果字典
+    result = defaultdict(lambda: {
+        '采购意向': 0,
+        '预告': 0,
+        '招标': 0,
+        '结果': 0,
+        '信用': 0
+    })
+
+    # 定义列名映射(原始列名 -> 类型名)
+    column_mapping = {
+        '采购意向': '采购意向',
+        '预告': '预告',
+        '招标': '招标',
+        '结果': '结果',
+        '信用': '信用'
+    }
+
+    # 处理每一对spidercode-类型列
+    for i in range(0, len(raw_df.columns), 2):
+        if i + 1 >= len(raw_df.columns):
+            break
+
+        # 获取当前列对
+        spidercode_col = raw_df.columns[i]
+        count_col = raw_df.columns[i + 1]
+
+        # 获取类型名称
+        typ = column_mapping.get(count_col, count_col)
+
+        # 处理每一行数据
+        for _, row in raw_df.iterrows():
+            spidercode = row[spidercode_col]
+            count = row[count_col]
+
+            if pd.notna(spidercode) and pd.notna(count):
+                try:
+                    result[spidercode][typ] += int(count)
+                except (ValueError, TypeError):
+                    continue
+
+    # 转换为DataFrame
+    df = pd.DataFrame.from_dict(result, orient='index')
+    df.reset_index(inplace=True)
+    df.rename(columns={'index': 'spidercode'}, inplace=True)
+
+    # 计算总量
+    df['总量'] = df[['采购意向', '预告', '招标', '结果', '信用']].sum(axis=1)
+
+    # 重新排序列
+    df = df[['spidercode', '总量', '采购意向', '预告', '招标', '结果', '信用']]
+
+    # 保存结果
+    df.to_excel(output_file, index=False)
+    print(f"转换完成,结果已保存到{output_file}")
+
+
+# 使用示例
+transform_spider_data('111.xlsx', 'transformed_data.xlsx')

+ 37 - 0
tools/高质量站点第一版/根据id找出爬虫代码.py

@@ -0,0 +1,37 @@
+import csv
+from pymongo import MongoClient
+
+
+def mark_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name):
+    # 连接到MongoDB
+    client = MongoClient(mongo_uri)
+    db = client[db_name]
+    collection = db[collection_name]
+
+    # 读取 CSV 文件中的 spidercodes
+    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
+        csv_reader = csv.reader(csv_file)
+        next(csv_reader)  # 跳过标题行
+        spidercodes = [row[0] for row in csv_reader]  # 假设 spidercode 是第一列
+
+    # 去除重复的 spidercodes
+    unique_spidercodes = list(set(spidercodes))
+
+    # 批量更新符合条件的文档,设置 flag=1
+    result = collection.update_many(
+        {"spidercode": {"$in": unique_spidercodes}},
+        {"$set": {"tag": 1}}
+    )
+
+    print(f"成功更新了 {result.modified_count} 条文档")
+
+
+# 使用示例
+if __name__ == "__main__":
+    # 配置参数
+    csv_file_path = 'spidercodes.csv'  # 替换为你的CSV文件路径
+    mongo_uri = 'mongodb://172.20.45.129:27002/'  # MongoDB连接字符串
+    db_name = 'data_quality'  # 数据库名称
+    collection_name = 'bidding_202505'  # 集合名称
+
+    mark_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name)

+ 43 - 0
tools/高质量站点第一版/统计标讯数量.py

@@ -0,0 +1,43 @@
+import csv
+from pymongo import MongoClient
+from collections import defaultdict
+def count_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name):
+    # 连接到MongoDB
+    client = MongoClient(mongo_uri)
+    db = client[db_name]
+    collection = db[collection_name]
+
+    # 读取 CSV 文件(严格按行顺序)
+    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
+        csv_reader = csv.reader(csv_file)
+        next(csv_reader)  # 跳过标题行
+        spidercodes = [row[0] for row in csv_reader]  # 假设 spidercode 是第一列
+
+        # 一次性查询所有符合条件的数据(避免多次查询)
+        # query = {"toptype": "招标", "spidercode": {"$in": list(set(spidercodes))}}
+        # 查询条件:subtype 是 "中标" 或 "成交"
+        query = {
+            "subtype": {"$in": ["中标", "成交"]},
+            "spidercode": {"$in": list(set(spidercodes))}
+        }
+        cursor = collection.find(query, {"spidercode": 1})
+
+        # 统计每个 spidercode 的数量
+        code_counts = {}
+        for doc in cursor:
+            code = doc["spidercode"]
+            code_counts[code] = code_counts.get(code, 0) + 1
+
+        # 按 CSV 顺序输出
+        for code in spidercodes:
+            print(f"{code}: {code_counts.get(code, 0)}")
+
+# 使用示例
+if __name__ == "__main__":
+    # 配置参数
+    csv_file_path = 'spidercodes.csv'  # 替换为你的CSV文件路径
+    mongo_uri = 'mongodb://172.20.45.129:27002/'  # MongoDB连接字符串
+    db_name = 'data_quality'  # 数据库名称
+    collection_name = 'result_new'  # 集合名称
+
+    count_spidercodes_in_mongo(csv_file_path, mongo_uri, db_name, collection_name)

+ 95 - 0
tools/高质量站点第一版/高质量站点-正文规则角度.py

@@ -0,0 +1,95 @@
+import pymongo
+from openpyxl import Workbook
+from openpyxl.styles import Font
+import datetime
+
+# MongoDB连接配置
+client = pymongo.MongoClient("mongodb://172.20.45.129:27002/")
+db = client["data_quality"]  # 替换为你的数据库名
+collection = db["final_results"]  # 替换为你的集合名
+
+# 自定义字段组合 - 这些字段的_flag值需要都为1
+required_fields = [
+    "area_flag",
+    # "multipackage_flag",
+    "projectname_flag",
+    # "projectcode_flag",
+    # "budget_flag",
+    # "s_winner_flag",
+    "buyer_flag",
+    "city_flag",
+    # "toptype_flag",
+    # "subtype_flag",
+    # "bidamount_flag"
+]
+
+# 查询条件:所有required_fields的_flag值都为1
+query = {field: 1 for field in required_fields}
+query["toptype"] = "采购意向"
+
+# query = {
+#     "$and": [
+#         {"$or": [{"subtype": "中标"}, {"subtype": "成交"}]},
+#         {field: 1 for field in required_fields}  # 直接放字典,不要用 **
+#     ]
+# }
+# 要导出的字段
+export_fields = ["site", "channel", "spidercode"]
+
+# 获取数据并按spidercode分组
+results = collection.find(query)
+grouped_data = {}
+
+for doc in results:
+    spidercode = doc.get("spidercode", "unknown")
+    if spidercode not in grouped_data:
+        # 确保存储的是字典,而不是字符串
+        grouped_data[spidercode] = {
+            "site": doc.get("site", ""),
+            "channel": doc.get("channel", ""),
+            "spidercode": spidercode
+        }
+
+
+# 创建Excel工作簿
+wb = Workbook()
+ws = wb.active
+ws.title = "Export Results"
+
+# 写入表头
+headers = ["序号"] + export_fields
+ws.append(headers)
+
+# 设置表头样式
+for cell in ws[1]:
+    cell.font = Font(bold=True)
+
+# 写入数据
+row_num = 2
+for idx, (spidercode, record) in enumerate(grouped_data.items(), start=1):
+    ws.append([
+        idx,  # 序号
+        record["site"],  # 直接访问字典,而不是用 .get()
+        record["channel"],
+        record["spidercode"]
+    ])
+    row_num += 1
+
+# 自动调整列宽
+for column in ws.columns:
+    max_length = 0
+    column_letter = column[0].column_letter
+    for cell in column:
+        try:
+            if len(str(cell.value)) > max_length:
+                max_length = len(str(cell.value))
+        except:
+            pass
+    adjusted_width = (max_length + 2)
+    ws.column_dimensions[column_letter].width = adjusted_width
+
+# 保存Excel文件
+filename = f"export_result_采购意向新.xlsx"
+wb.save(filename)
+
+print(f"导出完成,文件已保存为: {filename}")

+ 87 - 0
tools/高质量站点第一版/高质量站点-脚本1.py

@@ -0,0 +1,87 @@
+from bson import ObjectId
+from pymongo import MongoClient
+
+
+def process_tagged_documents():
+    # 直接在URI中包含用户名和密码
+    username = "viewdata"
+    password = "viewdata"
+    host = "127.0.0.1"  # 例如: localhost 或 192.168.1.100
+    port = "27088"  # 默认MongoDB端口
+
+    # 构建连接URI
+    mongo_uri = f"mongodb://{username}:{password}@{host}:{port}/"
+
+
+    # 连接MongoDB
+    client1 = MongoClient('mongodb://127.0.0.1:27087/',unicode_decode_error_handler="ignore", directConnection=True)  #清洗库
+    client2 = MongoClient(mongo_uri,unicode_decode_error_handler="ignore", directConnection=True)  #bidding库
+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  #测试库
+
+    # 定义数据库和集合
+    db1 = client1['jyqykhfw']  # 替换为实际的数据库1名称
+    db2 = client2['qfw']  # 替换为实际的数据库2名称
+    db3 = client3['data_quality']  # 替换为实际的数据库3名称
+
+    collection1 = db1['f_sourceinfo_chinaunicom_zhong']  # 替换为实际的集合1名称
+    collection2 = db2['bidding']  # 替换为实际的bidding集合名称
+    collection3 = db3['result']  # 替换为实际的结果集合名称
+
+    # 定义要检测的字段列表(可自定义)
+    fields_to_check = ['projectname','projectcode','area','city','budget','bidamount', 's_winner', 'buyer']
+
+
+    # 组合查询条件(增加_id条件)
+    combined_query = {
+        '$and': [
+            {'i_ckdata': {'$gt': 1}},
+            # {'_id': {'$lt': ObjectId("67a5a0563309c0998b14b361")}},
+            *[{f'v_taginfo.{field}': 1} for field in fields_to_check]
+        ]
+    }
+    # 按照_id升序排序(1表示升序,-1表示降序)
+    sort_order = [('_id', -1)]
+    # 遍历符合条件的文档
+    for doc in collection1.find(combined_query).sort(sort_order):
+        doc_id = ObjectId(doc['id'])
+
+        # 在库2的bidding集合中查找该id
+        bidding_doc = collection2.find_one({'_id': doc_id})
+        if bidding_doc:
+            site = bidding_doc.get('site')
+            channel = bidding_doc.get('channel')
+            spidercode = bidding_doc.get('spidercode')
+            current_flag = 2
+            # 检查库3中是否已存在相同的site和channel,spidercode组合
+            existing_record = collection3.find_one({
+                'site': site,
+                'channel': channel,
+                'spidercode': spidercode,
+                'flag':current_flag
+            })
+            if existing_record:
+                print(f"记录已存在,跳过: _id={doc_id},site={site}, channel={channel}")
+                continue
+            # 提取需要的字段
+            result = {
+                '_id': doc_id,
+                'site': bidding_doc.get('site'),
+                'channel': bidding_doc.get('channel'),
+                'spidercode': bidding_doc.get('spidercode'),
+                'toptype': bidding_doc.get('toptype'),
+                'subtype': bidding_doc.get('subtype'),
+                'flag': current_flag  # 可以根据需要设置不同的flag值
+            }
+            # 存入库3
+            collection3.update_one(
+                {'_id': doc_id},
+                {'$set': result},
+                upsert=True
+            )
+            print(f"已存入新记录: _id={doc_id}, site={site}, channel={channel}")
+        else:
+            print(f"bidding记录不存在,跳过: id={doc_id}")
+
+
+if __name__ == '__main__':
+    process_tagged_documents()

+ 97 - 0
tools/高质量站点第一版/高质量站点-脚本2.py

@@ -0,0 +1,97 @@
+from bson import ObjectId
+from pymongo import MongoClient, UpdateOne
+from pymongo.errors import BulkWriteError
+import time
+
+
+def process_tagged_documents_batch(batch_size=100):
+    # 连接MongoDB
+    client1 = MongoClient('mongodb://127.0.0.1:27087/',
+                          unicode_decode_error_handler="ignore",
+                          directConnection=True)  # 清洗库
+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
+
+    # 定义数据库和集合
+    db1 = client1['jyqykhfw']
+    db3 = client3['data_quality']
+
+    collection1 = db1['f_sourceinfo_chinaunicom_zhong']
+    collection3 = db3['result_new']
+
+    # 查询库1中i_ckdata>1的文档
+    base_query = {'i_ckdata': {'$gt': 1}}
+    # 按照_id降序排序
+    sort_order = [('_id', 1)]
+
+    # 批量处理变量
+    operations = []
+    processed_count = 0
+    start_time = time.time()
+
+    try:
+        # 使用批量处理方式
+        for doc in collection1.find(base_query).sort(sort_order).batch_size(batch_size):
+            try:
+                _id=doc['_id']
+                doc_id = ObjectId(doc['id'])
+
+                v_taginfo = doc.get('v_taginfo', {})
+                v_baseinfo = doc.get('v_baseinfo', {})
+                site = v_baseinfo.get('site', '')
+                channel = v_baseinfo.get('channel', '')
+                spidercode = v_baseinfo.get('spidercode', '')
+                toptype = v_baseinfo.get('toptype', '')
+                subtype = v_baseinfo.get('subtype', '')
+
+                result = {
+                    'v_taginfo': v_taginfo,
+                    'site': site,
+                    'channel': channel,
+                    'spidercode': spidercode,
+                    'toptype' : toptype,
+                    'subtype' : subtype
+                }
+                # 输出当前正在处理的ID
+                print(f"正在处理文档 _id:{_id} , id: {doc_id}")
+
+                # 使用UpdateOne构建正确的批量操作
+                operations.append(
+                    UpdateOne(
+                        {'_id': doc_id},
+                        {'$set': result},
+                        upsert=True
+                    )
+                )
+
+                # 当达到批量大小时执行批量操作
+                if len(operations) >= batch_size:
+                    collection3.bulk_write(operations, ordered=False)
+                    processed_count += len(operations)
+                    print(f"已批量处理 {processed_count} 条记录")
+                    operations = []
+
+            except Exception as e:
+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
+                continue
+
+        # 处理剩余的不足一个批次的操作
+        if operations:
+            collection3.bulk_write(operations, ordered=False)
+            processed_count += len(operations)
+            print(f"最后一批处理了 {len(operations)} 条记录")
+
+    except BulkWriteError as bwe:
+        print(f"批量写入时发生错误: {bwe.details}")
+    except Exception as e:
+        print(f"处理过程中发生错误: {str(e)}")
+    finally:
+        end_time = time.time()
+        print(f"处理完成,共处理 {processed_count} 条记录")
+        print(f"总耗时: {end_time - start_time:.2f} 秒")
+        client1.close()
+        client3.close()
+
+
+if __name__ == '__main__':
+    # 可以调整batch_size参数以获得最佳性能
+    process_tagged_documents_batch(batch_size=500)

+ 56 - 0
tools/高质量站点第一版/高质量站点-脚本3.py

@@ -0,0 +1,56 @@
+from bson import ObjectId
+from pymongo import MongoClient
+
+
+def process_tagged_documents():
+
+    # 连接MongoDB
+    client1 = MongoClient('mongodb://127.0.0.1:27087/',unicode_decode_error_handler="ignore", directConnection=True)  #清洗库
+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  #测试库
+
+    # 定义数据库和集合
+    db1 = client1['jyqykhfw']  # 替换为实际的数据库1名称
+    db3 = client3['data_quality']  # 替换为实际的数据库3名称
+
+    collection1 = db1['f_sourceinfo_2025Jslt_yys']  # 替换为实际的集合1名称
+    collection3 = db3['result_new']  # 替换为实际的结果集合名称
+
+    # 查询库1中i_ckdata>1的文档
+    base_query = {
+        '$and': [
+            {'i_ckdata': {'$gt': 1}},
+        ]
+    }
+    # 按照_id升序排序(1表示升序,-1表示降序)
+    sort_order = [('_id', 1)]
+    # 遍历符合条件的文档
+    for doc in collection1.find(base_query).sort(sort_order):
+        _id=doc['_id']
+        doc_id = ObjectId(doc['id'])
+
+        # 检查v_taginfo中的字段
+        v_taginfo = doc.get('v_taginfo', {})
+        v_baseinfo =doc.get('v_baseinfo',{})
+        site = v_baseinfo.get('site','')
+        channel = v_baseinfo.get('channel','')
+        spidercode = v_baseinfo.get('spidercode','')
+
+        result = {
+            'v_taginfo' :v_taginfo,
+            'site':site,
+            'channel':channel,
+            'spidercode':spidercode
+
+        }
+        # 存入库3
+        collection3.update_one(
+            {'_id': doc_id},
+            {'$set': result},
+            upsert=True
+        )
+        print(f"已存入新记录: _id={_id},id={doc_id}")
+
+
+
+if __name__ == '__main__':
+    process_tagged_documents()

+ 77 - 0
tools/高质量站点第一版/高质量站点-脚本41.py

@@ -0,0 +1,77 @@
+from bson import ObjectId
+from pymongo import MongoClient, UpdateOne
+from pymongo.errors import BulkWriteError
+
+
+def process_tagged_documents_batch():
+    # 连接MongoDB
+    client = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
+    db = client['data_quality']
+    collection = db['result_new']
+
+    # 定义要检测的字段列表(可自定义)
+    # fields_to_check = ['projectname', 'area', 'city', 'budget', 'buyer']
+    # fields_to_check = ['projectname', 'projectcode','area', 'city', 'buyer']
+    # fields_to_check = ['projectname', 'projectcode','area', 'city','budget', 'buyer','toptype','subtype']
+    fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'bidamount', 's_winner', 'toptype', 'subtype']
+    sort_order = [('_id', 1)]
+
+    # 构建查询条件:toptype为"采购意向"且所有指定字段在v_taginfo中都为1
+    query = {
+        "$and": [
+            {"$or": [{"subtype": "合同"}, {"subtype": "验收"}]},
+            *[{f"v_taginfo.{field}": 1} for field in fields_to_check]
+        ]
+    }
+
+    # 批量处理参数
+    batch_size = 500  # 每批处理500个文档
+    operations = []
+    processed_count = 0
+
+    try:
+        # 查询符合条件的文档
+        cursor = collection.find(query).sort(sort_order).batch_size(batch_size)
+
+        for doc in cursor:
+            try:
+                doc_id = doc['_id']
+                # 输出当前正在处理的ID
+                print(f"正在处理文档 _id: {doc_id}")
+                # 准备批量操作
+                operations.append(
+                    UpdateOne(
+                        {'_id': doc_id},
+                        {'$set': {'flag5': 1}},
+                        upsert=True
+                    )
+                )
+
+                # 达到批量大小时执行
+                if len(operations) >= batch_size:
+                    collection.bulk_write(operations, ordered=False)
+                    processed_count += len(operations)
+                    print(f"已批量处理 {processed_count} 条记录")
+                    operations = []
+
+            except Exception as e:
+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
+                continue
+
+        # 处理剩余的不足一个批次的操作
+        if operations:
+            collection.bulk_write(operations, ordered=False)
+            processed_count += len(operations)
+            print(f"最后一批处理了 {len(operations)} 条记录")
+
+    except BulkWriteError as bwe:
+        print(f"批量写入时发生错误: {bwe.details}")
+    except Exception as e:
+        print(f"处理过程中发生错误: {str(e)}")
+    finally:
+        client.close()
+        print(f"处理完成,共处理 {processed_count} 条记录")
+
+
+if __name__ == '__main__':
+    process_tagged_documents_batch()

+ 89 - 0
tools/高质量站点第一版/高质量站点-脚本51.py

@@ -0,0 +1,89 @@
+from bson import ObjectId
+from pymongo import MongoClient, UpdateOne
+from pymongo.errors import BulkWriteError
+
+
+def process_tagged_documents_batch():
+    # 连接MongoDB
+    client = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
+    db = client['data_quality']
+    collection = db['result_new']
+
+    # 定义要检测的字段列表(可自定义)
+    # fields_to_check = ['projectname', 'area', 'city', 'budget', 'buyer']
+    # fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'buyer']
+    # fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'budget', 'buyer', 'toptype', 'subtype']
+    fields_to_check = ['projectname', 'projectcode', 'area', 'city', 'bidamount', 's_winner', 'toptype', 'subtype']
+
+    sort_order = [('_id', 1)]
+
+    # 批量处理参数
+    batch_size = 500  # 每批处理500个文档
+    operations = []
+    processed_count = 0
+    flagged_count = 0  # 统计被打标记的文档数量
+
+    query = {
+        "subtype": {"$in": ["合同", "验收"]}
+    }
+
+    try:
+        # 查询所有文档(或者可以根据需要添加其他查询条件)
+        cursor = collection.find(query).sort(sort_order).batch_size(batch_size)
+
+        for doc in cursor:
+            try:
+                doc_id = doc['_id']
+                # 检查是否有字段值为2
+                has_invalid_field = False  # 初始化标记变量
+
+                # 详细检查每个字段是否有值为2的情况
+                for field in fields_to_check:
+                    # 获取嵌套字段v_taginfo下的字段值,默认为0
+                    field_value = doc.get('v_taginfo', {}).get(field, 0)
+                    if field_value == 2:
+                        has_invalid_field = True
+                        break  # 发现一个无效字段就停止检查
+
+                # 仅当有字段值为2时才设置flag=1
+                if has_invalid_field:
+                    # 准备批量操作
+                    operations.append(
+                        UpdateOne(
+                            {'_id': doc_id},
+                            {'$set': {'err5': 1}}
+                            # 注意:这里移除了upsert=True,因为我们只更新已存在的文档
+                        )
+                    )
+                    flagged_count += 1
+                    print(f"标记文档 _id: {doc_id} (检测到字段值为2)")
+
+                # 达到批量大小时执行
+                if len(operations) >= batch_size:
+                    if operations:  # 确保操作列表不为空
+                        collection.bulk_write(operations, ordered=False)
+                        processed_count += len(operations)
+                        print(f"已批量处理 {processed_count} 条记录 (其中 {flagged_count} 条被标记)")
+                        operations = []
+
+            except Exception as e:
+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
+                continue
+
+        # 处理剩余的不足一个批次的操作
+        if operations:
+            collection.bulk_write(operations, ordered=False)
+            processed_count += len(operations)
+            print(f"最后一批处理了 {len(operations)} 条记录")
+
+    except BulkWriteError as bwe:
+        print(f"批量写入时发生错误: {bwe.details}")
+    except Exception as e:
+        print(f"处理过程中发生错误: {str(e)}")
+    finally:
+        client.close()
+        print(f"处理完成,共处理 {processed_count} 条记录,其中 {flagged_count} 条被标记为flag=1")
+
+
+if __name__ == '__main__':
+    process_tagged_documents_batch()

+ 98 - 0
tools/高质量站点第一版/高质量站点-脚本61.py

@@ -0,0 +1,98 @@
+from pymongo import MongoClient, UpdateOne
+from pymongo.errors import BulkWriteError
+
+
+def process_filtered_by_spidercode():
+    # 连接配置
+    mongo_uri = "mongodb://viewdata:viewdata@127.0.0.1:27088/"
+
+    # 连接MongoDB
+    client2 = MongoClient(mongo_uri, unicode_decode_error_handler="ignore", directConnection=True)  # bidding库
+    client3 = MongoClient('mongodb://172.20.45.129:27002/')  # 测试库
+
+    # 定义数据库和集合
+    db2 = client2['qfw']
+    db3 = client3['data_quality']
+    collection2 = db2['bidding']
+    collection3 = db3['result_new']
+    collection4 = db3['result2']
+
+    # 批量处理参数
+    batch_size = 500
+    operations = []
+    processed_spidercodes = set()  # 内存去重集合
+
+    try:
+        # 查询条件
+        query = {
+            'err1': 1
+        }
+
+        # 获取符合条件的文档ID(只获取_id字段提高性能)
+        filtered_ids = [doc['_id'] for doc in collection3.find(query, {'_id': 1})]
+        print(f"找到 {len(filtered_ids)} 条符合条件的记录")
+
+        # 批量查询bidding集合获取爬虫代码和其他信息
+        bidding_docs = collection2.find(
+            {'_id': {'$in': filtered_ids}},
+            {'site': 1, 'channel': 1, 'spidercode': 1, '_id': 1}
+        )
+
+        # 处理每个文档
+        for doc in bidding_docs:
+            try:
+                spidercode = doc.get('spidercode', '').strip()
+
+                # 检查爬虫代码有效性
+                if not spidercode:
+                    print(f"无效爬虫代码,跳过文档: _id={doc.get('_id')}")
+                    continue
+
+                # 内存去重检查
+                if spidercode in processed_spidercodes:
+                    continue
+
+                processed_spidercodes.add(spidercode)
+
+                # 准备批量操作(以spidercode为唯一键)
+                operations.append(
+                    UpdateOne(
+                        {'spidercode': spidercode},  # 去重依据
+                        {
+                            '$set': {
+                                'site': doc.get('site'),
+                                'channel': doc.get('channel'),
+                                'spidercode': spidercode,
+                                'err1':1
+                            }
+                        },
+                        upsert=True
+                    )
+                )
+
+                # 执行批量操作
+                if len(operations) >= batch_size:
+                    collection4.bulk_write(operations, ordered=False)
+                    print(f"已处理 {len(processed_spidercodes)} 个唯一爬虫代码 | 当前批次: {len(operations)} 条")
+                    operations = []
+
+            except Exception as e:
+                print(f"处理文档 {doc.get('_id')} 时出错: {str(e)}")
+                continue
+
+        # 处理剩余操作
+        if operations:
+            collection4.bulk_write(operations, ordered=False)
+
+    except BulkWriteError as bwe:
+        print(f"批量写入错误: {bwe.details}")
+    except Exception as e:
+        print(f"处理错误: {str(e)}")
+    finally:
+        client2.close()
+        client3.close()
+        print(f"处理完成: 共存储 {len(processed_spidercodes)} 个唯一爬虫代码")
+
+
+if __name__ == '__main__':
+    process_filtered_by_spidercode()

+ 80 - 0
tools/高质量站点第一版/高质量站点-脚本7.py

@@ -0,0 +1,80 @@
+from pymongo import MongoClient
+import pandas as pd
+
+
+def export_unique_flag1_spidercodes():
+    # 连接MongoDB
+    client = MongoClient('mongodb://172.20.45.129:27002/')
+    db = client['data_quality']
+    collection = db['result_new']
+
+    try:
+        print("正在获取spidercode列表...")
+
+        # 1. 获取flag1=1的所有spidercode
+        flag1_spidercodes = set(collection.distinct(
+            "spidercode",
+            {"flag5": 1}
+        ))
+
+        # 2. 获取err1=1的所有spidercode
+        err1_spidercodes = set(collection.distinct(
+            "spidercode",
+            {"err5": 1}
+        ))
+
+        print(f"flag1=1的spidercode数量: {len(flag1_spidercodes)}")
+        print(f"err1=1的spidercode数量: {len(err1_spidercodes)}")
+
+        # 3. 找出flag1=1但不在err1=1中的spidercode
+        unique_spidercodes = flag1_spidercodes - err1_spidercodes
+        print(f"找到 {len(unique_spidercodes)} 个符合条件的唯一spidercode")
+
+        if not unique_spidercodes:
+            print("没有找到符合条件的数据")
+            return
+
+        # 4. 查询这些spidercode对应的文档(按spidercode分组,取每个组的第一条记录)
+        pipeline = [
+            {"$match": {
+                "spidercode": {"$in": list(unique_spidercodes)},
+                "flag5": 1
+            }},
+            {"$group": {
+                "_id": "$spidercode",
+                "site": {"$first": "$site"},
+                "channel": {"$first": "$channel"},
+                "spidercode": {"$first": "$spidercode"}
+            }}
+        ]
+
+        print("正在聚合查询文档...")
+        cursor = collection.aggregate(pipeline)
+
+        # 5. 创建DataFrame
+        df = pd.DataFrame(list(cursor))
+
+        # 移除MongoDB生成的_id列
+        if '_id' in df.columns:
+            df.drop('_id', axis=1, inplace=True)
+
+        # 6. 导出到Excel
+        output_file = "flag1_unique_spidercodes.xlsx"
+        df.to_excel(
+            output_file,
+            index=False,
+            columns=['spidercode', 'site', 'channel'],
+            engine='openpyxl'
+        )
+
+        print(f"成功导出 {len(df)} 条记录到 {output_file}")
+        print(f"导出的字段: {list(df.columns)}")
+
+    except Exception as e:
+        print(f"处理过程中发生错误: {str(e)}")
+    finally:
+        client.close()
+
+
+if __name__ == '__main__':
+    export_unique_flag1_spidercodes()

+ 0 - 0
tools/高质量站点第二版/ai抽取和规则抽取对比结果.py


+ 0 - 0
tools/高质量站点第二版/增加一致性对比-智昆.py


+ 47 - 0
tools/高质量站点第二版/找出爬虫比例.py

@@ -0,0 +1,47 @@
+from pymongo import MongoClient
+import pandas as pd
+
+
+def count_spidercode_stats(db_name='your_db_name', collection_name='your_collection_name'):
+    # 连接MongoDB
+    client = MongoClient('mongodb://172.20.45.129:27002/')
+    db = client[db_name]
+    collection = db[collection_name]
+
+    # 聚合查询统计每个spidercode的数量
+    pipeline = [
+        {"$group": {
+            "_id": "$spidercode",
+            "count": {"$sum": 1}
+        }},
+        {"$sort": {"count": -1}}
+    ]
+
+    # 执行聚合查询
+    results = list(collection.aggregate(pipeline))
+
+    if not results:
+        print("没有找到数据")
+        return
+
+    # 转换为DataFrame
+    df = pd.DataFrame(results)
+    df.rename(columns={'_id': 'spidercode'}, inplace=True)
+
+    # 计算总数和占比
+    total_count = df['count'].sum()
+    df['percentage'] = (df['count'] / total_count * 100).round(2)
+
+    # 打印结果
+    print(f"总记录数: {total_count}")
+    print("\n每个spidercode的数量及占比:")
+    print(df.to_string(index=False))
+
+    # 保存到Excel
+    output_file = 'spidercode_stats.xlsx'
+    df.to_excel(output_file, index=False)
+    print(f"\n结果已保存到 {output_file}")
+
+
+# 使用示例
+count_spidercode_stats(db_name='data_quality', collection_name='bidding_20250515')

+ 0 - 0
tools/高质量站点第二版/找出爬虫比例2.py


+ 0 - 0
tools/高质量站点第二版/统计三个大模型和规则一致性的比例.py


+ 1 - 0
tools/高质量站点第二版/记录

@@ -0,0 +1 @@
+样本数据:2025-5-15 日的数据,一共21979