liumiaomiao 1 рік тому
батько
коміт
67d9258578

+ 4 - 0
Dataquality/.idea/encodings.xml

@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" defaultCharsetForPropertiesFiles="UTF-8" />
+</project>

BIN
Dataquality/BasicMethods/__pycache__/quality_tools.cpython-38.pyc


+ 0 - 20
Dataquality/BasicMethods/quality_tools.py

@@ -1,20 +0,0 @@
-import openpyxl
-import pandas as pd
-import cpca
-class AreaUtil:
-    def export_execl(self,data):
-        # 创建一个 DataFrame 对象
-        # data = {'Name': ['Alice', 'Bob', 'Charlie'],
-        #         'Age': [25, 28, 30]}
-        df = pd.DataFrame(data)
-        # 导出 DataFrame 到 Excel 文件
-        df.to_excel('./output.xlsx', index=False)
-
-    def export_area(self,name):
-        # 调用transform()函数进行城市名称提取和省份转换
-        result = cpca.transform([name])
-        # 提取省份
-        province = result['省'][0]
-        # 打印结果
-        return  province
-au=AreaUtil()

+ 62 - 0
Dataquality/dataquality/buyer_export.py

@@ -0,0 +1,62 @@
+'''
+正式环境tidb采购单位数据导出
+从tidb库导出到tidb库
+'''
+import datetime
+import re
+import pymysql
+#正式库数据源链接
+mysqlconn = pymysql.connect(
+    host='127.0.0.1',
+    port=4000,
+    db='global_common_data',
+    user='liumiaomiao',
+    password='Lmm#20230731M',
+    charset='utf8')
+cursor = mysqlconn.cursor()
+#测试库数据源链接
+mysqlconn_217 = pymysql.connect(
+    host='192.168.3.217',
+    port=4000,
+    db='data_quality',
+    user='root',
+    password='=PDT49#80Z!RVv52_z',
+    charset='utf8')
+cursor_217 = mysqlconn_217.cursor()
+#需要导出的数据,sql语句
+sql = """SELECT a.name_id,b.`name`,b.area_code,b.city_code
+FROM dws_f_ent_tags a 
+LEFT JOIN dws_f_ent_baseinfo b 
+ON a.name_id=b.name_id 
+WHERE a.`status`=2 AND (b.identity_type&(1<<0))>0"""
+#导入的表字段
+key_list = ['name_id','name','area_code','city_code']
+#插入语句
+values_d = ["%s" for i in range(0, len(key_list))]
+sql_insert = "INSERT INTO " + "buyer0801" + re.sub("\'", "", str(tuple(key_list))) + "values" + re.sub(
+            "\'", "", str(tuple(values_d)))
+#执行单条sql语句
+cursor.execute(sql)
+#执行多条
+tj_result = cursor.fetchall()
+#一起提交
+mysqlconn.commit()
+
+#保存数据
+saveData = []
+count = 0
+for info in tj_result:
+    count += 1
+    saveData.append(info)
+    if len(saveData) % 5000 == 0:
+        cursor_217.executemany(sql_insert,saveData)
+        mysqlconn_217.commit()
+        print(datetime.datetime.now(),count)
+        saveData=[]
+if saveData:
+    cursor_217.executemany(sql_insert, saveData)
+    mysqlconn_217.commit()
+    saveData = []
+
+
+

+ 5 - 3
Dataquality/dataquality/es.py

@@ -7,7 +7,6 @@ from pymongo import MongoClient
 def ES_bidding(es_query):
     """
     操作样例:直接拉取数据
-    :return:
     """
     db_config = {
         # es
@@ -22,7 +21,7 @@ def ES_bidding(es_query):
         'mg_host': '192.168.3.206',
         'mg_port': 27080,
         'database': 'data_quality',
-        'collection': 'bidding_20230707'
+        'collection': 'bidding_20230807'
     }
     query = es_query
     # 传入查询语句query 以及配置信息
@@ -30,8 +29,11 @@ def ES_bidding(es_query):
 
 def run():
     ## 根据ES语句查找bidding
+    # es_query = {"track_total_hits": True,
+    #             "query": {"bool": {"must": [{"range": {"publishtime": {"from": "1691337600", "to": "1691424000"}}}]}}}
     es_query = {"track_total_hits": True,
-                "query": {"bool": {"must": [{"range": {"publishtime": {"from": "1688659200", "to": "1688745600"}}}]}}}
+                "query": {"bool": {"must": [{"range": {"publishtime": {"from": "1691337600", "to": "1691424000"}}},
+                                            {"terms": {"subtype": ["中标", "合同","成交"]}}]}}}
     ES_bidding(es_query)
 
 run()

+ 28 - 14
Dataquality/dataquality/export_execl.py

@@ -2,9 +2,16 @@ import openpyxl
 from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
 from openpyxl.utils import get_column_letter
 from pymongo import MongoClient
+from bson import ObjectId
 
-db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
-coll_user = db["bidding_20230707"]
+# db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
+# coll_user = db["bidding_20230707"]
+
+# db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
+# coll_user = db["bidding_source"]
+
+db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").yantianlei
+coll_user = db["2023Zglt_qgyys222"]
 # 设置列宽
 def setExcelWith(ws):
     # 第一步:计算每列最大宽度,并存储在列表lks中。
@@ -14,7 +21,7 @@ def setExcelWith(ws):
         for j in range(1, ws.max_row + 1):  # 每行循环
             sz = ws.cell(row=j, column=i).value  # 每个单元格内容
             if isinstance(sz, str):  # 中文占用多个字节,需要分开处理
-                lk1 = len(sz.encode('gbk'))  # gbk解码一个中文两字节,utf-8一个中文三字节,gbk合适
+                lk1 = len(sz.encode('utf-8'))  # gbk解码一个中文两字节,utf-8一个中文三字节,gbk合适
             else:
                 lk1 = len(str(sz))
             if lk < lk1:
@@ -42,32 +49,39 @@ def writeExcel( path, sheetname, tongji_title, tongji_list):
     # 写入excel
     for i in range(0, len(tongji_list)):
         for j in range(0, len(tongji_list[i])):
-            value = ILLEGAL_CHARACTERS_RE.sub(r'', tongji_list[i][j])
+            value = ILLEGAL_CHARACTERS_RE.sub(r'', str(tongji_list[i][j]))
             ws.cell(row=i + 2, column=j + 1, value=value)
     setExcelWith(ws)
     book.save(path)
-
 def pankong(key, item):
     if key in item and item[key]:
         value = item[key]
     else:
         value = ""
     return value
-
 # 读取数据
 def run_data_daochu():
-    title_name =['id','标题','采购单位','中标单位','区域','flag_title','flag_buyer']
-    mongo_list = ["id", "title", "buyer", "s_winner", "area",'flag_title','flag_buyer']
+    # title_name =['id','标题','项目名称','项目编号','采购单位','中标单位','中标金额','省','市','县','开标日期','原文地址','title_qa','projectname_qa',"region_qa","projectcode_qa","bidopentime_qa","budget_qa","bidamount_qa"]
+    # mongo_list = ["_id", "title", "projectname","projectcode","buyer","s_winner", "bidamount","area","city","district","bidopentime",'href','title_qa','projectname_qa','region_qa','projectcode_qa','bidopentime_qa','budget_qa','bidamount_qa']
+    # title_name = ['id', '标题', '项目名称', '项目编号', '采购单位', '中标单位', '中标金额', '省', '市', '县', '开标日期', '原文地址']
+    # mongo_list = ["_id", "title", "projectname", "projectcode", "buyer", "s_winner", "bidamount", "area", "city","district", "bidopentime", 'href']
+    title_name = ['id','得分']
+    mongo_list = ["id", 'score']
     data_list_correct = []
     data_list_error =[]
-    for item in coll_user.find().batch_size(1000):
+    for item in coll_user.find():
+    # for item in coll_user.find({"_id": {"$in": [ObjectId("64dc46bfb44bf08751f1dc1a"), ObjectId("64dae31fb44bf08751ed9e59"), ObjectId("64d9df9ab44bf08751eb028b"), ObjectId("64f00847e2d7d34fa00d50eb")]}}):
         info_temp = []
+        #这个是遍历这条数据中要存的字段
         for i in range(0, len(mongo_list)):
             info_temp.append(pankong(mongo_list[i],item))
-        if item["flag_title"]==1 and  item["flag_buyer"]==1:
-            data_list_correct.append(info_temp)
-        else:
-            data_list_error.append(info_temp)
-    writeExcel("output_correct.xlsx", "统计详情", title_name, data_list_correct)
+        # if item["title_qa"] or item["projectname_qa"]or item["region_qa"]or item["projectcode_qa"]or item["bidopentime_qa"]or item["budget_qa"]or item["bidamount_qa"]:
+        #是你所有要写入的数据
+        # if  "flag" in item and item["flag"] == 1 :
+        data_list_error.append(info_temp)
+        # else:
+        #     data_list_correct.append(info_temp)
+        # print(data_list_error)
     writeExcel("output_error.xlsx", "统计详情", title_name, data_list_error)
+    # writeExcel("output_error.xlsx", "统计详情", title_name, data_list_error)
 run_data_daochu()

+ 14 - 9
Dataquality/dataquality/inspect_area.py

@@ -1,6 +1,10 @@
+'''
+区域字段数据质量检查脚本
+编写人:刘苗苗
+'''
 from bson import ObjectId
 from pymongo import MongoClient
-from BasicMethods.area_quality import au
+from BasicMethods.quality_tools import qu
 
 db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
 coll_user = db["bidding_20230707"]
@@ -16,38 +20,39 @@ def pankong(key, item):
 correct_count=0
 count = 0
 #find里面 单条数据用法:{"_id":ObjectId("64a8bb45990ffa1883accd78")}
+#for item in coll_user.find({"_id":ObjectId("64a8bb45990ffa1883accd78")})
 for item in coll_user.find().batch_size(1000):
     count += 1
     if count % 1000==0:
         print(count)
     title = pankong("title", item)
     #根据title抽取出来的省份
-    expect_area1 = au.export_area(title)
+    expect_area1 = qu.export_area(title)
     # print(expect_area1)
     # 根据buyer抽取出来的省份
     buyer = pankong("buyer", item)
-    expect_area = au.export_area(buyer)
+    expect_area = qu.export_area(buyer)
     # 根据buyeraddr抽取出来的省份
     buyeraddr = pankong("buyeraddr", item)
-    expect_area4 = au.export_area(buyeraddr)
+    expect_area4 = qu.export_area(buyeraddr)
     # print(expect_area)
     #根据s_winner抽取出来的省份
     s_winner=pankong("s_winner",item)
-    expect_area2 = au.export_area(s_winner)
+    expect_area2 = qu.export_area(s_winner)
     # 根据winneradder抽取出来的省份
     winneraddr = pankong("winneraddr", item)
-    expect_area3 = au.export_area(winneraddr)
+    expect_area3 = qu.export_area(winneraddr)
     # 根据agency抽取出来的省份
     agency = pankong("agency", item)
-    expect_area5 = au.export_area(agency)
+    expect_area5 = qu.export_area(agency)
     # 根据agencyaddr抽取出来的省份
     agencyaddr = pankong("agencyaddr", item)
-    expect_area6 = au.export_area(agencyaddr)
+    expect_area6 = qu.export_area(agencyaddr)
 
     #数据组抽取出来的省份,需要验证的字段
     tmp_area = pankong("area", item)
     #抽取出来的地区,通过函数在转换一遍
-    actual_area = au.export_area(tmp_area)
+    actual_area = qu.export_area(tmp_area)
     flag_buyer = 0
     flag_title = 0
     flag_s_winner = 0

+ 63 - 0
Dataquality/dataquality/inspect_buyer.py

@@ -0,0 +1,63 @@
+'''
+采购单位数据质量检查脚本
+编写人:刘苗苗
+'''
+import datetime
+import requests
+import pymysql
+#构造请求
+def buyer_portrait(name):
+        headers = {
+            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"
+        }
+        s = requests.Session()
+        response = s.post("https://www.jianyu360.cn/phone/login", headers=headers,data=
+        {
+            'reqType': 'phoneLogin',
+            'isAutoLogin': 'false',
+            'phone': '19937989931',
+            'password': '123456'
+        })
+        response1 = s.post("https://www.jianyu360.cn/bigmember/subVipPortrait/buyer/getData", headers=headers,params=
+            {"buyer":name,"match":"","exactMatch":"0","matchRange":"","scopeClass":"","timeRange":""}
+        )
+        response2 = s.post("https://www.jianyu360.cn/bigmember/portrait/contacts", headers=headers, params=
+        {"entName":name,"entType":"0"}
+                           )
+        r1=response1.json()
+        r2=response2.json()
+        result=r1['data']['bidamount_count']
+        result_list=r2['data']['list']
+        result1=len(result_list)
+        return  result,result1
+#测试库数据源链接
+mysqlconn = pymysql.connect(
+    host='192.168.3.217',
+    port=4000,
+    db='data_quality',
+    user='root',
+    password='=PDT49#80Z!RVv52_z',
+    charset='utf8')
+cursor = mysqlconn.cursor()
+#需要导出的数据,sql语句
+sql = """SELECT * FROM buyer0801 where id < 500000 """
+#执行单条sql语句
+cursor.execute(sql)
+#执行多条
+tj_result = cursor.fetchall()
+#一起提交
+mysqlconn.commit()
+#status=2 80%正确,portrait_empty_flag=0地区,画像内容都是空,
+
+for info in tj_result:
+    name=info[2]
+    result,result1=buyer_portrait(name)
+    if result == 0 and result1 == 0:
+        print(info,result,result1)
+        update_sql = "UPDATE buyer0801 SET portrait_empty_flag=0 WHERE name_id='{}'".format(info[1])
+        cursor.execute(update_sql)
+        mysqlconn.commit()
+
+
+
+

+ 9 - 32
Dataquality/dataquality/inspect_buyer_area.py

@@ -1,35 +1,11 @@
 '''
-采购单位数据质量检查脚本
+采购单位数据质量检查脚本--采购单位名称中不含区域
 编写人:刘苗苗
 '''
 import datetime
 import requests
 import pymysql
-#构造请求
-def buyer_portrait(name):
-        headers = {
-            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67"
-        }
-        s = requests.Session()
-        response = s.post("https://www.jianyu360.cn/phone/login", headers=headers,data=
-        {
-            'reqType': 'phoneLogin',
-            'isAutoLogin': 'false',
-            'phone': '19937989931',
-            'password': '123456'
-        })
-        response1 = s.post("https://www.jianyu360.cn/bigmember/subVipPortrait/buyer/getData", headers=headers,params=
-            {"buyer":name,"match":"","exactMatch":"0","matchRange":"","scopeClass":"","timeRange":""}
-        )
-        response2 = s.post("https://www.jianyu360.cn/bigmember/portrait/contacts", headers=headers, params=
-        {"entName":name,"entType":"0"}
-                           )
-        r1=response1.json()
-        r2=response2.json()
-        result=r1['data']['bidamount_count']
-        result_list=r2['data']['list']
-        result1=len(result_list)
-        return  result,result1
+from BasicMethods.quality_tools import qu
 #测试库数据源链接
 mysqlconn = pymysql.connect(
     host='192.168.3.217',
@@ -40,21 +16,22 @@ mysqlconn = pymysql.connect(
     charset='utf8')
 cursor = mysqlconn.cursor()
 #需要导出的数据,sql语句
-sql = """SELECT * FROM buyer0801 where id < 500000 """
+sql = """SELECT * FROM buyer0801 where id > 391115 """
 #执行单条sql语句
 cursor.execute(sql)
 #执行多条
 tj_result = cursor.fetchall()
 #一起提交
 mysqlconn.commit()
-#status=2 80%正确,portrait_empty_flag=0地区,画像内容都是空,
+#status=2 80%正确,area_empty_flag=0采购单位名称中不含区域
+keywords=['']
 
 for info in tj_result:
     name=info[2]
-    result,result1=buyer_portrait(name)
-    if result == 0 and result1 == 0:
-        print(info,result,result1)
-        update_sql = "UPDATE buyer0801 SET portrait_empty_flag=0 WHERE name_id='{}'".format(info[1])
+    expect_area = qu.export_area(name)
+    print(expect_area,info[0])
+    if expect_area is None :
+        update_sql = "UPDATE buyer0801 SET area_empty_flag=0 WHERE name_id='{}'".format(info[1])
         cursor.execute(update_sql)
         mysqlconn.commit()
 

+ 34 - 0
Dataquality/dataquality/inspect_infotype.py

@@ -0,0 +1,34 @@
+from pymongo import MongoClient
+from BasicMethods.quality_tools import qu
+#链接数据库
+db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
+coll_user = db["bidding_20230707"]
+#信息类型:拟建项目采购意向 预告预审预审结果论证意见需求公示 招标邀标询价竞谈单一竞价变更 中标成交废标流标 合同验收违规
+correct_count=0
+count = 0
+list=[['处罚决定','严重违法','违规'],['验收公告,验收单','验收单标号','验收']]
+list1=[{
+    'keyword_title':'处罚决定',
+    'keyword_text':'严重违法',
+    'info_type':'违规'
+},{
+    'keyword_title':'验收公告 验收单',
+    'keyword_text':'验收单标号',
+    'info_type':'验收'
+}]
+
+#find里面 单条数据用法:{"_id":ObjectId("64a8bb45990ffa1883accd78")}
+for item in coll_user.find().batch_size(1000):
+    count += 1
+    if count % 1000==0:
+        print(count)
+    title =qu.pankong("title", item)
+    text=qu.pankong("detail", item)
+    for key in title:
+        actural_info_type='违规'
+        flag_info_title=1
+    for key in text:
+        flag_info_text=1
+    #抽取出来的信息类型
+    type=qu.pankong("subtype", item)
+

+ 95 - 0
Dataquality/dataquality/inspect_projectname.py

@@ -0,0 +1,95 @@
+'''
+"01长度类型"--"01、少于5个字  02、超过40字"
+02汉字占比--"01、非汉字:汉字占比<1:3,02、有特殊符号如:!@#¥%……&*"
+03语义表述不完整---"01、仅有单位名称(实体)02、排除通用后缀(中标公告)03、排除敏感词(测试等)"
+
+'''
+from pymongo import MongoClient
+from bson import ObjectId
+def pankong(key, item):
+    if key in item and item[key]:
+        value = item[key]
+    else:
+        value = ""
+    return value
+def inspect_projectname():
+    # db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
+    # coll_user = db["bidding_source"]
+    db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").yantianlei
+    coll_user = db["2023Zglt_qgyys222"]
+    count=0
+    projectname_qa=[]
+    # # key_list= ["采购计划任务", "采购公告", "招标文件预公示", "招标预公告", "计划招标公告", "调研公告", "预申公告", "预审文件", "预审公告更正公告", "预审结果",
+    #                     "预审结果公示", "预审结果", "预审结果的公示", "预审结果变更", "论证意见公示", "需求论证公示", "征求意见公告", "进口产品公示", "需求公告", "直接采购公告",
+    #                     "需求公示", "采购公告", "采购项目", "项目公告", "招标公告", "意向公开", "比选公告", "邀请公告", "采购邀请", "邀请书", "邀请函", "询价采购",
+    #                     "询价公告", "比价公告", "比价项目公告", "建设项目", "中标(成交)公告", "服务","项目","合同公告","比价项目公告","比质比价","比质比价公告","询价采购公告",
+    #                "询价书","询价单","询价采购","公开询价","询价邀请书","竞价交易公告","竞价公告","竞价项目","竞价的公告","竞价采购公告","变更公告","更正公告","暂停公告","候选人公示",
+    #                "候选人公示","结果公告","结果公示","服务项目","成交公告","中选人公示","中选结果公示","中标公示","中标公告","成交公示","废标公告","终止公告","异常公告","流标公告",
+    #                "失败公告","合同公告","合同信息","采购合同","销售合同","集成合同","项目合同","合同公示","服务合同","验收公告","验收单公示","标段","单一来源采购公示","考试培训",
+    #                "比价单","项目询价","采购项目","询价","竞争性谈判","竞争性磋商","单一来源","竞价处置公告","网上竞价","结果公告","结果公示","中标公示","中标公告","公告","合同",
+    #            "评审失败","招标失败","比选失败","采购失败","流标公示","项目合同","验收结果","验收公告","行政处罚","违约行为","投诉","不良行为","协议书","候选人","购置",
+    #            "公示","建设","中标","招标","工程","采购","成交通知书","中标结果公示","采购结果","成交结果公告"]
+    abnormal_list=["公告公告","项目项目","合同合同","nbsp","..."]
+    abnormal_end_list=["--"]
+    abnormal_start_list = ["3","6","7","8","0","."]
+    # for item in coll_user.find({"_id":ObjectId("64e823d1bc72bfca1010b85f")}):
+    for item in coll_user.find().batch_size(1000):
+        count += 1
+        if count % 1000 == 0:
+            print(count)
+        if  "projectname"  not in item:
+            projectname_qa.append("0000")
+        else:
+            if item["projectname"]:
+                projectname=item["projectname"]
+                if projectname==[]:
+                    projectname_qa.append("0402")
+                if len(projectname)  <=  5:
+                    projectname_qa.append("0101")
+                if len(projectname) >= 100:
+                    projectname_qa.append("0102")
+                # 规则2: 检测汉字和非汉字的占比
+                chinese_chars = [char for char in projectname if '\u4e00' <= char <= '\u9fff']  # 匹配汉字
+                print(chinese_chars)
+                non_chinese_chars = [char for char in projectname if not ('\u4e00' <= char <= '\u9fff')]  # 匹配非汉字和非字母数字字符
+                print(non_chinese_chars)
+                non_chinese_chars_radio = len(non_chinese_chars) / len(projectname)
+                if non_chinese_chars_radio >0.5:
+                      projectname_qa.append("0201")
+                # #不包含通用词汇
+                # key_flag2 = 0
+                # for key in key_list:
+                #     if key in projectname:
+                #         key_flag2 += 1
+                #         break
+                # # print(key_flag2)
+                # if key_flag2 == 0 :
+                #     projectname_qa.append("0302")
+                #包含错误词汇
+                key_flag1 = 0
+                for key in abnormal_list:
+                    if key in projectname:
+                        key_flag1 += 1
+                # if key_flag1 :
+                #     projectname_qa.append("0303")
+                #以错误词汇结尾
+                key_flag3 = 0
+                for key in abnormal_end_list:
+                    if projectname[-len(key):]== key:
+                        key_flag3 += 1
+                # 以错误词汇开始
+                key_flag4 = 0
+                for key in abnormal_start_list:
+                    if projectname[0] == key :
+                        key_flag4 += 1
+                print(item['_id'],key_flag1,key_flag3,key_flag4)
+                if key_flag1 or key_flag3 or key_flag4 :
+                    projectname_qa.append("0303")
+            else:
+                projectname_qa.append("0000")
+        print(projectname_qa)
+        coll_user.update_one({"_id": item["_id"]}, {"$set": {"projectname_qa": projectname_qa}})
+        projectname_qa = []
+
+
+inspect_projectname()

+ 87 - 0
Dataquality/dataquality/inspect_title.py

@@ -0,0 +1,87 @@
+'''
+"01长度类型"--"01、少于5个字  02、超过40字"
+02汉字占比--"01、非汉字:汉字占比<1:3,02、有特殊符号如:!@#¥%……&*"
+03语义表述不完整---"01、仅有单位名称(实体)02、排除通用后缀(中标公告)03、排除敏感词(测试等)"
+
+'''
+from pymongo import MongoClient
+from bson import ObjectId
+
+def inspect_title():
+    # db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
+    db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").yantianlei
+    coll_user = db["2023Zglt_qgyys222"]
+    count=0
+    title_qa=[]
+    key_list= ["采购计划任务", "采购公告", "招标文件预公示", "招标预公告", "计划招标公告", "调研公告", "预申公告", "预审文件", "预审公告更正公告", "预审结果",
+                        "预审结果公示", "预审结果", "预审结果的公示", "预审结果变更", "论证意见公示", "需求论证公示", "征求意见公告", "进口产品公示", "需求公告", "直接采购公告",
+                        "需求公示", "采购公告", "采购项目", "项目公告", "招标公告", "意向公开", "比选公告", "邀请公告", "采购邀请", "邀请书", "邀请函", "询价采购",
+                        "询价公告", "比价公告", "比价项目公告", "建设项目", "中标(成交)公告", "服务","项目","合同公告","比价项目公告","比质比价","比质比价公告","询价采购公告",
+                   "询价书","询价单","询价采购","公开询价","询价邀请书","竞价交易公告","竞价公告","竞价项目","竞价的公告","竞价采购公告","变更公告","更正公告","暂停公告","候选人公示",
+                   "候选人公示","结果公告","结果公示","服务项目","成交公告","中选人公示","中选结果公示","中标公示","中标公告","成交公示","废标公告","终止公告","异常公告","流标公告",
+                   "失败公告","合同公告","合同信息","采购合同","销售合同","集成合同","项目合同","合同公示","服务合同","验收公告","验收单公示","标段","单一来源采购公示","考试培训",
+                   "比价单","项目询价","采购项目","询价","竞争性谈判","竞争性磋商","单一来源","竞价处置公告","网上竞价","结果公告","结果公示","中标公示","中标公告","公告","合同",
+               "评审失败","招标失败","比选失败","采购失败","流标公示","项目合同","验收结果","验收公告","行政处罚","违约行为","投诉","不良行为","协议书","候选人","购置",
+               "公示","建设","中标","招标","工程","采购","成交通知书"]
+    abnormal_list=["公告公告","项目项目","合同合同","nbsp","..."]
+    abnormal_end_list=["--"]
+    abnormal_start_list = ["3","6","7","8","0","."]
+    # for item in coll_user.find({"_id":ObjectId("64dcd32055d5406905b8a71f")}):
+    for item in coll_user.find().batch_size(1000):
+        count += 1
+        if count % 1000 == 0:
+            print(count)
+        title=item["title"]
+        if len(title)  <=  5:
+            title_qa.append("0101")
+        if len(title) >= 100:
+            title_qa.append("0102")
+        # 规则2: 检测汉字和非汉字的占比
+        chinese_chars = [char for char in title if '\u4e00' <= char <= '\u9fff']  # 匹配汉字
+        # print(chinese_chars)
+        non_chinese_chars = [char for char in title if not ('\u4e00' <= char <= '\u9fff')]  # 匹配非汉字和非字母数字字符
+        non_chinese_chars_radio = len(non_chinese_chars) / len(title)
+        if non_chinese_chars_radio >0.5:
+              title_qa.append("0201")
+        # #不以通用词汇结尾
+        # key_flag = 0
+        # for key in key_end_list:
+        #     if item["title"][-len(key):]== key:
+        #         key_flag +=1
+        # print(key_flag)
+        #不包含通用词汇
+        key_flag2 = 0
+        for key in key_list:
+            if key in item["title"]:
+                key_flag2 += 1
+                break
+        # print(key_flag2)
+        if key_flag2 ==0 :
+            title_qa.append("0302")
+        #包含错误词汇
+        key_flag1 = 0
+        for key in abnormal_list:
+            if key in item['title']:
+                key_flag1 += 1
+        # if key_flag1 :
+        #     title_qa.append("0303")
+        #以错误词汇结尾
+        key_flag3 = 0
+        for key in abnormal_end_list:
+            if item["title"][-len(key):]== key:
+                key_flag3 += 1
+        # 以错误词汇开始
+        key_flag4 = 0
+        for key in abnormal_start_list:
+            if item["title"][0] == key :
+                key_flag4 += 1
+        print(item['_id'],key_flag1,key_flag3,key_flag4)
+        if key_flag1 or key_flag3 or key_flag4 :
+            title_qa.append("0303")
+        #不包含通用词汇
+        # print(non_chinese_chars)
+        print(title_qa)
+        coll_user.update_one({"_id":item["_id"]},{"$set":{"title_qa":title_qa}})
+        title_qa = []
+
+inspect_title()

BIN
Dataquality/dataquality/output_correct.xlsx


BIN
Dataquality/dataquality/output_error.xlsx


+ 56 - 0
Dataquality/dataquality/sample_data_export.py

@@ -0,0 +1,56 @@
+'''
+样本数据选取
+①只针对某一天全量数据进行抽样,本次样本:8月7号es中结果类数据
+②只抽查不重复的数据,即extracttype标记为1的数据
+③抽样数量:N条
+④按照站点域名的比例进行抽取数据,占比高的抽样数据量多
+⑤每个站点按比例等距离抽样,不足1%的数据,每个站点抽取1条
+'''
+from pymongo import MongoClient
+def sample_data(N):
+    db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
+    coll_user = db["bidding_source"]
+    # find里面 单条数据用法:{"_id":ObjectId("64a8bb45990ffa1883accd78")}
+    # count_all =list(coll_user.aggregate([
+    #        {"$group": {"_id": "$site", "totalAmount": {"$sum": 1}}},
+    #       { "$match": { "totalAmount": {"$gt": 30 } } },
+    #       { "$group": { "_id": None, "totalSum": { "$sum": "$totalAmount" } } }
+    #     ]))[0]["totalSum"]
+    #统计总的数据量
+    # count_all = coll_user.estimated_document_count()
+    count_all = coll_user.count_documents({'score':100})
+    print(count_all)
+    #把符合条件的站点名称存起来
+    site_list = {}
+    n=0
+    site_count=coll_user.aggregate([{"$match": {'score':100}},
+                         {"$group": {"_id": "$site", "count": {"$sum": 1}}},
+                         {"$sort": {"count": -1}}])
+    for item in site_count:
+        if (n/count_all)<=0.95 :
+            # 站点下数据的总数量
+            n += item["count"]
+            site_list[item["_id"]] = item["count"]
+    #选取每个站点数据量 总和 在总数据量的95%以上的数据n和站点名称,存放在site_list中
+    #site_list={'新疆维吾尔自治区政府采购网': 6767, '中国招标投标公共服务平台': 6019,......}
+    # print(site_list)
+    # print(len(site_list))
+    # print(n)
+    m=0
+    for key in site_list:
+        ratio=(site_list[key])/n
+        #num每个站点要取多少数据量
+        num=int(ratio*N)
+        if num==0:
+            num=1
+        print(key,site_list[key], num,ratio)
+        #每个站点等间隔数据取值
+        jiange=int((site_list[key])/num)
+        print(jiange,num)
+        for i in range(0, num):
+            print(i,jiange)
+            for info in coll_user.find({"site":key,'score':100}).sort("_id",1).skip(i * jiange).limit(1):
+            # for item in coll_user.find().sort("_id",1).skip(i * jiange).limit(1):
+                print(info)
+                coll_user.update_one({"_id": info["_id"]}, {"$set": {"flag": 1} })
+sample_data(100)

+ 34 - 0
Dataquality/dataquality/score.py

@@ -0,0 +1,34 @@
+from pymongo import MongoClient
+from bson import ObjectId
+
+def bid_score():
+    # db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
+    # coll_user = db["bidding_source"]
+    db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").yantianlei
+    coll_user = db["2023Zglt_qgyys222"]
+    count=0
+    score=100
+    # for item in coll_user.find({"_id":ObjectId("64dc2bea5b7b9126edac6845")}):
+    for item in coll_user.find():
+        if item['title_qa']:
+             score-=10
+        if item['projectname_qa']:
+            score-=10
+        if item['region_qa']:
+            score-=10
+        if item['projectcode_qa']:
+            score-=10
+        if item['bidopentime_qa']:
+            score-=10
+        # if item['buyer_qa']:
+        #     score-=10
+        # if item['winner_qa']:
+        #     score-=10
+        if item['budget_qa']:
+            score-=10
+        if item['bidamount_qa']:
+            score-=10
+        print(score)
+        coll_user.update_one({"_id": item["_id"]}, {"$set": {"score": score}})
+        score = 100
+bid_score()

+ 16 - 11
Dataquality/main.py

@@ -1,16 +1,21 @@
-# 这是一个示例 Python 脚本。
+import pandas as pd
 
-# 按 ⌃R 执行或将其替换为您的代码。
-# 按 双击 ⇧ 在所有地方搜索类、文件、工具窗口、操作和设置。
+def export_execl():
+    # data ='测试' # 此处是要写入的数据
+    # wb = openpyxl.Workbook()
+    # ws = wb.create_sheet("test")
+    #
+    # # 写数据函数cell,cell中column和row至少为1
+    # ws.cell(row=1, column=1, value=data)
+    # wb.save("/Users/miaobao/Documents/work/quality_result/quality_result.xls")
 
+    # 创建一个 DataFrame 对象
+    data = {'Name': ['Alice', 'Bob', 'Charlie'],
+            'Age': [25, 28, 30]}
+    df = pd.DataFrame(data)
 
-def print_hi(name):
-    # 在下面的代码行中使用断点来调试脚本。
-    print(f'Hi, {name}')  # 按 ⌘F8 切换断点。
+    # 导出 DataFrame 到 Excel 文件
+    df.to_excel('./output.xlsx', index=False)
 
+export_execl()
 
-# 按间距中的绿色按钮以运行脚本。
-if __name__ == '__main__':
-    print_hi('PyCharm')
-
-# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助