Browse Source

区域优化

liumiaomiao 2 years ago
parent
commit
d816c6e4d4

+ 8 - 2
Dataquality/BasicMethods/area_quality.py

@@ -1,5 +1,6 @@
 import openpyxl
 import pandas as pd
+import cpca
 class AreaUtil:
     def export_execl(self,data):
         # 创建一个 DataFrame 对象
@@ -10,5 +11,10 @@ class AreaUtil:
         df.to_excel('./output.xlsx', index=False)
 
     def export_area(self,name):
-        pass
-
+        # 调用transform()函数进行城市名称提取和省份转换
+        result = cpca.transform([name])
+        # 提取省份
+        province = result['省'][0]
+        # 打印结果
+        return  province
+au=AreaUtil()

+ 73 - 0
Dataquality/dataquality/export_execl.py

@@ -0,0 +1,73 @@
+import openpyxl
+from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE
+from openpyxl.utils import get_column_letter
+from pymongo import MongoClient
+
+db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
+coll_user = db["bidding_20230707"]
+# 设置列宽
+def setExcelWith(ws):
+    # 第一步:计算每列最大宽度,并存储在列表lks中。
+    lks = []  # 英文变量太费劲,用汉语首字拼音代替
+    for i in range(1, ws.max_column + 1):  # 每列循环
+        lk = 1  # 定义初始列宽,并在每个行循环完成后重置
+        for j in range(1, ws.max_row + 1):  # 每行循环
+            sz = ws.cell(row=j, column=i).value  # 每个单元格内容
+            if isinstance(sz, str):  # 中文占用多个字节,需要分开处理
+                lk1 = len(sz.encode('gbk'))  # gbk解码一个中文两字节,utf-8一个中文三字节,gbk合适
+            else:
+                lk1 = len(str(sz))
+            if lk < lk1:
+                lk = lk1  # 借助每行循环将最大值存入lk中
+            # print(lk)
+        lks.append(lk)  # 将每列最大宽度加入列表。(犯了一个错,用lks = lks.append(lk)报错,append会修改列表变量,返回值none,而none不能继续用append方法)
+
+    # 第二步:设置列宽
+    for i in range(1, ws.max_column + 1):
+        k = get_column_letter(i)  # 将数字转化为列名,26个字母以内也可以用[chr(i).upper() for i in range(97, 123)],不用导入模块
+        ws.column_dimensions[k].width = lks[i - 1] + 2  # 设置列宽,一般加两个字节宽度,可以根据实际情况灵活调整
+# 将数据写入excel
+def writeExcel( path, sheetname, tongji_title, tongji_list):
+    book = openpyxl.Workbook()
+    ws = book.active
+    ws.title = sheetname
+    # 表头
+    data_title = tongji_title
+
+    # 创建表头
+    col_num = 1
+    for col in data_title:
+        ws.cell(row=1, column=col_num, value=str(col))
+        col_num += 1
+    # 写入excel
+    for i in range(0, len(tongji_list)):
+        for j in range(0, len(tongji_list[i])):
+            value = ILLEGAL_CHARACTERS_RE.sub(r'', tongji_list[i][j])
+            ws.cell(row=i + 2, column=j + 1, value=value)
+    setExcelWith(ws)
+    book.save(path)
+
+def pankong(key, item):
+    if key in item and item[key]:
+        value = item[key]
+    else:
+        value = ""
+    return value
+
+# 读取数据
+def run_data_daochu():
+    title_name =['id','标题','采购单位','中标单位','区域','flag_title','flag_buyer']
+    mongo_list = ["id", "title", "buyer", "s_winner", "area",'flag_title','flag_buyer']
+    data_list_correct = []
+    data_list_error =[]
+    for item in coll_user.find().batch_size(1000):
+        info_temp = []
+        for i in range(0, len(mongo_list)):
+            info_temp.append(pankong(mongo_list[i],item))
+        if item["flag_title"]==1 and  item["flag_buyer"]==1:
+            data_list_correct.append(info_temp)
+        else:
+            data_list_error.append(info_temp)
+    writeExcel("output_correct.xlsx", "统计详情", title_name, data_list_correct)
+    writeExcel("output_error.xlsx", "统计详情", title_name, data_list_error)
+run_data_daochu()

+ 76 - 0
Dataquality/dataquality/inspect_area.py

@@ -0,0 +1,76 @@
+from bson import ObjectId
+from pymongo import MongoClient
+from BasicMethods.area_quality import au
+
+db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
+coll_user = db["bidding_20230707"]
+# coll_user_inserd = db["user_insder"]
+
+def pankong(key, item):
+    if key in item and item[key]:
+        value = item[key]
+    else:
+        value = ""
+    return value
+#正确数量
+correct_count=0
+count = 0
+#find里面 单条数据用法:{"_id":ObjectId("64a8bb45990ffa1883accd78")}
+for item in coll_user.find().batch_size(1000):
+    count += 1
+    if count % 1000==0:
+        print(count)
+    title = pankong("title", item)
+    #根据title抽取出来的省份
+    expect_area1 = au.export_area(title)
+    # print(expect_area1)
+    # 根据buyer抽取出来的省份
+    buyer = pankong("buyer", item)
+    expect_area = au.export_area(buyer)
+    # 根据buyeraddr抽取出来的省份
+    buyeraddr = pankong("buyeraddr", item)
+    expect_area4 = au.export_area(buyeraddr)
+    # print(expect_area)
+    #根据s_winner抽取出来的省份
+    s_winner=pankong("s_winner",item)
+    expect_area2 = au.export_area(s_winner)
+    # 根据winneradder抽取出来的省份
+    winneraddr = pankong("winneraddr", item)
+    expect_area3 = au.export_area(winneraddr)
+    # 根据agency抽取出来的省份
+    agency = pankong("agency", item)
+    expect_area5 = au.export_area(agency)
+    # 根据agencyaddr抽取出来的省份
+    agencyaddr = pankong("agencyaddr", item)
+    expect_area6 = au.export_area(agencyaddr)
+
+    #数据组抽取出来的省份,需要验证的字段
+    tmp_area = pankong("area", item)
+    #抽取出来的地区,通过函数在转换一遍
+    actual_area = au.export_area(tmp_area)
+    flag_buyer = 0
+    flag_title = 0
+    flag_s_winner = 0
+    flag_winneraddr = 0
+    flag_buyeraddr = 0
+    flag_agency = 0
+    flag_agencyaddr = 0
+    if expect_area and actual_area and expect_area == actual_area:
+        flag_buyer = 1
+    if expect_area1 and actual_area and expect_area1 == actual_area:
+        flag_title = 1
+    if expect_area2 and actual_area and expect_area2 == actual_area:
+        flag_s_winner = 1
+    if expect_area3 and actual_area and expect_area3 == actual_area:
+        flag_winneraddr = 1
+    if expect_area4 and actual_area and expect_area4 == actual_area:
+        flag_buyeraddr = 1
+    if expect_area5 and actual_area and expect_area5 == actual_area:
+        flag_agency = 1
+    if expect_area6 and actual_area and expect_area6 == actual_area:
+        flag_agencyaddr = 1
+    if flag_buyer == 1 or flag_title == 1 or flag_s_winner == 1 or flag_winneraddr == 1 or flag_buyeraddr == 1 or flag_agency ==1 or flag_agencyaddr == 1:
+        correct_count+=1
+    coll_user.update_one({"_id":item["_id"]},{"$set":{"flag_buyer":flag_buyer,"flag_title":flag_title,"flag_s_winner":flag_s_winner,"flag_winneraddr":flag_winneraddr,"flag_buyeraddr":flag_buyeraddr,"flag_agency":flag_agency,"flag_agencyaddr":flag_agencyaddr}})
+    # coll_user.update({"_id":item["_id"]},{"$set":{"flag":flag_title}})
+print(correct_count)