liumiaomiao 1 year ago
parent
commit
663a1c3945
5 changed files with 88 additions and 73 deletions
  1. 16 18
      score.py
  2. 2 2
      tables/fields/area.py
  3. 28 17
      tables/fields/projectname.py
  4. 26 15
      tables/fields/title.py
  5. 16 21
      util/get_region.py

+ 16 - 18
score.py

@@ -2,35 +2,33 @@ from pymongo import MongoClient
 from bson import ObjectId
 from bson import ObjectId
 
 
 def bid_score():
 def bid_score():
-    db = MongoClient('192.168.3.167', 27080, unicode_decode_error_handler="ignore").jyqyfw_historyData2023_1
-    coll_user = db["20230921Ssk_endo"]
-    # db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").yantianlei
-    # coll_user = db["20230920Zglt_9_1"]
-    # db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").zhaoxiuzhen
-    # coll_user = db["20230917LT_ycl"]
+    # db = MongoClient('192.168.3.167', 27080, unicode_decode_error_handler="ignore").jyqyfw_historyData2023_1
+    # coll_user = db["20230921Ssk_endo"]
+    db = MongoClient('192.168.3.206', 27080, unicode_decode_error_handler="ignore").data_quality
+    coll_user = db["bidding_20231122"]
     count=0
     count=0
     score=100
     score=100
-    # for item in coll_user.find({"_id":ObjectId("64dc2bea5b7b9126edac6845")}):
-    for item in coll_user.find().sort("_id",1):
-        # if item['title_qa']:
-        #      score-=10
-        if item['projectname_qa']:
+    for item in coll_user.find({"_id":ObjectId("655ec5319aed6eb2ffa5d77f")}):
+    # for item in coll_user.find().sort("_id",1):
+        if item.get('title_qa'):
+             score-=10
+        if item.get('projectname_qa'):
             score-=10
             score-=10
-        if item['area_qa']:
+        if item.get('area_qa'):
             score-=10
             score-=10
-        if item['projectcode_qa']:
+        if item.get('projectcode_qa'):
             score-=10
             score-=10
         # if item['bidopentime_qa']:
         # if item['bidopentime_qa']:
         #     score-=10
         #     score-=10
-        if item['buyer_qa']:
+        if item.get('buyer_qa'):
             score-=10
             score-=10
-        if item['winner_qa']:
+        if item.get('winner_qa'):
             score-=10
             score-=10
-        if item['budget_qa']:
+        if item.get('budget_qa'):
             score-=10
             score-=10
-        if item['bidamount_qa']:
+        if item.get('bidamount_qa'):
             score-=10
             score-=10
-        if item["multipackage_qa"]:
+        if item.get("multipackage_qa"):
             score -= 10
             score -= 10
         print(score)
         print(score)
         coll_user.update_one({"_id": item["_id"]}, {"$set": {"score": score}})
         coll_user.update_one({"_id": item["_id"]}, {"$set": {"score": score}})

+ 2 - 2
tables/fields/area.py

@@ -37,10 +37,10 @@ class AreaChecker(object):
 
 
         # 获取当前脚本所在目录的上一级目录
         # 获取当前脚本所在目录的上一级目录
         current_dir = os.path.dirname(__file__)
         current_dir = os.path.dirname(__file__)
-        parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
+        # parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
 
 
         # 构建 Excel 文件路径
         # 构建 Excel 文件路径
-        xls_file = os.path.join(parent_dir, 'docs', 'area.xlsx')
+        xls_file = os.path.join(current_dir, '..', '..', 'docs', 'aera.xls')
         # 加载地区代码的XLS文件
         # 加载地区代码的XLS文件
         # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
         # xls_file = "C:\\Users\\25503\\PycharmProjects\\data_quality\\docs\\table_head_doc\\aera.xls"
         df = pd.read_excel(xls_file)
         df = pd.read_excel(xls_file)

+ 28 - 17
tables/fields/projectname.py

@@ -31,11 +31,17 @@ class ProjectnameChecker(object):
                 "parent_code": "02",
                 "parent_code": "02",
                 "checkFn": self.check0201
                 "checkFn": self.check0201
             },
             },
-            "0302": {
-                "name": "不包含通用词汇(中标公告)",
+            # "0302": {
+            #     "name": "不包含通用词汇(中标公告)",
+            #     "parent_name": "语义表述不完整",
+            #     "parent_code": "03",
+            #     "checkFn": self.check0302
+            # },
+            "0303": {
+                "name": "包含叠词,异常词汇,特殊词汇(测试,公告公告等)",
                 "parent_name": "语义表述不完整",
                 "parent_name": "语义表述不完整",
                 "parent_code": "03",
                 "parent_code": "03",
-                "checkFn": self.check0302
+                "checkFn": self.check0303
             }
             }
         }
         }
 
 
@@ -75,29 +81,34 @@ class ProjectnameChecker(object):
         :param projectname:
         :param projectname:
         :return:返回true 代表异常
         :return:返回true 代表异常
         """
         """
-        #标题中包含异常字符
+        # 项目名称必须以通用词汇结尾
+        with open(general_config["table_field_config"]["path"], "r") as f:
+            reads = csv.reader(f)
+            for w in reads:
+                if w[0] in projectname:
+                    return False
+                else:
+                    return True
+        return True
+    def check0303(self,projectname: str) -> bool:
+        """
+        没有通用后缀
+        :param projectname:
+        :return:返回true 代表异常
+        """
+        # 项目名称中包含异常字符
         with open(abnormal_config["table_field_config"]["path6"], "r") as f:
         with open(abnormal_config["table_field_config"]["path6"], "r") as f:
             reads = csv.reader(f)
             reads = csv.reader(f)
             for w in reads:
             for w in reads:
                 if w[0] in projectname:
                 if w[0] in projectname:
                     return True
                     return True
-        #项目名称以异常字符结尾
+        # 项目名称以异常字符结尾
         with open(abnormal_config["table_field_config"]["path5"], "r") as f:
         with open(abnormal_config["table_field_config"]["path5"], "r") as f:
             reads = csv.reader(f)
             reads = csv.reader(f)
             for w in reads:
             for w in reads:
-                if re.search(f"{w[0]}$", projectname) !=None:
+                if re.search(f"{w[0]}$", projectname) != None:
                     return True
                     return True
         # 项目名称以异常字符开始
         # 项目名称以异常字符开始
         p1 = re.compile(r"^[3|6|7|8|0|\.]")
         p1 = re.compile(r"^[3|6|7|8|0|\.]")
         if p1.match(projectname):
         if p1.match(projectname):
-            return True
-        # 放在最后判断
-        # 项目名称必须以通用词汇结尾
-        with open(general_config["table_field_config"]["path"], "r") as f:
-            reads = csv.reader(f)
-            for w in reads:
-                if re.search(f"{w[0]}$", projectname) != None:
-                    return False
-                else:
-                    return True
-        return True
+            return True

+ 26 - 15
tables/fields/title.py

@@ -30,11 +30,17 @@ class TitleChecker(object):
                 "parent_code": "02",
                 "parent_code": "02",
                 "checkFn": self.check0201
                 "checkFn": self.check0201
             },
             },
-            "0302": {
-                "name": "不包含通用词汇(中标公告)",
+            # "0302": {
+            #     "name": "不包含通用词汇(中标公告)",
+            #     "parent_name": "语义表述不完整",
+            #     "parent_code": "03",
+            #     "checkFn": self.check0302
+            # },
+            "0303": {
+                "name": "包含叠词,异常词汇,特殊词汇(测试,公告公告等)",
                 "parent_name": "语义表述不完整",
                 "parent_name": "语义表述不完整",
                 "parent_code": "03",
                 "parent_code": "03",
-                "checkFn": self.check0302
+                "checkFn": self.check0303
             }
             }
         }
         }
 
 
@@ -70,6 +76,22 @@ class TitleChecker(object):
             return True
             return True
         return False
         return False
     def check0302(self,title: str) -> bool:
     def check0302(self,title: str) -> bool:
+        """
+        没有通用后缀
+        :param title:
+        :return:返回true 代表异常
+        """
+        #标题必须以通用词汇结尾
+        with open(general_config["table_field_config"]["path"], "r") as f:
+            reads = csv.reader(f)
+            for w in reads:
+                if w[0] in title:
+                    return False
+                else:
+                    return True
+        return False
+
+    def check0303(self, title: str) -> bool:
         """
         """
         没有通用后缀
         没有通用后缀
         :param title:
         :param title:
@@ -92,15 +114,4 @@ class TitleChecker(object):
         #标题以异常字符开始
         #标题以异常字符开始
         p1 = re.compile(r"^[3|6|7|8|0|\.]")
         p1 = re.compile(r"^[3|6|7|8|0|\.]")
         if p1.match(title):
         if p1.match(title):
-            return True
-
-        #放在最后判断
-        #标题必须以通用词汇结尾
-        with open(general_config["table_field_config"]["path"], "r") as f:
-            reads = csv.reader(f)
-            for w in reads:
-                if re.search(f"{w[0]}$", title) !=None:
-                    return False
-                else:
-                    return True
-        return False
+            return True

+ 16 - 21
util/get_region.py

@@ -1,16 +1,23 @@
+import os
 import cpca
 import cpca
 import re
 import re
 import pandas as pd
 import pandas as pd
 
 
 def get_city_info(text):
 def get_city_info(text):
-    # 读取区县数据
-    df_county = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//区县.xlsx")
+    # 获取当前脚本所在目录的上一级目录
+    current_dir = os.path.dirname(__file__)
+    parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
 
 
+    # 读取区县数据
+    df_county_addr = os.path.join(parent_dir, 'docs', '区县.xlsx')
+    df_county=pd.read_excel(df_county_addr)
     # 读取乡镇数据
     # 读取乡镇数据
-    df_town = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//乡镇.xlsx")
+    df_town_addr = os.path.join(parent_dir, 'docs', '乡镇.xlsx')
+    df_town = pd.read_excel(df_town_addr)
 
 
     # 读取市级数据
     # 读取市级数据
-    df_city = pd.read_excel("//Users//miaobao//Documents//work//PycharmProjects//data_quality//docs//市.xlsx")
+    df_city_addr = os.path.join(parent_dir, 'docs', '市.xlsx')
+    df_city = pd.read_excel(df_city_addr)
 
 
     # 使用cpca库提取地名
     # 使用cpca库提取地名
     df = cpca.transform([text])
     df = cpca.transform([text])
@@ -26,7 +33,7 @@ def get_city_info(text):
 
 
     if province is None and city is None and district is None:
     if province is None and city is None and district is None:
         # 使用正则表达式提取乡镇信息
         # 使用正则表达式提取乡镇信息
-        towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡', text)
+        towns = re.findall(r'[\u4e00-\u9fa5]+镇|[\u4e00-\u9fa5]+乡|[\u4e00-\u9fa5]+街道|[\u4e00-\u9fa5]+庄|[\u4e00-\u9fa5]+营|[\u4e00-\u9fa5]+店', text)
         if towns:
         if towns:
             for town in towns:
             for town in towns:
                 town_name = None
                 town_name = None
@@ -55,27 +62,15 @@ def get_city_info(text):
                         province = df_city_result.iloc[0]['省']
                         province = df_city_result.iloc[0]['省']
 
 
                     break# 找到乡镇信息后跳出循环
                     break# 找到乡镇信息后跳出循环
-
-            if not province and not city and not district and '区县代码' in df_county.columns:
-                county_code = df.iloc[0]['区县代码']
-                city_info = df_city[df_city['城市代码'] == county_code]
-                if not city_info.empty:
-                    city = city_info.iloc[0]['城市名称']
-
-                    # 将城市名称转换成对应的省份名称
-                    df_city_result = cpca.transform([city])
-                    province = df_city_result.iloc[0]['省']
-
-                county_info = df_county[df_county['区县代码'] == county_code].iloc[0]
-                district = county_info['区县名称']
-
+                else:
+                    continue
     return province, city, district
     return province, city, district
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
 
 
     # 使用方法示例
     # 使用方法示例
-    province, city, district = get_city_info("河南省开发的")
-    if get_city_info("电动蝶阀待开发的")==[None,None,None]:
+    province, city, district = get_city_info("杞县文化广电新闻出版旅游局")
+    if province==None or city==None or district==None:
         print("44444")
         print("44444")
     print(province, city, district)
     print(province, city, district)