|
@@ -0,0 +1,95 @@
|
|
|
+'''
|
|
|
+"01长度类型"--"01、少于5个字 02、超过40字"
|
|
|
+02汉字占比--"01、非汉字:汉字占比<1:3,02、有特殊符号如:!@#¥%……&*"
|
|
|
+03语义表述不完整---"01、仅有单位名称(实体)02、排除通用后缀(中标公告)03、排除敏感词(测试等)"
|
|
|
+
|
|
|
+'''
|
|
|
+from pymongo import MongoClient
|
|
|
+from bson import ObjectId
|
|
|
+def pankong(key, item):
|
|
|
+ if key in item and item[key]:
|
|
|
+ value = item[key]
|
|
|
+ else:
|
|
|
+ value = ""
|
|
|
+ return value
|
|
|
+def inspect_projectname():
|
|
|
+ # db = MongoClient('192.168.3.71', 29099, unicode_decode_error_handler="ignore").quality
|
|
|
+ # coll_user = db["bidding_source"]
|
|
|
+ db = MongoClient('192.168.3.166', 27082, unicode_decode_error_handler="ignore").yantianlei
|
|
|
+ coll_user = db["2023Zglt_qgyys222"]
|
|
|
+ count=0
|
|
|
+ projectname_qa=[]
|
|
|
+ # # key_list= ["采购计划任务", "采购公告", "招标文件预公示", "招标预公告", "计划招标公告", "调研公告", "预申公告", "预审文件", "预审公告更正公告", "预审结果",
|
|
|
+ # "预审结果公示", "预审结果", "预审结果的公示", "预审结果变更", "论证意见公示", "需求论证公示", "征求意见公告", "进口产品公示", "需求公告", "直接采购公告",
|
|
|
+ # "需求公示", "采购公告", "采购项目", "项目公告", "招标公告", "意向公开", "比选公告", "邀请公告", "采购邀请", "邀请书", "邀请函", "询价采购",
|
|
|
+ # "询价公告", "比价公告", "比价项目公告", "建设项目", "中标(成交)公告", "服务","项目","合同公告","比价项目公告","比质比价","比质比价公告","询价采购公告",
|
|
|
+ # "询价书","询价单","询价采购","公开询价","询价邀请书","竞价交易公告","竞价公告","竞价项目","竞价的公告","竞价采购公告","变更公告","更正公告","暂停公告","候选人公示",
|
|
|
+ # "候选人公示","结果公告","结果公示","服务项目","成交公告","中选人公示","中选结果公示","中标公示","中标公告","成交公示","废标公告","终止公告","异常公告","流标公告",
|
|
|
+ # "失败公告","合同公告","合同信息","采购合同","销售合同","集成合同","项目合同","合同公示","服务合同","验收公告","验收单公示","标段","单一来源采购公示","考试培训",
|
|
|
+ # "比价单","项目询价","采购项目","询价","竞争性谈判","竞争性磋商","单一来源","竞价处置公告","网上竞价","结果公告","结果公示","中标公示","中标公告","公告","合同",
|
|
|
+ # "评审失败","招标失败","比选失败","采购失败","流标公示","项目合同","验收结果","验收公告","行政处罚","违约行为","投诉","不良行为","协议书","候选人","购置",
|
|
|
+ # "公示","建设","中标","招标","工程","采购","成交通知书","中标结果公示","采购结果","成交结果公告"]
|
|
|
+ abnormal_list=["公告公告","项目项目","合同合同","nbsp","..."]
|
|
|
+ abnormal_end_list=["--"]
|
|
|
+ abnormal_start_list = ["3","6","7","8","0","."]
|
|
|
+ # for item in coll_user.find({"_id":ObjectId("64e823d1bc72bfca1010b85f")}):
|
|
|
+ for item in coll_user.find().batch_size(1000):
|
|
|
+ count += 1
|
|
|
+ if count % 1000 == 0:
|
|
|
+ print(count)
|
|
|
+ if "projectname" not in item:
|
|
|
+ projectname_qa.append("0000")
|
|
|
+ else:
|
|
|
+ if item["projectname"]:
|
|
|
+ projectname=item["projectname"]
|
|
|
+ if projectname==[]:
|
|
|
+ projectname_qa.append("0402")
|
|
|
+ if len(projectname) <= 5:
|
|
|
+ projectname_qa.append("0101")
|
|
|
+ if len(projectname) >= 100:
|
|
|
+ projectname_qa.append("0102")
|
|
|
+ # 规则2: 检测汉字和非汉字的占比
|
|
|
+ chinese_chars = [char for char in projectname if '\u4e00' <= char <= '\u9fff'] # 匹配汉字
|
|
|
+ print(chinese_chars)
|
|
|
+ non_chinese_chars = [char for char in projectname if not ('\u4e00' <= char <= '\u9fff')] # 匹配非汉字和非字母数字字符
|
|
|
+ print(non_chinese_chars)
|
|
|
+ non_chinese_chars_radio = len(non_chinese_chars) / len(projectname)
|
|
|
+ if non_chinese_chars_radio >0.5:
|
|
|
+ projectname_qa.append("0201")
|
|
|
+ # #不包含通用词汇
|
|
|
+ # key_flag2 = 0
|
|
|
+ # for key in key_list:
|
|
|
+ # if key in projectname:
|
|
|
+ # key_flag2 += 1
|
|
|
+ # break
|
|
|
+ # # print(key_flag2)
|
|
|
+ # if key_flag2 == 0 :
|
|
|
+ # projectname_qa.append("0302")
|
|
|
+ #包含错误词汇
|
|
|
+ key_flag1 = 0
|
|
|
+ for key in abnormal_list:
|
|
|
+ if key in projectname:
|
|
|
+ key_flag1 += 1
|
|
|
+ # if key_flag1 :
|
|
|
+ # projectname_qa.append("0303")
|
|
|
+ #以错误词汇结尾
|
|
|
+ key_flag3 = 0
|
|
|
+ for key in abnormal_end_list:
|
|
|
+ if projectname[-len(key):]== key:
|
|
|
+ key_flag3 += 1
|
|
|
+ # 以错误词汇开始
|
|
|
+ key_flag4 = 0
|
|
|
+ for key in abnormal_start_list:
|
|
|
+ if projectname[0] == key :
|
|
|
+ key_flag4 += 1
|
|
|
+ print(item['_id'],key_flag1,key_flag3,key_flag4)
|
|
|
+ if key_flag1 or key_flag3 or key_flag4 :
|
|
|
+ projectname_qa.append("0303")
|
|
|
+ else:
|
|
|
+ projectname_qa.append("0000")
|
|
|
+ print(projectname_qa)
|
|
|
+ coll_user.update_one({"_id": item["_id"]}, {"$set": {"projectname_qa": projectname_qa}})
|
|
|
+ projectname_qa = []
|
|
|
+
|
|
|
+
|
|
|
+inspect_projectname()
|