liumiaomiao 1 rok temu
rodzic
commit
e4fd28c25d

BIN
a2s-0.0.2-py3-none-any.whl


Plik diff jest za duży
+ 10 - 9
client.py


+ 2 - 0
docs/config.py

@@ -27,6 +27,8 @@ abnormal_config = {
         "path2": "./docs/table_head_doc/abnormal_buyer_contain.csv",
         "path3": "./docs/table_head_doc/abnormal_buyer_end.csv",
         "path4": "./docs/table_head_doc/projectcode.csv",
+        "path5": "./docs/table_head_doc/title_abnormal.csv",
+        "path6": "./docs/table_head_doc/title_abnormal_contain.csv",
     }}
 
 # 调试

+ 126 - 128
docs/table_head_doc/general_label.csv

@@ -1,128 +1,126 @@
-"采购计划任务"
-"采购公告"
-"招标文件预公示"
-"招标预公告"
-"计划招标公告"
-"调研公告"
-"预申公告"
-"预审文件"
-"预审公告更正公告"
-"预审结果"
-"预审结果公示"
-"预审结果"
-"预审结果的公示"
-"预审结果变更"
-"论证意见公示"
-"需求论证公示"
-"征求意见公告"
-"进口产品公示"
-"需求公告"
-"直接采购公告"
-"需求公示"
-"采购公告"
-"采购项目"
-"项目公告"
-"招标公告"
-"意向公开"
-"比选公告"
-"邀请公告"
-"采购邀请"
-"邀请书"
-"邀请函"
-"询价采购"
-"询价公告"
-"比价公告"
-"比价项目公告"
-"建设项目"
-"中标(成交)公告"
-"服务"
-"项目"
-"合同公告"
-"比价项目公告"
-"比质比价"
-"比质比价公告"
-"询价采购公告"
-"询价书"
-"询价单"
-"询价采购"
-"公开询价"
-"询价邀请书"
-"竞价交易公告"
-"竞价公告"
-"竞价项目"
-"竞价的公告"
-"竞价采购公告"
-"变更公告"
-"更正公告"
-"暂停公告"
-"候选人公示"
-"候选人公示"
-"结果公告"
-"结果公示"
-"服务项目"
-"成交公告"
-"中选人公示"
-"中选结果公示"
-"中标公示"
-"中标公告"
-"成交公示"
-"废标公告"
-"终止公告"
-"异常公告"
-"流标公告"
-"失败公告"
-"合同公告"
-"合同信息"
-"采购合同"
-"销售合同"
-"集成合同"
-"项目合同"
-"合同公示"
-"服务合同"
-"验收公告"
-"验收单公示"
-"标段"
-"单一来源采购公示"
-"考试培训"
-"比价单"
-"项目询价"
-"采购项目"
-"询价"
-"竞争性谈判"
-"竞争性磋商"
-"单一来源"
-"竞价处置公告"
-"网上竞价"
-"结果公告"
-"结果公示"
-"中标公示"
-"中标公告"
-"公告"
-"合同"
-"评审失败"
-"招标失败"
-"比选失败"
-"采购失败"
-"流标公示"
-"项目合同"
-"验收结果"
-"验收公告"
-"行政处罚"
-"违约行为"
-"投诉"
-"不良行为"
-"协议书"
-"候选人"
-"购置"
-"公示"
-"建设"
-"中标"
-"招标"
-"工程"
-"采购"
-"成交通知书"
-"公告公告"
-"项目项目"
-"合同合同"
-"nbsp"
-"..."
+采购计划任务
+采购公告
+招标文件预公示
+招标预公告
+计划招标公告
+调研公告
+预申公告
+预审文件
+预审公告更正公告
+预审结果
+预审结果公示
+预审结果
+预审结果的公示
+预审结果变更
+论证意见公示
+需求论证公示
+征求意见公告
+进口产品公示
+需求公告
+直接采购公告
+需求公示
+采购公告
+采购项目
+项目公告
+招标公告
+意向公开
+比选公告
+邀请公告
+采购邀请
+邀请书
+邀请函
+询价采购
+询价公告
+比价公告
+比价项目公告
+建设项目
+中标(成交)公告
+服务
+项目
+合同公告
+比价项目公告
+比质比价
+比质比价公告
+询价采购公告
+询价书
+询价单
+询价采购
+公开询价
+询价邀请书
+竞价交易公告
+竞价公告
+竞价项目
+竞价的公告
+竞价采购公告
+变更公告
+更正公告
+暂停公告
+候选人公示
+候选人公示
+结果公告
+结果公示
+服务项目
+成交公告
+中选人公示
+中选结果公示
+中标公示
+中标公告
+成交公示
+废标公告
+终止公告
+异常公告
+流标公告
+失败公告
+合同公告
+合同信息
+采购合同
+销售合同
+集成合同
+项目合同
+合同公示
+服务合同
+验收公告
+验收单公示
+标段
+单一来源采购公示
+考试培训
+比价单
+项目询价
+采购项目
+询价
+竞争性谈判
+竞争性磋商
+单一来源
+竞价处置公告
+网上竞价
+结果公告
+结果公示
+中标公示
+中标公告
+公告
+合同
+评审失败
+招标失败
+比选失败
+采购失败
+流标公示
+项目合同
+验收结果
+验收公告
+行政处罚
+违约行为
+投诉
+不良行为
+协议书
+候选人
+购置
+公示
+建设
+中标
+招标
+工程
+采购
+成交通知书
+公告公告
+项目项目
+合同合同

+ 1 - 0
docs/table_head_doc/title_abnormal.csv

@@ -0,0 +1 @@
+nbsp

+ 4 - 0
docs/table_head_doc/title_abnormal_contain.csv

@@ -0,0 +1,4 @@
+nbsp
+...
+.
+..

+ 13 - 120
tables/fields/NoField.py

@@ -31,44 +31,13 @@ class NoFieldChecker(object):
         :param obj:代表一个item
         :return:返回true 代表异常
         """
-        self.check_bidamount_ac = AcAutomation()
-        with open(amount_config["table_field_config"]["path"], "r") as f:
-            reads = csv.reader(f)
-            [self.check_bidamount_ac.add_word(w[0]) for w in reads]
-
-        detail = obj.get("detail", "")
-        attach_text = obj.get("attach_text", {})
         subtype = obj.get("subtype", "")
         if subtype in ["中标", "成交","合同","验收"]:
-            contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
-            content = "\n".join(contents) #字典处理成字符串
-            if self.check_bidamount_ac.search(content):
-                return True
-
-            for attach_index, attach_content in attach_text.items():
-                if attach_content:
-                    for topic_index, topic_detail in attach_content.items():
-                        # oss地址
-                        attach_url = topic_detail.get("attach_url", "")
-                        if attach_url:
-                            # 获取附件内容
-                            st, content = fsc.download_text_content(attach_url)
-
-                            # 下载成功
-                            # 超长文本不处理,暂定30万字
-                            if st and content.strip():
-                                if len(content) > 300000:
-                                    continue
-                            # 开始检测
-                            contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
-                            content = "\n".join(contents)
-                            if self.check_bidamount_ac.search(content):
-                                    return True
-            return False
+            bidamount = obj.get("bidamount", "")
+            if bidamount:
+                return False
+            return True
         return False
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
 
     def check_winner(self,obj, catch_content: CatchContentObject) -> bool:
         """
@@ -83,9 +52,7 @@ class NoFieldChecker(object):
                 return False
             return True
         return  False
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
+
 
     def check_buyer(self,obj,catch_content: CatchContentObject) -> bool:
         """
@@ -99,55 +66,19 @@ class NoFieldChecker(object):
             return False
         return True
 
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
-
     def check_budget(self,obj, catch_content: CatchContentObject) -> bool:
         """
         预算为空检测
         :param obj:代表一个item
         :return:返回true 代表异常
         """
-        self.check_budget_ac = AcAutomation()
-        with open(budget_config["table_field_config"]["path"],"r") as f :
-            reads=csv.reader(f)
-            [self.check_budget_ac.add_word(w[0]) for w in reads ]
-
-        detail = obj.get("detail", "")
-        attach_text = obj.get("attach_text", {})
         subtype = obj.get("subtype", "")
         if subtype not in ["中标", "成交", "合同", "验收"]:
-            contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")  # 返回值是字典
-            content = "\n".join(contents)  # 字典处理成字符串
-            if self.check_budget_ac.search(content):
-                return True
-
-            for attach_index, attach_content in attach_text.items():
-                if attach_content:
-                    for topic_index, topic_detail in attach_content.items():
-                        # oss地址
-                        attach_url = topic_detail.get("attach_url", "")
-                        if attach_url:
-                            # 获取附件内容
-                            st, content = fsc.download_text_content(attach_url)
-
-                            # 下载成功
-                            # 超长文本不处理,暂定30万字
-                            if st and content.strip():
-                                if len(content) > 300000:
-                                    continue
-                            # 开始检测
-                            contents = catch_content.public_attachment_catch(content, platform="attach",
-                                                                             document_id=attach_url)
-                            content = "\n".join(contents)
-                            if self.check_budget_ac.search(content):
-                                return True
-            return False
+            budget = obj.get("budget", "")
+            if budget:
+                return False
+            return True
         return False
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
 
     def check_region(self,obj, catch_content: CatchContentObject) -> bool:
         """
@@ -169,9 +100,6 @@ class NoFieldChecker(object):
         if title :
             return False
         return True
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
 
     def check_projectname(self,obj, catch_content: CatchContentObject) -> bool:
         """
@@ -182,9 +110,7 @@ class NoFieldChecker(object):
         if projectname :
             return False
         return True
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
+
 
     def check_projectcode(self,obj, catch_content: CatchContentObject) -> bool:
         """
@@ -192,44 +118,11 @@ class NoFieldChecker(object):
         :param obj:代表一个item
         :return:返回true 代表异常
         """
-        self.check_projectcode_ac = AcAutomation()
-        with open(abnormal_config["table_field_config"]["path4"], "r") as f:
-            reads = csv.reader(f)
-            [self.check_projectcode_ac.add_word(w[0]) for w in reads]
-
         projectcode = obj.get("projectcode", "")
-        detail = obj.get("detail", "")
-        attach_text = obj.get("attach_text", {})
-        if projectcode == "":
-            contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
-            content = "\n".join(contents) #字典处理成字符串
-            if self.check_projectcode_ac.search(content):
-                return True
-
-            for attach_index, attach_content in attach_text.items():
-                if attach_content:
-                    for topic_index, topic_detail in attach_content.items():
-                        # oss地址
-                        attach_url = topic_detail.get("attach_url", "")
-                        if attach_url:
-                            # 获取附件内容
-                            st, content = fsc.download_text_content(attach_url)
-
-                            # 下载成功
-                            # 超长文本不处理,暂定30万字
-                            if st and content.strip():
-                                if len(content) > 300000:
-                                    continue
-                            # 开始检测
-                            contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
-                            content = "\n".join(contents)
-                            if self.check_projectcode_ac.search(content):
-                                    return True
+        if projectcode:
             return False
-        return False
-        # 处理正文
-        # 检查因素
-        # 是否返回 0000
+        return True
+
     def check_subpackage(self,obj, catch_content: CatchContentObject) -> bool:
         """
         公司名称检测

+ 26 - 20
tables/fields/buyer.py

@@ -26,18 +26,18 @@ class BuyerChecker(object):
         采购单位0101判断不准确,备用
         """
         self.errors_tables = {
-            "0101": {
-                "name": "实体识别",
-                "parent_name": "名称错误",
-                "parent_code": "01",
-                "checkFn": self.check0101
-            },
-            "0201": {
-                "name": "看数据的标签是不是采购单位",
-                "parent_name": "数据标签错误",
-                "parent_code": "02",
-                "checkFn": self.check0201
-            },
+            # "0101": {
+            #     "name": "实体识别",
+            #     "parent_name": "名称错误",
+            #     "parent_code": "01",
+            #     "checkFn": self.check0101
+            # },
+            # "0201": {
+            #     "name": "看数据的标签是不是采购单位",
+            #     "parent_name": "数据标签错误",
+            #     "parent_code": "02",
+            #     "checkFn": self.check0201
+            # },
             "0103": {
                 "name": "包含叠词,异常词汇,特殊词汇",
                 "parent_name": "名称错误",
@@ -281,7 +281,10 @@ class BuyerChecker(object):
         return True
 
     def check0103(self, buyer: str):
-
+        """
+        return  True 代表异常
+        """
+        # 采购单位名称以异常词开始
         with open(abnormal_config["table_field_config"]["path1"], "r") as f:
             reads = csv.reader(f)
             for n in reads:
@@ -289,24 +292,27 @@ class BuyerChecker(object):
                 if p1.match(buyer):
                     return True
 
-        # 包含词 使用敏感词检验方法
+        # 采购单位名称中包含异常
         self.check_abnormal_ac = AcAutomation()
         with open(abnormal_config["table_field_config"]["path2"], "r") as f:
             reads = csv.reader(f)
-            [self.check_abnormal_ac.add_word(w[0]) for w in reads]
-        if self.check_abnormal_ac.search(buyer):
-            return True
+            for k in reads:
+                if k[0] in (buyer):
+                    return True
 
+        # 采购单位名称以异常词结尾
         with open(abnormal_config["table_field_config"]["path3"], "r") as f:
             reads = csv.reader(f)
             for m in reads:
-                p2 = re.compile(".*$" + m[0])
-                if p2.match(buyer):
+                p2 = re.compile(f"{m[0]}$")
+                if p2.search(buyer):
                     return True
         return False
+        # 如果采购单位类型in ("学校","教育","卫健委","医疗","政府办","政务中心"),则采购单位名称中一般都含有地名
 
     def check0104(self, buyer: str, buyerclass: str):
         if buyerclass in ("学校", "教育", "卫健委", "医疗", "政府办", "政务中心"):
-            if get_city_info(buyer) == [None, None, None]:
+            province, city, district = get_city_info(buyer)
+            if province == None and city == None and district == None:
                 return True
         return False

+ 24 - 11
tables/fields/projectname.py

@@ -5,6 +5,8 @@ import re
 from docs.config import general_config
 from util.sensitive_word import AcAutomation
 import csv
+from docs.config import abnormal_config
+
 class ProjectnameChecker(object):
     """
         项目名称字段检查
@@ -70,21 +72,32 @@ class ProjectnameChecker(object):
     def check0302(self,projectname: str) -> bool:
         """
         没有通用后缀
-        :param title:
+        :param projectname:
         :return:返回true 代表异常
         """
-        self.check_general_ac = AcAutomation()
-        with open(general_config["table_field_config"]["path"], "r") as f:
+        #标题中包含异常字符
+        with open(abnormal_config["table_field_config"]["path6"], "r") as f:
+            reads = csv.reader(f)
+            for w in reads:
+                if w[0] in projectname:
+                    return True
+        #项目名称以异常字符结尾
+        with open(abnormal_config["table_field_config"]["path5"], "r") as f:
             reads = csv.reader(f)
-            [self.check_general_ac.add_word(w[0]) for w in reads]
+            for w in reads:
+                if re.search(f"{w[0]}$", projectname) !=None:
+                    return True
+        # 项目名称以异常字符开始
         p1 = re.compile(r"^[3|6|7|8|0|\.]")
-        p2 = re.compile(".*--")
         if p1.match(projectname):
-            # print(11111)
             return True
-        if p2.match(projectname):
-            # print(2222)
-            return True
-        if self.check_general_ac.search(projectname):
-            return False
+        # 放在最后判断
+        # 项目名称必须以通用词汇结尾
+        with open(general_config["table_field_config"]["path"], "r") as f:
+            reads = csv.reader(f)
+            for w in reads:
+                if re.search(f"{w[0]}$", projectname) != None:
+                    return False
+                else:
+                    return True
         return True

+ 1 - 0
tables/fields/purchasing.py

@@ -363,6 +363,7 @@ class PurchasingChecker(object):
         """
         purchasing_evaluate_list, score = purchasing_evaluate_start(purchasinglist, purchasingsource)
         print(score)
+        #大于0.85的通过验证的
         if score < 0.85:
             return True
         else:

+ 26 - 12
tables/fields/title.py

@@ -4,7 +4,7 @@
 import re
 
 from docs.config import general_config
-from util.sensitive_word import AcAutomation
+from docs.config import abnormal_config
 import csv
 class TitleChecker(object):
     """
@@ -75,18 +75,32 @@ class TitleChecker(object):
         :param title:
         :return:返回true 代表异常
         """
-        self.check_general_ac = AcAutomation()
-        with open(general_config["table_field_config"]["path"], "r") as f:
+        #标题中包含异常字符
+        with open(abnormal_config["table_field_config"]["path6"], "r") as f:
+            reads = csv.reader(f)
+            for w in reads:
+                if w[0] in title:
+                    return True
+        #标题以异常字符结尾
+        #re.search()匹配整个字符串,并返回第一个成功的匹配,如果匹配失败,则返回None
+        with open(abnormal_config["table_field_config"]["path5"], "r") as f:
             reads = csv.reader(f)
-            [self.check_general_ac.add_word(w[0]) for w in reads]
+            for w in reads:
+                ret=re.search(f"{w[0]}$", title)
+                if  ret != None:
+                    return True
+        #标题以异常字符开始
         p1 = re.compile(r"^[3|6|7|8|0|\.]")
-        p2 = re.compile(".*--")
         if p1.match(title):
-            print(11111)
-            return True
-        if p2.match(title):
-            print(2222)
             return True
-        if self.check_general_ac.search(title):
-            return False
-        return True
+
+        #放在最后判断
+        #标题必须以通用词汇结尾
+        with open(general_config["table_field_config"]["path"], "r") as f:
+            reads = csv.reader(f)
+            for w in reads:
+                if re.search(f"{w[0]}$", title) !=None:
+                    return False
+                else:
+                    return True
+        return False

+ 23 - 24
tables/fields/winner.py

@@ -21,18 +21,18 @@ class WinnerChecker(object):
 
     def __init__(self):
         self.errors_tables = {
-            "0101": {
-                "name": "实体识别",
-                "parent_name": "名称错误",
-                "parent_code": "01",
-                "checkFn": self.check0101
-            },
-            "0201": {
-                "name": "看数据的标签是不是之中标单位",
-                "parent_name": "数据标签错误",
-                "parent_code": "02",
-                "checkFn": self.check0201
-            },
+            # "0101": {
+            #     "name": "实体识别",
+            #     "parent_name": "名称错误",
+            #     "parent_code": "01",
+            #     "checkFn": self.check0101
+            # },
+            # "0201": {
+            #     "name": "看数据的标签是不是之中标单位",
+            #     "parent_name": "数据标签错误",
+            #     "parent_code": "02",
+            #     "checkFn": self.check0201
+            # },
             "0103": {
                 "name": "包含叠词,异常词汇,特殊词汇",
                 "parent_name": "名称错误",
@@ -265,26 +265,25 @@ class WinnerChecker(object):
                             return False
         return True
 
-    def check0103(self, winner: str):
+    def check0103(self,winner:str):
+        #中标单位名称以异常词开始
         with open(abnormal_config["table_field_config"]["path1"], "r") as f:
             reads = csv.reader(f)
-            for n in reads:
-                p1 = re.compile("^" + n[0])
+            for n in  reads:
+                p1 = re.compile("^"+n[0])
                 if p1.match(winner):
                     return True
 
-        # 包含词 使用敏感词检验方法
-        self.check_abnormal_ac = AcAutomation()
+        # 中标单位名称包含异常词
         with open(abnormal_config["table_field_config"]["path2"], "r") as f:
             reads = csv.reader(f)
-            [self.check_abnormal_ac.add_word(w[0]) for w in reads]
-        if self.check_abnormal_ac.search(winner):
-            return True
-
+            for n in  reads:
+                if n[0] in winner:
+                    return True
+        # 中标单位名称以异常词结尾
         with open(abnormal_config["table_field_config"]["path3"], "r") as f:
             reads = csv.reader(f)
-            for m in reads:
-                p2 = re.compile(".*$" + m[0])
-                if p2.match(winner):
+            for w in reads:
+                if re.search(f"{w[0]}$", winner):
                     return True
         return False

Niektóre pliki nie zostały wyświetlone z powodu dużej ilości zmienionych plików