# coding:utf-8 from tables import CatchContentObject, fsc from util.sensitive_word import AcAutomation from docs.config import amount_config from docs.config import budget_config from docs.config import DEBUG from docs.config import abnormal_config import csv class NoFieldChecker(object): """ 无字段或空值检查 """ def __init__(self): self.errors_tables = { "title": self.check_title, "projectname": self.check_projectname, "buyer":self.check_buyer, "winner": self.check_winner, "budget": self.check_budget, "bidamount": self.check_bidamount, "area":self.check_region, "projectcode": self.check_projectcode, "multipackage":self.check_subpackage, } def check_bidamount(self,obj,catch_content: CatchContentObject) -> bool: """ 中标金额为空检测 :param obj:代表一个item :return:返回true 代表异常 """ self.check_bidamount_ac = AcAutomation() with open(amount_config["table_field_config"]["path"], "r") as f: reads = csv.reader(f) [self.check_bidamount_ac.add_word(w[0]) for w in reads] detail = obj.get("detail", "") attach_text = obj.get("attach_text", {}) subtype = obj.get("subtype", "") if subtype in ["中标", "成交","合同","验收"]: contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典 content = "\n".join(contents) #字典处理成字符串 if self.check_bidamount_ac.search(content): return True for attach_index, attach_content in attach_text.items(): if attach_content: for topic_index, topic_detail in attach_content.items(): # oss地址 attach_url = topic_detail.get("attach_url", "") if attach_url: # 获取附件内容 st, content = fsc.download_text_content(attach_url) # 下载成功 # 超长文本不处理,暂定30万字 if st and content.strip(): if len(content) > 300000: continue # 开始检测 contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url) content = "\n".join(contents) if self.check_bidamount_ac.search(content): return True return False return False # 处理正文 # 检查因素 # 是否返回 0000 def check_winner(self,obj, catch_content: CatchContentObject) -> bool: """ 中标单位名称为空检测,除中标类型的标讯,其他类型标讯不检查这个字段是否为空 :param obj:代表一个item :return:返回true 代表异常 """ subtype = obj.get("subtype", "") if subtype in ["中标", "成交", "合同", "验收"]: winner = obj.get("winner", "") if winner: return False return True return False # 处理正文 # 检查因素 # 是否返回 0000 def check_buyer(self,obj,catch_content: CatchContentObject) -> bool: """ 采购单位名称是否为空检测 :param buyer:采购单位,多个逗号分割 :param obj:代表一个item :return:返回true 代表异常 """ buyer = obj.get("buyer", "") if buyer : return False return True # 处理正文 # 检查因素 # 是否返回 0000 def check_budget(self,obj, catch_content: CatchContentObject) -> bool: """ 预算为空检测 :param obj:代表一个item :return:返回true 代表异常 """ self.check_budget_ac = AcAutomation() with open(budget_config["table_field_config"]["path"],"r") as f : reads=csv.reader(f) [self.check_budget_ac.add_word(w[0]) for w in reads ] detail = obj.get("detail", "") attach_text = obj.get("attach_text", {}) subtype = obj.get("subtype", "") if subtype not in ["中标", "成交", "合同", "验收"]: contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") # 返回值是字典 content = "\n".join(contents) # 字典处理成字符串 if self.check_budget_ac.search(content): return True for attach_index, attach_content in attach_text.items(): if attach_content: for topic_index, topic_detail in attach_content.items(): # oss地址 attach_url = topic_detail.get("attach_url", "") if attach_url: # 获取附件内容 st, content = fsc.download_text_content(attach_url) # 下载成功 # 超长文本不处理,暂定30万字 if st and content.strip(): if len(content) > 300000: continue # 开始检测 contents = catch_content.public_attachment_catch(content, platform="attach", document_id=attach_url) content = "\n".join(contents) if self.check_budget_ac.search(content): return True return False return False # 处理正文 # 检查因素 # 是否返回 0000 def check_region(self,obj, catch_content: CatchContentObject) -> bool: """ 区域为空检测 :param obj:代表一个item :return:返回true 代表异常 """ pass # 处理正文 # 检查因素 # 是否返回 0000 def check_title(self,obj, catch_content: CatchContentObject) -> bool: """ :param obj:代表一个item :return:返回true 代表异常 """ title = obj.get("title", "") if title : return False return True # 处理正文 # 检查因素 # 是否返回 0000 def check_projectname(self,obj, catch_content: CatchContentObject) -> bool: """ :param obj:代表一个item :return:返回true 代表异常 """ projectname = obj.get("projectname", "") if projectname : return False return True # 处理正文 # 检查因素 # 是否返回 0000 def check_projectcode(self,obj, catch_content: CatchContentObject) -> bool: """ 项目编号为空检测 :param obj:代表一个item :return:返回true 代表异常 """ self.check_projectcode_ac = AcAutomation() with open(abnormal_config["table_field_config"]["path4"], "r") as f: reads = csv.reader(f) [self.check_projectcode_ac.add_word(w[0]) for w in reads] projectcode = obj.get("projectcode", "") detail = obj.get("detail", "") attach_text = obj.get("attach_text", {}) if projectcode == "": contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典 content = "\n".join(contents) #字典处理成字符串 if self.check_projectcode_ac.search(content): return True for attach_index, attach_content in attach_text.items(): if attach_content: for topic_index, topic_detail in attach_content.items(): # oss地址 attach_url = topic_detail.get("attach_url", "") if attach_url: # 获取附件内容 st, content = fsc.download_text_content(attach_url) # 下载成功 # 超长文本不处理,暂定30万字 if st and content.strip(): if len(content) > 300000: continue # 开始检测 contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url) content = "\n".join(contents) if self.check_projectcode_ac.search(content): return True return False return False # 处理正文 # 检查因素 # 是否返回 0000 def check_subpackage(self,obj, catch_content: CatchContentObject) -> bool: """ 公司名称检测 :param obj:代表一个item :return:返回true 代表异常 """ pass # 处理正文 # 检查因素 # 是否返回 0000