# coding:utf-8 from tables.ai import org_ner from tables import clear_spacing from html_table_extractor.extractor import Extractor from tables import CatchContentObject, fsc from util.sensitive_word import AcAutomation from tables import match_company_index, key_value_header from tables import tfc_object from docs.config import ai2config, abnormal_config from docs.config import DEBUG import csv import re pattern = r',|。|\?|!|;' class WinnerChecker(object): """ 中标字段检查 """ def __init__(self): self.errors_tables = { # "0101": { # "name": "实体识别", # "parent_name": "名称错误", # "parent_code": "01", # "checkFn": self.check0101 # }, # "0201": { # "name": "看数据的标签是不是之中标单位", # "parent_name": "数据标签错误", # "parent_code": "02", # "checkFn": self.check0201 # }, "0103": { "name": "包含叠词,异常词汇,特殊词汇", "parent_name": "名称错误", "parent_code": "01", "checkFn": self.check0103 }, "0104": { "name": "中标单位包含采购单位", "parent_name": "名称错误", "parent_code": "01", "checkFn": self.check0104 }, "0301": { "name": "中标单位名称长度<3", "parent_name": "名称长度异常错误", "parent_code": "03", "checkFn": self.check0301 } } # self.winner_ac = AcAutomation() with open(ai2config["table_field_config"]["corpus_path"], "r") as f: reads = csv.reader(f) [self.winner_ac.add_word(d[0].strip()) for d in reads if "中标单位" in d[1] and d[0].strip()] def intention_check(self, header): """ 意图结果检测 :param header: :return: """ if header in self.winner_ac.catch: if DEBUG: print(f"中标单位意图:::>> **{header}**==> [中标单位]") return True tags = tfc_object.predict([header]) if tags: if "中标单位" in tags[0]: if DEBUG: print(f"中标单位意图:::>> **{header}**==> {tags}") return True def winner_intention_table(self, tables: list, companies): """ 表格意图检测 :param tables: :param companies: :return: """ for row_ind, row in enumerate(tables): for col_ind, column in enumerate(row): extract_companies = re.findall("|".join(companies), column) if extract_companies: if col_ind > 0: status = self.intention_check(row[col_ind - 1]) if status: for company in companies: companies.remove(company) if not companies: return False if row_ind > 0 and len(tables[row_ind - 1]) > col_ind: status = self.intention_check(tables[row_ind - 1][col_ind]) if status: for company in companies: companies.remove(company) if not companies: return False if self.winner_ac.search(column): companies = self.winner_intention_content(companies, column) if not companies: return False return companies def winner_intention_content(self, companies, column): """ 文本意图检测 :param companies: :param column: :return: """ # 公司名称的下标 indexes = match_company_index(companies, column) if not indexes: return companies # 实体提取的head字段 start_ind = 0 for r in indexes: start, end, company_name = r if company_name not in companies: start_ind = end continue start_ind = start_ind if start_ind > start - 10 else start - 10 text_ = column[start_ind:end + 10] start_ind = end head = key_value_header(text_) for val, ind in head: if self.intention_check(val): if company_name in companies: companies.pop(company_name) if not companies: return False return companies @staticmethod def check_company_name(contents: list, companies: list): """ 公司名称检测 :param contents:正文段落分割后 :param companies: 公司list :return:返回False结束流程,list继续流程 """ new_content_list = [] # 合并文本 for ind, con in enumerate(contents): if "> **{text}**==> {company}") companies.remove(company) if not companies: return False return companies def check_intention(self, contents: list, companies: list): """ 意图检测 :param contents:正文段落分割后 :param companies: 公司list :return:返回False结束流程,list继续流程 """ for ind, content in enumerate(contents): if " bool: """ 公司名称检测 :param winner:中标单位,多个逗号分割 :param detail: 公告 :param attach_text: 附件解析结果 :param catch_content: 单挑数据缓存 :return:返回true 代表异常 """ companies = [company for company in winner.split(",") if company] contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") companies = self.check_company_name(contents, companies) if not companies: return False for attach_index, attach_content in attach_text.items(): if attach_content: for topic_index, topic_detail in attach_content.items(): # oss地址 attach_url = topic_detail.get("attach_url", "") if attach_url: # 获取附件内容 st, content = fsc.download_text_content(attach_url) # 下载成功 # 超长文本不处理,暂定30万字 if st and content.strip(): if len(content) > 300000: continue # 开始检测 contents = catch_content.public_attachment_catch(content, platform="attach", document_id=attach_url) companies = self.check_company_name(contents, companies) if not companies: return False return True def check0201(self, winner: str, detail: str, attach_text: dict, catch_content: CatchContentObject) -> bool: """ 公司名称检测 :param winner:中标单位,多个逗号分割 :param detail: 公告 :param attach_text: 附件解析结果 :param catch_content: 单挑数据缓存 :return:返回true 代表异常 """ companies = [company for company in winner.split(",") if company] # 多中标人 # 公告意图检测 contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") companies = self.check_intention(contents, companies) if not companies: return False # 附件意图检测 for attach_index, attach_content in attach_text.items(): if attach_content: for topic_index, topic_detail in attach_content.items(): # oss地址 attach_url = topic_detail.get("attach_url", "") if attach_url: # 获取附件内容 st, content = fsc.download_text_content(attach_url) # 下载成功 # 超长文本不处理,暂定30万字 if st and content.strip(): if len(content) > 300000: continue # 开始检测 contents = catch_content.public_attachment_catch(content, platform="attach", document_id=attach_url) companies = self.check_intention(contents, companies) if not companies: return False return True def check0103(self,s_winner:str,subtype:str): if subtype in ("中标", "成交", "合同", "验收"): #中标单位名称以异常词开始 with open(abnormal_config["table_field_config"]["path1"], "r") as f: reads = csv.reader(f) for n in reads: p1 = re.compile("^"+n[0]) if p1.match(s_winner): return True # 中标单位名称包含异常词 with open(abnormal_config["table_field_config"]["path2"], "r") as f: reads = csv.reader(f) for n in reads: if n[0] in s_winner: return True # 中标单位名称以异常词结尾 with open(abnormal_config["table_field_config"]["path3"], "r") as f: reads = csv.reader(f) for w in reads: if re.search(f"{w[0]}$", s_winner): return True return False def check0301(self,s_winner:str,subtype:str): """ 中标单位长度异常检测 :param obj:代表一个item :return:返回true 代表异常 """ if subtype in ("中标", "成交", "合同", "验收"): if len(s_winner) < 3: return True return False def check0104(self,s_winner:str,buyer:str,subtype:str): """ 中标单位包含采购单位 :param obj:代表一个item :return:返回true 代表异常 """ if subtype in ("中标", "成交", "合同", "验收"): if buyer and s_winner: if buyer in s_winner: return True return False