# coding:utf-8 from tables.ai import org_ner from tables import clear_spacing from html_table_extractor.extractor import Extractor from tables import CatchContentObject, fsc from util.sensitive_word import AcAutomation from tables import match_company_index, key_value_header from tables import tfc_object from docs.config import ai2config, abnormal_config from docs.config import DEBUG import csv import re pattern = r',|。|\?|!|;' class WinnerChecker(object): """ 中标字段检查 """ def __init__(self): self.errors_tables = { # "0101": { # "name": "实体识别", # "parent_name": "名称错误", # "parent_code": "01", # "checkFn": self.check0101 # }, # "0201": { # "name": "看数据的标签是不是之中标单位", # "parent_name": "数据标签错误", # "parent_code": "02", # "checkFn": self.check0201 # }, "0103": { "name": "包含叠词,异常词汇,特殊词汇", "parent_name": "名称错误", "parent_code": "01", "checkFn": self.check0103 } } # self.winner_ac = AcAutomation() with open(ai2config["table_field_config"]["corpus_path"], "r") as f: reads = csv.reader(f) [self.winner_ac.add_word(d[0].strip()) for d in reads if "中标单位" in d[1] and d[0].strip()] def intention_check(self, header): """ 意图结果检测 :param header: :return: """ if header in self.winner_ac.catch: if DEBUG: print(f"中标单位意图:::>> **{header}**==> [中标单位]") return True tags = tfc_object.predict([header]) if tags: if "中标单位" in tags[0]: if DEBUG: print(f"中标单位意图:::>> **{header}**==> {tags}") return True def winner_intention_table(self, tables: list, companies): """ 表格意图检测 :param tables: :param companies: :return: """ for row_ind, row in enumerate(tables): for col_ind, column in enumerate(row): extract_companies = re.findall("|".join(companies), column) if extract_companies: if col_ind > 0: status = self.intention_check(row[col_ind - 1]) if status: for company in companies: companies.remove(company) if not companies: return False if row_ind > 0 and len(tables[row_ind - 1]) > col_ind: status = self.intention_check(tables[row_ind - 1][col_ind]) if status: for company in companies: companies.remove(company) if not companies: return False if self.winner_ac.search(column): companies = self.winner_intention_content(companies, column) if not companies: return False return companies def winner_intention_content(self, companies, column): """ 文本意图检测 :param companies: :param column: :return: """ # 公司名称的下标 indexes = match_company_index(companies, column) if not indexes: return companies # 实体提取的head字段 start_ind = 0 for r in indexes: start, end, company_name = r if company_name not in companies: start_ind = end continue start_ind = start_ind if start_ind > start - 10 else start - 10 text_ = column[start_ind:end + 10] start_ind = end head = key_value_header(text_) for val, ind in head: if self.intention_check(val): if company_name in companies: companies.pop(company_name) if not companies: return False return companies @staticmethod def check_company_name(contents: list, companies: list): """ 公司名称检测 :param contents:正文段落分割后 :param companies: 公司list :return:返回False结束流程,list继续流程 """ new_content_list = [] # 合并文本 for ind, con in enumerate(contents): if "