# coding:utf-8 from docs.config import ai2config from tables.ai.table_field_category import TableFieldCategoryModel from tables.ai import fields_ner from util.info_preprocess import Preprocess from util.fs_client import FileServeClient import re from util.mogodb_helper import MongoDBInterface # 初始化 fsc = FileServeClient() # oss客户端 preprocess = Preprocess() # 文本预处理程序(html,attach) tfc_object = TableFieldCategoryModel(ai2config["table_field_config"]) # 表格字段识别模型实例 class CatchContentObject(object): """ 缓存 """ def __init__(self): self.catch = {} def public_attachment_catch(self, content, platform, document_id): if document_id in self.catch: return self.catch[document_id] contents = preprocess.get_preprocess(platform).preprocess(content) contents=contents.replace("(","(").replace(")",")") contents = contents.split("\n") contents = [c for c in contents if c.strip()] self.catch[document_id] = contents return contents def initialize(self): self.catch = {} def match_company_index(companies: list, content: str): """ 获取公司所在文本位置 :param companies:公司列表 :param content:文本 :return: """ companies.sort(key=lambda x: len(x), reverse=True) match_index = [] pattern = r"|".join(companies) repatten = pattern.replace(")", "\)").replace("(", "\(").replace(".", "\.").replace("*", "\*") for company in re.finditer(repatten, content): start_index = company.start() company_name = company.group() end_index = company.end() match_index.append((start_index, end_index, company_name)) # (start_index,company_name) return match_index def key_value_header(content): """ 获取content中的键值 :param content:文本 :return: """ head = [] ret = fields_ner(text=content) result = ret.get("result", []) if result: head = result[0].get("HEAD", []) return head def clear_spacing(data_list): ''' 清理二维列表中的空值、空格 :param data_list: 待处理的二维列表 :return: 处理后的二维列表 ''' new_data_list = [] # 遍历原始列表的每一行 for row in data_list: new_row = [] for elem in row: # 如果元素为None,则将其转换为空字符串 new_row.append("") if not elem else new_row.append(elem.replace(" ", "")) new_data_list.append(new_row) return new_data_list if __name__ == '__main__': ret = match_company_index("中标单位:xxxx公司") print(ret)