123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- # coding:utf-8
- from docs.config import ai2config
- from tables.ai.table_field_category import TableFieldCategoryModel
- from tables.ai import fields_ner
- from util.info_preprocess import Preprocess
- from util.fs_client import FileServeClient
- import re
- from util.mogodb_helper import MongoDBInterface
- # 初始化
- fsc = FileServeClient() # oss客户端
- preprocess = Preprocess() # 文本预处理程序(html,attach)
- tfc_object = TableFieldCategoryModel(ai2config["table_field_config"]) # 表格字段识别模型实例
- class CatchContentObject(object):
- """
- 缓存
- """
- def __init__(self):
- self.catch = {}
- def public_attachment_catch(self, content, platform, document_id):
- if document_id in self.catch:
- return self.catch[document_id]
- contents = preprocess.get_preprocess(platform).preprocess(content)
- contents=contents.replace("(","(").replace(")",")")
- contents = contents.split("\n")
- contents = [c for c in contents if c.strip()]
- self.catch[document_id] = contents
- return contents
- def initialize(self):
- self.catch = {}
- def match_company_index(companies: list, content: str):
- """
- 获取公司所在文本位置
- :param companies:公司列表
- :param content:文本
- :return:
- """
- companies.sort(key=lambda x: len(x), reverse=True)
- match_index = []
- pattern = r"|".join(companies)
- repatten = pattern.replace(")", "\)").replace("(", "\(").replace(".", "\.").replace("*", "\*")
- for company in re.finditer(repatten, content):
- start_index = company.start()
- company_name = company.group()
- end_index = company.end()
- match_index.append((start_index, end_index, company_name)) # (start_index,company_name)
- return match_index
- def key_value_header(content):
- """
- 获取content中的键值
- :param content:文本
- :return:
- """
- head = []
- ret = fields_ner(text=content)
- result = ret.get("result", [])
- if result:
- head = result[0].get("HEAD", [])
- return head
- def clear_spacing(data_list):
- '''
- 清理二维列表中的空值、空格
- :param data_list: 待处理的二维列表
- :return: 处理后的二维列表
- '''
- new_data_list = []
- # 遍历原始列表的每一行
- for row in data_list:
- new_row = []
- for elem in row:
- # 如果元素为None,则将其转换为空字符串
- new_row.append("") if not elem else new_row.append(elem.replace(" ", ""))
- new_data_list.append(new_row)
- return new_data_list
- if __name__ == '__main__':
- ret = match_company_index("中标单位:xxxx公司")
- print(ret)
|