__init__.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # coding:utf-8
  2. from docs.config import ai2config
  3. from tables.ai.table_field_category import TableFieldCategoryModel
  4. from tables.ai import fields_ner
  5. from util.info_preprocess import Preprocess
  6. from util.fs_client import FileServeClient
  7. import re
  8. from util.mogodb_helper import MongoDBInterface
  9. # 初始化
  10. fsc = FileServeClient() # oss客户端
  11. preprocess = Preprocess() # 文本预处理程序(html,attach)
  12. tfc_object = TableFieldCategoryModel(ai2config["table_field_config"]) # 表格字段识别模型实例
  13. class CatchContentObject(object):
  14. """
  15. 缓存
  16. """
  17. def __init__(self):
  18. self.catch = {}
  19. def public_attachment_catch(self, content, platform, document_id):
  20. if document_id in self.catch:
  21. return self.catch[document_id]
  22. contents = preprocess.get_preprocess(platform).preprocess(content)
  23. contents=contents.replace("(","(").replace(")",")")
  24. contents = contents.split("\n")
  25. contents = [c for c in contents if c.strip()]
  26. self.catch[document_id] = contents
  27. return contents
  28. def initialize(self):
  29. self.catch = {}
  30. def match_company_index(companies: list, content: str):
  31. """
  32. 获取公司所在文本位置
  33. :param companies:公司列表
  34. :param content:文本
  35. :return:
  36. """
  37. companies.sort(key=lambda x: len(x), reverse=True)
  38. match_index = []
  39. pattern = r"|".join(companies)
  40. repatten = pattern.replace(")", "\)").replace("(", "\(").replace(".", "\.").replace("*", "\*")
  41. for company in re.finditer(repatten, content):
  42. start_index = company.start()
  43. company_name = company.group()
  44. end_index = company.end()
  45. match_index.append((start_index, end_index, company_name)) # (start_index,company_name)
  46. return match_index
  47. def key_value_header(content):
  48. """
  49. 获取content中的键值
  50. :param content:文本
  51. :return:
  52. """
  53. head = []
  54. ret = fields_ner(text=content)
  55. result = ret.get("result", [])
  56. if result:
  57. head = result[0].get("HEAD", [])
  58. return head
  59. def clear_spacing(data_list):
  60. '''
  61. 清理二维列表中的空值、空格
  62. :param data_list: 待处理的二维列表
  63. :return: 处理后的二维列表
  64. '''
  65. new_data_list = []
  66. # 遍历原始列表的每一行
  67. for row in data_list:
  68. new_row = []
  69. for elem in row:
  70. # 如果元素为None,则将其转换为空字符串
  71. new_row.append("") if not elem else new_row.append(elem.replace(" ", ""))
  72. new_data_list.append(new_row)
  73. return new_data_list
  74. if __name__ == '__main__':
  75. ret = match_company_index("中标单位:xxxx公司")
  76. print(ret)