s_winner.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. # coding:utf-8
  2. from tables.ai import org_ner
  3. from tables import clear_spacing
  4. from html_table_extractor.extractor import Extractor
  5. from tables import CatchContentObject, fsc
  6. from util.sensitive_word import AcAutomation
  7. from tables import match_company_index, key_value_header
  8. from tables import tfc_object
  9. from docs.config import ai2config, abnormal_config
  10. from docs.config import DEBUG
  11. import csv
  12. import re
  13. pattern = r',|。|\?|!|;'
  14. class WinnerChecker(object):
  15. """
  16. 中标字段检查
  17. """
  18. def __init__(self):
  19. self.errors_tables = {
  20. # "0101": {
  21. # "name": "实体识别",
  22. # "parent_name": "名称错误",
  23. # "parent_code": "01",
  24. # "checkFn": self.check0101
  25. # },
  26. # "0201": {
  27. # "name": "看数据的标签是不是之中标单位",
  28. # "parent_name": "数据标签错误",
  29. # "parent_code": "02",
  30. # "checkFn": self.check0201
  31. # },
  32. "0103": {
  33. "name": "包含叠词,异常词汇,特殊词汇",
  34. "parent_name": "名称错误",
  35. "parent_code": "01",
  36. "checkFn": self.check0103
  37. },
  38. "0104": {
  39. "name": "中标单位包含采购单位",
  40. "parent_name": "名称错误",
  41. "parent_code": "01",
  42. "checkFn": self.check0104
  43. },
  44. "0301": {
  45. "name": "中标单位名称长度<3",
  46. "parent_name": "名称长度异常错误",
  47. "parent_code": "03",
  48. "checkFn": self.check0301
  49. }
  50. }
  51. #
  52. self.winner_ac = AcAutomation()
  53. with open(ai2config["table_field_config"]["corpus_path"], "r") as f:
  54. reads = csv.reader(f)
  55. [self.winner_ac.add_word(d[0].strip()) for d in reads if "中标单位" in d[1] and d[0].strip()]
  56. def intention_check(self, header):
  57. """
  58. 意图结果检测
  59. :param header:
  60. :return:
  61. """
  62. if header in self.winner_ac.catch:
  63. if DEBUG:
  64. print(f"中标单位意图:::>> **{header}**==> [中标单位]")
  65. return True
  66. tags = tfc_object.predict([header])
  67. if tags:
  68. if "中标单位" in tags[0]:
  69. if DEBUG:
  70. print(f"中标单位意图:::>> **{header}**==> {tags}")
  71. return True
  72. def winner_intention_table(self, tables: list, companies):
  73. """
  74. 表格意图检测
  75. :param tables:
  76. :param companies:
  77. :return:
  78. """
  79. for row_ind, row in enumerate(tables):
  80. for col_ind, column in enumerate(row):
  81. extract_companies = re.findall("|".join(companies), column)
  82. if extract_companies:
  83. if col_ind > 0:
  84. status = self.intention_check(row[col_ind - 1])
  85. if status:
  86. for company in companies:
  87. companies.remove(company)
  88. if not companies:
  89. return False
  90. if row_ind > 0 and len(tables[row_ind - 1]) > col_ind:
  91. status = self.intention_check(tables[row_ind - 1][col_ind])
  92. if status:
  93. for company in companies:
  94. companies.remove(company)
  95. if not companies:
  96. return False
  97. if self.winner_ac.search(column):
  98. companies = self.winner_intention_content(companies, column)
  99. if not companies:
  100. return False
  101. return companies
  102. def winner_intention_content(self, companies, column):
  103. """
  104. 文本意图检测
  105. :param companies:
  106. :param column:
  107. :return:
  108. """
  109. # 公司名称的下标
  110. indexes = match_company_index(companies, column)
  111. if not indexes:
  112. return companies
  113. # 实体提取的head字段
  114. start_ind = 0
  115. for r in indexes:
  116. start, end, company_name = r
  117. if company_name not in companies:
  118. start_ind = end
  119. continue
  120. start_ind = start_ind if start_ind > start - 10 else start - 10
  121. text_ = column[start_ind:end + 10]
  122. start_ind = end
  123. head = key_value_header(text_)
  124. for val, ind in head:
  125. if self.intention_check(val):
  126. if company_name in companies:
  127. companies.pop(company_name)
  128. if not companies:
  129. return False
  130. return companies
  131. @staticmethod
  132. def check_company_name(contents: list, companies: list):
  133. """
  134. 公司名称检测
  135. :param contents:正文段落分割后
  136. :param companies: 公司list
  137. :return:返回False结束流程,list继续流程
  138. """
  139. new_content_list = []
  140. # 合并文本
  141. for ind, con in enumerate(contents):
  142. if "<table" in con:
  143. table = Extractor(con).parse()
  144. _tables = table.return_list()
  145. _tables = clear_spacing(_tables)
  146. for text in _tables:
  147. new_content_list.extend(text)
  148. else:
  149. # 一段文本
  150. new_content_list.append(con.replace(" ", ""))
  151. # 开始判断公司名称
  152. for text in new_content_list:
  153. p = r"|".join(companies)
  154. repatten = p.replace(")", "\)").replace("(", "\(").replace(".", "\.")
  155. s = re.split(pattern, text)
  156. for t in s:
  157. if re.search(repatten, t):
  158. al_result = org_ner(t)
  159. for company in al_result:
  160. if company in companies:
  161. if DEBUG:
  162. print(f"中标单位实体识别:::>> **{text}**==> {company}")
  163. companies.remove(company)
  164. if not companies:
  165. return False
  166. return companies
  167. def check_intention(self, contents: list, companies: list):
  168. """
  169. 意图检测
  170. :param contents:正文段落分割后
  171. :param companies: 公司list
  172. :return:返回False结束流程,list继续流程
  173. """
  174. for ind, content in enumerate(contents):
  175. if "<table" in content:
  176. # 表格处理
  177. table = Extractor(content).parse()
  178. _tables = table.return_list()
  179. _tables = clear_spacing(_tables)
  180. _table_str = str(_tables)
  181. if re.search("|".join(companies), _table_str):
  182. companies = self.winner_intention_table(_tables, companies)
  183. if not companies:
  184. return False
  185. continue
  186. # 非表格处理
  187. companies = self.winner_intention_content(companies, content)
  188. if not companies:
  189. return False
  190. return companies
  191. def check0101(self, winner: str, detail: str, attach_text: dict, catch_content: CatchContentObject) -> bool:
  192. """
  193. 公司名称检测
  194. :param winner:中标单位,多个逗号分割
  195. :param detail: 公告
  196. :param attach_text: 附件解析结果
  197. :param catch_content: 单挑数据缓存
  198. :return:返回true 代表异常
  199. """
  200. companies = [company for company in winner.split(",") if company]
  201. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")
  202. companies = self.check_company_name(contents, companies)
  203. if not companies:
  204. return False
  205. for attach_index, attach_content in attach_text.items():
  206. if attach_content:
  207. for topic_index, topic_detail in attach_content.items():
  208. # oss地址
  209. attach_url = topic_detail.get("attach_url", "")
  210. if attach_url:
  211. # 获取附件内容
  212. st, content = fsc.download_text_content(attach_url)
  213. # 下载成功
  214. # 超长文本不处理,暂定30万字
  215. if st and content.strip():
  216. if len(content) > 300000:
  217. continue
  218. # 开始检测
  219. contents = catch_content.public_attachment_catch(content, platform="attach",
  220. document_id=attach_url)
  221. companies = self.check_company_name(contents, companies)
  222. if not companies:
  223. return False
  224. return True
  225. def check0201(self, winner: str, detail: str, attach_text: dict, catch_content: CatchContentObject) -> bool:
  226. """
  227. 公司名称检测
  228. :param winner:中标单位,多个逗号分割
  229. :param detail: 公告
  230. :param attach_text: 附件解析结果
  231. :param catch_content: 单挑数据缓存
  232. :return:返回true 代表异常
  233. """
  234. companies = [company for company in winner.split(",") if company] # 多中标人
  235. # 公告意图检测
  236. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")
  237. companies = self.check_intention(contents, companies)
  238. if not companies:
  239. return False
  240. # 附件意图检测
  241. for attach_index, attach_content in attach_text.items():
  242. if attach_content:
  243. for topic_index, topic_detail in attach_content.items():
  244. # oss地址
  245. attach_url = topic_detail.get("attach_url", "")
  246. if attach_url:
  247. # 获取附件内容
  248. st, content = fsc.download_text_content(attach_url)
  249. # 下载成功
  250. # 超长文本不处理,暂定30万字
  251. if st and content.strip():
  252. if len(content) > 300000:
  253. continue
  254. # 开始检测
  255. contents = catch_content.public_attachment_catch(content, platform="attach",
  256. document_id=attach_url)
  257. companies = self.check_intention(contents, companies)
  258. if not companies:
  259. return False
  260. return True
  261. def check0103(self,s_winner:str,subtype:str):
  262. if subtype in ("中标", "成交", "合同", "验收"):
  263. #中标单位名称以异常词开始
  264. with open(abnormal_config["table_field_config"]["path1"], "r") as f:
  265. reads = csv.reader(f)
  266. for n in reads:
  267. p1 = re.compile("^"+n[0])
  268. if p1.match(s_winner):
  269. return True
  270. # 中标单位名称包含异常词
  271. with open(abnormal_config["table_field_config"]["path2"], "r") as f:
  272. reads = csv.reader(f)
  273. for n in reads:
  274. if n[0] in s_winner:
  275. return True
  276. # 中标单位名称以异常词结尾
  277. with open(abnormal_config["table_field_config"]["path3"], "r") as f:
  278. reads = csv.reader(f)
  279. for w in reads:
  280. if re.search(f"{w[0]}$", s_winner):
  281. return True
  282. return False
  283. def check0301(self,s_winner:str,subtype:str):
  284. """
  285. 中标单位长度异常检测
  286. :param obj:代表一个item
  287. :return:返回true 代表异常
  288. """
  289. if subtype in ("中标", "成交", "合同", "验收"):
  290. if len(s_winner) < 3:
  291. return True
  292. return False
  293. def check0104(self,s_winner:str,buyer:str,subtype:str):
  294. """
  295. 中标单位包含采购单位
  296. :param obj:代表一个item
  297. :return:返回true 代表异常
  298. """
  299. if subtype in ("中标", "成交", "合同", "验收"):
  300. if buyer and s_winner:
  301. if buyer in s_winner:
  302. return True
  303. return False