buyer.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. # coding:utf-8
  2. from tables.ai import org_ner
  3. from tables import clear_spacing
  4. from html_table_extractor.extractor import Extractor
  5. from tables import CatchContentObject, fsc
  6. from util.sensitive_word import AcAutomation
  7. from tables import match_company_index, key_value_header
  8. from tables import tfc_object
  9. from docs.config import ai2config
  10. from docs.config import DEBUG
  11. from util.get_region import get_city_info
  12. import csv
  13. import re
  14. from docs.config import abnormal_config
  15. pattern = r',|。|\?|!|;'
  16. class BuyerChecker(object):
  17. """
  18. 中标字段检查
  19. """
  20. def __init__(self):
  21. """
  22. 采购单位0101判断不准确,备用
  23. """
  24. self.errors_tables = {
  25. # "0101": {
  26. # "name": "实体识别",
  27. # "parent_name": "名称错误",
  28. # "parent_code": "01",
  29. # "checkFn": self.check0101
  30. # },
  31. # "0201": {
  32. # "name": "看数据的标签是不是采购单位",
  33. # "parent_name": "数据标签错误",
  34. # "parent_code": "02",
  35. # "checkFn": self.check0201
  36. # },
  37. "0103": {
  38. "name": "包含叠词,异常词汇,特殊词汇",
  39. "parent_name": "名称错误",
  40. "parent_code": "01",
  41. "checkFn": self.check0103
  42. },
  43. "0104": {
  44. "name": "名称不完整",
  45. "parent_name": "名称错误",
  46. "parent_code": "01",
  47. "checkFn": self.check0104
  48. },
  49. "0301": {
  50. "name": "采购单位名称长度<3",
  51. "parent_name": "名称长度异常错误",
  52. "parent_code": "03",
  53. "checkFn": self.check0301
  54. },
  55. "0105": {
  56. "name": "采购单位与中标单位一致",
  57. "parent_name": "名称错误",
  58. "parent_code": "01",
  59. "checkFn": self.check0105
  60. }
  61. }
  62. #
  63. self.buyer_ac = AcAutomation()
  64. with open(ai2config["table_field_config"]["corpus_path"], "r") as f:
  65. reads = csv.reader(f)
  66. [self.buyer_ac.add_word(d[0].strip()) for d in reads if "采购单位" in d[1] and d[0].strip()]
  67. def intention_check(self, header):
  68. """
  69. 意图结果检测
  70. :param header:
  71. :return:
  72. """
  73. if header in self.buyer_ac.catch:
  74. if DEBUG:
  75. print(f"采购单位意图:::>> **{header}**==> [采购单位]")
  76. return True
  77. tags = tfc_object.predict([header])
  78. if tags:
  79. if "采购单位" in tags[0]:
  80. if DEBUG:
  81. print(f"采购单位意图:::>> **{header}**==> {tags}")
  82. return True
  83. def buyer_intention_table(self, tables: list, companies):
  84. """
  85. 表格意图检测
  86. :param tables:
  87. :param companies:
  88. :return:
  89. """
  90. for row_ind, row in enumerate(tables):
  91. for col_ind, column in enumerate(row):
  92. extract_companies = re.findall("|".join(companies), column)
  93. if extract_companies:
  94. if col_ind > 0:
  95. status = self.intention_check(row[col_ind - 1])
  96. if status:
  97. for company in companies:
  98. companies.remove(company)
  99. if not companies:
  100. return False
  101. if row_ind > 0 and len(tables[row_ind - 1]) > col_ind:
  102. status = self.intention_check(tables[row_ind - 1][col_ind])
  103. if status:
  104. for company in companies:
  105. companies.remove(company)
  106. if not companies:
  107. return False
  108. if self.buyer_ac.search(column):
  109. companies = self.buyer_intention_content(companies, column)
  110. if not companies:
  111. return False
  112. return companies
  113. def buyer_intention_content(self, companies, column):
  114. """
  115. 文本意图检测
  116. :param companies:
  117. :param column:
  118. :return:
  119. """
  120. # 公司名称的下标
  121. indexes = match_company_index(companies, column)
  122. if not indexes:
  123. return companies
  124. # 实体提取的head字段
  125. start_ind = 0
  126. for r in indexes:
  127. start, end, company_name = r
  128. if company_name not in companies:
  129. start_ind = end
  130. continue
  131. start_ind = start_ind if start_ind > start - 10 else start - 10
  132. text_ = column[start_ind:end + 10]
  133. start_ind = end
  134. head = key_value_header(text_)
  135. for val, ind in head:
  136. if self.intention_check(val):
  137. if company_name in companies:
  138. companies.pop(company_name)
  139. if not companies:
  140. return False
  141. return companies
  142. @staticmethod
  143. def check_company_name(contents: list, companies: list):
  144. """
  145. 公司名称检测
  146. :param contents:正文段落分割后
  147. :param companies: 公司list
  148. :return:返回False结束流程,list继续流程
  149. """
  150. new_content_list = []
  151. # 合并文本
  152. for ind, con in enumerate(contents):
  153. if "<table" in con:
  154. table = Extractor(con).parse()
  155. _tables = table.return_list()
  156. _tables = clear_spacing(_tables)
  157. for text in _tables:
  158. new_content_list.extend(text)
  159. else:
  160. # 一段文本
  161. new_content_list.append(con.replace(" ", ""))
  162. # 开始判断公司名称
  163. for text in new_content_list:
  164. p = r"|".join(companies)
  165. repatten = p.replace(")", "\)").replace("(", "\(").replace(".", "\.")
  166. s = re.split(pattern, text)
  167. for t in s:
  168. if re.search(repatten, t):
  169. al_result = org_ner(t)
  170. for company in al_result:
  171. if company in companies:
  172. if DEBUG:
  173. print(f"采购单位实体识别:::>> **{text}**==> {company}")
  174. companies.remove(company)
  175. if not companies:
  176. return False
  177. return companies
  178. def check_intention(self, contents: list, companies: list):
  179. """
  180. 意图检测
  181. :param contents:正文段落分割后
  182. :param companies: 公司list
  183. :return:返回False结束流程,list继续流程
  184. """
  185. for ind, content in enumerate(contents):
  186. if "<table" in content:
  187. # 表格处理
  188. table = Extractor(content).parse()
  189. _tables = table.return_list()
  190. _tables = clear_spacing(_tables)
  191. _table_str = str(_tables)
  192. if re.search("|".join(companies), _table_str):
  193. companies = self.buyer_intention_table(_tables, companies)
  194. if not companies:
  195. return False
  196. continue
  197. # 非表格处理
  198. companies = self.buyer_intention_content(companies, content)
  199. if not companies:
  200. return False
  201. return companies
  202. def check0101(self, buyer: str, detail: str, attach_text: dict, catch_content: CatchContentObject) -> bool:
  203. """
  204. 公司名称检测
  205. :param buyer:采购单位
  206. :param detail: 公告
  207. :param attach_text: 附件解析结果
  208. :param catch_content: 单挑数据缓存
  209. :return:返回true 代表异常
  210. """
  211. try:
  212. companies = [buyer]
  213. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")
  214. companies = self.check_company_name(contents, companies)
  215. if not companies:
  216. return False
  217. for attach_index, attach_content in attach_text.items():
  218. if attach_content:
  219. for topic_index, topic_detail in attach_content.items():
  220. # oss地址
  221. attach_url = topic_detail.get("attach_url", "")
  222. if attach_url:
  223. # 获取附件内容
  224. st, content = fsc.download_text_content(attach_url)
  225. # 下载成功
  226. # 超长文本不处理,暂定30万字
  227. if st and content.strip():
  228. if len(content) > 300000:
  229. continue
  230. # 开始检测
  231. contents = catch_content.public_attachment_catch(content, platform="attach",
  232. document_id=attach_url)
  233. companies = self.check_company_name(contents, companies)
  234. if not companies:
  235. return False
  236. except Exception as e:
  237. print(e)
  238. return True
  239. def check0201(self, buyer: str, detail: str, attach_text: dict, catch_content: CatchContentObject) -> bool:
  240. """
  241. 公司名称检测
  242. :param buyer:采购单位
  243. :param detail: 公告
  244. :param attach_text: 附件解析结果
  245. :param catch_content: 单挑数据缓存
  246. :return:返回true 代表异常
  247. """
  248. companies = [buyer] # 多中标人
  249. # 公告意图检测
  250. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")
  251. companies = self.check_intention(contents, companies)
  252. if not companies:
  253. return False
  254. # 附件意图检测
  255. for attach_index, attach_content in attach_text.items():
  256. if attach_content:
  257. for topic_index, topic_detail in attach_content.items():
  258. # oss地址
  259. attach_url = topic_detail.get("attach_url", "")
  260. if attach_url:
  261. # 获取附件内容
  262. st, content = fsc.download_text_content(attach_url)
  263. # 下载成功
  264. # 超长文本不处理,暂定30万字
  265. if st and content.strip():
  266. if len(content) > 300000:
  267. continue
  268. # 开始检测
  269. contents = catch_content.public_attachment_catch(content, platform="attach",
  270. document_id=attach_url)
  271. companies = self.check_intention(contents, companies)
  272. if not companies:
  273. return False
  274. return True
  275. def check0103(self, buyer: str):
  276. """
  277. return True 代表异常
  278. """
  279. # 采购单位名称以异常词开始
  280. with open(abnormal_config["table_field_config"]["path1"], "r") as f:
  281. reads = csv.reader(f)
  282. for n in reads:
  283. p1 = re.compile("^" + n[0])
  284. if p1.match(buyer):
  285. return True
  286. # 采购单位名称中包含异常词
  287. self.check_abnormal_ac = AcAutomation()
  288. with open(abnormal_config["table_field_config"]["path2"], "r") as f:
  289. reads = csv.reader(f)
  290. for k in reads:
  291. if k[0] in (buyer):
  292. return True
  293. # 采购单位名称以异常词结尾
  294. with open(abnormal_config["table_field_config"]["path3"], "r") as f:
  295. reads = csv.reader(f)
  296. for m in reads:
  297. p2 = re.compile(f"{m[0]}$")
  298. if p2.search(buyer):
  299. return True
  300. return False
  301. def check0301(self, buyer: str):
  302. """
  303. return True 代表异常
  304. """
  305. if len(buyer) < 3:
  306. return True
  307. return False
  308. def check0104(self, buyer: str, buyerclass: str):
  309. """
  310. #如果采购单位类型in ("学校","教育","卫健委","医疗","政府办","政务中心"),则采购单位名称中一般都含有地名
  311. """
  312. if buyerclass in ("学校", "教育", "卫健委", "医疗", "政府办", "政务中心"):
  313. province, city, district = get_city_info(buyer)
  314. if province == None and city == None and district == None:
  315. return True
  316. return False
  317. def check0105(self, buyer: str, s_winner: str):
  318. if buyer and s_winner:
  319. if buyer == s_winner:
  320. return True
  321. return False