# coding:utf-8
from tables.ai import org_ner
from tables import clear_spacing
from html_table_extractor.extractor import Extractor
from tables import CatchContentObject, fsc
from util.sensitive_word import AcAutomation
from tables import match_company_index, key_value_header
from tables import tfc_object
from docs.config import ai2config, abnormal_config
from docs.config import DEBUG
import csv
import re
pattern = r',|。|\?|!|;'
class WinnerChecker(object):
"""
中标字段检查
"""
def __init__(self):
self.errors_tables = {
# "0101": {
# "name": "实体识别",
# "parent_name": "名称错误",
# "parent_code": "01",
# "checkFn": self.check0101
# },
# "0201": {
# "name": "看数据的标签是不是之中标单位",
# "parent_name": "数据标签错误",
# "parent_code": "02",
# "checkFn": self.check0201
# },
"0103": {
"name": "包含叠词,异常词汇,特殊词汇",
"parent_name": "名称错误",
"parent_code": "01",
"checkFn": self.check0103
},
"0104": {
"name": "中标单位包含采购单位",
"parent_name": "名称错误",
"parent_code": "01",
"checkFn": self.check0104
},
"0301": {
"name": "中标单位名称长度<3",
"parent_name": "名称长度异常错误",
"parent_code": "03",
"checkFn": self.check0301
}
}
#
self.winner_ac = AcAutomation()
with open(ai2config["table_field_config"]["corpus_path"], "r") as f:
reads = csv.reader(f)
[self.winner_ac.add_word(d[0].strip()) for d in reads if "中标单位" in d[1] and d[0].strip()]
def intention_check(self, header):
"""
意图结果检测
:param header:
:return:
"""
if header in self.winner_ac.catch:
if DEBUG:
print(f"中标单位意图:::>> **{header}**==> [中标单位]")
return True
tags = tfc_object.predict([header])
if tags:
if "中标单位" in tags[0]:
if DEBUG:
print(f"中标单位意图:::>> **{header}**==> {tags}")
return True
def winner_intention_table(self, tables: list, companies):
"""
表格意图检测
:param tables:
:param companies:
:return:
"""
for row_ind, row in enumerate(tables):
for col_ind, column in enumerate(row):
extract_companies = re.findall("|".join(companies), column)
if extract_companies:
if col_ind > 0:
status = self.intention_check(row[col_ind - 1])
if status:
for company in companies:
companies.remove(company)
if not companies:
return False
if row_ind > 0 and len(tables[row_ind - 1]) > col_ind:
status = self.intention_check(tables[row_ind - 1][col_ind])
if status:
for company in companies:
companies.remove(company)
if not companies:
return False
if self.winner_ac.search(column):
companies = self.winner_intention_content(companies, column)
if not companies:
return False
return companies
def winner_intention_content(self, companies, column):
"""
文本意图检测
:param companies:
:param column:
:return:
"""
# 公司名称的下标
indexes = match_company_index(companies, column)
if not indexes:
return companies
# 实体提取的head字段
start_ind = 0
for r in indexes:
start, end, company_name = r
if company_name not in companies:
start_ind = end
continue
start_ind = start_ind if start_ind > start - 10 else start - 10
text_ = column[start_ind:end + 10]
start_ind = end
head = key_value_header(text_)
for val, ind in head:
if self.intention_check(val):
if company_name in companies:
companies.pop(company_name)
if not companies:
return False
return companies
@staticmethod
def check_company_name(contents: list, companies: list):
"""
公司名称检测
:param contents:正文段落分割后
:param companies: 公司list
:return:返回False结束流程,list继续流程
"""
new_content_list = []
# 合并文本
for ind, con in enumerate(contents):
if "
> **{text}**==> {company}")
companies.remove(company)
if not companies:
return False
return companies
def check_intention(self, contents: list, companies: list):
"""
意图检测
:param contents:正文段落分割后
:param companies: 公司list
:return:返回False结束流程,list继续流程
"""
for ind, content in enumerate(contents):
if " bool:
"""
公司名称检测
:param winner:中标单位,多个逗号分割
:param detail: 公告
:param attach_text: 附件解析结果
:param catch_content: 单挑数据缓存
:return:返回true 代表异常
"""
companies = [company for company in winner.split(",") if company]
contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")
companies = self.check_company_name(contents, companies)
if not companies:
return False
for attach_index, attach_content in attach_text.items():
if attach_content:
for topic_index, topic_detail in attach_content.items():
# oss地址
attach_url = topic_detail.get("attach_url", "")
if attach_url:
# 获取附件内容
st, content = fsc.download_text_content(attach_url)
# 下载成功
# 超长文本不处理,暂定30万字
if st and content.strip():
if len(content) > 300000:
continue
# 开始检测
contents = catch_content.public_attachment_catch(content, platform="attach",
document_id=attach_url)
companies = self.check_company_name(contents, companies)
if not companies:
return False
return True
def check0201(self, winner: str, detail: str, attach_text: dict, catch_content: CatchContentObject) -> bool:
"""
公司名称检测
:param winner:中标单位,多个逗号分割
:param detail: 公告
:param attach_text: 附件解析结果
:param catch_content: 单挑数据缓存
:return:返回true 代表异常
"""
companies = [company for company in winner.split(",") if company] # 多中标人
# 公告意图检测
contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告")
companies = self.check_intention(contents, companies)
if not companies:
return False
# 附件意图检测
for attach_index, attach_content in attach_text.items():
if attach_content:
for topic_index, topic_detail in attach_content.items():
# oss地址
attach_url = topic_detail.get("attach_url", "")
if attach_url:
# 获取附件内容
st, content = fsc.download_text_content(attach_url)
# 下载成功
# 超长文本不处理,暂定30万字
if st and content.strip():
if len(content) > 300000:
continue
# 开始检测
contents = catch_content.public_attachment_catch(content, platform="attach",
document_id=attach_url)
companies = self.check_intention(contents, companies)
if not companies:
return False
return True
def check0103(self,s_winner:str,subtype:str):
if subtype in ("中标", "成交", "合同", "验收"):
#中标单位名称以异常词开始
with open(abnormal_config["table_field_config"]["path1"], "r") as f:
reads = csv.reader(f)
for n in reads:
p1 = re.compile("^"+n[0])
if p1.match(s_winner):
return True
# 中标单位名称包含异常词
with open(abnormal_config["table_field_config"]["path2"], "r") as f:
reads = csv.reader(f)
for n in reads:
if n[0] in s_winner:
return True
# 中标单位名称以异常词结尾
with open(abnormal_config["table_field_config"]["path3"], "r") as f:
reads = csv.reader(f)
for w in reads:
if re.search(f"{w[0]}$", s_winner):
return True
return False
def check0301(self,s_winner:str,subtype:str):
"""
中标单位长度异常检测
:param obj:代表一个item
:return:返回true 代表异常
"""
if subtype in ("中标", "成交", "合同", "验收"):
if len(s_winner) < 3:
return True
return False
def check0104(self,s_winner:str,buyer:str,subtype:str):
"""
中标单位包含采购单位
:param obj:代表一个item
:return:返回true 代表异常
"""
if subtype in ("中标", "成交", "合同", "验收"):
if buyer and s_winner:
if buyer in s_winner:
return True
return False