123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244 |
- # coding:utf-8
- from tables import CatchContentObject, fsc
- from util.sensitive_word import AcAutomation
- from docs.config import amount_config
- from docs.config import budget_config
- from docs.config import DEBUG
- from docs.config import abnormal_config
- import csv
- class NoFieldChecker(object):
- """
- 无字段或空值检查
- """
- def __init__(self):
- self.errors_tables = {
- "title": self.check_title,
- "projectname": self.check_projectname,
- "buyer":self.check_buyer,
- "winner": self.check_winner,
- "budget": self.check_budget,
- "bidamount": self.check_bidamount,
- "area":self.check_region,
- "projectcode": self.check_projectcode,
- "multipackage":self.check_subpackage,
- }
- def check_bidamount(self,obj,catch_content: CatchContentObject) -> bool:
- """
- 中标金额为空检测
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- self.check_bidamount_ac = AcAutomation()
- with open(amount_config["table_field_config"]["path"], "r") as f:
- reads = csv.reader(f)
- [self.check_bidamount_ac.add_word(w[0]) for w in reads]
- detail = obj.get("detail", "")
- attach_text = obj.get("attach_text", {})
- subtype = obj.get("subtype", "")
- if subtype in ["中标", "成交","合同","验收"]:
- contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
- content = "\n".join(contents) #字典处理成字符串
- if self.check_bidamount_ac.search(content):
- return True
- for attach_index, attach_content in attach_text.items():
- if attach_content:
- for topic_index, topic_detail in attach_content.items():
- # oss地址
- attach_url = topic_detail.get("attach_url", "")
- if attach_url:
- # 获取附件内容
- st, content = fsc.download_text_content(attach_url)
- # 下载成功
- # 超长文本不处理,暂定30万字
- if st and content.strip():
- if len(content) > 300000:
- continue
- # 开始检测
- contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
- content = "\n".join(contents)
- if self.check_bidamount_ac.search(content):
- return True
- return False
- return False
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_winner(self,obj, catch_content: CatchContentObject) -> bool:
- """
- 中标单位名称为空检测,除中标类型的标讯,其他类型标讯不检查这个字段是否为空
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- subtype = obj.get("subtype", "")
- if subtype in ["中标", "成交", "合同", "验收"]:
- winner = obj.get("winner", "")
- if winner:
- return False
- return True
- return False
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_buyer(self,obj,catch_content: CatchContentObject) -> bool:
- """
- 采购单位名称是否为空检测
- :param buyer:采购单位,多个逗号分割
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- buyer = obj.get("buyer", "")
- if buyer :
- return False
- return True
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_budget(self,obj, catch_content: CatchContentObject) -> bool:
- """
- 预算为空检测
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- self.check_budget_ac = AcAutomation()
- with open(budget_config["table_field_config"]["path"],"r") as f :
- reads=csv.reader(f)
- [self.check_budget_ac.add_word(w[0]) for w in reads ]
- detail = obj.get("detail", "")
- attach_text = obj.get("attach_text", {})
- subtype = obj.get("subtype", "")
- if subtype not in ["中标", "成交", "合同", "验收"]:
- contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") # 返回值是字典
- content = "\n".join(contents) # 字典处理成字符串
- if self.check_budget_ac.search(content):
- return True
- for attach_index, attach_content in attach_text.items():
- if attach_content:
- for topic_index, topic_detail in attach_content.items():
- # oss地址
- attach_url = topic_detail.get("attach_url", "")
- if attach_url:
- # 获取附件内容
- st, content = fsc.download_text_content(attach_url)
- # 下载成功
- # 超长文本不处理,暂定30万字
- if st and content.strip():
- if len(content) > 300000:
- continue
- # 开始检测
- contents = catch_content.public_attachment_catch(content, platform="attach",
- document_id=attach_url)
- content = "\n".join(contents)
- if self.check_budget_ac.search(content):
- return True
- return False
- return False
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_region(self,obj, catch_content: CatchContentObject) -> bool:
- """
- 区域为空检测
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- pass
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_title(self,obj, catch_content: CatchContentObject) -> bool:
- """
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- title = obj.get("title", "")
- if title :
- return False
- return True
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_projectname(self,obj, catch_content: CatchContentObject) -> bool:
- """
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- projectname = obj.get("projectname", "")
- if projectname :
- return False
- return True
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_projectcode(self,obj, catch_content: CatchContentObject) -> bool:
- """
- 项目编号为空检测
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- self.check_projectcode_ac = AcAutomation()
- with open(abnormal_config["table_field_config"]["path4"], "r") as f:
- reads = csv.reader(f)
- [self.check_projectcode_ac.add_word(w[0]) for w in reads]
- projectcode = obj.get("projectcode", "")
- detail = obj.get("detail", "")
- attach_text = obj.get("attach_text", {})
- if projectcode == "":
- contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
- content = "\n".join(contents) #字典处理成字符串
- if self.check_projectcode_ac.search(content):
- return True
- for attach_index, attach_content in attach_text.items():
- if attach_content:
- for topic_index, topic_detail in attach_content.items():
- # oss地址
- attach_url = topic_detail.get("attach_url", "")
- if attach_url:
- # 获取附件内容
- st, content = fsc.download_text_content(attach_url)
- # 下载成功
- # 超长文本不处理,暂定30万字
- if st and content.strip():
- if len(content) > 300000:
- continue
- # 开始检测
- contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
- content = "\n".join(contents)
- if self.check_projectcode_ac.search(content):
- return True
- return False
- return False
- # 处理正文
- # 检查因素
- # 是否返回 0000
- def check_subpackage(self,obj, catch_content: CatchContentObject) -> bool:
- """
- 公司名称检测
- :param obj:代表一个item
- :return:返回true 代表异常
- """
- pass
- # 处理正文
- # 检查因素
- # 是否返回 0000
|