NoField.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. # coding:utf-8
  2. from tables import CatchContentObject, fsc
  3. from util.sensitive_word import AcAutomation
  4. from docs.config import amount_config
  5. from docs.config import budget_config
  6. from docs.config import DEBUG
  7. from docs.config import abnormal_config
  8. import csv
  9. class NoFieldChecker(object):
  10. """
  11. 无字段或空值检查
  12. """
  13. def __init__(self):
  14. self.errors_tables = {
  15. "title": self.check_title,
  16. "projectname": self.check_projectname,
  17. "buyer":self.check_buyer,
  18. "winner": self.check_winner,
  19. "budget": self.check_budget,
  20. "bidamount": self.check_bidamount,
  21. "area":self.check_region,
  22. "projectcode": self.check_projectcode,
  23. "multipackage":self.check_subpackage,
  24. }
  25. def check_bidamount(self,obj,catch_content: CatchContentObject) -> bool:
  26. """
  27. 中标金额为空检测
  28. :param obj:代表一个item
  29. :return:返回true 代表异常
  30. """
  31. self.check_bidamount_ac = AcAutomation()
  32. with open(amount_config["table_field_config"]["path"], "r") as f:
  33. reads = csv.reader(f)
  34. [self.check_bidamount_ac.add_word(w[0]) for w in reads]
  35. detail = obj.get("detail", "")
  36. attach_text = obj.get("attach_text", {})
  37. subtype = obj.get("subtype", "")
  38. if subtype in ["中标", "成交","合同","验收"]:
  39. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
  40. content = "\n".join(contents) #字典处理成字符串
  41. if self.check_bidamount_ac.search(content):
  42. return True
  43. for attach_index, attach_content in attach_text.items():
  44. if attach_content:
  45. for topic_index, topic_detail in attach_content.items():
  46. # oss地址
  47. attach_url = topic_detail.get("attach_url", "")
  48. if attach_url:
  49. # 获取附件内容
  50. st, content = fsc.download_text_content(attach_url)
  51. # 下载成功
  52. # 超长文本不处理,暂定30万字
  53. if st and content.strip():
  54. if len(content) > 300000:
  55. continue
  56. # 开始检测
  57. contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
  58. content = "\n".join(contents)
  59. if self.check_bidamount_ac.search(content):
  60. return True
  61. return False
  62. return False
  63. # 处理正文
  64. # 检查因素
  65. # 是否返回 0000
  66. def check_winner(self,obj, catch_content: CatchContentObject) -> bool:
  67. """
  68. 中标单位名称为空检测,除中标类型的标讯,其他类型标讯不检查这个字段是否为空
  69. :param obj:代表一个item
  70. :return:返回true 代表异常
  71. """
  72. subtype = obj.get("subtype", "")
  73. if subtype in ["中标", "成交", "合同", "验收"]:
  74. winner = obj.get("winner", "")
  75. if winner:
  76. return False
  77. return True
  78. return False
  79. # 处理正文
  80. # 检查因素
  81. # 是否返回 0000
  82. def check_buyer(self,obj,catch_content: CatchContentObject) -> bool:
  83. """
  84. 采购单位名称是否为空检测
  85. :param buyer:采购单位,多个逗号分割
  86. :param obj:代表一个item
  87. :return:返回true 代表异常
  88. """
  89. buyer = obj.get("buyer", "")
  90. if buyer :
  91. return False
  92. return True
  93. # 处理正文
  94. # 检查因素
  95. # 是否返回 0000
  96. def check_budget(self,obj, catch_content: CatchContentObject) -> bool:
  97. """
  98. 预算为空检测
  99. :param obj:代表一个item
  100. :return:返回true 代表异常
  101. """
  102. self.check_budget_ac = AcAutomation()
  103. with open(budget_config["table_field_config"]["path"],"r") as f :
  104. reads=csv.reader(f)
  105. [self.check_budget_ac.add_word(w[0]) for w in reads ]
  106. detail = obj.get("detail", "")
  107. attach_text = obj.get("attach_text", {})
  108. subtype = obj.get("subtype", "")
  109. if subtype not in ["中标", "成交", "合同", "验收"]:
  110. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") # 返回值是字典
  111. content = "\n".join(contents) # 字典处理成字符串
  112. if self.check_budget_ac.search(content):
  113. return True
  114. for attach_index, attach_content in attach_text.items():
  115. if attach_content:
  116. for topic_index, topic_detail in attach_content.items():
  117. # oss地址
  118. attach_url = topic_detail.get("attach_url", "")
  119. if attach_url:
  120. # 获取附件内容
  121. st, content = fsc.download_text_content(attach_url)
  122. # 下载成功
  123. # 超长文本不处理,暂定30万字
  124. if st and content.strip():
  125. if len(content) > 300000:
  126. continue
  127. # 开始检测
  128. contents = catch_content.public_attachment_catch(content, platform="attach",
  129. document_id=attach_url)
  130. content = "\n".join(contents)
  131. if self.check_budget_ac.search(content):
  132. return True
  133. return False
  134. return False
  135. # 处理正文
  136. # 检查因素
  137. # 是否返回 0000
  138. def check_region(self,obj, catch_content: CatchContentObject) -> bool:
  139. """
  140. 区域为空检测
  141. :param obj:代表一个item
  142. :return:返回true 代表异常
  143. """
  144. pass
  145. # 处理正文
  146. # 检查因素
  147. # 是否返回 0000
  148. def check_title(self,obj, catch_content: CatchContentObject) -> bool:
  149. """
  150. :param obj:代表一个item
  151. :return:返回true 代表异常
  152. """
  153. title = obj.get("title", "")
  154. if title :
  155. return False
  156. return True
  157. # 处理正文
  158. # 检查因素
  159. # 是否返回 0000
  160. def check_projectname(self,obj, catch_content: CatchContentObject) -> bool:
  161. """
  162. :param obj:代表一个item
  163. :return:返回true 代表异常
  164. """
  165. projectname = obj.get("projectname", "")
  166. if projectname :
  167. return False
  168. return True
  169. # 处理正文
  170. # 检查因素
  171. # 是否返回 0000
  172. def check_projectcode(self,obj, catch_content: CatchContentObject) -> bool:
  173. """
  174. 项目编号为空检测
  175. :param obj:代表一个item
  176. :return:返回true 代表异常
  177. """
  178. self.check_projectcode_ac = AcAutomation()
  179. with open(abnormal_config["table_field_config"]["path4"], "r") as f:
  180. reads = csv.reader(f)
  181. [self.check_projectcode_ac.add_word(w[0]) for w in reads]
  182. projectcode = obj.get("projectcode", "")
  183. detail = obj.get("detail", "")
  184. attach_text = obj.get("attach_text", {})
  185. if projectcode == "":
  186. contents = catch_content.public_attachment_catch(detail, platform="html", document_id="公告") #返回值是字典
  187. content = "\n".join(contents) #字典处理成字符串
  188. if self.check_projectcode_ac.search(content):
  189. return True
  190. for attach_index, attach_content in attach_text.items():
  191. if attach_content:
  192. for topic_index, topic_detail in attach_content.items():
  193. # oss地址
  194. attach_url = topic_detail.get("attach_url", "")
  195. if attach_url:
  196. # 获取附件内容
  197. st, content = fsc.download_text_content(attach_url)
  198. # 下载成功
  199. # 超长文本不处理,暂定30万字
  200. if st and content.strip():
  201. if len(content) > 300000:
  202. continue
  203. # 开始检测
  204. contents = catch_content.public_attachment_catch(content, platform="attach",document_id=attach_url)
  205. content = "\n".join(contents)
  206. if self.check_projectcode_ac.search(content):
  207. return True
  208. return False
  209. return False
  210. # 处理正文
  211. # 检查因素
  212. # 是否返回 0000
  213. def check_subpackage(self,obj, catch_content: CatchContentObject) -> bool:
  214. """
  215. 公司名称检测
  216. :param obj:代表一个item
  217. :return:返回true 代表异常
  218. """
  219. pass
  220. # 处理正文
  221. # 检查因素
  222. # 是否返回 0000