server.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # coding:utf-8
  2. from utils.sensitive_word import AcAutomation
  3. from utils.reading_data import loading_keywords
  4. from utils.analyse import cal_overlap_score
  5. from docs.config import ContentsKeywordsPath, TitleKeywordsPath
  6. from utils.reading_data import loading_condition
  7. from utils.reading_data import format_values
  8. from docs.config import RulesHead
  9. from utils.rules import verify_condition_start
  10. from utils.mongo_helper import MongoConnect
  11. from bson import ObjectId
  12. import schedule
  13. from docs.config import mongo_db
  14. from docs.config import MaxId
  15. import joblib
  16. import os
  17. import time
  18. from loguru import logger
  19. from collections import defaultdict
  20. logger.add("./logs/file.log", rotation="12:00")
  21. conditions = loading_condition()
  22. content_keywords_objects, content_total_keywords = loading_keywords(ContentsKeywordsPath)
  23. title_keywords_objects, title_total_keywords = loading_keywords(TitleKeywordsPath)
  24. ac = AcAutomation()
  25. for word in content_total_keywords:
  26. ac.add_word(word)
  27. for word in title_total_keywords:
  28. ac.add_word(word)
  29. Convert = {'title_words': "标题", 'content_words': "全文", 'title_content_words': "标题&全文"}
  30. def get_position(words: list):
  31. '''
  32. 显示位置
  33. :param words:
  34. :return:
  35. '''
  36. keys_word = set()
  37. keys_dict = defaultdict(list)
  38. for word, start, end in words:
  39. keys_dict[word].append(start)
  40. keys_word.add(word)
  41. return keys_word, keys_dict
  42. def cal_frequency(statistic_keyword, title_keys_dict, content_keys_dict):
  43. '''
  44. 计算词频新增
  45. :param statistic_keyword:
  46. :param title_keys_dict:
  47. :param content_keys_dict:
  48. :return:
  49. '''
  50. words_position = {}
  51. for topic, words in statistic_keyword.items():
  52. if topic == "title_words":
  53. words_detail = title_keys_dict
  54. elif topic == "content_words":
  55. words_detail = content_keys_dict
  56. else:
  57. continue
  58. for key_word, (_, _, topic) in words.items():
  59. if topic not in words_position:
  60. words_position[topic] = {}
  61. if key_word not in words_position[topic]:
  62. words_position[topic][key_word] = {}
  63. words_position[topic][key_word]["frequency"] = len(words_detail.get(key_word, []))
  64. words_position[topic][key_word]["position"] = words_detail.get(key_word, [])
  65. return words_position
  66. def score(words_detail):
  67. words_label = {}
  68. for keyword, (exists_len, label, scope) in words_detail.items():
  69. if label not in words_label:
  70. words_label[label] = 1
  71. else:
  72. words_label[label] += 1
  73. return words_label
  74. def process_content(topic_words, keywords_objects):
  75. '''
  76. 处理全文使用
  77. :param topic_words:
  78. :param keywords_objects:
  79. :return:
  80. '''
  81. statistic_keyword = cal_overlap_score(topic_words, keywords_objects)
  82. title_result = statistic_keyword.get('title_words', {})
  83. content_result = statistic_keyword.get('content_words', {})
  84. del_words = []
  85. for word in title_result:
  86. if word not in content_result:
  87. del_words.append(word)
  88. for word in del_words:
  89. del title_result[word]
  90. content_words_label = score(content_result)
  91. title_words_label = score(title_result)
  92. content_params = format_values(content_words_label, RulesHead)
  93. title_params = format_values(title_words_label, RulesHead)
  94. title_content_params = content_params + title_params
  95. is_save, rule = verify_condition_start(conditions, title_params, content_params, title_content_params)
  96. return is_save, rule, statistic_keyword, title_words_label, content_words_label
  97. def process_title(topic_words, keywords_objects):
  98. '''
  99. 处理标题匹配
  100. :param topic_words:
  101. :param keywords_objects:
  102. :return:
  103. '''
  104. statistic_keyword = cal_overlap_score(topic_words, keywords_objects)
  105. title_result = statistic_keyword.get('title_words', {})
  106. content_result = statistic_keyword.get('content_words', {})
  107. content_words_label = score(content_result)
  108. title_words_label = score(title_result)
  109. content_params = format_values(content_words_label, RulesHead)
  110. title_params = format_values(title_words_label, RulesHead)
  111. title_content_params = content_params + title_params
  112. is_save, rule = verify_condition_start(conditions, title_params, content_params, title_content_params)
  113. return is_save, rule, statistic_keyword, title_words_label, content_words_label
  114. def start(title: str, content: str):
  115. """
  116. 新华三程序测试入口
  117. :param title:
  118. :param content:
  119. :return:
  120. """
  121. global content_keywords_objects, title_keywords_objects, ac, conditions
  122. topic_words = {}
  123. title_ac_words = ac.search_and_position(title)
  124. content_ac__words = ac.search_and_position(content + " " + title)
  125. title_keys_word, title_keys_dict = get_position(title_ac_words)
  126. content_keys_word, content_keys_dict = get_position(content_ac__words)
  127. topic_words["title_words"] = title_keys_word
  128. topic_words["content_words"] = content_keys_word
  129. # 执行计算程序
  130. is_save, rule, statistic_keyword, title_label, content_label = process_title(topic_words, title_keywords_objects)
  131. if not is_save:
  132. is_save, rule, statistic_keyword, title_label, content_label = process_content(topic_words,
  133. content_keywords_objects)
  134. rule_title = ""
  135. if rule:
  136. rule_title = ",".join([Convert[r] for r in rule if r in Convert])
  137. words_position = cal_frequency(statistic_keyword, title_keys_dict, content_keys_dict)
  138. statistic_label = {"content_words": content_label, "title_words": title_label}
  139. if is_save:
  140. return statistic_keyword, statistic_label, 1, words_position, rule_title
  141. return statistic_keyword, statistic_label, 0, words_position, rule_title
  142. def routine_work(mongo):
  143. logger.warning(f"本批次开始-->")
  144. try:
  145. mongo.connect()
  146. max_id = ""
  147. count = 0
  148. if os.path.exists(MaxId):
  149. max_id = joblib.load(MaxId)
  150. if not max_id:
  151. max_id = ObjectId("0" * 24)
  152. rule_title = ""
  153. words_position = ""
  154. for row in mongo.col.find({"_id": {"$gt": max_id}}, {"title": 1, "detail": 1}).sort("_id", 1):
  155. title = row.get("title", "")
  156. detail = row.get("detail", "")
  157. count += 1
  158. max_id = row["_id"]
  159. try:
  160. keywords_detail, label_detail, label, words_position, rule_title = start(title, detail)
  161. except Exception as e:
  162. print(e)
  163. label = 0
  164. print(max_id, label, rule_title)
  165. mongo.col.update_one({"_id": row["_id"]},
  166. {"$set": {"valid": label, "matchplace": rule_title, "matchnum": str(words_position)}})
  167. if max_id:
  168. joblib.dump(max_id, MaxId)
  169. logger.warning(f"本次运行 {count} 条")
  170. mongo.client.close()
  171. except Exception as e:
  172. time.sleep(5)
  173. def main(mongo):
  174. # 定义定时任务时直接打标签
  175. schedule.every(40).minutes.do(routine_work, mongo)
  176. while True:
  177. schedule.run_pending()
  178. def test(mongo):
  179. # title = row.get("title", "")
  180. title = "天网机房天津泰达有线电视有限公司经开区统一运营系统泰达投资服务中心、图书馆、档案馆监控系统改造项目招标公告监控"
  181. # detail = row.get("detail", "")
  182. detail = "天网机房天津泰达有线电视有限公司经开区统一运营系统泰达投资服务中心、图书馆、档案馆监控系统改造项目招标公告空调大华"
  183. # try:
  184. keywords_detail, label_detail, label, words_position, rule_title = start(title, detail)
  185. print(keywords_detail)
  186. print(label_detail)
  187. print(label)
  188. print(words_position)
  189. print(rule_title)
  190. # except Exception as e:
  191. # logger.warning(str(e))
  192. # label = 0
  193. if __name__ == "__main__":
  194. mongo = MongoConnect(mongo_db)
  195. routine_work(mongo)
  196. # main(mongo)
  197. # test(mongo)
  198. # 2022年工作总结
  199. #"工作内容、成果(不要记流水账)"
  200. # 一、项目需求分析、功能模块拆分、新技术的验证、更新现有业务用到的技术(OCR、语音识别、实体识别)
  201. # 二、对其它部门的业务做技术支持(文本分类、评标专家抽取、标的物抽取、附件抽取)
  202. # 三、评标专家抽取(稳定运行)
  203. # 四、标的物抽取更新两版,准确率有表格64%,非表格0%。综合提升到 77%
  204. # 五、附件抽取更新一版(乱码率)减少80%
  205. #
  206. # "收获、体会、改进、成长的地方"
  207. # 对人工智能技术的理解更加的深入,丰富了知识储备。项目管理、项目开发能力、程序功能模块的拆分能力得到提升
  208. #
  209. # "规划、期望、期待对(个人、公司、团队)"
  210. # 1、提高项目管理能力
  211. # 2、提高文档的写作能力
  212. # 3、提升程序架构的认知能力,模块拆分更加的合理
  213. # 4、拓展人工智能技术应用到更多的场景当中去
  214. # 5、提升自身的开发能力