# coding:utf-8 from utils.sensitive_word import AcAutomation from utils.reading_data import loading_keywords from utils.analyse import cal_overlap_score from docs.config import ContentsKeywordsPath, TitleKeywordsPath from utils.reading_data import loading_condition from utils.reading_data import format_values from docs.config import RulesHead from utils.rules import verify_condition_start from utils.mongo_helper import MongoConnect from bson import ObjectId import schedule from docs.config import mongo_db from docs.config import MaxId import joblib import os import time from loguru import logger from collections import defaultdict logger.add("./logs/file.log", rotation="12:00") conditions = loading_condition() content_keywords_objects, content_total_keywords = loading_keywords(ContentsKeywordsPath) title_keywords_objects, title_total_keywords = loading_keywords(TitleKeywordsPath) ac = AcAutomation() for word in content_total_keywords: ac.add_word(word) for word in title_total_keywords: ac.add_word(word) Convert = {'title_words': "标题", 'content_words': "全文", 'title_content_words': "标题&全文"} def get_position(words: list): ''' 显示位置 :param words: :return: ''' keys_word = set() keys_dict = defaultdict(list) for word, start, end in words: keys_dict[word].append(start) keys_word.add(word) return keys_word, keys_dict def cal_frequency(statistic_keyword, title_keys_dict, content_keys_dict): ''' 计算词频新增 :param statistic_keyword: :param title_keys_dict: :param content_keys_dict: :return: ''' words_position = {} for topic, words in statistic_keyword.items(): if topic == "title_words": words_detail = title_keys_dict elif topic == "content_words": words_detail = content_keys_dict else: continue for key_word, (_, _, topic) in words.items(): if topic not in words_position: words_position[topic] = {} if key_word not in words_position[topic]: words_position[topic][key_word] = {} words_position[topic][key_word]["frequency"] = len(words_detail.get(key_word, [])) words_position[topic][key_word]["position"] = words_detail.get(key_word, []) return words_position def score(words_detail): words_label = {} for keyword, (exists_len, label, scope) in words_detail.items(): if label not in words_label: words_label[label] = 1 else: words_label[label] += 1 return words_label def process_content(topic_words, keywords_objects): ''' 处理全文使用 :param topic_words: :param keywords_objects: :return: ''' statistic_keyword = cal_overlap_score(topic_words, keywords_objects) title_result = statistic_keyword.get('title_words', {}) content_result = statistic_keyword.get('content_words', {}) del_words = [] for word in title_result: if word not in content_result: del_words.append(word) for word in del_words: del title_result[word] content_words_label = score(content_result) title_words_label = score(title_result) content_params = format_values(content_words_label, RulesHead) title_params = format_values(title_words_label, RulesHead) title_content_params = content_params + title_params is_save, rule = verify_condition_start(conditions, title_params, content_params, title_content_params) return is_save, rule, statistic_keyword, title_words_label, content_words_label def process_title(topic_words, keywords_objects): ''' 处理标题匹配 :param topic_words: :param keywords_objects: :return: ''' statistic_keyword = cal_overlap_score(topic_words, keywords_objects) title_result = statistic_keyword.get('title_words', {}) content_result = statistic_keyword.get('content_words', {}) content_words_label = score(content_result) title_words_label = score(title_result) content_params = format_values(content_words_label, RulesHead) title_params = format_values(title_words_label, RulesHead) title_content_params = content_params + title_params is_save, rule = verify_condition_start(conditions, title_params, content_params, title_content_params) return is_save, rule, statistic_keyword, title_words_label, content_words_label def start(title: str, content: str): """ 新华三程序测试入口 :param title: :param content: :return: """ global content_keywords_objects, title_keywords_objects, ac, conditions topic_words = {} title_ac_words = ac.search_and_position(title) content_ac__words = ac.search_and_position(content + " " + title) title_keys_word, title_keys_dict = get_position(title_ac_words) content_keys_word, content_keys_dict = get_position(content_ac__words) topic_words["title_words"] = title_keys_word topic_words["content_words"] = content_keys_word # 执行计算程序 is_save, rule, statistic_keyword, title_label, content_label = process_title(topic_words, title_keywords_objects) if not is_save: is_save, rule, statistic_keyword, title_label, content_label = process_content(topic_words, content_keywords_objects) rule_title = "" if rule: rule_title = ",".join([Convert[r] for r in rule if r in Convert]) words_position = cal_frequency(statistic_keyword, title_keys_dict, content_keys_dict) statistic_label = {"content_words": content_label, "title_words": title_label} if is_save: return statistic_keyword, statistic_label, 1, words_position, rule_title return statistic_keyword, statistic_label, 0, words_position, rule_title def routine_work(mongo): logger.warning(f"本批次开始-->") try: mongo.connect() max_id = "" count = 0 if os.path.exists(MaxId): max_id = joblib.load(MaxId) if not max_id: max_id = ObjectId("0" * 24) rule_title = "" words_position = "" for row in mongo.col.find({"_id": {"$gt": max_id}}, {"title": 1, "detail": 1}).sort("_id", 1): title = row.get("title", "") detail = row.get("detail", "") count += 1 max_id = row["_id"] try: keywords_detail, label_detail, label, words_position, rule_title = start(title, detail) except Exception as e: print(e) label = 0 print(max_id, label, rule_title) mongo.col.update_one({"_id": row["_id"]}, {"$set": {"valid": label, "matchplace": rule_title, "matchnum": str(words_position)}}) if max_id: joblib.dump(max_id, MaxId) logger.warning(f"本次运行 {count} 条") mongo.client.close() except Exception as e: time.sleep(5) def main(mongo): # 定义定时任务时直接打标签 schedule.every(40).minutes.do(routine_work, mongo) while True: schedule.run_pending() def test(mongo): # title = row.get("title", "") title = "天网机房天津泰达有线电视有限公司经开区统一运营系统泰达投资服务中心、图书馆、档案馆监控系统改造项目招标公告监控" # detail = row.get("detail", "") detail = "天网机房天津泰达有线电视有限公司经开区统一运营系统泰达投资服务中心、图书馆、档案馆监控系统改造项目招标公告空调大华" # try: keywords_detail, label_detail, label, words_position, rule_title = start(title, detail) print(keywords_detail) print(label_detail) print(label) print(words_position) print(rule_title) # except Exception as e: # logger.warning(str(e)) # label = 0 if __name__ == "__main__": mongo = MongoConnect(mongo_db) routine_work(mongo) # main(mongo) # test(mongo) # 2022年工作总结 #"工作内容、成果(不要记流水账)" # 一、项目需求分析、功能模块拆分、新技术的验证、更新现有业务用到的技术(OCR、语音识别、实体识别) # 二、对其它部门的业务做技术支持(文本分类、评标专家抽取、标的物抽取、附件抽取) # 三、评标专家抽取(稳定运行) # 四、标的物抽取更新两版,准确率有表格64%,非表格0%。综合提升到 77% # 五、附件抽取更新一版(乱码率)减少80% # # "收获、体会、改进、成长的地方" # 对人工智能技术的理解更加的深入,丰富了知识储备。项目管理、项目开发能力、程序功能模块的拆分能力得到提升 # # "规划、期望、期待对(个人、公司、团队)" # 1、提高项目管理能力 # 2、提高文档的写作能力 # 3、提升程序架构的认知能力,模块拆分更加的合理 # 4、拓展人工智能技术应用到更多的场景当中去 # 5、提升自身的开发能力