ai
/
keywords_engine


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
							# coding:utf-8

from utils.sensitive_word import AcAutomation
from utils.reading_data import loading_keywords
from utils.analyse import cal_overlap_score
from docs.config import ContentsKeywordsPath, TitleKeywordsPath
from utils.reading_data import loading_condition
from utils.reading_data import format_values
from docs.config import RulesHead
from utils.rules import verify_condition_start
from utils.mongo_helper import MongoConnect
from bson import ObjectId
import schedule
from docs.config import mongo_db
from docs.config import MaxId
import joblib
import os
import time
from loguru import logger
from collections import defaultdict

logger.add("./logs/file.log", rotation="12:00")

conditions = loading_condition()
content_keywords_objects, content_total_keywords = loading_keywords(ContentsKeywordsPath)
title_keywords_objects, title_total_keywords = loading_keywords(TitleKeywordsPath)
ac = AcAutomation()

for word in content_total_keywords:
    ac.add_word(word)
for word in title_total_keywords:
    ac.add_word(word)

Convert = {'title_words': "标题", 'content_words': "全文", 'title_content_words': "标题&全文"}


def get_position(words: list):
    '''
    显示位置
    :param words:
    :return:
    '''
    keys_word = set()
    keys_dict = defaultdict(list)
    for word, start, end in words:
        keys_dict[word].append(start)
        keys_word.add(word)
    return keys_word, keys_dict


def cal_frequency(statistic_keyword, title_keys_dict, content_keys_dict):
    '''
    计算词频新增
    :param statistic_keyword:
    :param title_keys_dict:
    :param content_keys_dict:
    :return:
    '''
    words_position = {}
    for topic, words in statistic_keyword.items():
        if topic == "title_words":
            words_detail = title_keys_dict
        elif topic == "content_words":
            words_detail = content_keys_dict
        else:
            continue
        for key_word, (_, _, topic) in words.items():
            if topic not in words_position:
                words_position[topic] = {}
            if key_word not in words_position[topic]:
                words_position[topic][key_word] = {}
            words_position[topic][key_word]["frequency"] = len(words_detail.get(key_word, []))
            words_position[topic][key_word]["position"] = words_detail.get(key_word, [])
    return words_position


def score(words_detail):
    words_label = {}
    for keyword, (exists_len, label, scope) in words_detail.items():
        if label not in words_label:
            words_label[label] = 1
        else:
            words_label[label] += 1
    return words_label


def process_content(topic_words, keywords_objects):
    '''
    处理全文使用
    :param topic_words:
    :param keywords_objects:
    :return:
    '''
    statistic_keyword = cal_overlap_score(topic_words, keywords_objects)
    title_result = statistic_keyword.get('title_words', {})
    content_result = statistic_keyword.get('content_words', {})
    del_words = []
    for word in title_result:
        if word not in content_result:
            del_words.append(word)
    for word in del_words:
        del title_result[word]

    content_words_label = score(content_result)
    title_words_label = score(title_result)
    content_params = format_values(content_words_label, RulesHead)
    title_params = format_values(title_words_label, RulesHead)
    title_content_params = content_params + title_params
    is_save, rule = verify_condition_start(conditions, title_params, content_params, title_content_params)
    return is_save, rule, statistic_keyword, title_words_label, content_words_label


def process_title(topic_words, keywords_objects):
    '''
    处理标题匹配
    :param topic_words:
    :param keywords_objects:
    :return:
    '''
    statistic_keyword = cal_overlap_score(topic_words, keywords_objects)
    title_result = statistic_keyword.get('title_words', {})
    content_result = statistic_keyword.get('content_words', {})
    content_words_label = score(content_result)
    title_words_label = score(title_result)
    content_params = format_values(content_words_label, RulesHead)
    title_params = format_values(title_words_label, RulesHead)
    title_content_params = content_params + title_params
    is_save, rule = verify_condition_start(conditions, title_params, content_params, title_content_params)
    return is_save, rule, statistic_keyword, title_words_label, content_words_label


def start(title: str, content: str):
    """
    新华三程序测试入口
    :param title:
    :param content:
    :return:
    """
    global content_keywords_objects, title_keywords_objects, ac, conditions
    topic_words = {}
    title_ac_words = ac.search_and_position(title)
    content_ac__words = ac.search_and_position(content + " " + title)
    title_keys_word, title_keys_dict = get_position(title_ac_words)
    content_keys_word, content_keys_dict = get_position(content_ac__words)
    topic_words["title_words"] = title_keys_word
    topic_words["content_words"] = content_keys_word
    # 执行计算程序
    is_save, rule, statistic_keyword, title_label, content_label = process_title(topic_words, title_keywords_objects)
    if not is_save:
        is_save, rule, statistic_keyword, title_label, content_label = process_content(topic_words,
                                                                                       content_keywords_objects)
    rule_title = ""
    if rule:
        rule_title = ",".join([Convert[r] for r in rule if r in Convert])
    words_position = cal_frequency(statistic_keyword, title_keys_dict, content_keys_dict)
    statistic_label = {"content_words": content_label, "title_words": title_label}
    if is_save:
        return statistic_keyword, statistic_label, 1, words_position, rule_title
    return statistic_keyword, statistic_label, 0, words_position, rule_title


def routine_work(mongo):
    logger.warning(f"本批次开始-->")
    try:
        mongo.connect()
        max_id = ""
        count = 0
        if os.path.exists(MaxId):
            max_id = joblib.load(MaxId)
        if not max_id:
            max_id = ObjectId("0" * 24)
        rule_title = ""
        words_position = ""
        for row in mongo.col.find({"_id": {"$gt": max_id}}, {"title": 1, "detail": 1}).sort("_id", 1):
            title = row.get("title", "")
            detail = row.get("detail", "")
            count += 1
            max_id = row["_id"]
            try:
                keywords_detail, label_detail, label, words_position, rule_title = start(title, detail)
            except Exception as e:
                print(e)
                label = 0
            print(max_id, label, rule_title)
            mongo.col.update_one({"_id": row["_id"]},
                                 {"$set": {"valid": label, "matchplace": rule_title, "matchnum": str(words_position)}})

        if max_id:
            joblib.dump(max_id, MaxId)
        logger.warning(f"本次运行 {count} 条")
        mongo.client.close()
    except Exception as e:
        time.sleep(5)


def main(mongo):
    # 定义定时任务时直接打标签
    schedule.every(40).minutes.do(routine_work, mongo)
    while True:
        schedule.run_pending()


def test(mongo):
    # title = row.get("title", "")
    title = "天网机房天津泰达有线电视有限公司经开区统一运营系统泰达投资服务中心、图书馆、档案馆监控系统改造项目招标公告监控"
    # detail = row.get("detail", "")
    detail = "天网机房天津泰达有线电视有限公司经开区统一运营系统泰达投资服务中心、图书馆、档案馆监控系统改造项目招标公告空调大华"
    # try:
    keywords_detail, label_detail, label, words_position, rule_title = start(title, detail)
    print(keywords_detail)
    print(label_detail)
    print(label)
    print(words_position)
    print(rule_title)
    # except Exception as e:
    #     logger.warning(str(e))
    #     label = 0


if __name__ == "__main__":
    mongo = MongoConnect(mongo_db)
    routine_work(mongo)
    # main(mongo)
    # test(mongo)

# 2022年工作总结
#"工作内容、成果（不要记流水账）"
# 一、项目需求分析、功能模块拆分、新技术的验证、更新现有业务用到的技术（OCR、语音识别、实体识别）
# 二、对其它部门的业务做技术支持（文本分类、评标专家抽取、标的物抽取、附件抽取）
# 三、评标专家抽取（稳定运行）
# 四、标的物抽取更新两版，准确率有表格64%，非表格0%。综合提升到 77%
# 五、附件抽取更新一版（乱码率）减少80%
#

# "收获、体会、改进、成长的地方"
# 对人工智能技术的理解更加的深入，丰富了知识储备。项目管理、项目开发能力、程序功能模块的拆分能力得到提升
#


# "规划、期望、期待对（个人、公司、团队）"
# 1、提高项目管理能力
# 2、提高文档的写作能力
# 3、提升程序架构的认知能力，模块拆分更加的合理
# 4、拓展人工智能技术应用到更多的场景当中去
# 5、提升自身的开发能力