ai
/
machine_learning


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							# coding:utf-8
from machine_models.tools import link_db
from machine_models.tools import chinese2vector
from machine_models.tools import tfidf
import joblib

if __name__ == '__main__':
    m_config = {
        "db": "re4art",
        "col": "bidding_china_4_9",
        "host": "192.168.3.207:27092",
    }
    with open("stopwords.txt", "r") as f:
        stop_words = [word.strip() for word in f.readlines()]

    client, col = link_db(m_config)
    corpus = []
    with open("./target.csv", "r") as f:
        read_data = f.read()
        read_data = read_data.replace("\n", " ")
    other = chinese2vector(read_data, remove_word=["x"], stopwords=stop_words)
    print(other)
    corpus.append(other)
    count = 0
    for row in col.find({}).sort("_id", 1):
        # detail = row.get("detail", "")
        # title = row.get("title", "")
        count += 1
        print(count)
        # corpus = chinese2vector(title + detail.lower(), remove_word=["x", "m"], stopwords=stop_words)
        # col.update_one({"_id": row["_id"]}, {"$set": {"cut_detail": corpus}})
        cut_detail = row.get("cut_detail", "")
        corpus.append(cut_detail)
        # if len(contents) > 10000:
        #     cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
        #     corpus.extend(cut_ret)
        #     contents = []
    # if contents:
    #     cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
    #     corpus.extend(cut_ret)
    #     contents = []
    tfidf_vec, tfidf_ret = tfidf(analyzer="word", space_words=corpus)
    joblib.dump(tfidf_vec, "docs/model/dictionary")