# coding:utf-8 from machine_models.tools import link_db from machine_models.tools import chinese2vector from machine_models.tools import tfidf import joblib if __name__ == '__main__': m_config = { "db": "re4art", "col": "bidding_china_4_9", "host": "192.168.3.207:27092", } with open("stopwords.txt", "r") as f: stop_words = [word.strip() for word in f.readlines()] client, col = link_db(m_config) corpus = [] with open("./target.csv", "r") as f: read_data = f.read() read_data = read_data.replace("\n", " ") other = chinese2vector(read_data, remove_word=["x"], stopwords=stop_words) print(other) corpus.append(other) count = 0 for row in col.find({}).sort("_id", 1): # detail = row.get("detail", "") # title = row.get("title", "") count += 1 print(count) # corpus = chinese2vector(title + detail.lower(), remove_word=["x", "m"], stopwords=stop_words) # col.update_one({"_id": row["_id"]}, {"$set": {"cut_detail": corpus}}) cut_detail = row.get("cut_detail", "") corpus.append(cut_detail) # if len(contents) > 10000: # cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words) # corpus.extend(cut_ret) # contents = [] # if contents: # cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words) # corpus.extend(cut_ret) # contents = [] tfidf_vec, tfidf_ret = tfidf(analyzer="word", space_words=corpus) joblib.dump(tfidf_vec, "docs/model/dictionary")