12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- # coding:utf-8
- from machine_models.tools import link_db
- from machine_models.tools import chinese2vector
- from machine_models.tools import tfidf
- import joblib
- if __name__ == '__main__':
- m_config = {
- "db": "re4art",
- "col": "bidding_china_4_9",
- "host": "192.168.3.207:27092",
- }
- with open("stopwords.txt", "r") as f:
- stop_words = [word.strip() for word in f.readlines()]
- client, col = link_db(m_config)
- corpus = []
- with open("./target.csv", "r") as f:
- read_data = f.read()
- read_data = read_data.replace("\n", " ")
- other = chinese2vector(read_data, remove_word=["x"], stopwords=stop_words)
- print(other)
- corpus.append(other)
- count = 0
- for row in col.find({}).sort("_id", 1):
- # detail = row.get("detail", "")
- # title = row.get("title", "")
- count += 1
- print(count)
- # corpus = chinese2vector(title + detail.lower(), remove_word=["x", "m"], stopwords=stop_words)
- # col.update_one({"_id": row["_id"]}, {"$set": {"cut_detail": corpus}})
- cut_detail = row.get("cut_detail", "")
- corpus.append(cut_detail)
- # if len(contents) > 10000:
- # cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
- # corpus.extend(cut_ret)
- # contents = []
- # if contents:
- # cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
- # corpus.extend(cut_ret)
- # contents = []
- tfidf_vec, tfidf_ret = tfidf(analyzer="word", space_words=corpus)
- joblib.dump(tfidf_vec, "docs/model/dictionary")
|