create_dict.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # coding:utf-8
  2. from machine_models.tools import link_db
  3. from machine_models.tools import chinese2vector
  4. from machine_models.tools import tfidf
  5. import joblib
  6. if __name__ == '__main__':
  7. m_config = {
  8. "db": "re4art",
  9. "col": "bidding_china_4_9",
  10. "host": "192.168.3.207:27092",
  11. }
  12. with open("stopwords.txt", "r") as f:
  13. stop_words = [word.strip() for word in f.readlines()]
  14. client, col = link_db(m_config)
  15. corpus = []
  16. with open("./target.csv", "r") as f:
  17. read_data = f.read()
  18. read_data = read_data.replace("\n", " ")
  19. other = chinese2vector(read_data, remove_word=["x"], stopwords=stop_words)
  20. print(other)
  21. corpus.append(other)
  22. count = 0
  23. for row in col.find({}).sort("_id", 1):
  24. # detail = row.get("detail", "")
  25. # title = row.get("title", "")
  26. count += 1
  27. print(count)
  28. # corpus = chinese2vector(title + detail.lower(), remove_word=["x", "m"], stopwords=stop_words)
  29. # col.update_one({"_id": row["_id"]}, {"$set": {"cut_detail": corpus}})
  30. cut_detail = row.get("cut_detail", "")
  31. corpus.append(cut_detail)
  32. # if len(contents) > 10000:
  33. # cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
  34. # corpus.extend(cut_ret)
  35. # contents = []
  36. # if contents:
  37. # cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
  38. # corpus.extend(cut_ret)
  39. # contents = []
  40. tfidf_vec, tfidf_ret = tfidf(analyzer="word", space_words=corpus)
  41. joblib.dump(tfidf_vec, "docs/model/dictionary")