dictionary.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. 语料辅助工具词典、向量化辅助
  5. '''
  6. from typing import List
  7. from collections import defaultdict
  8. import jieba.posseg as psg
  9. class Dictionary():
  10. def __init__(self,stopwords:List[str]):
  11. self.word_ids = defaultdict(lambda :0)
  12. self.word_docs = defaultdict(lambda :0)
  13. self.corpus_size = 0
  14. self.stopwords = dict(zip(stopwords,[0]*len(stopwords)))
  15. self.dictionary=None
  16. self.dictionary_words=None
  17. # 切词
  18. def cut(self, text):
  19. return [w for w,x in psg.cut(text) if w not in self.stopwords and x not in ["m","x"]]
  20. #往词典添加语料,支持更新添加
  21. def append_vocab(self,text:List[str],need_cut=True):
  22. for c in text:
  23. cut_text = self.cut(c) if need_cut else text.split(' ')
  24. for w in cut_text:
  25. if not w in self.word_ids:
  26. self.word_ids[w] += 1
  27. for w in list(set(cut_text)):
  28. self.word_docs[w]+=1
  29. self.corpus_size+=1
  30. #构建词典
  31. def build_dictionary(self,tfidf_limit=1e-4,vocab_file='./data/vocab'):
  32. import math,joblib
  33. word_size = len(self.word_ids)
  34. word_tf = dict([(w, c / word_size) for w, c in self.word_ids.items()])
  35. word_idf = dict([(w, math.log(self.corpus_size / (c + 1))) for w, c in self.word_docs.items()])
  36. word_tfidf = [(w, (pos+1,c * word_idf[w])) for pos,(w, c) in enumerate(word_tf.items())]
  37. word_tfidf = sorted(word_tfidf, key=lambda x: x[1][1], reverse=True)
  38. b_size = len(word_tfidf)
  39. # 过滤小于1e-4的数据
  40. print('舍弃词汇',list(filter(lambda x:x[1][1]<=tfidf_limit,word_tfidf)))
  41. word_tfidf =list(filter(lambda x:x[1][1]>tfidf_limit,word_tfidf))
  42. a_size = len(word_tfidf)
  43. print('舍弃词汇量:',b_size-a_size)
  44. self.dictionary = dict(word_tfidf)
  45. self.dictionary['']=(0,0)
  46. self.dictionary_words = list(self.dictionary.keys())
  47. joblib.dump(self.dictionary,vocab_file)
  48. #快速构建
  49. def quick_build_by_mongo(self,host:str,port:int,db:str,col:str,fields:List[str],need_cut=False,maxsize=1000,tfidf_limit=1e-3,vocab_file=''):
  50. import sys
  51. mi = MyIterator(host=host,port=port,db=db,col=col,fields=fields,maxsize=maxsize)
  52. for text in mi:
  53. self.append_vocab(text,need_cut=need_cut)
  54. print('>',end='')
  55. sys.stdout.flush()
  56. self.build_dictionary(tfidf_limit=tfidf_limit,vocab_file=vocab_file)
  57. #
  58. def load_dictionary(self,vocab_file='./data/vocab'):
  59. import joblib
  60. self.dictionary = joblib.load(vocab_file)
  61. self.dictionary_words = list(self.dictionary.keys())
  62. #语料向量化
  63. def vector_corpus(self,corpus:List[str],dim=100,need_cut=True,return_type='one_hot',use_tfidf=False):
  64. import numpy as np
  65. ret = np.zeros((len(corpus),dim,2),dtype=np.float) if use_tfidf else np.zeros((len(corpus),dim),dtype=np.long)
  66. for i,c in enumerate(corpus):
  67. try:
  68. text_cut = self.cut(c) if need_cut else c.split(' ')
  69. text_cut_dict = dict(zip(text_cut, [0] * len(text_cut)))
  70. if return_type == 'one_hot':
  71. vect = [self.dictionary[w][0] if w in text_cut_dict else 0 for w in self.dictionary_words[:dim]]
  72. ret[i,:len(vect)]=vect
  73. else:#bow
  74. if use_tfidf:
  75. vect = [self.dictionary[w] if w in self.dictionary else (0,0) for w in text_cut ]
  76. else:
  77. vect = [self.dictionary[w][0] if w in self.dictionary else 0 for w in text_cut]
  78. #超出部分自动截断
  79. seg_len = len(vect) if len(vect)<dim else dim
  80. ret[i,:seg_len]=vect[:seg_len]
  81. except Exception as e:
  82. print('第%d条数据向量化出错:%s'%(i,e))
  83. return ret
  84. '''
  85. 使用mongo数据库快速构建词典
  86. '''
  87. from pymongo import MongoClient,ASCENDING
  88. from bson import ObjectId
  89. class MyIterator(object):
  90. def __init__(self, host:str,port:int,db:str,col:str,fields:List[str],maxsize=1000):
  91. self.col = MongoClient(host=host, port=port)[db][col]
  92. self.fields = fields
  93. self.fields_filter = dict(zip(self.fields,[1]*len(self.fields)))
  94. self.maxid=ObjectId('0'*24)
  95. self.size=0
  96. self.maxsize=maxsize
  97. # 取得迭代器
  98. def __iter__(self):
  99. return self
  100. def __next__(self):
  101. hasdata=False
  102. ret = []
  103. for row in self.col.find({'_id': {'$gt': self.maxid}}, \
  104. self.fields_filter) \
  105. .sort('_id', ASCENDING). \
  106. limit(100):
  107. self.size+=1
  108. if self.size>=self.maxsize:
  109. break
  110. hasdata = True
  111. self.maxid = row['_id']
  112. ret.append(' '.join([row[f] for f in self.fields]))
  113. if hasdata:
  114. return ret
  115. else:
  116. raise StopIteration
  117. if __name__ == '__main__':
  118. dic = Dictionary(stopwords=[])
  119. dic.quick_build_by_mongo(host='192.168.3.207', port=27092, \
  120. db='re4art', col='bidding_china_4_9', \
  121. fields=['title','detail'], need_cut=True, \
  122. maxsize=50000, \
  123. tfidf_limit=1e-5, \
  124. vocab_file='vocab')
  125. dic.load_dictionary(vocab_file='vocab')
  126. print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],use_tfidf=True,return_type='bow',dim=20))
  127. # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],use_tfidf=False,return_type='bow',dim=20))
  128. # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],return_type='one_hot',dim=20))