#!/usr/bin/python # -*- coding: utf-8 -*- ''' 语料辅助工具词典、向量化辅助 ''' from typing import List from collections import defaultdict import jieba.posseg as psg class Dictionary(): def __init__(self,stopwords:List[str]): self.word_ids = defaultdict(lambda :0) self.word_docs = defaultdict(lambda :0) self.corpus_size = 0 self.stopwords = dict(zip(stopwords,[0]*len(stopwords))) self.dictionary=None self.dictionary_words=None # 切词 def cut(self, text): return [w for w,x in psg.cut(text) if w not in self.stopwords and x not in ["m","x"]] #往词典添加语料,支持更新添加 def append_vocab(self,text:List[str],need_cut=True): for c in text: cut_text = self.cut(c) if need_cut else text.split(' ') for w in cut_text: if not w in self.word_ids: self.word_ids[w] += 1 for w in list(set(cut_text)): self.word_docs[w]+=1 self.corpus_size+=1 #构建词典 def build_dictionary(self,tfidf_limit=1e-4,vocab_file='./data/vocab'): import math,joblib word_size = len(self.word_ids) word_tf = dict([(w, c / word_size) for w, c in self.word_ids.items()]) word_idf = dict([(w, math.log(self.corpus_size / (c + 1))) for w, c in self.word_docs.items()]) word_tfidf = [(w, (pos+1,c * word_idf[w])) for pos,(w, c) in enumerate(word_tf.items())] word_tfidf = sorted(word_tfidf, key=lambda x: x[1][1], reverse=True) b_size = len(word_tfidf) # 过滤小于1e-4的数据 print('舍弃词汇',list(filter(lambda x:x[1][1]<=tfidf_limit,word_tfidf))) word_tfidf =list(filter(lambda x:x[1][1]>tfidf_limit,word_tfidf)) a_size = len(word_tfidf) print('舍弃词汇量:',b_size-a_size) self.dictionary = dict(word_tfidf) self.dictionary['']=(0,0) self.dictionary_words = list(self.dictionary.keys()) joblib.dump(self.dictionary,vocab_file) #快速构建 def quick_build_by_mongo(self,host:str,port:int,db:str,col:str,fields:List[str],need_cut=False,maxsize=1000,tfidf_limit=1e-3,vocab_file=''): import sys mi = MyIterator(host=host,port=port,db=db,col=col,fields=fields,maxsize=maxsize) for text in mi: self.append_vocab(text,need_cut=need_cut) print('>',end='') sys.stdout.flush() self.build_dictionary(tfidf_limit=tfidf_limit,vocab_file=vocab_file) # def load_dictionary(self,vocab_file='./data/vocab'): import joblib self.dictionary = joblib.load(vocab_file) self.dictionary_words = list(self.dictionary.keys()) #语料向量化 def vector_corpus(self,corpus:List[str],dim=100,need_cut=True,return_type='one_hot',use_tfidf=False): import numpy as np ret = np.zeros((len(corpus),dim,2),dtype=np.float) if use_tfidf else np.zeros((len(corpus),dim),dtype=np.long) for i,c in enumerate(corpus): try: text_cut = self.cut(c) if need_cut else c.split(' ') text_cut_dict = dict(zip(text_cut, [0] * len(text_cut))) if return_type == 'one_hot': vect = [self.dictionary[w][0] if w in text_cut_dict else 0 for w in self.dictionary_words[:dim]] ret[i,:len(vect)]=vect else:#bow if use_tfidf: vect = [self.dictionary[w] if w in self.dictionary else (0,0) for w in text_cut ] else: vect = [self.dictionary[w][0] if w in self.dictionary else 0 for w in text_cut] #超出部分自动截断 seg_len = len(vect) if len(vect)=self.maxsize: break hasdata = True self.maxid = row['_id'] ret.append(' '.join([row[f] for f in self.fields])) if hasdata: return ret else: raise StopIteration if __name__ == '__main__': dic = Dictionary(stopwords=[]) dic.quick_build_by_mongo(host='192.168.3.207', port=27092, \ db='re4art', col='bidding_china_4_9', \ fields=['title','detail'], need_cut=True, \ maxsize=50000, \ tfidf_limit=1e-5, \ vocab_file='vocab') dic.load_dictionary(vocab_file='vocab') print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],use_tfidf=True,return_type='bow',dim=20)) # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],use_tfidf=False,return_type='bow',dim=20)) # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],return_type='one_hot',dim=20))