123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- '''
- 语料辅助工具词典、向量化辅助
- '''
- from typing import List
- from collections import defaultdict
- import jieba.posseg as psg
- class Dictionary():
- def __init__(self,stopwords:List[str]):
- self.word_ids = defaultdict(lambda :0)
- self.word_docs = defaultdict(lambda :0)
- self.corpus_size = 0
- self.stopwords = dict(zip(stopwords,[0]*len(stopwords)))
- self.dictionary=None
- self.dictionary_words=None
- # 切词
- def cut(self, text):
- return [w for w,x in psg.cut(text) if w not in self.stopwords and x not in ["m","x"]]
- #往词典添加语料,支持更新添加
- def append_vocab(self,text:List[str],need_cut=True):
- for c in text:
- cut_text = self.cut(c) if need_cut else text.split(' ')
- for w in cut_text:
- if not w in self.word_ids:
- self.word_ids[w] += 1
- for w in list(set(cut_text)):
- self.word_docs[w]+=1
- self.corpus_size+=1
- #构建词典
- def build_dictionary(self,tfidf_limit=1e-4,vocab_file='./data/vocab'):
- import math,joblib
- word_size = len(self.word_ids)
- word_tf = dict([(w, c / word_size) for w, c in self.word_ids.items()])
- word_idf = dict([(w, math.log(self.corpus_size / (c + 1))) for w, c in self.word_docs.items()])
- word_tfidf = [(w, (pos+1,c * word_idf[w])) for pos,(w, c) in enumerate(word_tf.items())]
- word_tfidf = sorted(word_tfidf, key=lambda x: x[1][1], reverse=True)
- b_size = len(word_tfidf)
- # 过滤小于1e-4的数据
- print('舍弃词汇',list(filter(lambda x:x[1][1]<=tfidf_limit,word_tfidf)))
- word_tfidf =list(filter(lambda x:x[1][1]>tfidf_limit,word_tfidf))
- a_size = len(word_tfidf)
- print('舍弃词汇量:',b_size-a_size)
- self.dictionary = dict(word_tfidf)
- self.dictionary['']=(0,0)
- self.dictionary_words = list(self.dictionary.keys())
- joblib.dump(self.dictionary,vocab_file)
- #快速构建
- def quick_build_by_mongo(self,host:str,port:int,db:str,col:str,fields:List[str],need_cut=False,maxsize=1000,tfidf_limit=1e-3,vocab_file=''):
- import sys
- mi = MyIterator(host=host,port=port,db=db,col=col,fields=fields,maxsize=maxsize)
- for text in mi:
- self.append_vocab(text,need_cut=need_cut)
- print('>',end='')
- sys.stdout.flush()
- self.build_dictionary(tfidf_limit=tfidf_limit,vocab_file=vocab_file)
- #
- def load_dictionary(self,vocab_file='./data/vocab'):
- import joblib
- self.dictionary = joblib.load(vocab_file)
- self.dictionary_words = list(self.dictionary.keys())
- #语料向量化
- def vector_corpus(self,corpus:List[str],dim=100,need_cut=True,return_type='one_hot',use_tfidf=False):
- import numpy as np
- ret = np.zeros((len(corpus),dim,2),dtype=np.float) if use_tfidf else np.zeros((len(corpus),dim),dtype=np.long)
- for i,c in enumerate(corpus):
- try:
- text_cut = self.cut(c) if need_cut else c.split(' ')
- text_cut_dict = dict(zip(text_cut, [0] * len(text_cut)))
- if return_type == 'one_hot':
- vect = [self.dictionary[w][0] if w in text_cut_dict else 0 for w in self.dictionary_words[:dim]]
- ret[i,:len(vect)]=vect
- else:#bow
- if use_tfidf:
- vect = [self.dictionary[w] if w in self.dictionary else (0,0) for w in text_cut ]
- else:
- vect = [self.dictionary[w][0] if w in self.dictionary else 0 for w in text_cut]
- #超出部分自动截断
- seg_len = len(vect) if len(vect)<dim else dim
- ret[i,:seg_len]=vect[:seg_len]
- except Exception as e:
- print('第%d条数据向量化出错:%s'%(i,e))
- return ret
- '''
- 使用mongo数据库快速构建词典
- '''
- from pymongo import MongoClient,ASCENDING
- from bson import ObjectId
- class MyIterator(object):
- def __init__(self, host:str,port:int,db:str,col:str,fields:List[str],maxsize=1000):
- self.col = MongoClient(host=host, port=port)[db][col]
- self.fields = fields
- self.fields_filter = dict(zip(self.fields,[1]*len(self.fields)))
- self.maxid=ObjectId('0'*24)
- self.size=0
- self.maxsize=maxsize
- # 取得迭代器
- def __iter__(self):
- return self
- def __next__(self):
- hasdata=False
- ret = []
- for row in self.col.find({'_id': {'$gt': self.maxid}}, \
- self.fields_filter) \
- .sort('_id', ASCENDING). \
- limit(100):
- self.size+=1
- if self.size>=self.maxsize:
- break
- hasdata = True
- self.maxid = row['_id']
- ret.append(' '.join([row[f] for f in self.fields]))
- if hasdata:
- return ret
- else:
- raise StopIteration
- if __name__ == '__main__':
- dic = Dictionary(stopwords=[])
- dic.quick_build_by_mongo(host='192.168.3.207', port=27092, \
- db='re4art', col='bidding_china_4_9', \
- fields=['title','detail'], need_cut=True, \
- maxsize=50000, \
- tfidf_limit=1e-5, \
- vocab_file='vocab')
- dic.load_dictionary(vocab_file='vocab')
- print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],use_tfidf=True,return_type='bow',dim=20))
- # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],use_tfidf=False,return_type='bow',dim=20))
- # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例:'],return_type='one_hot',dim=20))
|