QA
/
data_quality_server


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
							#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
语料辅助工具词典、向量化辅助
'''

from typing import List
from collections import defaultdict
import jieba.posseg as psg
class Dictionary():

    def __init__(self,stopwords:List[str]):
        self.word_ids = defaultdict(lambda :0)
        self.word_docs = defaultdict(lambda :0)
        self.corpus_size = 0
        self.stopwords = dict(zip(stopwords,[0]*len(stopwords)))
        self.dictionary=None
        self.dictionary_words=None

    # 切词
    def cut(self, text):
        return [w for w,x in psg.cut(text) if w not in self.stopwords and x not in ["m","x"]]


    #往词典添加语料，支持更新添加
    def append_vocab(self,text:List[str],need_cut=True):
        for c in text:
            cut_text = self.cut(c) if need_cut else text.split(' ')
            for w in cut_text:
                if not w in self.word_ids:
                    self.word_ids[w] += 1

            for w in  list(set(cut_text)):
                self.word_docs[w]+=1
            self.corpus_size+=1

    #构建词典
    def build_dictionary(self,tfidf_limit=1e-4,vocab_file='./data/vocab'):
        import math,joblib
        word_size = len(self.word_ids)
        word_tf = dict([(w, c / word_size) for w, c in self.word_ids.items()])
        word_idf = dict([(w, math.log(self.corpus_size / (c + 1))) for w, c in self.word_docs.items()])
        word_tfidf = [(w, (pos+1,c * word_idf[w])) for pos,(w, c) in enumerate(word_tf.items())]
        word_tfidf = sorted(word_tfidf, key=lambda x: x[1][1], reverse=True)
        b_size = len(word_tfidf)
        # 过滤小于1e-4的数据
        print('舍弃词汇',list(filter(lambda x:x[1][1]<=tfidf_limit,word_tfidf)))
        word_tfidf =list(filter(lambda x:x[1][1]>tfidf_limit,word_tfidf))
        a_size = len(word_tfidf)
        print('舍弃词汇量:',b_size-a_size)
        self.dictionary = dict(word_tfidf)
        self.dictionary['']=(0,0)
        self.dictionary_words = list(self.dictionary.keys())
        joblib.dump(self.dictionary,vocab_file)

    #快速构建
    def quick_build_by_mongo(self,host:str,port:int,db:str,col:str,fields:List[str],need_cut=False,maxsize=1000,tfidf_limit=1e-3,vocab_file=''):
        import sys
        mi = MyIterator(host=host,port=port,db=db,col=col,fields=fields,maxsize=maxsize)
        for text in mi:
            self.append_vocab(text,need_cut=need_cut)
            print('>',end='')
            sys.stdout.flush()

        self.build_dictionary(tfidf_limit=tfidf_limit,vocab_file=vocab_file)

    #
    def load_dictionary(self,vocab_file='./data/vocab'):
        import joblib
        self.dictionary = joblib.load(vocab_file)
        self.dictionary_words = list(self.dictionary.keys())

    #语料向量化
    def vector_corpus(self,corpus:List[str],dim=100,need_cut=True,return_type='one_hot',use_tfidf=False):
        import numpy as np
        ret = np.zeros((len(corpus),dim,2),dtype=np.float) if use_tfidf else np.zeros((len(corpus),dim),dtype=np.long)
        for i,c in enumerate(corpus):
            try:
                text_cut = self.cut(c) if need_cut else c.split(' ')
                text_cut_dict = dict(zip(text_cut, [0] * len(text_cut)))
                if return_type == 'one_hot':
                    vect = [self.dictionary[w][0] if w in text_cut_dict else 0 for w in self.dictionary_words[:dim]]
                    ret[i,:len(vect)]=vect
                else:#bow
                    if use_tfidf:
                        vect = [self.dictionary[w] if w in self.dictionary else (0,0) for w in text_cut ]
                    else:
                        vect = [self.dictionary[w][0] if w in self.dictionary else 0 for w in text_cut]
                    #超出部分自动截断
                    seg_len = len(vect) if len(vect)<dim else dim
                    ret[i,:seg_len]=vect[:seg_len]
            except Exception as e:
                print('第%d条数据向量化出错:%s'%(i,e))
        return ret

'''
使用mongo数据库快速构建词典
'''
from pymongo import MongoClient,ASCENDING
from bson import ObjectId
class MyIterator(object):
    def __init__(self, host:str,port:int,db:str,col:str,fields:List[str],maxsize=1000):
        self.col = MongoClient(host=host, port=port)[db][col]
        self.fields = fields
        self.fields_filter = dict(zip(self.fields,[1]*len(self.fields)))
        self.maxid=ObjectId('0'*24)
        self.size=0
        self.maxsize=maxsize


    # 取得迭代器
    def __iter__(self):
        return self

    def __next__(self):
        hasdata=False
        ret = []
        for row in self.col.find({'_id': {'$gt': self.maxid}}, \
                                 self.fields_filter) \
                .sort('_id', ASCENDING). \
                limit(100):
            self.size+=1
            if self.size>=self.maxsize:
                break

            hasdata = True
            self.maxid = row['_id']
            ret.append(' '.join([row[f] for f in self.fields]))
        if hasdata:
            return ret
        else:
            raise StopIteration


if __name__ == '__main__':
    dic = Dictionary(stopwords=[])
    dic.quick_build_by_mongo(host='192.168.3.207', port=27092, \
                             db='re4art', col='bidding_china_4_9', \
                             fields=['title','detail'], need_cut=True, \
                             maxsize=50000, \
                             tfidf_limit=1e-5, \
                             vocab_file='vocab')
    dic.load_dictionary(vocab_file='vocab')
    print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例：'],use_tfidf=True,return_type='bow',dim=20))
    # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例：'],use_tfidf=False,return_type='bow',dim=20))
    # print(dic.vector_corpus(corpus=['大连市竞争性谈判以下展示了使用 filter 函数的实例：'],return_type='one_hot',dim=20))