ai
/
machine_learning


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
							# coding:utf-8

import jieba.posseg as psg
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from util.htmlutil.htmltag import Clean
import jieba
import multiprocessing

jieba.enable_parallel(multiprocessing.cpu_count())


def chinese2vectors(chinese: list, remove_word: list, stop_words: list) -> list:
    """
    中文转向量（多文本）
    :param chinese:
    :param remove_word: 去除词性 x ，n , eng
    :param stop_words: 停用词
    :return:
    """
    if not remove_word:
        remove_word = ["x"]
    if not stop_words:
        stop_words = []
    space_words = []
    for row in chinese:
        cut_ret = [word for word, x in psg.lcut(Clean(row)) if x not in remove_word and word not in stop_words]
        space_words.append(" ".join(cut_ret))
    return space_words


def chinese2vector(chinese: str, remove_word: list, stopwords: list) -> str:
    """
    中文转向量（但文本）
    :param chinese:
    :param remove_word: 去除词性 x ，n , eng
    :param stopwords: 停用词
    :return:
    """
    if not stopwords:
        stopwords = []
    if not remove_word:
        remove_word = ["x"]
    cut_ret = [word for word, x in psg.lcut(Clean(chinese)) if x not in remove_word and word not in stopwords]
    cut_ret = " ".join(cut_ret)
    return cut_ret


def tfidf(analyzer, space_words) -> tuple:
    '''
    tf-idf编码
    :param analyzer:
    :param space_words:
    :return:
    '''
    tfidf_vec = TfidfVectorizer(analyzer=analyzer)
    tfidf_ret = tfidf_vec.fit_transform(space_words)
    return tfidf_vec, tfidf_ret


def one2hot(space_words) -> tuple:
    '''
    onehot编码
    :param space_words:
    :return:
    '''
    oht = OneHotEncoder()
    oht_ret = oht.fit_transform(space_words)
    return oht, oht_ret


def combine_row(target_one: [list], target_two: [list]) -> list:
    """
    二维元组
    :param target_one:
    :param target_two:
    :return:
    """
    if len(target_one) != len(target_two):
        raise ValueError("两个列表维度不同")
    try:
        for ind, row in enumerate(target_two):
            target_one[ind] += row
    except Exception as e:
        raise e
    return target_one


def label2encode(labels: []) -> tuple:
    """
    labelEncode 标签向量化
    :param labels:
    :return:
    """
    le = LabelEncoder()
    train_labels = []
    for row in labels:
        train_labels += row
    le.fit_transform(train_labels)
    le_ret = [le.transform(row) for row in labels]
    le_ret = MultiLabelBinarizer().fit_transform(le_ret)
    return le, le_ret


def encode2label(le, predict_results, target_label: list) -> list:
    """
    向量转标签
    :param le: 标签词典对象
    :param predict_results: 预测结果
    :param target_label: 需要的分类
    :return:
    """
    detail_labels = []
    for i, label in enumerate(predict_results):
        if label.sum() > 0:
            label = [i for (i, x) in enumerate(label) if x > 0]
            label_str = ','.join([label for label in le.inverse_transform(label) if label in target_label])
            detail_labels.append(label_str)
    return detail_labels