# coding:utf-8 import jieba.posseg as psg from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.preprocessing import MultiLabelBinarizer from util.htmlutil.htmltag import Clean import jieba import multiprocessing jieba.enable_parallel(multiprocessing.cpu_count()) def chinese2vectors(chinese: list, remove_word: list, stop_words: list) -> list: """ 中文转向量(多文本) :param chinese: :param remove_word: 去除词性 x ,n , eng :param stop_words: 停用词 :return: """ if not remove_word: remove_word = ["x"] if not stop_words: stop_words = [] space_words = [] for row in chinese: cut_ret = [word for word, x in psg.lcut(Clean(row)) if x not in remove_word and word not in stop_words] space_words.append(" ".join(cut_ret)) return space_words def chinese2vector(chinese: str, remove_word: list, stopwords: list) -> str: """ 中文转向量(但文本) :param chinese: :param remove_word: 去除词性 x ,n , eng :param stopwords: 停用词 :return: """ if not stopwords: stopwords = [] if not remove_word: remove_word = ["x"] cut_ret = [word for word, x in psg.lcut(Clean(chinese)) if x not in remove_word and word not in stopwords] cut_ret = " ".join(cut_ret) return cut_ret def tfidf(analyzer, space_words) -> tuple: ''' tf-idf编码 :param analyzer: :param space_words: :return: ''' tfidf_vec = TfidfVectorizer(analyzer=analyzer) tfidf_ret = tfidf_vec.fit_transform(space_words) return tfidf_vec, tfidf_ret def one2hot(space_words) -> tuple: ''' onehot编码 :param space_words: :return: ''' oht = OneHotEncoder() oht_ret = oht.fit_transform(space_words) return oht, oht_ret def combine_row(target_one: [list], target_two: [list]) -> list: """ 二维元组 :param target_one: :param target_two: :return: """ if len(target_one) != len(target_two): raise ValueError("两个列表维度不同") try: for ind, row in enumerate(target_two): target_one[ind] += row except Exception as e: raise e return target_one def label2encode(labels: []) -> tuple: """ labelEncode 标签向量化 :param labels: :return: """ le = LabelEncoder() train_labels = [] for row in labels: train_labels += row le.fit_transform(train_labels) le_ret = [le.transform(row) for row in labels] le_ret = MultiLabelBinarizer().fit_transform(le_ret) return le, le_ret def encode2label(le, predict_results, target_label: list) -> list: """ 向量转标签 :param le: 标签词典对象 :param predict_results: 预测结果 :param target_label: 需要的分类 :return: """ detail_labels = [] for i, label in enumerate(predict_results): if label.sum() > 0: label = [i for (i, x) in enumerate(label) if x > 0] label_str = ','.join([label for label in le.inverse_transform(label) if label in target_label]) detail_labels.append(label_str) return detail_labels