123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- # coding:utf-8
- import jieba.posseg as psg
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.preprocessing import LabelEncoder, OneHotEncoder
- from sklearn.preprocessing import MultiLabelBinarizer
- from util.htmlutil.htmltag import Clean
- import jieba
- import multiprocessing
- jieba.enable_parallel(multiprocessing.cpu_count())
- def chinese2vectors(chinese: list, remove_word: list, stop_words: list) -> list:
- """
- 中文转向量(多文本)
- :param chinese:
- :param remove_word: 去除词性 x ,n , eng
- :param stop_words: 停用词
- :return:
- """
- if not remove_word:
- remove_word = ["x"]
- if not stop_words:
- stop_words = []
- space_words = []
- for row in chinese:
- cut_ret = [word for word, x in psg.lcut(Clean(row)) if x not in remove_word and word not in stop_words]
- space_words.append(" ".join(cut_ret))
- return space_words
- def chinese2vector(chinese: str, remove_word: list, stopwords: list) -> str:
- """
- 中文转向量(但文本)
- :param chinese:
- :param remove_word: 去除词性 x ,n , eng
- :param stopwords: 停用词
- :return:
- """
- if not stopwords:
- stopwords = []
- if not remove_word:
- remove_word = ["x"]
- cut_ret = [word for word, x in psg.lcut(Clean(chinese)) if x not in remove_word and word not in stopwords]
- cut_ret = " ".join(cut_ret)
- return cut_ret
- def tfidf(analyzer, space_words) -> tuple:
- '''
- tf-idf编码
- :param analyzer:
- :param space_words:
- :return:
- '''
- tfidf_vec = TfidfVectorizer(analyzer=analyzer)
- tfidf_ret = tfidf_vec.fit_transform(space_words)
- return tfidf_vec, tfidf_ret
- def one2hot(space_words) -> tuple:
- '''
- onehot编码
- :param space_words:
- :return:
- '''
- oht = OneHotEncoder()
- oht_ret = oht.fit_transform(space_words)
- return oht, oht_ret
- def combine_row(target_one: [list], target_two: [list]) -> list:
- """
- 二维元组
- :param target_one:
- :param target_two:
- :return:
- """
- if len(target_one) != len(target_two):
- raise ValueError("两个列表维度不同")
- try:
- for ind, row in enumerate(target_two):
- target_one[ind] += row
- except Exception as e:
- raise e
- return target_one
- def label2encode(labels: []) -> tuple:
- """
- labelEncode 标签向量化
- :param labels:
- :return:
- """
- le = LabelEncoder()
- train_labels = []
- for row in labels:
- train_labels += row
- le.fit_transform(train_labels)
- le_ret = [le.transform(row) for row in labels]
- le_ret = MultiLabelBinarizer().fit_transform(le_ret)
- return le, le_ret
- def encode2label(le, predict_results, target_label: list) -> list:
- """
- 向量转标签
- :param le: 标签词典对象
- :param predict_results: 预测结果
- :param target_label: 需要的分类
- :return:
- """
- detail_labels = []
- for i, label in enumerate(predict_results):
- if label.sum() > 0:
- label = [i for (i, x) in enumerate(label) if x > 0]
- label_str = ','.join([label for label in le.inverse_transform(label) if label in target_label])
- detail_labels.append(label_str)
- return detail_labels
|