#!/usr/bin/env python3 # -*- coding: utf-8 -*- # 文本分类 (多元分类) import pandas as pd from sklearn.utils import shuffle import torch as t from util.dictionary import Dictionary import jieba import os import joblib from docs.config import ai2config table_field_config = ai2config["table_field_config"] jieba.add_word('型号') jieba.add_word('规格') jieba.add_word('设备') jieba.add_word('名称') EMBED_DIM = 300 vocab_file = table_field_config['vocab_file'] ct = Dictionary(stopwords=[]) class TableFieldCategoryModel(object): def __init__(self, config): self._corpus_path = config.get("corpus_path") self._epochs = config.get("epochs", 500) self._lr = config.get("lr", 1e-3) self._momentum = config.get("momentum", 0.5) self._output = config.get("output", 2) self._vocab_file = config.get("vocab_file") self._model_path = config.get("model_path") self._vocab_label = config.get("vocab_label") self._model = "" self.label2decode = "" def create_train_data(self): """ 训练数据生成 :return: """ if not os.path.exists(self._corpus_path): raise FileExistsError("文件不存在") label_data = pd.read_csv(self._corpus_path) label_data.drop_duplicates(['corpus'], inplace=True) label_data = shuffle(label_data) corpus = label_data['corpus'].values category = label_data['label'].values ct.append_vocab(text=corpus, need_cut=True) ct.build_dictionary(tfidf_limit=1e-6, vocab_file=vocab_file) # 查验数据 print('词量:', len(ct.dictionary)) global EMBED_DIM EMBED_DIM = len(ct.dictionary) x_vector = ct.vector_corpus(corpus=corpus, dim=EMBED_DIM, use_tfidf=False, return_type='one_hot') label2encode, label2decode = {}, {} category = [str(c).split(';') for c in category] for c in category: for w in c: if not (w in label2encode): label2encode[w] = len(label2encode) label2decode = dict(list(zip(label2encode.values(), label2encode.keys()))) y = [] label_len = len(label2encode) print('classes::', label_len) for c in category: y1 = [0] * label_len for w in c: y1[label2encode[w]] = 1 y.append(y1) joblib.dump((label2encode, label2decode), self._vocab_label) return x_vector, y, label2encode, label2decode @staticmethod def make_nn(input_size, output_size): return t.nn.Sequential( t.nn.Linear(input_size, input_size // 2), t.nn.ReLU(inplace=True), t.nn.Linear(input_size // 2, input_size // 4), t.nn.ReLU(inplace=True), t.nn.Linear(input_size // 4, output_size), t.nn.Sigmoid() if output_size == 2 else t.nn.Softmax(dim=1), ) def train(self): x_vector, y, le, _ = self.create_train_data() print('classes:', len(le)) mlp = self.make_nn(EMBED_DIM, len(le)) optimizer = t.optim.SGD(mlp.parameters(), lr=self._lr, momentum=self._momentum) lossfunc = t.nn.BCELoss() x_vector = t.autograd.Variable(t.tensor(x_vector).float()) y = t.autograd.Variable(t.tensor(y).float()) for epoch in range(self._epochs): outputs = mlp(x_vector) optimizer.zero_grad() acc(outputs, y) loss = lossfunc(outputs, y) loss.backward() optimizer.step() print('epoch:', epoch, 'loss:', loss.data.numpy()) if loss.data.numpy() < 0.01: print('提前结束训练') break t.save(mlp.state_dict(), self._model_path) def predict(self, corups): global EMBED_DIM if not self._model: ct.load_dictionary(vocab_file=vocab_file) EMBED_DIM = len(ct.dictionary) le, self.label2decode = joblib.load(self._vocab_label) self._model = self.make_nn(EMBED_DIM, len(self.label2decode)) self._model.load_state_dict(t.load(self._model_path)) x_vector = ct.vector_corpus(corups, dim=EMBED_DIM, return_type='one_hot', use_tfidf=False) x = t.tensor(x_vector).float() x = x.view(x.size()[0], -1) x = t.autograd.Variable(x) y = self._model(x) ret = [] for r in y.data.numpy(): row_label = [] for i, w in enumerate(r): if w >= 0.20: row_label.append(self.label2decode[i]) ret.append(row_label) return ret def val(self): global EMBED_DIM ct.load_dictionary(vocab_file=vocab_file) EMBED_DIM = len(ct.dictionary) le, de = joblib.load(self._vocab_file) label_data = pd.read_csv(self._corpus_path) label_data.drop_duplicates(['corpus'], inplace=True) label_data = shuffle(label_data) corpus = label_data['corpus'].values category = label_data['label'].values x_vector = ct.vector_corpus(corpus, dim=EMBED_DIM, return_type='one_hot', use_tfidf=False) mlp = self.make_nn(EMBED_DIM, len(de)) mlp.load_state_dict(t.load(self._model_path)) x = t.tensor(x_vector).float() x = x.view(x.size()[0], -1) x = t.autograd.Variable(x) y = mlp(x) ret = [] for r in y.data.numpy(): row_label = [] for i, w in enumerate(r): if w >= 0.35: row_label.append(de[i]) ret.append(row_label) print(list(zip(category, ret))) count = 0 for i in list(zip(category, ret)): if len(i[1]) == 1: if i[0] == i[1][0]: count += 1 else: print(i) if i[0].split(';') == i[1]: count += 1 print(count / len(category)) def acc(outputs, y): b = outputs > 0.25 b = b.data.numpy() a = y > 0.5 a = a.data.numpy() count = 0 print(len(a)) for i in range(len(a)): if (a[i] == b[i]).all(): count += 1 print("**********************************", count / len(a))