123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- # 文本分类 (多元分类)
- import pandas as pd
- from sklearn.utils import shuffle
- import torch as t
- from util.dictionary import Dictionary
- import jieba
- import os
- import joblib
- from docs.config import ai2config
- table_field_config = ai2config["table_field_config"]
- jieba.add_word('型号')
- jieba.add_word('规格')
- jieba.add_word('设备')
- jieba.add_word('名称')
- EMBED_DIM = 300
- vocab_file = table_field_config['vocab_file']
- ct = Dictionary(stopwords=[])
- class TableFieldCategoryModel(object):
- def __init__(self, config):
- self._corpus_path = config.get("corpus_path")
- self._epochs = config.get("epochs", 500)
- self._lr = config.get("lr", 1e-3)
- self._momentum = config.get("momentum", 0.5)
- self._output = config.get("output", 2)
- self._vocab_file = config.get("vocab_file")
- self._model_path = config.get("model_path")
- self._vocab_label = config.get("vocab_label")
- self._model = ""
- self.label2decode = ""
- def create_train_data(self):
- """
- 训练数据生成
- :return:
- """
- if not os.path.exists(self._corpus_path):
- raise FileExistsError("文件不存在")
- label_data = pd.read_csv(self._corpus_path)
- label_data.drop_duplicates(['corpus'], inplace=True)
- label_data = shuffle(label_data)
- corpus = label_data['corpus'].values
- category = label_data['label'].values
- ct.append_vocab(text=corpus, need_cut=True)
- ct.build_dictionary(tfidf_limit=1e-6, vocab_file=vocab_file)
- # 查验数据
- print('词量:', len(ct.dictionary))
- global EMBED_DIM
- EMBED_DIM = len(ct.dictionary)
- x_vector = ct.vector_corpus(corpus=corpus, dim=EMBED_DIM, use_tfidf=False, return_type='one_hot')
- label2encode, label2decode = {}, {}
- category = [str(c).split(';') for c in category]
- for c in category:
- for w in c:
- if not (w in label2encode):
- label2encode[w] = len(label2encode)
- label2decode = dict(list(zip(label2encode.values(), label2encode.keys())))
- y = []
- label_len = len(label2encode)
- print('classes::', label_len)
- for c in category:
- y1 = [0] * label_len
- for w in c:
- y1[label2encode[w]] = 1
- y.append(y1)
- joblib.dump((label2encode, label2decode), self._vocab_label)
- return x_vector, y, label2encode, label2decode
- @staticmethod
- def make_nn(input_size, output_size):
- return t.nn.Sequential(
- t.nn.Linear(input_size, input_size // 2),
- t.nn.ReLU(inplace=True),
- t.nn.Linear(input_size // 2, input_size // 4),
- t.nn.ReLU(inplace=True),
- t.nn.Linear(input_size // 4, output_size),
- t.nn.Sigmoid() if output_size == 2 else t.nn.Softmax(dim=1),
- )
- def train(self):
- x_vector, y, le, _ = self.create_train_data()
- print('classes:', len(le))
- mlp = self.make_nn(EMBED_DIM, len(le))
- optimizer = t.optim.SGD(mlp.parameters(), lr=self._lr, momentum=self._momentum)
- lossfunc = t.nn.BCELoss()
- x_vector = t.autograd.Variable(t.tensor(x_vector).float())
- y = t.autograd.Variable(t.tensor(y).float())
- for epoch in range(self._epochs):
- outputs = mlp(x_vector)
- optimizer.zero_grad()
- acc(outputs, y)
- loss = lossfunc(outputs, y)
- loss.backward()
- optimizer.step()
- print('epoch:', epoch, 'loss:', loss.data.numpy())
- if loss.data.numpy() < 0.01:
- print('提前结束训练')
- break
- t.save(mlp.state_dict(), self._model_path)
- def predict(self, corups):
- global EMBED_DIM
- if not self._model:
- ct.load_dictionary(vocab_file=vocab_file)
- EMBED_DIM = len(ct.dictionary)
- le, self.label2decode = joblib.load(self._vocab_label)
- self._model = self.make_nn(EMBED_DIM, len(self.label2decode))
- self._model.load_state_dict(t.load(self._model_path))
- x_vector = ct.vector_corpus(corups, dim=EMBED_DIM, return_type='one_hot', use_tfidf=False)
- x = t.tensor(x_vector).float()
- x = x.view(x.size()[0], -1)
- x = t.autograd.Variable(x)
- y = self._model(x)
- ret = []
- for r in y.data.numpy():
- row_label = []
- for i, w in enumerate(r):
- if w >= 0.20:
- row_label.append(self.label2decode[i])
- ret.append(row_label)
- return ret
- def val(self):
- global EMBED_DIM
- ct.load_dictionary(vocab_file=vocab_file)
- EMBED_DIM = len(ct.dictionary)
- le, de = joblib.load(self._vocab_file)
- label_data = pd.read_csv(self._corpus_path)
- label_data.drop_duplicates(['corpus'], inplace=True)
- label_data = shuffle(label_data)
- corpus = label_data['corpus'].values
- category = label_data['label'].values
- x_vector = ct.vector_corpus(corpus, dim=EMBED_DIM, return_type='one_hot', use_tfidf=False)
- mlp = self.make_nn(EMBED_DIM, len(de))
- mlp.load_state_dict(t.load(self._model_path))
- x = t.tensor(x_vector).float()
- x = x.view(x.size()[0], -1)
- x = t.autograd.Variable(x)
- y = mlp(x)
- ret = []
- for r in y.data.numpy():
- row_label = []
- for i, w in enumerate(r):
- if w >= 0.35:
- row_label.append(de[i])
- ret.append(row_label)
- print(list(zip(category, ret)))
- count = 0
- for i in list(zip(category, ret)):
- if len(i[1]) == 1:
- if i[0] == i[1][0]:
- count += 1
- else:
- print(i)
- if i[0].split(';') == i[1]:
- count += 1
- print(count / len(category))
- def acc(outputs, y):
- b = outputs > 0.25
- b = b.data.numpy()
- a = y > 0.5
- a = a.data.numpy()
- count = 0
- print(len(a))
- for i in range(len(a)):
- if (a[i] == b[i]).all():
- count += 1
- print("**********************************", count / len(a))
|