123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- # coding:utf-8
- from sklearn.svm import LinearSVC
- import os
- import joblib
- from machine_models.databases import loading_train_data
- from sklearn.model_selection import train_test_split
- from machine_models.tools import label2encode
- from sklearn.multiclass import OneVsRestClassifier
- import datetime
- from docs.config import convertField
- from machine_models.databases import File
- import numpy as np
- import uuid
- from machine_models.databases.mysql_helper import Model
- def many_recall_score(y_test, y_pred):
- '''
- 多标签召回率计算
- :param y_test:
- :param y_pred:
- :return:
- '''
- correct_count = 0
- total_count = 0
- for values in zip(y_test, y_pred):
- test_result = values[0]
- pred_result = values[1]
- total_count += test_result.sum()
- correct_count += pred_result[test_result > 0].sum()
- return correct_count / total_count
- def recall_score(y_test, y_pred):
- '''
- 单标签召回率计算
- :param y_test:
- :param y_pred:
- :return:
- '''
- return (y_test == y_pred).sum() / y_test.size
- def train_ones_label(x_train, y_train):
- '''
- 单标签训练
- :return:
- '''
- seed = int(datetime.datetime.now().timestamp())
- model = LinearSVC(random_state=seed)
- model.fit(x_train, y_train)
- return model
- def train_many_labels(x_train, y_train):
- '''
- 多标签训练
- :param x_train:
- :param y_train:
- :return:
- '''
- seed = int(datetime.datetime.now().timestamp())
- model = LinearSVC(random_state=seed)
- clf = OneVsRestClassifier(model, n_jobs=-1) # 根据二分类器构建多分类器
- clf.fit(x_train, y_train) # 训练模型
- return clf
- def train(project_id, focus_field, tfidf_vec, label_type: int, model_dir: str):
- """
- 模型训练
- :param project_id:
- :param focus_field:
- :param tfidf_vec:
- :param label_type:
- :param model_dir:
- :return:
- """
- # 关注字段
- focus_field = [convertField[field] for field in focus_field if field in convertField]
- # 读取数据
- train_data, train_label, count = loading_train_data(project_id, focus_field)
- # 训练数据向量化
- train_vec = tfidf_vec.transform(train_data)
- # label转向量
- le, label_vec = label2encode(train_label)
- if label_type == 1:
- single_label = []
- for label in label_vec:
- for ind, tag in enumerate(label):
- if tag == 1:
- single_label.append(ind)
- break
- label_vec = single_label
- x_train, x_test, y_train, y_test = train_test_split(train_vec, label_vec, test_size=0.2, shuffle=True)
- model_path = os.path.join(model_dir, "model.model")
- try:
- if label_type == 1:
- # 单标签训练
- y_test = np.array(y_test)
- clf = train_ones_label(x_train, y_train)
- y_pred = clf.predict(x_test)
- # 模型评估
- score = (y_test == y_pred).sum() / y_test.size
- recall = recall_score(y_test, y_pred)
- else:
- # 多标签训练
- clf = train_many_labels(x_train, y_train)
- y_pred = clf.predict(x_test)
- # 模型评估
- score = (y_test == y_pred).sum() / y_test.size
- recall = many_recall_score(y_test, y_pred)
- except Exception:
- return False
- # 模型储存
- joblib.dump((clf, le), model_path)
- # 上传模型
- model_url = str(uuid.uuid4())
- with open(model_path, "rb") as f:
- File.upload_bytes_file(model_url, f.read())
- f1_score = ((score * recall) / (score + recall)) * 2 if score and recall else 0
- # 生成数据库记录对象
- add_model = Model(sampleData=count, recallRate=recall, precision=score, accuracyRate=f1_score,
- modelFile=model_url)
- return add_model
|