# coding:utf-8 from sklearn.svm import LinearSVC import os import joblib from machine_models.databases import loading_train_data from sklearn.model_selection import train_test_split from machine_models.tools import label2encode from sklearn.multiclass import OneVsRestClassifier import datetime from docs.config import convertField from machine_models.databases import File import numpy as np import uuid from machine_models.databases.mysql_helper import Model def many_recall_score(y_test, y_pred): ''' 多标签召回率计算 :param y_test: :param y_pred: :return: ''' correct_count = 0 total_count = 0 for values in zip(y_test, y_pred): test_result = values[0] pred_result = values[1] total_count += test_result.sum() correct_count += pred_result[test_result > 0].sum() return correct_count / total_count def recall_score(y_test, y_pred): ''' 单标签召回率计算 :param y_test: :param y_pred: :return: ''' return (y_test == y_pred).sum() / y_test.size def train_ones_label(x_train, y_train): ''' 单标签训练 :return: ''' seed = int(datetime.datetime.now().timestamp()) model = LinearSVC(random_state=seed) model.fit(x_train, y_train) return model def train_many_labels(x_train, y_train): ''' 多标签训练 :param x_train: :param y_train: :return: ''' seed = int(datetime.datetime.now().timestamp()) model = LinearSVC(random_state=seed) clf = OneVsRestClassifier(model, n_jobs=-1) # 根据二分类器构建多分类器 clf.fit(x_train, y_train) # 训练模型 return clf def train(project_id, focus_field, tfidf_vec, label_type: int, model_dir: str): """ 模型训练 :param project_id: :param focus_field: :param tfidf_vec: :param label_type: :param model_dir: :return: """ # 关注字段 focus_field = [convertField[field] for field in focus_field if field in convertField] # 读取数据 train_data, train_label, count = loading_train_data(project_id, focus_field) # 训练数据向量化 train_vec = tfidf_vec.transform(train_data) # label转向量 le, label_vec = label2encode(train_label) if label_type == 1: single_label = [] for label in label_vec: for ind, tag in enumerate(label): if tag == 1: single_label.append(ind) break label_vec = single_label x_train, x_test, y_train, y_test = train_test_split(train_vec, label_vec, test_size=0.2, shuffle=True) model_path = os.path.join(model_dir, "model.model") try: if label_type == 1: # 单标签训练 y_test = np.array(y_test) clf = train_ones_label(x_train, y_train) y_pred = clf.predict(x_test) # 模型评估 score = (y_test == y_pred).sum() / y_test.size recall = recall_score(y_test, y_pred) else: # 多标签训练 clf = train_many_labels(x_train, y_train) y_pred = clf.predict(x_test) # 模型评估 score = (y_test == y_pred).sum() / y_test.size recall = many_recall_score(y_test, y_pred) except Exception: return False # 模型储存 joblib.dump((clf, le), model_path) # 上传模型 model_url = str(uuid.uuid4()) with open(model_path, "rb") as f: File.upload_bytes_file(model_url, f.read()) f1_score = ((score * recall) / (score + recall)) * 2 if score and recall else 0 # 生成数据库记录对象 add_model = Model(sampleData=count, recallRate=recall, precision=score, accuracyRate=f1_score, modelFile=model_url) return add_model