1 gadu atpakaļ · e5608bafff
--- a/ai-0.0.4-py3-none-any.whl
+++ b/ai-0.0.4-py3-none-any.whl
--- a/create_dict.py
+++ b/create_dict.py
@@ -0,0 +1,43 @@
 
				+# coding:utf-8
			
 
				+from machine_models.tools import link_db
			
 
				+from machine_models.tools import chinese2vector
			
 
				+from machine_models.tools import tfidf
			
 
				+import joblib
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    m_config = {
			
 
				+        "db": "re4art",
			
 
				+        "col": "bidding_china_4_9",
			
 
				+        "host": "192.168.3.207:27092",
			
 
				+    }
			
 
				+    with open("stopwords.txt", "r") as f:
			
 
				+        stop_words = [word.strip() for word in f.readlines()]
			
 
				+
			
 
				+    client, col = link_db(m_config)
			
 
				+    corpus = []
			
 
				+    with open("./target.csv", "r") as f:
			
 
				+        read_data = f.read()
			
 
				+        read_data = read_data.replace("\n", " ")
			
 
				+    other = chinese2vector(read_data, remove_word=["x"], stopwords=stop_words)
			
 
				+    print(other)
			
 
				+    corpus.append(other)
			
 
				+    count = 0
			
 
				+    for row in col.find({}).sort("_id", 1):
			
 
				+        # detail = row.get("detail", "")
			
 
				+        # title = row.get("title", "")
			
 
				+        count += 1
			
 
				+        print(count)
			
 
				+        # corpus = chinese2vector(title + detail.lower(), remove_word=["x", "m"], stopwords=stop_words)
			
 
				+        # col.update_one({"_id": row["_id"]}, {"$set": {"cut_detail": corpus}})
			
 
				+        cut_detail = row.get("cut_detail", "")
			
 
				+        corpus.append(cut_detail)
			
 
				+        # if len(contents) > 10000:
			
 
				+        #     cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
			
 
				+        #     corpus.extend(cut_ret)
			
 
				+        #     contents = []
			
 
				+    # if contents:
			
 
				+    #     cut_ret = chinese2vectors(contents, remove_word=["x"], stop_words=stop_words)
			
 
				+    #     corpus.extend(cut_ret)
			
 
				+    #     contents = []
			
 
				+    tfidf_vec, tfidf_ret = tfidf(analyzer="word", space_words=corpus)
			
 
				+    joblib.dump(tfidf_vec, "docs/model/dictionary")
			
--- a/docs/__pycache__/config.cpython-37.pyc
+++ b/docs/__pycache__/config.cpython-37.pyc
--- a/docs/config.py
+++ b/docs/config.py
@@ -0,0 +1,54 @@
 
				+# coding:utf-8
			
 
				+
			
 
				+mysql_config = {
			
 
				+    "db": "machineLearning",
			
 
				+    "ip": "192.168.3.109",
			
 
				+    "port": "4000",
			
 
				+    "user": "root",
			
 
				+    "pwd": "Tidb#20220214",
			
 
				+    "charset": "utf8"
			
 
				+}
			
 
				+
			
 
				+source_mongo_config = {
			
 
				+    "host": "192.168.3.207:27001,192.168.3.206:27002",
			
 
				+    "user": "jyDevGroup",
			
 
				+    "password": "jy@DevGroup",
			
 
				+    "db": "qfw_data",
			
 
				+    "col": "bidding"
			
 
				+}
			
 
				+
			
 
				+catch_mongo_config = {
			
 
				+    "host": "192.168.3.207:27092",
			
 
				+    "user": "",
			
 
				+    "password": "",
			
 
				+    "db": "re4art",
			
 
				+    "col": "catch_mongo_test"
			
 
				+}
			
 
				+
			
 
				+oss_file_config = {
			
 
				+    "access_key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
			
 
				+    "access_key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
			
 
				+    "endpoint": "oss-cn-beijing.aliyuncs.com",
			
 
				+    "bucket_name": "jy-datafile",
			
 
				+}
			
 
				+
			
 
				+oss_txt_config = {
			
 
				+    "access_key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
			
 
				+    "access_key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
			
 
				+    "endpoint": "oss-cn-beijing.aliyuncs.com",
			
 
				+    "bucket_name": "topjy",
			
 
				+}
			
 
				+
			
 
				+convertField = {
			
 
				+    "标题": "cut_title",
			
 
				+    "正文": "cut_detail",
			
 
				+    "采购单位": "cut_buyer",
			
 
				+    "中标单位": "cut_winner",
			
 
				+    "标的物": "cut_purchasing",
			
 
				+    "附件": "cut_attach_text"
			
 
				+}
			
 
				+
			
 
				+stopWordsPath = "./docs/stopwords.txt"
			
 
				+baseDir = "./docs/"
			
 
				+dictionaryPath = "./docs/dictionary"
			
 
				+dictionaryUrl = "111111"
			
--- a/docs/model/dictionary
+++ b/docs/model/dictionary
--- a/docs/model/model.model
+++ b/docs/model/model.model
--- a/docs/stopwords.txt
+++ b/docs/stopwords.txt
@@ -0,0 +1,627 @@
 
				+项目
			
 
				+公告
			
 
				+-
			
 
				+招标
			
 
				+）
			
 
				+（
			
 
				+2019
			
 
				+公示
			
 
				+、
			
 
				+中标
			
 
				+年
			
 
				+结果
			
 
				+的
			
 
				+中心
			
 
				+公开
			
 
				+成交
			
 
				+竞争性
			
 
				+限公司
			
 
				+关于
			
 
				+(
			
 
				+[
			
 
				+]
			
 
				+)
			
 
				+及
			
 
				+建设
			
 
				+有
			
 
				+合同
			
 
				+和
			
 
				+中国
			
 
				+：
			
 
				+等
			
 
				+更
			
 
				+谈判
			
 
				+“
			
 
				+平台
			
 
				+”
			
 
				+询价
			
 
				+单一来源
			
 
				+安全
			
 
				+有限公司
			
 
				+1
			
 
				+_
			
 
				+磋
			
 
				+商公告
			
 
				+二次
			
 
				+公司
			
 
				+编号
			
 
				+【
			
 
				+】
			
 
				+与
			
 
				+询价公告
			
 
				+公安局
			
 
				+集团
			
 
				+管理
			
 
				+字
			
 
				+选人
			
 
				+化
			
 
				+年度
			
 
				+磋商
			
 
				+县
			
 
				+段
			
 
				+项目
			
 
				+文件
			
 
				+2019
			
 
				+联系
			
 
				+单位
			
 
				+时间
			
 
				+信息
			
 
				+提供
			
 
				+进行
			
 
				+功能
			
 
				+支持
			
 
				+中心
			
 
				+名称
			
 
				+内容
			
 
				+公告
			
 
				+政府
			
 
				+品牌
			
 
				+方式
			
 
				+满足
			
 
				+00
			
 
				+工作
			
 
				+限公司
			
 
				+合同
			
 
				+质疑
			
 
				+供应
			
 
				+具有
			
 
				+地址
			
 
				+必须
			
 
				+交易
			
 
				+工程
			
 
				+成交
			
 
				+编号
			
 
				+建设
			
 
				+应商
			
 
				+现场
			
 
				+相关
			
 
				+具备
			
 
				+05
			
 
				+10
			
 
				+资格
			
 
				+地点
			
 
				+需求
			
 
				+30
			
 
				+中标
			
 
				+公司
			
 
				+根据
			
 
				+检测
			
 
				+管理
			
 
				+通过
			
 
				+注册
			
 
				+条件
			
 
				+平台
			
 
				+公共
			
 
				+中国
			
 
				+安全
			
 
				+企业
			
 
				+31
			
 
				+网上
			
 
				+截止
			
 
				+报名
			
 
				+网络
			
 
				+包括
			
 
				+修改
			
 
				+资料
			
 
				+资质
			
 
				+06
			
 
				+用户
			
 
				+不予
			
 
				+国家
			
 
				+本次
			
 
				+规定
			
 
				+附件
			
 
				+咨询
			
 
				+此条
			
 
				+使用
			
 
				+磋商
			
 
				+生产
			
 
				+金额
			
 
				+保证
			
 
				+cn
			
 
				+数据
			
 
				+文件
			
 
				+记录
			
 
				+项目
			
 
				+投标
			
 
				+2019
			
 
				+招标
			
 
				+时间
			
 
				+供应商
			
 
				+有限公司
			
 
				+进行
			
 
				+满足
			
 
				+功能
			
 
				+内容
			
 
				+以上
			
 
				+具有
			
 
				+00
			
 
				+质疑
			
 
				+必须
			
 
				+联系人
			
 
				+公共资源
			
 
				+05
			
 
				+10 
			
 
				+30
			
 
				+联系方式
			
 
				+现场
			
 
				+名称
			
 
				+公司
			
 
				+根据
			
 
				+地址
			
 
				+其他
			
 
				+使用
			
 
				+报名
			
 
				+联系电话
			
 
				+工作
			
 
				+31
			
 
				+条件
			
 
				+包括
			
 
				+06
			
 
				+不予
			
 
				+中心
			
 
				+规定
			
 
				+本次
			
 
				+同时
			
 
				+此条
			
 
				+方式
			
 
				+cn
			
 
				+有关
			
 
				+记录
			
 
				+可以
			
 
				+上虞
			
 
				+所有
			
 
				+http
			
 
				+www
			
 
				+完成
			
 
				+如下
			
 
				+保证金
			
 
				+实际
			
 
				+或者
			
 
				+gov 
			
 
				+不得
			
 
				+三家
			
 
				+12
			
 
				+接受
			
 
				+17
			
 
				+问题
			
 
				+文档
			
 
				+人员
			
 
				+11
			
 
				+只有
			
 
				+下载
			
 
				+15
			
 
				+情况
			
 
				+34
			
 
				+指定
			
 
				+以下
			
 
				+获取
			
 
				+结果
			
 
				+在线
			
 
				+直接
			
 
				+隐藏
			
 
				+全部
			
 
				+能力
			
 
				+唯一
			
 
				+按照
			
 
				+&#
			
 
				+集团
			
 
				+参与
			
 
				+CA
			
 
				+倾向性
			
 
				+竞争性
			
 
				+查看
			
 
				+com 
			
 
				+有效
			
 
				+400
			
 
				+能够
			
 
				+14
			
 
				+开远市
			
 
				+24
			
 
				+统一
			
 
				+组织
			
 
				+存在
			
 
				+列入
			
 
				+综合
			
 
				+承诺函
			
 
				+失信
			
 
				+16
			
 
				+即可
			
 
				+及时
			
 
				+不少
			
 
				+独立
			
 
				+联合体
			
 
				+010
			
 
				+一个
			
 
				+是否
			
 
				+此参数
			
 
				+20
			
 
				+000
			
 
				+形式
			
 
				+0606
			
 
				+58851111
			
 
				+同一
			
 
				+以及
			
 
				+13
			
 
				+没有
			
 
				+输入
			
 
				+拓久
			
 
				+之日起
			
 
				+亮显
			
 
				+为了
			
 
				+处理
			
 
				+正式
			
 
				+18
			
 
				+影响
			
 
				+唯一性
			
 
				+IEBOARD
			
 
				+视通
			
 
				+东威
			
 
				+标的
			
 
				+09
			
 
				+主要
			
 
				+最高
			
 
				+所在
			
 
				+方有权
			
 
				+鸿合
			
 
				+指向性
			
 
				+其它
			
 
				+中原
			
 
				+需要
			
 
				+知识点
			
 
				+限制性
			
 
				+VGA
			
 
				+分公司
			
 
				+人民
			
 
				+2018
			
 
				+各个
			
 
				+了解
			
 
				+业绩
			
 
				+社会
			
 
				+为准
			
 
				+法律法规
			
 
				+上午
			
 
				+日立
			
 
				+小组
			
 
				+下列
			
 
				+ggzy
			
 
				+发现
			
 
				+三年
			
 
				+原因
			
 
				+有限责任
			
 
				+参考
			
 
				+无效
			
 
				+携带
			
 
				+欢迎
			
 
				+删除
			
 
				+IQ
			
 
				+高科
			
 
				+湖山
			
 
				+联系
			
 
				+无法
			
 
				+无条件
			
 
				+其中
			
 
				+至少
			
 
				+后期
			
 
				+2.5
			
 
				+确定
			
 
				+并且
			
 
				+含有
			
 
				+海康
			
 
				+不变
			
 
				+内江
			
 
				+01
			
 
				+偏离
			
 
				+结束
			
 
				+用途
			
 
				+必需
			
 
				+除外
			
 
				+快捷键
			
 
				+此项
			
 
				+220kV
			
 
				+提出
			
 
				+备注
			
 
				+3.1
			
 
				+4.2
			
 
				+否则
			
 
				+通用
			
 
				+三个
			
 
				+08
			
 
				+期间
			
 
				+日内
			
 
				+45
			
 
				+天内
			
 
				+予以
			
 
				+由此
			
 
				+尺寸
			
 
				+日至
			
 
				+工作日内
			
 
				+及其
			
 
				+差别待遇
			
 
				+28
			
 
				+第二
			
 
				+终止
			
 
				+明确
			
 
				+我司
			
 
				+爱普生
			
 
				+最终
			
 
				+印天
			
 
				+白光
			
 
				+巨龙
			
 
				+所以
			
 
				+网点
			
 
				+类型
			
 
				+第一
			
 
				+不再
			
 
				+最大
			
 
				+提升
			
 
				+4.1
			
 
				+一切
			
 
				+品目
			
 
				+联动
			
 
				+物理
			
 
				+任意
			
 
				+条例
			
 
				+成立
			
 
				+变化
			
 
				+明基
			
 
				+松下
			
 
				+赛尔
			
 
				+独有
			
 
				+带式
			
 
				+送货
			
 
				+一份
			
 
				+公开
			
 
				+限制
			
 
				+文号
			
 
				+产生
			
 
				+04
			
 
				+之间
			
 
				+方向
			
 
				+KYRW
			
 
				+第三
			
 
				+场地
			
 
				+效果
			
 
				+50
			
 
				+理光
			
 
				+鲅鱼圈区
			
 
				+锐取
			
 
				+airitilibrary
			
 
				+学术
			
 
				+第二十二条
			
 
				+开始
			
 
				+现将
			
 
				+威胁
			
 
				+符合国家
			
 
				+大华
			
 
				+一级
			
 
				+霍邱
			
 
				+一年
			
 
				+方须
			
 
				+医共体
			
 
				+RJ45
			
 
				+扣发
			
 
				+总价
			
 
				+我们
			
 
				+包件
			
 
				+华能
			
 
				+从而
			
 
				+健全
			
 
				+限于
			
 
				+因此
			
 
				+多点
			
 
				+办法
			
 
				+可自相
			
 
				+接到
			
 
				+不能
			
 
				+ccpc
			
 
				+届满
			
 
				+下同
			
 
				+已经
			
 
				+最新
			
 
				+扩展
			
 
				+落实
			
 
				+稳定
			
 
				+特点
			
 
				+黑名单
			
 
				+班班
			
 
				+现对
			
 
				+真实
			
 
				+creditchina
			
 
				+工期
			
 
				+文件夹
			
 
				+不足
			
 
				+PCIe
			
 
				+准确
			
 
				+校正
			
 
				+每个
			
 
				+便于
			
 
				+以次充好
			
 
				+投机取巧
			
 
				+因无
			
 
				+歧视性
			
 
				+包号
			
 
				+项下
			
 
				+5.3
			
 
				+正本
			
 
				+放弃
			
 
				+原则上
			
 
				+得到
			
 
				+之前
			
 
				+详细
			
 
				+统计
			
 
				+代理商
			
 
				+整个
			
 
				+跟踪
			
 
				+后方
			
 
				+时至
			
 
				+提高
			
 
				+目标
			
 
				+长乐
			
 
				+有意
			
 
				+迷你
			
 
				+退款
			
 
				+蓝色
			
 
				+富民县
			
 
				+两侧
			
 
				+板面
			
 
				+西路
			
 
				+或是
			
 
				+拖动
			
 
				+随时
			
 
				+互动
			
 
				+设定
			
 
				+有赛
			
 
				+练习
			
 
				+RGB
			
 
				+瀚驰
			
 
				+漫游
			
 
				+排斥
			
 
				+小于
			
 
				+专区
			
 
				+登陆
			
 
				+发售
			
 
				+大厅
			
 
				+联合
			
 
				+大街
			
 
				+成员名单
			
 
				+关闭
			
 
				+产地
			
 
				+中途
			
 
				+水货
			
 
				+以便
			
 
				+开具
			
 
				+退还
			
 
				+答疑
			
 
				+按规定
			
 
				+不间断
			
 
				+务必
			
 
				+初验
			
 
				+成熟
			
 
				+版权
			
 
				+澄清
			
 
				+原则
			
 
				+方仅
			
 
				+本级
			
 
				+快捷
			
 
				+历史
			
 
				+多种
			
 
				+主体
			
 
				+对应
			
 
				+银信
			
 
				+目的
			
 
				+信服
			
 
				+划分
			
 
				+龙井
			
 
				+IP
			
 
				+html
			
 
				+之一
			
 
				+重点
			
 
				+女士
			
 
				+定点
			
 
				+计算
			
 
				+上行政区域
			
 
				+专用
			
 
				+汇款
			
 
				+容量
			
 
				+适应
			
 
				+全额
			
 
				+单上
			
 
				+常用
			
 
				+实质性
			
 
				+批次
			
 
				+歧视
			
 
				+精品
			
 
				+有希沃
			
 
				+演示
			
 
				+转载
			
 
				+非常
			
 
				+语文
			
 
				+逼真
			
 
				+降低
			
 
				+时间表
			
 
				+秦淮
			
 
				+天得
			
 
				+路天恒
			
 
				+为鸿合
			
 
				+出入口
			
 
				+禁止
			
 
				+答题
			
 
				+虚拟
			
 
				+主页
			
 
				+紫旭
			
 
				+分局
			
 
				+采办
			
 
				+地区
			
 
				+加装
			
 
				+之外
			
 
				+保有
			
 
				+剩余时间
			
 
				+GDC
			
 
				+对接
			
 
				+时限
			
 
				+关系
			
 
				+获得
			
 
				+尚未
			
 
				+户名
			
 
				+超级
			
 
				+进一步
			
 
				+另行
			
 
				+涉及
			
 
				+每天
			
 
				+到场
			
 
				+平衡
			
 
				+一台
			
 
				+DRGs
			
 
				+固定
			
 
				+3T
			
 
				+SSD
			
 
				+保留
			
 
				+GHz
			
 
				+推荐
			
 
				+自带
			
 
				+登记表
			
 
				+转包
			
 
				+RIS
			
 
				+临床
			
 
				+指导
			
 
				+雪亮
			
 
				+对于
			
 
				+现行
			
 
				+方面
			
 
				+估算
			
 
				+产业
			
 
				+立体声
			
 
				+社库
			
 
				+全文
			
 
				+标包
			
 
				+发出
			
 
				+海棠
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/machine_models/__init__.py
+++ b/machine_models/__init__.py
@@ -0,0 +1,122 @@
 
				+# coding:utf-8
			
 
				+from docs.config import dictionaryPath
			
 
				+from docs.config import dictionaryUrl
			
 
				+from machine_models.databases import File
			
 
				+from machine_models.train_model import train
			
 
				+from machine_models.databases.mysql_helper import Model
			
 
				+from machine_models.databases import session
			
 
				+from machine_models.databases.mysql_helper import Project
			
 
				+from machine_models.predict_model import predict
			
 
				+from util.file_operations import generate_directory, del_directory
			
 
				+from docs.config import baseDir
			
 
				+import os
			
 
				+import joblib
			
 
				+import uuid
			
 
				+import datetime
			
 
				+
			
 
				+# 词典文件加载，只加载一次
			
 
				+if not os.path.exists(dictionaryPath):
			
 
				+    status = File.download_file(dictionaryUrl, dictionaryPath)
			
 
				+    if not status:
			
 
				+        raise ValueError("词典文件下载失败")
			
 
				+tfidf_vec = joblib.load(dictionaryPath)
			
 
				+
			
 
				+def train_fail(project_id, user_id):
			
 
				+    '''
			
 
				+    记录失败日志
			
 
				+    :param project_id:
			
 
				+    :param user_id:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    fail_model = Model(state=2, projectId=project_id, createperson=user_id,
			
 
				+                       createTime=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
			
 
				+    session.add(fail_model)
			
 
				+    session.commit()
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def train_model(request_params: dict):
			
 
				+    # 清空数据库链接对象缓存
			
 
				+    session.expire_all()
			
 
				+    session.commit()
			
 
				+
			
 
				+    # 获取训练项目数据
			
 
				+    project_id = request_params.get("id")
			
 
				+    user_id = request_params.get("userId", "")
			
 
				+    label_type = request_params.get("type", 1)
			
 
				+    fields = request_params.get("fields", "")
			
 
				+    model_dir = ""
			
 
				+    try:
			
 
				+        # 不存在项目Id
			
 
				+        if not project_id:
			
 
				+            return train_fail(project_id, user_id)
			
 
				+        model_dir = os.path.join(baseDir, str(uuid.uuid4()))
			
 
				+        dir_status = generate_directory(model_dir)
			
 
				+        # 文件夹生成错误
			
 
				+        if not dir_status:
			
 
				+            return train_fail(project_id, user_id)
			
 
				+        # 开始训练
			
 
				+        model_detail = train(project_id, fields.split(","), tfidf_vec, label_type, model_dir)
			
 
				+        # 训练失败
			
 
				+        if not model_detail:
			
 
				+            return train_fail(project_id, user_id)
			
 
				+        # 训练成功记录
			
 
				+        model_detail.projectId = project_id
			
 
				+        model_detail.state = 0
			
 
				+        model_detail.createperson = user_id
			
 
				+        model_detail.createTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
			
 
				+        session.add(model_detail)
			
 
				+        session.commit()
			
 
				+        # 清空本次训练生成文件
			
 
				+        del_directory(model_dir)
			
 
				+        return True
			
 
				+    except Exception as e:
			
 
				+        train_fail(project_id, user_id)
			
 
				+        # 清空本次训练生成文件
			
 
				+        if model_dir and os.path.exists(model_dir):
			
 
				+            del_directory(model_dir)
			
 
				+        return train_fail(project_id, user_id)
			
 
				+
			
 
				+
			
 
				+def predict_model(request_params):
			
 
				+    # 清空本地数据库缓存
			
 
				+    session.expire_all()
			
 
				+    session.commit()
			
 
				+    # 获取预测参数
			
 
				+    project_id = request_params.get("id", -1)
			
 
				+    id_list = request_params.get("id_list", [])
			
 
				+    model_id = request_params.get("model_id", -1)
			
 
				+    project_info = session.query(Project).filter_by(id=project_id).first()
			
 
				+
			
 
				+    # 查询项目信息
			
 
				+    if not project_info:
			
 
				+        return {"error_code": 0, "error_message": f"项目信息不存在--> {project_id}"}
			
 
				+    focus_field, target_label, label_type = project_info.focusField.split(
			
 
				+        ","), project_info.labels.split(","), project_info.type
			
 
				+
			
 
				+    # 查询模型信息
			
 
				+    model_info = session.query(Model).filter_by(id=model_id).first()
			
 
				+    print(type(model_id), "-->", model_id)
			
 
				+    if not model_info:
			
 
				+        return {"error_code": 0, "error_message": f"模型信息不存在--> {model_id}"}
			
 
				+
			
 
				+    # 加载模型
			
 
				+    model_url = model_info.modelFile
			
 
				+    model_dir = os.path.join(baseDir, model_url)
			
 
				+    model_path = os.path.join(model_dir, "model.model")
			
 
				+    if not os.path.exists(model_path):
			
 
				+        dir_status = generate_directory(model_dir)
			
 
				+        if not dir_status:
			
 
				+            return {"error_code": 0, "error_message": f"文件夹创建失败，请检查存储设备-->"}
			
 
				+        status = File.download_file(model_url, model_path)
			
 
				+        if not status:
			
 
				+            return {"error_code": 0, "error_message": f"oss储存模型加载失败--> {model_id}"}
			
 
				+    try:
			
 
				+        data = predict(id_list, tfidf_vec, label_type, focus_field, target_label, model_path)
			
 
				+    except Exception as e:
			
 
				+        print(e)
			
 
				+        return {"error_code": 0, "error_message": "预测过程出错"}
			
 
				+    # 清空缓存
			
 
				+    if os.path.exists(model_dir):
			
 
				+        del_directory(model_dir)
			
 
				+    return {"error_code": 1, "data": data}
			
--- a/machine_models/__pycache__/__init__.cpython-37.pyc
+++ b/machine_models/__pycache__/__init__.cpython-37.pyc
--- a/machine_models/__pycache__/predict_model.cpython-37.pyc
+++ b/machine_models/__pycache__/predict_model.cpython-37.pyc
--- a/machine_models/__pycache__/tools.cpython-37.pyc
+++ b/machine_models/__pycache__/tools.cpython-37.pyc
--- a/machine_models/__pycache__/train_model.cpython-37.pyc
+++ b/machine_models/__pycache__/train_model.cpython-37.pyc
--- a/machine_models/databases/__init__.py
+++ b/machine_models/databases/__init__.py
@@ -0,0 +1,144 @@
 
				+# coding:utf-8
			
 
				+'''
			
 
				+数据加载、建立缓存
			
 
				+'''
			
 
				+from sqlalchemy.orm.session import sessionmaker
			
 
				+from machine_models.databases.mysql_helper import init_db
			
 
				+from machine_models.databases.mysql_helper import Model
			
 
				+from machine_models.databases.mysql_helper import AnnotatedData
			
 
				+from machine_models.databases.mongo_helper import MongoConnect
			
 
				+from docs.config import mysql_config
			
 
				+from docs.config import source_mongo_config
			
 
				+from docs.config import catch_mongo_config
			
 
				+from util.fs_client import FileServeClient
			
 
				+from machine_models.tools import chinese2vector
			
 
				+from docs.config import stopWordsPath
			
 
				+from bson import ObjectId
			
 
				+from docs.config import oss_file_config
			
 
				+from docs.config import oss_txt_config
			
 
				+from util.oss_file import OssServeClient
			
 
				+
			
 
				+# 链接初始化
			
 
				+Fs = FileServeClient(oss_txt_config)
			
 
				+File = OssServeClient(oss_file_config)
			
 
				+engine = init_db(mysql_config)
			
 
				+Connect = sessionmaker(bind=engine)
			
 
				+session = Connect()
			
 
				+source_mongo = MongoConnect(source_mongo_config)
			
 
				+catch_mongo = MongoConnect(catch_mongo_config)
			
 
				+
			
 
				+# 加载停用词
			
 
				+with open(stopWordsPath, "r") as f:
			
 
				+    stop_words = [word.strip() for word in f.readlines()]
			
 
				+
			
 
				+
			
 
				+def get_info(m_id, focus_field: list, need_doc: bool = False):
			
 
				+    """
			
 
				+    关注字段获取
			
 
				+    :param m_id:
			
 
				+    :param focus_field:
			
 
				+    :param need_doc: 获取原文档
			
 
				+    :return:
			
 
				+    """
			
 
				+    select_fields = ["title", "detail", "href", "buyer", "winner", "purchasing", "attach_text", "cut_title",
			
 
				+                     "cut_detail", "cut_buyer", "cut_winner", "cut_purchasing", "cut_attach_text"]
			
 
				+    fields = {field: 1 for field in select_fields}
			
 
				+    c_info = catch_mongo.get_by_mid(ObjectId(m_id.strip()), fields)
			
 
				+    if c_info:
			
 
				+        # 获取字段内容
			
 
				+        content, add_field = select_field(c_info, focus_field)
			
 
				+        # 添加缓存
			
 
				+        if add_field:
			
 
				+            catch_mongo.update(c_info["_id"], add_field)
			
 
				+        doc = c_info if need_doc else {}
			
 
				+        return content, doc
			
 
				+    s_info = source_mongo.get_by_mid(ObjectId(m_id.strip()), fields)
			
 
				+    if s_info:
			
 
				+        # 获取字段内容
			
 
				+        content, add_field = select_field(s_info, focus_field)
			
 
				+        # 添加缓存
			
 
				+        s_info.update(add_field)
			
 
				+        catch_mongo.insert(s_info)
			
 
				+        doc = s_info if need_doc else {}
			
 
				+        return content, doc
			
 
				+    return "", {}
			
 
				+
			
 
				+
			
 
				+def select_field(info, focus_field):
			
 
				+    """
			
 
				+    字段筛选
			
 
				+    :param info:
			
 
				+    :param focus_field:
			
 
				+    :return:
			
 
				+    """
			
 
				+    content = ""  # 合并的切词文本
			
 
				+    add_field = {}  # 添加的缓存切词字段
			
 
				+    for field in focus_field:
			
 
				+        content += " "
			
 
				+        if field in info:
			
 
				+            content += info[field]
			
 
				+        else:
			
 
				+            original_field = field.split("_", 1)[-1]
			
 
				+            if original_field in info:
			
 
				+                add_field[field] = get_content(original_field, info.get(original_field, ""))
			
 
				+                content += add_field[field]
			
 
				+    return content, add_field
			
 
				+
			
 
				+
			
 
				+def get_content(field: dict, value: any) -> str:
			
 
				+    """
			
 
				+    需求字段合成文本内容
			
 
				+    :param field:字段
			
 
				+    :param value:值
			
 
				+    :return:合并文本
			
 
				+    """
			
 
				+    content = ""  # 正文文本
			
 
				+    if value and field == "attach_text":  # 附件单独处理
			
 
				+        for ind, attach in value.items():
			
 
				+            for topic, topic_detail in attach.items():
			
 
				+                attach_url = topic_detail.get("attach_url", "")
			
 
				+                # 加载oss附件文本
			
 
				+                state, attach_txt = Fs.download_text_content(attach_url)
			
 
				+                if state:
			
 
				+                    content += attach_txt
			
 
				+    else:
			
 
				+        # 通用处理方法
			
 
				+        if isinstance(value, str):
			
 
				+            content = value if value else ""
			
 
				+        else:
			
 
				+            return ""
			
 
				+    return chinese2vector(content, remove_word=["x"], stopwords=stop_words)
			
 
				+
			
 
				+
			
 
				+def loading_train_data(project_id, focus_field):
			
 
				+    """
			
 
				+    加载训练数据
			
 
				+    :param project_id:
			
 
				+    :param focus_field:
			
 
				+    :return:
			
 
				+    """
			
 
				+    train_data = []
			
 
				+    labels = []
			
 
				+    result = session.query(AnnotatedData).filter_by(projectId=project_id).order_by(AnnotatedData.id).all()
			
 
				+    for row in result:
			
 
				+        label, m_id = row.label, row.infoId
			
 
				+        many_label = [tag.strip() for tag in label.split(",") if tag.strip()]
			
 
				+        if not many_label:
			
 
				+            continue
			
 
				+        content, doc = get_info(m_id, focus_field)
			
 
				+        # 添加训练文本
			
 
				+        if content.strip():
			
 
				+            train_data.append(content)
			
 
				+            labels.append(many_label)
			
 
				+    return train_data, labels, len(labels)
			
 
				+
			
 
				+
			
 
				+def loading_predict_data(m_id: str, focus_field: list):
			
 
				+    """
			
 
				+    加载预测数据
			
 
				+    :param m_id:
			
 
				+    :param focus_field:
			
 
				+    :return:
			
 
				+    """
			
 
				+    content, doc = get_info(m_id, focus_field, need_doc=True)
			
 
				+    return content, doc
			
--- a/machine_models/databases/__pycache__/__init__.cpython-37.pyc
+++ b/machine_models/databases/__pycache__/__init__.cpython-37.pyc
--- a/machine_models/databases/__pycache__/mongo_helper.cpython-37.pyc
+++ b/machine_models/databases/__pycache__/mongo_helper.cpython-37.pyc
--- a/machine_models/databases/__pycache__/mysql_helper.cpython-37.pyc
+++ b/machine_models/databases/__pycache__/mysql_helper.cpython-37.pyc
--- a/machine_models/databases/mongo_helper.py
+++ b/machine_models/databases/mongo_helper.py
@@ -0,0 +1,76 @@
 
				+# coding:utf-8
			
 
				+
			
 
				+"""
			
 
				+mongodb 数据库连接文件
			
 
				+"""
			
 
				+
			
 
				+from pymongo import MongoClient
			
 
				+import urllib.parse as parse
			
 
				+from loguru import logger
			
 
				+from pymongo.errors import CursorNotFound
			
 
				+
			
 
				+
			
 
				+class MongoConnect(object):
			
 
				+    def __init__(self, config):
			
 
				+        self.__host = config.get("host", "")
			
 
				+        self.__user = config.get("user", "")
			
 
				+        self.__password = config.get("password", "")
			
 
				+        self.__database = config.get("db", "")
			
 
				+        self.__col = config.get("col", "")
			
 
				+        self.__charset = config.get("charset", "")
			
 
				+        self.client, self.col = self.connect()
			
 
				+
			
 
				+    def connect(self):
			
 
				+        """
			
 
				+        连接数据库
			
 
				+        :return:
			
 
				+        """
			
 
				+        # 特殊符号转义
			
 
				+        self.__user = parse.quote_plus(self.__user)
			
 
				+        self.__password = parse.quote_plus(self.__password)
			
 
				+
			
 
				+        # 连接数据库
			
 
				+        if self.__user:
			
 
				+            client = MongoClient(
			
 
				+                "mongodb://{}:{}@{}".format(self.__user, self.__password, self.__host),
			
 
				+                unicode_decode_error_handler='ignore')
			
 
				+        else:
			
 
				+            client = MongoClient(
			
 
				+                "mongodb://{}".format(self.__host),
			
 
				+                unicode_decode_error_handler='ignore')
			
 
				+        col = client[self.__database][self.__col]
			
 
				+        return client, col
			
 
				+
			
 
				+    def get_by_mid(self, m_id, fields):
			
 
				+        info = {}
			
 
				+        for i in range(2):
			
 
				+            try:
			
 
				+                info = self.col.find_one({"_id": m_id}, fields)
			
 
				+                break
			
 
				+            except CursorNotFound as e:
			
 
				+                logger.warning(e)
			
 
				+                self.client, self.col = self.connect()
			
 
				+
			
 
				+        return info
			
 
				+
			
 
				+    def insert(self, row):
			
 
				+        info = {}
			
 
				+        for i in range(2):
			
 
				+            try:
			
 
				+                info = self.col.insert_one(row)
			
 
				+                break
			
 
				+            except CursorNotFound as e:
			
 
				+                logger.warning(e)
			
 
				+                self.client, self.col = self.connect()
			
 
				+        return info
			
 
				+
			
 
				+    def update(self, m_id, row):
			
 
				+        info = {}
			
 
				+        for i in range(2):
			
 
				+            try:
			
 
				+                info = self.col.update_one({"_id": m_id}, {"$set": row})
			
 
				+                break
			
 
				+            except CursorNotFound as e:
			
 
				+                logger.warning(e)
			
 
				+                self.client, self.col = self.connect()
			
 
				+        return info
			
--- a/machine_models/databases/mysql_helper.py
+++ b/machine_models/databases/mysql_helper.py
@@ -0,0 +1,67 @@
 
				+# coding:utf-8
			
 
				+from sqlalchemy import create_engine
			
 
				+from sqlalchemy.ext.declarative import declarative_base
			
 
				+from sqlalchemy import Column, String, Integer, Float, DATETIME
			
 
				+
			
 
				+Base = declarative_base()
			
 
				+
			
 
				+
			
 
				+class AnnotatedData(Base):
			
 
				+    # 训练数据对照表
			
 
				+    __tablename__ = 'annotatedData'
			
 
				+    id = Column(Integer, primary_key=True)
			
 
				+    tenantID = Column(Integer, comment="租户ID")
			
 
				+    projectId = Column(Integer, comment="项目ID")
			
 
				+    infoId = Column(String(100), comment="信息ID")
			
 
				+    label = Column(String(255), comment="标签标识")
			
 
				+    createTime = Column(DATETIME, comment="创建时间")
			
 
				+
			
 
				+
			
 
				+class Model(Base):
			
 
				+    # 模型表
			
 
				+    __tablename__ = 'model'
			
 
				+    id = Column(Integer, primary_key=True)
			
 
				+    createperson = Column(String(100), comment="创建人")
			
 
				+    createTime = Column(DATETIME, comment='创建时间')
			
 
				+    sampleData = Column(Integer, comment='模型数据量')
			
 
				+    recallRate = Column(Float(11, 2), comment='召回率')
			
 
				+    precision = Column(Float(11, 2), comment='精度')
			
 
				+    accuracyRate = Column(Float(11, 2), comment='准确率')
			
 
				+    state = Column(Integer(), comment='是否是默认模型  0 不是   1 是')
			
 
				+    modelFile = Column(String(255), comment='模型文件（oss存储）')
			
 
				+    projectId = Column(Integer, comment='项目标识')
			
 
				+
			
 
				+
			
 
				+class Project(Base):
			
 
				+    # 项目表
			
 
				+    __tablename__ = 'project'
			
 
				+    id = Column(Integer, primary_key=True)
			
 
				+    name = Column(String(255), comment="项目名称")
			
 
				+    labels = Column(String(255), comment='标签集')
			
 
				+    type = Column(Integer, comment='多标签')
			
 
				+    userId = Column(Integer, comment='用户id')
			
 
				+    model = Column(Integer, comment='模型ID')
			
 
				+    focusField = Column(String(255), comment='模型文件（oss存储）')
			
 
				+    createTime = Column(DATETIME, comment='创建时间')
			
 
				+    totalCount = Column(Integer, comment="总数")
			
 
				+
			
 
				+
			
 
				+def init_db(mysql_config):
			
 
				+    """
			
 
				+根据类创建数据库表
			
 
				+    :return:
			
 
				+    """
			
 
				+    db = mysql_config.get("db")
			
 
				+    ip = mysql_config.get("ip")
			
 
				+    port = mysql_config.get("port")
			
 
				+    user = mysql_config.get("user")
			
 
				+    pwd = mysql_config.get("pwd")
			
 
				+    charset = mysql_config.get("charset")
			
 
				+    engine = create_engine(
			
 
				+        f"mysql+pymysql://{user}:{pwd}@{ip}:{port}/{db}?charset={charset}",
			
 
				+        max_overflow=0,  # 超过连接池⼤⼩外最多创建的连接
			
 
				+        pool_size=5,  # 连接池⼤⼩
			
 
				+        pool_timeout=30,  # 池中没有线程最多等待的时间，否则报错
			
 
				+        pool_recycle=-1  # 多久之后对线程池中的线程进⾏⼀次连接的回收
			
 
				+    )
			
 
				+    return engine
			
--- a/machine_models/predict_model.py
+++ b/machine_models/predict_model.py
@@ -0,0 +1,45 @@
 
				+# coding:utf-8
			
 
				+
			
 
				+from machine_models.databases import loading_predict_data
			
 
				+import joblib
			
 
				+from machine_models.tools import encode2label
			
 
				+from docs.config import convertField
			
 
				+
			
 
				+
			
 
				+def predict(id_list, tfidf_vec, label_type, focus_field, target_label, model_path):
			
 
				+    '''
			
 
				+    预测入口
			
 
				+    :param id_list: id列表
			
 
				+    :param tfidf_vec: tf-idf 词典
			
 
				+    :param label_type: 类型
			
 
				+    :param focus_field:关注字段
			
 
				+    :param target_label:目标标签
			
 
				+    :param model_path:model_path
			
 
				+    :return:
			
 
				+    '''
			
 
				+    model, le = joblib.load(model_path)
			
 
				+    # 开始预测
			
 
				+    focus_field = [convertField[field] for field in focus_field if field in convertField]
			
 
				+    predict_result = []
			
 
				+    for m_id in id_list:
			
 
				+        content, doc = loading_predict_data(m_id, focus_field)
			
 
				+        if not doc:
			
 
				+            predict_result = [{"id": m_id, "title": "",
			
 
				+                               "url": "", "labels": ""}]
			
 
				+            continue
			
 
				+        content_vec = tfidf_vec.transform([content])
			
 
				+        # 单标签
			
 
				+        if label_type == 1:
			
 
				+            predict_y = model.predict(content_vec)
			
 
				+            target = le.classes_[predict_y[0]] if len(predict_y) > 0 else ""
			
 
				+            predict_result.append({"id": m_id, "title": doc.get("title", ""),
			
 
				+                                   "url": doc.get("href", ""), "labels": target})
			
 
				+
			
 
				+        else:
			
 
				+            # 多标签
			
 
				+            predict_y = model.predict(content_vec)
			
 
				+            result = encode2label(le, predict_y, target_label)
			
 
				+            target = result[0] if result else ""
			
 
				+            predict_result.append({"id": m_id, "title": doc.get("title", ""),
			
 
				+                                   "url": doc.get("href", ""), "labels": target})
			
 
				+    return predict_result
			
--- a/machine_models/tools.py
+++ b/machine_models/tools.py
@@ -0,0 +1,120 @@
 
				+# coding:utf-8
			
 
				+
			
 
				+import jieba.posseg as psg
			
 
				+from sklearn.feature_extraction.text import TfidfVectorizer
			
 
				+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
			
 
				+from sklearn.preprocessing import MultiLabelBinarizer
			
 
				+from util.htmlutil.htmltag import Clean
			
 
				+import jieba
			
 
				+import multiprocessing
			
 
				+
			
 
				+jieba.enable_parallel(multiprocessing.cpu_count())
			
 
				+
			
 
				+
			
 
				+def chinese2vectors(chinese: list, remove_word: list, stop_words: list) -> list:
			
 
				+    """
			
 
				+    中文转向量（多文本）
			
 
				+    :param chinese:
			
 
				+    :param remove_word: 去除词性 x ，n , eng
			
 
				+    :param stop_words: 停用词
			
 
				+    :return:
			
 
				+    """
			
 
				+    if not remove_word:
			
 
				+        remove_word = ["x"]
			
 
				+    if not stop_words:
			
 
				+        stop_words = []
			
 
				+    space_words = []
			
 
				+    for row in chinese:
			
 
				+        cut_ret = [word for word, x in psg.lcut(Clean(row)) if x not in remove_word and word not in stop_words]
			
 
				+        space_words.append(" ".join(cut_ret))
			
 
				+    return space_words
			
 
				+
			
 
				+
			
 
				+def chinese2vector(chinese: str, remove_word: list, stopwords: list) -> str:
			
 
				+    """
			
 
				+    中文转向量（但文本）
			
 
				+    :param chinese:
			
 
				+    :param remove_word: 去除词性 x ，n , eng
			
 
				+    :param stopwords: 停用词
			
 
				+    :return:
			
 
				+    """
			
 
				+    if not stopwords:
			
 
				+        stopwords = []
			
 
				+    if not remove_word:
			
 
				+        remove_word = ["x"]
			
 
				+    cut_ret = [word for word, x in psg.lcut(Clean(chinese)) if x not in remove_word and word not in stopwords]
			
 
				+    cut_ret = " ".join(cut_ret)
			
 
				+    return cut_ret
			
 
				+
			
 
				+
			
 
				+def tfidf(analyzer, space_words) -> tuple:
			
 
				+    '''
			
 
				+    tf-idf编码
			
 
				+    :param analyzer:
			
 
				+    :param space_words:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    tfidf_vec = TfidfVectorizer(analyzer=analyzer)
			
 
				+    tfidf_ret = tfidf_vec.fit_transform(space_words)
			
 
				+    return tfidf_vec, tfidf_ret
			
 
				+
			
 
				+
			
 
				+def one2hot(space_words) -> tuple:
			
 
				+    '''
			
 
				+    onehot编码
			
 
				+    :param space_words:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    oht = OneHotEncoder()
			
 
				+    oht_ret = oht.fit_transform(space_words)
			
 
				+    return oht, oht_ret
			
 
				+
			
 
				+
			
 
				+def combine_row(target_one: [list], target_two: [list]) -> list:
			
 
				+    """
			
 
				+    二维元组
			
 
				+    :param target_one:
			
 
				+    :param target_two:
			
 
				+    :return:
			
 
				+    """
			
 
				+    if len(target_one) != len(target_two):
			
 
				+        raise ValueError("两个列表维度不同")
			
 
				+    try:
			
 
				+        for ind, row in enumerate(target_two):
			
 
				+            target_one[ind] += row
			
 
				+    except Exception as e:
			
 
				+        raise e
			
 
				+    return target_one
			
 
				+
			
 
				+
			
 
				+def label2encode(labels: []) -> tuple:
			
 
				+    """
			
 
				+    labelEncode 标签向量化
			
 
				+    :param labels:
			
 
				+    :return:
			
 
				+    """
			
 
				+    le = LabelEncoder()
			
 
				+    train_labels = []
			
 
				+    for row in labels:
			
 
				+        train_labels += row
			
 
				+    le.fit_transform(train_labels)
			
 
				+    le_ret = [le.transform(row) for row in labels]
			
 
				+    le_ret = MultiLabelBinarizer().fit_transform(le_ret)
			
 
				+    return le, le_ret
			
 
				+
			
 
				+
			
 
				+def encode2label(le, predict_results, target_label: list) -> list:
			
 
				+    """
			
 
				+    向量转标签
			
 
				+    :param le: 标签词典对象
			
 
				+    :param predict_results: 预测结果
			
 
				+    :param target_label: 需要的分类
			
 
				+    :return:
			
 
				+    """
			
 
				+    detail_labels = []
			
 
				+    for i, label in enumerate(predict_results):
			
 
				+        if label.sum() > 0:
			
 
				+            label = [i for (i, x) in enumerate(label) if x > 0]
			
 
				+            label_str = ','.join([label for label in le.inverse_transform(label) if label in target_label])
			
 
				+            detail_labels.append(label_str)
			
 
				+    return detail_labels
			
--- a/machine_models/train_model.py
+++ b/machine_models/train_model.py
@@ -0,0 +1,127 @@
 
				+# coding:utf-8
			
 
				+
			
 
				+from sklearn.svm import LinearSVC
			
 
				+import os
			
 
				+import joblib
			
 
				+from machine_models.databases import loading_train_data
			
 
				+from sklearn.model_selection import train_test_split
			
 
				+from machine_models.tools import label2encode
			
 
				+from sklearn.multiclass import OneVsRestClassifier
			
 
				+import datetime
			
 
				+from docs.config import convertField
			
 
				+from machine_models.databases import File
			
 
				+import numpy as np
			
 
				+import uuid
			
 
				+from machine_models.databases.mysql_helper import Model
			
 
				+
			
 
				+
			
 
				+def many_recall_score(y_test, y_pred):
			
 
				+    '''
			
 
				+    多标签召回率计算
			
 
				+    :param y_test:
			
 
				+    :param y_pred:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    correct_count = 0
			
 
				+    total_count = 0
			
 
				+    for values in zip(y_test, y_pred):
			
 
				+        test_result = values[0]
			
 
				+        pred_result = values[1]
			
 
				+        total_count += test_result.sum()
			
 
				+        correct_count += pred_result[test_result > 0].sum()
			
 
				+    return correct_count / total_count
			
 
				+
			
 
				+
			
 
				+def recall_score(y_test, y_pred):
			
 
				+    '''
			
 
				+    单标签召回率计算
			
 
				+    :param y_test:
			
 
				+    :param y_pred:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    return (y_test == y_pred).sum() / y_test.size
			
 
				+
			
 
				+
			
 
				+def train_ones_label(x_train, y_train):
			
 
				+    '''
			
 
				+    单标签训练
			
 
				+    :return:
			
 
				+    '''
			
 
				+    seed = int(datetime.datetime.now().timestamp())
			
 
				+    model = LinearSVC(random_state=seed)
			
 
				+    model.fit(x_train, y_train)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def train_many_labels(x_train, y_train):
			
 
				+    '''
			
 
				+    多标签训练
			
 
				+    :param x_train:
			
 
				+    :param y_train:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    seed = int(datetime.datetime.now().timestamp())
			
 
				+    model = LinearSVC(random_state=seed)
			
 
				+    clf = OneVsRestClassifier(model, n_jobs=-1)  # 根据二分类器构建多分类器
			
 
				+    clf.fit(x_train, y_train)  # 训练模型
			
 
				+    return clf
			
 
				+
			
 
				+
			
 
				+def train(project_id, focus_field, tfidf_vec, label_type: int, model_dir: str):
			
 
				+    """
			
 
				+    模型训练
			
 
				+    :param project_id:
			
 
				+    :param focus_field:
			
 
				+    :param tfidf_vec:
			
 
				+    :param label_type:
			
 
				+    :param model_dir:
			
 
				+    :return:
			
 
				+    """
			
 
				+    # 关注字段
			
 
				+    focus_field = [convertField[field] for field in focus_field if field in convertField]
			
 
				+    # 读取数据
			
 
				+    train_data, train_label, count = loading_train_data(project_id, focus_field)
			
 
				+    # 训练数据向量化
			
 
				+    train_vec = tfidf_vec.transform(train_data)
			
 
				+    # label转向量
			
 
				+    le, label_vec = label2encode(train_label)
			
 
				+    if label_type == 1:
			
 
				+        single_label = []
			
 
				+        for label in label_vec:
			
 
				+            for ind, tag in enumerate(label):
			
 
				+                if tag == 1:
			
 
				+                    single_label.append(ind)
			
 
				+                    break
			
 
				+        label_vec = single_label
			
 
				+    x_train, x_test, y_train, y_test = train_test_split(train_vec, label_vec, test_size=0.2, shuffle=True)
			
 
				+    model_path = os.path.join(model_dir, "model.model")
			
 
				+    try:
			
 
				+        if label_type == 1:
			
 
				+            # 单标签训练
			
 
				+            y_test = np.array(y_test)
			
 
				+            clf = train_ones_label(x_train, y_train)
			
 
				+            y_pred = clf.predict(x_test)
			
 
				+            # 模型评估
			
 
				+            score = (y_test == y_pred).sum() / y_test.size
			
 
				+            recall = recall_score(y_test, y_pred)
			
 
				+        else:
			
 
				+            # 多标签训练
			
 
				+            clf = train_many_labels(x_train, y_train)
			
 
				+            y_pred = clf.predict(x_test)
			
 
				+            # 模型评估
			
 
				+            score = (y_test == y_pred).sum() / y_test.size
			
 
				+            recall = many_recall_score(y_test, y_pred)
			
 
				+    except Exception:
			
 
				+        return False
			
 
				+        # 模型储存
			
 
				+    joblib.dump((clf, le), model_path)
			
 
				+
			
 
				+    # 上传模型
			
 
				+    model_url = str(uuid.uuid4())
			
 
				+    with open(model_path, "rb") as f:
			
 
				+        File.upload_bytes_file(model_url, f.read())
			
 
				+    f1_score = ((score * recall) / (score + recall)) * 2 if score and recall else 0
			
 
				+    # 生成数据库记录对象
			
 
				+    add_model = Model(sampleData=count, recallRate=recall, precision=score, accuracyRate=f1_score,
			
 
				+                      modelFile=model_url)
			
 
				+    return add_model
			
--- a/predict_server.py
+++ b/predict_server.py
@@ -0,0 +1,31 @@
 
				+# coding:utf-8
			
 
				+"""
			
 
				+ 预测客户端
			
 
				+"""
			
 
				+import tornado.ioloop
			
 
				+import tornado.web
			
 
				+import json
			
 
				+from machine_models import predict_model
			
 
				+from loguru import logger
			
 
				+
			
 
				+logger.add('./logs/predict_{time}.log', rotation='00:00')
			
 
				+
			
 
				+
			
 
				+class MainHandler(tornado.web.RequestHandler):
			
 
				+    def post(self):
			
 
				+        request_params = self.request.body.decode('utf-8')
			
 
				+        try:
			
 
				+            request_dict = json.loads(request_params)
			
 
				+            predict_result = predict_model(request_dict)
			
 
				+            response_data = json.dumps(predict_result)
			
 
				+        except Exception as e:
			
 
				+            logger.warning(e)
			
 
				+            response_data = json.dumps({"error_code": 0})
			
 
				+        self.write(response_data)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    application = tornado.web.Application([(r"/jy_machining/predict", MainHandler), ])
			
 
				+    application.listen(8686)
			
 
				+    print('server start')
			
 
				+    tornado.ioloop.IOLoop.instance().start()
			
--- a/train_server.py
+++ b/train_server.py
@@ -0,0 +1,53 @@
 
				+# coding:utf-8
			
 
				+'''
			
 
				+训练客户端
			
 
				+'''
			
 
				+import nsq
			
 
				+import json
			
 
				+from machine_models import train_model
			
 
				+from loguru import logger
			
 
				+from queue import Queue
			
 
				+import time
			
 
				+from threading import Thread
			
 
				+
			
 
				+logger.add('./logs/runtime_{time}.log', rotation='00:00')
			
 
				+queueSave = Queue(maxsize=10000)  # 任务队列
			
 
				+
			
 
				+
			
 
				+def train_start():
			
 
				+    # 检查任务列表，开始训练
			
 
				+    global queueSave
			
 
				+    while True:
			
 
				+        if not queueSave.empty():
			
 
				+            params = queueSave.get()
			
 
				+            train_model(params)
			
 
				+            continue
			
 
				+        time.sleep(5)
			
 
				+
			
 
				+
			
 
				+def handler(message):
			
 
				+    '''
			
 
				+    nsq队列回调函数
			
 
				+    :param message:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    global queueSave
			
 
				+    try:
			
 
				+        body = message.body
			
 
				+        body = json.loads(body)
			
 
				+        queueSave.put(body)
			
 
				+    except Exception as e:
			
 
				+        logger.warning("start-->", e)
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+r = nsq.Reader(message_handler=handler, nsqd_tcp_addresses=['192.168.3.13:4150'], topic='machine_train',
			
 
				+               channel='NO.1',
			
 
				+               lookupd_poll_interval=5,
			
 
				+               lookupd_connect_timeout=10000,
			
 
				+               lookupd_request_timeout=10000)
			
 
				+if __name__ == '__main__':
			
 
				+    train_thread = Thread(target=train_start)
			
 
				+    train_thread.start()
			
 
				+    nsq.run()
			
 
				+    train_thread.join()
			
--- a/util/__pycache__/file_operations.cpython-37.pyc
+++ b/util/__pycache__/file_operations.cpython-37.pyc
--- a/util/__pycache__/fs_client.cpython-37.pyc
+++ b/util/__pycache__/fs_client.cpython-37.pyc
--- a/util/__pycache__/oss_file.cpython-37.pyc
+++ b/util/__pycache__/oss_file.cpython-37.pyc
--- a/util/file_operations.py
+++ b/util/file_operations.py
@@ -0,0 +1,62 @@
 
				+# --coding:utf-8--
			
 
				+'''
			
 
				+文件操作
			
 
				+'''
			
 
				+import os
			
 
				+import shutil
			
 
				+from shutil import copyfile
			
 
				+
			
 
				+
			
 
				+def save_file(file: bytes, filename):
			
 
				+    '''
			
 
				+    bytes保存文件
			
 
				+    :param file:
			
 
				+    :param filename:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    try:
			
 
				+        with open(filename, "wb") as fw:
			
 
				+            fw.write(file)
			
 
				+            fw.close()
			
 
				+            return True
			
 
				+    except Exception:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def generate_directory(dir_path: str) -> bool:
			
 
				+    '''
			
 
				+    生成文件夹
			
 
				+    :param dir_path:文件夹路径
			
 
				+    :return:
			
 
				+    '''
			
 
				+    try:
			
 
				+        if not os.path.exists(dir_path):
			
 
				+            os.makedirs(dir_path)
			
 
				+            os.chmod(dir_path, 0o777)
			
 
				+    except Exception:
			
 
				+        return False
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def del_directory(dir_path: str):
			
 
				+    '''
			
 
				+    删除文件夹
			
 
				+    :param dir_path: 文件夹路径
			
 
				+    :return:
			
 
				+    '''
			
 
				+    if os.path.exists(dir_path):
			
 
				+        shutil.rmtree(dir_path)
			
 
				+
			
 
				+
			
 
				+def file_copy(source_path: str, target_path: str):
			
 
				+    """
			
 
				+    文件copy到目标文件夹
			
 
				+    :param source_path: 文件原路径
			
 
				+    :param target_path: 目标文件夹
			
 
				+    :return:
			
 
				+    """
			
 
				+    try:
			
 
				+        copyfile(source_path, target_path)
			
 
				+    except IOError as e:
			
 
				+        return False
			
 
				+    return True
			
--- a/util/fs_client.py
+++ b/util/fs_client.py
@@ -0,0 +1,54 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import oss2
			
 
				+
			
 
				+
			
 
				+class FileServeClient(object):
			
 
				+    def __init__(self, config):
			
 
				+        '''
			
 
				+        文本存储客户端
			
 
				+        目前使用阿里云OSS对象存储服务
			
 
				+        注意：文件读写，都是以object-name为索引，请保存object-name
			
 
				+        '''
			
 
				+        self.auth = None
			
 
				+        self.bucket = None
			
 
				+        self._access_key_id = config.get("access_key_id", "")
			
 
				+        self._access_key_secret = config.get("access_key_secret", "")
			
 
				+        self._endpoint = config.get("endpoint", "")
			
 
				+        self._bucket_name = config.get("bucket_name", "")
			
 
				+        self.do_auth()
			
 
				+
			
 
				+    def do_auth(self):
			
 
				+        '''
			
 
				+        身份验证
			
 
				+        '''
			
 
				+        auth = oss2.Auth(self._access_key_id, self._access_key_secret)
			
 
				+        bucket = oss2.Bucket(auth, self._endpoint, self._bucket_name)
			
 
				+        self.auth = auth
			
 
				+        self.bucket = bucket
			
 
				+
			
 
				+    def upload_text_file(self, object_name: str, file_content: str) -> (any, any):
			
 
				+        '''
			
 
				+        文本上传
			
 
				+        '''
			
 
				+        result = self.bucket.put_object(object_name, bytes(file_content, encoding='utf-8'))
			
 
				+        status, request_id = result.status, result.request_id
			
 
				+        return status, request_id
			
 
				+
			
 
				+    def download_text_content(self, object_name) -> (bool, str):
			
 
				+        '''
			
 
				+        下载文本内容
			
 
				+        '''
			
 
				+        object_stream = self.bucket.get_object(object_name)
			
 
				+        content = object_stream.read()
			
 
				+        if object_stream.client_crc == object_stream.server_crc:
			
 
				+            return True, str(content, encoding='utf-8')
			
 
				+        else:
			
 
				+            return False, ''
			
 
				+
			
 
				+    def delete_object(self, object_name: str) -> (any, any):
			
 
				+        '''
			
 
				+        删除内容
			
 
				+        '''
			
 
				+        result = self.bucket.delete_object(object_name)
			
 
				+        status, request_id = result.status, result.request_id
			
 
				+        return status, request_id
			
--- a/util/htmlutil/__pycache__/htmltag.cpython-37.pyc
+++ b/util/htmlutil/__pycache__/htmltag.cpython-37.pyc
--- a/util/htmlutil/htmltag.py
+++ b/util/htmlutil/htmltag.py
@@ -0,0 +1,78 @@
 
				+# coding:utf-8
			
 
				+import re
			
 
				+
			
 
				+br_reg = re.compile('<br[/]*>', re.I)
			
 
				+table_reg = re.compile('<([/]*table[^>]*)>', re.I)
			
 
				+tablebody_reg = re.compile('<([/]*tbody[^>]*)>', re.I)
			
 
				+input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
			
 
				+tr_reg = re.compile('<([/]*tr[^>]*)>', re.I)
			
 
				+th_reg = re.compile('<([/]*th[^>]*)>', re.I)
			
 
				+td_reg = re.compile('<([/]*td[^>]*)>', re.I)
			
 
				+p_reg = re.compile('<[/]?p>', re.I)
			
 
				+othertag_reg = re.compile('<[^>]+>', re.I | re.M)
			
 
				+other_symbol_reg = re.compile('[\t| ]*')
			
 
				+seg_first_space_reg = re.compile('\n+\\s*', re.M)
			
 
				+mul_crcf_reg = re.compile('\n+', re.M)
			
 
				+brackets_reg = re.compile('\\s+')
			
 
				+table_fk_reg = re.compile('(\\[table[^\\]]*\\])(.*?)(\\[/table\\])', re.M | re.S | re.I)
			
 
				+
			
 
				+
			
 
				+##html标签清理
			
 
				+def Clean(html: str):
			
 
				+    html = br_reg.sub('\n', html)
			
 
				+    html = table_reg.sub('', html)
			
 
				+    html = tablebody_reg.sub('', html)
			
 
				+    html = tr_reg.sub('\n', html)
			
 
				+    html = td_reg.sub(' ', html)
			
 
				+    html = p_reg.sub('\n', html)
			
 
				+    html = othertag_reg.sub('', html)
			
 
				+    html = other_symbol_reg.sub('', html)
			
 
				+    html = seg_first_space_reg.sub('\n', html)
			
 
				+    html = mul_crcf_reg.sub('\n', html)
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def ClearSpace(txt: str):
			
 
				+    return brackets_reg.sub('', txt)
			
 
				+
			
 
				+
			
 
				+##html标签清理,但保留table表格
			
 
				+def CleanKeepTable(html: str):
			
 
				+    html = br_reg.sub('\n', html)
			
 
				+    html = table_reg.sub(subFunc4Match, html)
			
 
				+    html = tablebody_reg.sub(subFunc4Match, html)
			
 
				+    html = tr_reg.sub(subFunc4Match, html)
			
 
				+    html = td_reg.sub(subFunc4Match, html)
			
 
				+    html = th_reg.sub(subFunc4Match, html)
			
 
				+    html = p_reg.sub('\n', html)
			
 
				+    html = othertag_reg.sub('', html)
			
 
				+    # html = other_symbol_reg.sub('',html)
			
 
				+    html = seg_first_space_reg.sub('\n', html)
			
 
				+    # print("-->", html)
			
 
				+    html = table_fk_reg.sub(lambda x: x.group(1) + mul_crcf_reg.sub(' ', x.group(2)) + x.group(3), html)
			
 
				+    html = mul_crcf_reg.sub('\n', html)
			
 
				+    # 清理table标签中的空格
			
 
				+    html = html.replace('[', '<').replace(']', '>')
			
 
				+    html = html.replace('<table', '\n<table').replace('</table>', '</table>\n')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def subFunc4Match(strmatch):
			
 
				+    try:
			
 
				+        if strmatch:
			
 
				+            return '[%s]' % (strmatch.group(1))
			
 
				+        else:
			
 
				+            return ""
			
 
				+    except Exception as e:
			
 
				+        print(e)
			
 
				+
			
 
				+
			
 
				+def extract_input_value(html):
			
 
				+    input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
			
 
				+    input_r = re.compile(r'<[/]*input[^>].*?[/]>', re.I)
			
 
				+    result = input_r.findall(html)
			
 
				+    for input_detail in result:
			
 
				+        ret = input_reg.findall(input_detail)
			
 
				+        if ret:
			
 
				+            html = html.replace(input_detail, f"</td><td>{ret[0]}")
			
 
				+    return html
			
--- a/util/oss_file.py
+++ b/util/oss_file.py
@@ -0,0 +1,70 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import shutil
			
 
				+import oss2
			
 
				+
			
 
				+
			
 
				+class OssServeClient(object):
			
 
				+    def __init__(self, config):
			
 
				+        '''
			
 
				+        文件存储客户端
			
 
				+        目前使用阿里云OSS对象存储服务
			
 
				+        注意：文件读写，都是以object-name为索引，请保存object-name
			
 
				+        '''
			
 
				+        self.auth = None
			
 
				+        self.bucket = None
			
 
				+        self._access_key_id = config.get("access_key_id", "")
			
 
				+        self._access_key_secret = config.get("access_key_secret", "")
			
 
				+        self._endpoint = config.get("endpoint", "")
			
 
				+        self._bucket_name = config.get("bucket_name", "")
			
 
				+        self.do_auth()
			
 
				+
			
 
				+    def do_auth(self):
			
 
				+        '''
			
 
				+        身份验证
			
 
				+        '''
			
 
				+        auth = oss2.Auth(self._access_key_id, self._access_key_secret)
			
 
				+        bucket = oss2.Bucket(auth, self._endpoint, self._bucket_name)
			
 
				+        self.auth = auth
			
 
				+        self.bucket = bucket
			
 
				+
			
 
				+    def delete_object(self, object_name: str) -> (any, any):
			
 
				+        '''
			
 
				+        删除内容
			
 
				+        '''
			
 
				+        result = self.bucket.delete_object(object_name)
			
 
				+        status, request_id = result.status, result.request_id
			
 
				+        return status, request_id
			
 
				+
			
 
				+    def upload_bytes_file(self, object_name: str, file_content: bytes):
			
 
				+        '''
			
 
				+        文件上传
			
 
				+        :param object_name: fid
			
 
				+        :param file_content: 文件流
			
 
				+        :return:
			
 
				+        '''
			
 
				+        result = self.bucket.put_object(object_name, file_content)
			
 
				+        status, request_id = result.status, result.request_id
			
 
				+        return status, request_id
			
 
				+
			
 
				+    def download_file(self, object_name, save_path):
			
 
				+        '''
			
 
				+        文件下载到本地
			
 
				+        :param object_name: fid
			
 
				+        :param save_path: 保存路径
			
 
				+        :return:
			
 
				+        '''
			
 
				+        object_stream = self.bucket.get_object_to_file(object_name, save_path)
			
 
				+        if object_stream.status == 200:
			
 
				+            return True
			
 
				+
			
 
				+    def download_file_stream(self, object_name, filename):
			
 
				+        '''
			
 
				+        文件流下载
			
 
				+        :param object_name: fid
			
 
				+        :param filename: 文件路径
			
 
				+        :return:
			
 
				+        '''
			
 
				+        object_stream = self.bucket.get_object(object_name)
			
 
				+        with open(filename, 'wb') as file:
			
 
				+            shutil.copyfileobj(object_stream, file)
			
 
				+        return object_stream.status, filename