package extract
import (
"data_ai/clean"
"data_ai/prompt"
"data_ai/ul"
log "github.com/donnie4w/go-logger/logger"
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
"strings"
"sync"
"time"
"unicode/utf8"
)
// 识别结构化字段
func ExtractFieldInfo(sid string, eid string) {
q := map[string]interface{}{
"_id": map[string]interface{}{
"$gt": ul.StringTOBsonId(sid),
"$lte": ul.StringTOBsonId(eid),
},
}
//先查询抽取表-确定大模型需要识别到范围
dict := ConfrimExtractInfo(q)
log.Debug("查询语句...", q, "~", len(dict))
if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
log.Debug("数量超过限制临时使用:glm-4-flashx")
ul.FlashModel = "glm-4-flashx"
} else {
ul.FlashModel = "glm-4-flash"
}
pool_mgo := make(chan bool, ul.Reading)
wg_mgo := &sync.WaitGroup{}
sess := ul.BidMgo.GetMgoConn()
defer ul.BidMgo.DestoryMongoConn(sess)
total, isok := 0, 0
it := sess.DB(ul.BidMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
if total%1000 == 0 {
log.Debug("cur ai index ", total)
}
tmpid := ul.BsonTOStringId(tmp["_id"])
infoformat := qu.IntAll(tmp["infoformat"])
if infoformat > 1 || dict[tmpid] == nil {
tmp = make(map[string]interface{})
continue
}
isok++
pool_mgo <- true
wg_mgo.Add(1)
go func(tmp map[string]interface{}) {
defer func() {
<-pool_mgo
wg_mgo.Done()
}()
info := map[string]interface{}{}
u_id := ul.BsonTOStringId(tmp["_id"])
//大模型数据···
ai_zhipu := ResolveInfo(tmp, u_id)
if len(ai_zhipu) > 0 {
info["ai_zhipu"] = ai_zhipu
}
//科学计数法标记···数据标记···会冗余
s, f := ScientificUnit(qu.ObjToString(tmp["detail"]))
if s != "" && f > 0.0 {
info["e_bidamount"] = f
}
//更新方法
if len(info) > 0 && u_id != "" {
ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
"$set": info,
})
}
}(tmp)
tmp = make(map[string]interface{})
}
wg_mgo.Wait()
log.Debug("ai is over ...", sid, "~", eid)
}
// 获取处理数据...
func ResolveInfo(v map[string]interface{}, tmpid string) map[string]interface{} {
title := qu.ObjToString(v["title"])
old_detail := getDetailText(v, tmpid) //获取正文文本
//是否表格
isTable := false
if strings.Contains(old_detail, "
") {
isTable = false //可以屏蔽表格的识别内容
}
//过滤信息
if NotInProgressInfo(title, old_detail, v) {
return map[string]interface{}{}
}
//识别结构,短文本结构
f_data, shorText := map[string]interface{}{}, false
if utf8.RuneCountInString(old_detail) < 100 {
shorText = true
}
//文本格式转换
new_detail := ul.HttpConvertToMarkdown(old_detail)
//特殊文本转换
new_detail = CleanText(new_detail)
//短文本判断是否有效性
if shorText {
if info := prompt.AcquireJudgeShortInfo(new_detail); info["结果"] != "是" {
return map[string]interface{}{}
}
}
//获取外围字段数据-拆分合并字段
f_info_1 := prompt.AcquireExtractFieldInfoFirst(new_detail)
f_info_2 := prompt.AcquireExtractFieldInfoSecond(new_detail)
f_info_3 := prompt.AcquireExtractFieldInfoThird(new_detail)
f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2, f_info_3})
//非短文本以下识别-纯测试
if !shorText {
//获取分包信息
if pkg := prompt.AcquireNewMultiplePackageInfo(new_detail, isTable); len(pkg) > 0 {
f_info["s_pkg"] = pkg
}
//获取分类字段数据
s_toptype, s_subtype := prompt.AcquireClassInfo(new_detail, title, qu.ObjToString(v["toptype"]))
f_info["s_toptype"] = s_toptype
f_info["s_subtype"] = s_subtype
//调用标的物识别
if !ul.IsTool && !ul.IsLocal {
if s_purchasinglist := getPurList(v, old_detail, f_info); len(s_purchasinglist) > 0 {
f_info["s_purchasinglist"] = s_purchasinglist
}
}
}
//字段清洗
fns := GetFnsInfo(v) //获取附件名字
f_data = clean.CleanFieldInfo(f_info, fns, isTable)
//采购单位二级校验
CheckOutBuyerInfo(f_data)
//标题提取采购单位
if qu.ObjToString(f_data["s_buyer"]) == "" {
if zp_buyer := prompt.AcquireBuyerInfo(title); zp_buyer["实体单位"] != nil {
if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
f_data["s_buyer"] = ns_buyer
}
}
}
//强制逻辑判断-
ForcedLogicDecideInfo(f_data)
//返回数据
return f_data
}
/*
************************************************************
************************************************************
************************************************************
支持新模型-deepseek的轮询查询
*/
func RunDeepSeek() {
log.Debug("执行轮询定时···deepseek···", ul.Ext_Name)
tmp_data := ul.PyMgo.FindById(ul.Ext_Name, "67c50d2088dabe81a67a2468")
ttt := ExtractDeepSeekInfo(tmp_data)
log.Debug(ttt)
return
for {
log.Debug("开始处理线程数···", ul.Reading)
pool_mgo := make(chan bool, ul.Reading)
wg_mgo := &sync.WaitGroup{}
sess := ul.PyMgo.GetMgoConn()
defer ul.PyMgo.DestoryMongoConn(sess)
q, total, isok := map[string]interface{}{}, 0, 0
it := sess.DB(ul.PyMgo.DbName).C(ul.Ext_Name).Find(&q).Iter()
for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
if total%100 == 0 {
log.Debug("cur ai index ", total)
}
isok++
if tmp["ai_updatetime"] != nil {
tmp = make(map[string]interface{})
continue
}
pool_mgo <- true
wg_mgo.Add(1)
go func(tmp map[string]interface{}) {
defer func() {
<-pool_mgo
wg_mgo.Done()
}()
u_id := ul.BsonTOStringId(tmp["_id"])
//抽取deepseek数据···并更新
data := ExtractDeepSeekInfo(tmp)
update_info := make(map[string]interface{}, 0)
if len(data) > 0 && u_id != "" {
tmp["ai_zhipu"] = data
ul.ChooseCheckDataAI(tmp, &update_info)
if update_info["com_package"] == nil { //构建单包信息···
com_package := ul.CreatSingleFieldInfo(tmp, update_info)
update_info["com_package"] = com_package
}
update_info["ai_zhipu"] = data
}
update_info["ai_updatetime"] = time.Now().Unix()
ul.PyMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
"$set": update_info,
})
}(tmp)
tmp = make(map[string]interface{})
}
wg_mgo.Wait()
log.Debug("deepseek完毕······", isok)
break
time.Sleep(time.Second * 1800)
}
}
// deepseek模型识别字段
func ExtractDeepSeekInfo(tmp map[string]interface{}) map[string]interface{} {
//基础信息
tmpid := ul.BsonTOStringId(tmp["_id"])
title := qu.ObjToString(tmp["title"])
old_detail := getDetailText(tmp, tmpid) //获取正文文本
//过滤信息
if NotInProgressInfo(title, old_detail, tmp) {
return map[string]interface{}{}
}
//识别结构,短文本结构,不想进行分类识别
shorText := false
if utf8.RuneCountInString(old_detail) < 100 {
shorText = true
}
//文本格式转换
new_detail := ul.HttpConvertToMarkdown(title + "\n" + old_detail)
//特殊文本转换
new_detail = CleanText(new_detail)
//短文本判断是否有效性
if shorText {
if info := prompt.AcquireJudgeDeepSeekShortInfo(new_detail); info["结果"] != "是" {
return map[string]interface{}{}
}
}
//获取通用该字段
f_info := prompt.AcquireExtractFieldDeepSeekInfo(new_detail)
//******字段清洗******
f_data := clean.CleanDeepSeekInfo(f_info, tmp)
//******二级校验******
CheckOutDeepSeekBuyerInfo(f_data)
//******强制判断******
ForcedLogicDecideInfo(f_data)
return f_data
}
// 暂时不启用...无限重试
func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
//log.Debug("开始重置更新...", len(arr))
//reset := []string{}
//for k, v := range arr {
// log.Debug("...", k, "...", v)
// data := ul.SourceMgo.FindById(name, v)
// content := PromptFieldText(qu.ObjToString(data["detail"]))
// zp, ok := map[string]interface{}{}, 0
// for {
// ok++
// if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
// break
// }
// if ok >= 5 {
// log.Debug("请求数据失败...", v)
// reset = append(reset, v)
// break
// }
// }
// ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
// "$set": map[string]interface{}{
// "zhipu": zp,
// },
// })
//}
//if len(reset) > 0 { //无限尝试
// RunResetUpdateFieldInfo(reset, name, s_name)
//}
//log.Debug("本轮重置更新结束......")
}