package extract import ( "data_ai/clean" "data_ai/prompt" "data_ai/ul" log "github.com/donnie4w/go-logger/logger" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "strings" "sync" "time" "unicode/utf8" ) // 识别结构化字段 func ExtractFieldInfo(sid string, eid string) { q := map[string]interface{}{ "_id": map[string]interface{}{ "$gt": ul.StringTOBsonId(sid), "$lte": ul.StringTOBsonId(eid), }, } //先查询抽取表-确定大模型需要识别到范围 dict := ConfrimExtractInfo(q) log.Debug("查询语句...", q, "~", len(dict)) if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型 log.Debug("数量超过限制临时使用:glm-4-flashx") ul.FlashModel = "glm-4-flashx" } else { ul.FlashModel = "glm-4-flash" } pool_mgo := make(chan bool, ul.Reading) wg_mgo := &sync.WaitGroup{} sess := ul.BidMgo.GetMgoConn() defer ul.BidMgo.DestoryMongoConn(sess) total, isok := 0, 0 it := sess.DB(ul.BidMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter() for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%1000 == 0 { log.Debug("cur ai index ", total) } tmpid := ul.BsonTOStringId(tmp["_id"]) infoformat := qu.IntAll(tmp["infoformat"]) if infoformat > 1 || dict[tmpid] == nil { tmp = make(map[string]interface{}) continue } isok++ pool_mgo <- true wg_mgo.Add(1) go func(tmp map[string]interface{}) { defer func() { <-pool_mgo wg_mgo.Done() }() info := map[string]interface{}{} u_id := ul.BsonTOStringId(tmp["_id"]) //大模型数据··· ai_zhipu := ResolveInfo(tmp, u_id) if len(ai_zhipu) > 0 { info["ai_zhipu"] = ai_zhipu } //科学计数法标记···数据标记···会冗余 s, f := ScientificUnit(qu.ObjToString(tmp["detail"])) if s != "" && f > 0.0 { info["e_bidamount"] = f } //更新方法 if len(info) > 0 && u_id != "" { ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{ "$set": info, }) } }(tmp) tmp = make(map[string]interface{}) } wg_mgo.Wait() log.Debug("ai is over ...", sid, "~", eid) } // 获取处理数据... func ResolveInfo(v map[string]interface{}, tmpid string) map[string]interface{} { title := qu.ObjToString(v["title"]) old_detail := getDetailText(v, tmpid) //获取正文文本 //是否表格 isTable := false if strings.Contains(old_detail, "") { isTable = false //可以屏蔽表格的识别内容 } //过滤信息 if NotInProgressInfo(title, old_detail, v) { return map[string]interface{}{} } //识别结构,短文本结构 f_data, shorText := map[string]interface{}{}, false if utf8.RuneCountInString(old_detail) < 100 { shorText = true } //文本格式转换 new_detail := ul.HttpConvertToMarkdown(old_detail) //特殊文本转换 new_detail = CleanText(new_detail) //短文本判断是否有效性 if shorText { if info := prompt.AcquireJudgeShortInfo(new_detail); info["结果"] != "是" { return map[string]interface{}{} } } //获取外围字段数据-拆分合并字段 f_info_1 := prompt.AcquireExtractFieldInfoFirst(new_detail) f_info_2 := prompt.AcquireExtractFieldInfoSecond(new_detail) f_info_3 := prompt.AcquireExtractFieldInfoThird(new_detail) f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2, f_info_3}) //非短文本以下识别-纯测试 if !shorText { //获取分包信息 if pkg := prompt.AcquireNewMultiplePackageInfo(new_detail, isTable); len(pkg) > 0 { f_info["s_pkg"] = pkg } //获取分类字段数据 s_toptype, s_subtype := prompt.AcquireClassInfo(new_detail, title, qu.ObjToString(v["toptype"])) f_info["s_toptype"] = s_toptype f_info["s_subtype"] = s_subtype //调用标的物识别 if !ul.IsTool && !ul.IsLocal { if s_purchasinglist := getPurList(v, old_detail, f_info); len(s_purchasinglist) > 0 { f_info["s_purchasinglist"] = s_purchasinglist } } } //字段清洗 fns := GetFnsInfo(v) //获取附件名字 f_data = clean.CleanFieldInfo(f_info, fns, isTable) //采购单位二级校验 CheckOutBuyerInfo(f_data) //标题提取采购单位 if qu.ObjToString(f_data["s_buyer"]) == "" { if zp_buyer := prompt.AcquireBuyerInfo(title); zp_buyer["实体单位"] != nil { if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" { f_data["s_buyer"] = ns_buyer } } } //强制逻辑判断- ForcedLogicDecideInfo(f_data) //返回数据 return f_data } /* ************************************************************ ************************************************************ ************************************************************ 支持新模型-deepseek的轮询查询 */ func RunDeepSeek() { log.Debug("执行轮询定时···deepseek···", ul.Ext_Name) tmp_data := ul.PyMgo.FindById(ul.Ext_Name, "67c50d2088dabe81a67a2468") ttt := ExtractDeepSeekInfo(tmp_data) log.Debug(ttt) return for { log.Debug("开始处理线程数···", ul.Reading) pool_mgo := make(chan bool, ul.Reading) wg_mgo := &sync.WaitGroup{} sess := ul.PyMgo.GetMgoConn() defer ul.PyMgo.DestoryMongoConn(sess) q, total, isok := map[string]interface{}{}, 0, 0 it := sess.DB(ul.PyMgo.DbName).C(ul.Ext_Name).Find(&q).Iter() for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%100 == 0 { log.Debug("cur ai index ", total) } isok++ if tmp["ai_updatetime"] != nil { tmp = make(map[string]interface{}) continue } pool_mgo <- true wg_mgo.Add(1) go func(tmp map[string]interface{}) { defer func() { <-pool_mgo wg_mgo.Done() }() u_id := ul.BsonTOStringId(tmp["_id"]) //抽取deepseek数据···并更新 data := ExtractDeepSeekInfo(tmp) update_info := make(map[string]interface{}, 0) if len(data) > 0 && u_id != "" { tmp["ai_zhipu"] = data ul.ChooseCheckDataAI(tmp, &update_info) if update_info["com_package"] == nil { //构建单包信息··· com_package := ul.CreatSingleFieldInfo(tmp, update_info) update_info["com_package"] = com_package } update_info["ai_zhipu"] = data } update_info["ai_updatetime"] = time.Now().Unix() ul.PyMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{ "$set": update_info, }) }(tmp) tmp = make(map[string]interface{}) } wg_mgo.Wait() log.Debug("deepseek完毕······", isok) break time.Sleep(time.Second * 1800) } } // deepseek模型识别字段 func ExtractDeepSeekInfo(tmp map[string]interface{}) map[string]interface{} { //基础信息 tmpid := ul.BsonTOStringId(tmp["_id"]) title := qu.ObjToString(tmp["title"]) old_detail := getDetailText(tmp, tmpid) //获取正文文本 //过滤信息 if NotInProgressInfo(title, old_detail, tmp) { return map[string]interface{}{} } //识别结构,短文本结构,不想进行分类识别 shorText := false if utf8.RuneCountInString(old_detail) < 100 { shorText = true } //文本格式转换 new_detail := ul.HttpConvertToMarkdown(title + "\n" + old_detail) //特殊文本转换 new_detail = CleanText(new_detail) //短文本判断是否有效性 if shorText { if info := prompt.AcquireJudgeDeepSeekShortInfo(new_detail); info["结果"] != "是" { return map[string]interface{}{} } } //获取通用该字段 f_info := prompt.AcquireExtractFieldDeepSeekInfo(new_detail) //******字段清洗****** f_data := clean.CleanDeepSeekInfo(f_info, tmp) //******二级校验****** CheckOutDeepSeekBuyerInfo(f_data) //******强制判断****** ForcedLogicDecideInfo(f_data) return f_data } // 暂时不启用...无限重试 func RunResetUpdateFieldInfo(arr []string, name string, s_name string) { //log.Debug("开始重置更新...", len(arr)) //reset := []string{} //for k, v := range arr { // log.Debug("...", k, "...", v) // data := ul.SourceMgo.FindById(name, v) // content := PromptFieldText(qu.ObjToString(data["detail"])) // zp, ok := map[string]interface{}{}, 0 // for { // ok++ // if zp = ai.PostZhiPuAI(content); len(zp) > 0 { // break // } // if ok >= 5 { // log.Debug("请求数据失败...", v) // reset = append(reset, v) // break // } // } // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{ // "$set": map[string]interface{}{ // "zhipu": zp, // }, // }) //} //if len(reset) > 0 { //无限尝试 // RunResetUpdateFieldInfo(reset, name, s_name) //} //log.Debug("本轮重置更新结束......") }