package extract import ( "data_ai/clean" "data_ai/prompt" "data_ai/ul" log "github.com/donnie4w/go-logger/logger" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "strings" "sync" "unicode/utf8" ) // 识别结构化字段 func ExtractFieldInfo(sid string, eid string) { q := map[string]interface{}{ "_id": map[string]interface{}{ "$gt": ul.StringTOBsonId(sid), "$lte": ul.StringTOBsonId(eid), }, } //先查询抽取表-确定大模型需要识别到范围 dict := ConfrimExtractInfo(q) log.Debug("查询语句...", q, "~", len(dict)) if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型 log.Debug("数量超过限制临时使用:glm-4-flashx") ul.FlashModel = "glm-4-flashx" } else { ul.FlashModel = "glm-4-flash" } pool_mgo := make(chan bool, ul.Reading) wg_mgo := &sync.WaitGroup{} sess := ul.SourceMgo.GetMgoConn() defer ul.SourceMgo.DestoryMongoConn(sess) total, isok := 0, 0 it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter() for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%200 == 0 { log.Debug("cur ai index ", total) } tmpid := ul.BsonTOStringId(tmp["_id"]) infoformat := qu.IntAll(tmp["infoformat"]) if infoformat > 1 || dict[tmpid] == nil { tmp = make(map[string]interface{}) continue } isok++ pool_mgo <- true wg_mgo.Add(1) go func(tmp map[string]interface{}) { defer func() { <-pool_mgo wg_mgo.Done() }() u_id := ul.BsonTOStringId(tmp["_id"]) data := ResolveInfo(tmp) if len(data) > 0 || u_id == "" { ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{ "$set": map[string]interface{}{"ai_zhipu": data}, }) } }(tmp) tmp = make(map[string]interface{}) } wg_mgo.Wait() log.Debug("ai is over ...", sid, "~", eid) } // 获取处理数据... func ResolveInfo(v map[string]interface{}) map[string]interface{} { tmpid := ul.BsonTOStringId(v["_id"]) detail := getDetailText(v, tmpid) //获取正文文本 title := qu.ObjToString(v["title"]) dl := utf8.RuneCountInString(detail) //文本长度 //过滤数据··· if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil || dl < 20 { return map[string]interface{}{} } //识别结构,短文本结构 f_data, shorText := map[string]interface{}{}, false if dl < 100 { shorText = true } //文本格式转换 detail = ul.HttpConvertToMarkdown(detail) //获取外围字段数据 f_info := prompt.AcquireExtractFieldInfo(detail, shorText) //非短文本 if !shorText { //获取分包信息 if pkg := prompt.AcquireNewMultiplePackageInfo(detail); len(pkg) > 0 { f_info["s_pkg"] = pkg } //获取分类字段数据 s_toptype, s_subtype := prompt.AcquireClassInfo(detail, title, qu.ObjToString(v["toptype"])) f_info["s_toptype"] = s_toptype f_info["s_subtype"] = s_subtype } //调用标的物识别 if p_list := getPurList(v, detail, f_info); len(p_list) > 0 { f_info["purchasinglist"] = p_list } //字段清洗 fns := getpnsinfo(v) //获取附件名字 f_data = clean.CleanFieldInfo(f_info, fns) //对于某些字段进行二级校验 if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" { if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil { if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" { f_data["s_buyer"] = ns_buyer } } } return f_data } func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} { dict := map[string]interface{}{} sess := ul.SourceMgo.GetMgoConn() defer ul.SourceMgo.DestoryMongoConn(sess) total := 0 it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter() for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%1000 == 0 { log.Debug("cur index ", total) } if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别 tmpid := ul.BsonTOStringId(tmp["_id"]) dict[tmpid] = tmpid } tmp = make(map[string]interface{}) } return dict } // 获取附件名字信息 func getpnsinfo(tmp map[string]interface{}) []string { arr := []string{} if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil { if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil { for _, v := range *attachments { if info := qu.ObjToMap(v); info != nil { if filename := qu.ObjToString((*info)["filename"]); filename != "" { arr = append(arr, filename) } } } } } return arr } func getDetailText(v map[string]interface{}, tmpid string) string { detail := qu.ObjToString(v["detail"]) if ul.IsTool { detail = qu.ObjToString(v["details"]) filetext := qu.ObjToString(v["filetext"]) if utf8.RuneCountInString(detail) < 100 && filetext != "" { detail = filetext } } else { //if bs := ul.OssGetObject(tmpid); bs != "" { // detail = bs //} } return detail } // 获取标的物-过滤产权-拟建 func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) []map[string]interface{} { if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" { return []map[string]interface{}{} } p_data := map[string]interface{}{} p_data["detail"] = detail p_data["site"] = v["site"] p_data["attach_text"] = v["attach_text"] p_data["toptype"] = v["toptype"] if f_info["s_toptype"] != nil { p_data["toptype"] = f_info["s_toptype"] } if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 { if qu.IntAll(p_info["status"]) == 200 { p_list := ul.IsMarkInterfaceMap(p_info["purchasinglist"]) return p_list } } return []map[string]interface{}{} } // 暂时不启用...无限重试 func RunResetUpdateFieldInfo(arr []string, name string, s_name string) { //log.Debug("开始重置更新...", len(arr)) //reset := []string{} //for k, v := range arr { // log.Debug("...", k, "...", v) // data := ul.SourceMgo.FindById(name, v) // content := PromptFieldText(qu.ObjToString(data["detail"])) // zp, ok := map[string]interface{}{}, 0 // for { // ok++ // if zp = ai.PostZhiPuAI(content); len(zp) > 0 { // break // } // if ok >= 5 { // log.Debug("请求数据失败...", v) // reset = append(reset, v) // break // } // } // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{ // "$set": map[string]interface{}{ // "zhipu": zp, // }, // }) //} //if len(reset) > 0 { //无限尝试 // RunResetUpdateFieldInfo(reset, name, s_name) //} //log.Debug("本轮重置更新结束......") }