package extract import ( "data_ai/clean" "data_ai/prompt" "data_ai/ul" "fmt" log "github.com/donnie4w/go-logger/logger" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "sync" "unicode/utf8" ) // 抽取结构字段 func ExtractFieldInfo(name string, s_name string) { pool_mgo := make(chan bool, 50) wg_mgo := &sync.WaitGroup{} dataArr, _ := ul.SourceMgo.Find(name, map[string]interface{}{}, nil, nil) for k, v := range dataArr { if k%100 == 0 { log.Debug(k, "~", ul.BsonTOStringId(v["_id"])) } infoformat := qu.IntAll(v["infoformat"]) if infoformat != 1 { continue } pool_mgo <- true wg_mgo.Add(1) go func(v map[string]interface{}) { defer func() { <-pool_mgo wg_mgo.Done() }() tmpid := ul.BsonTOStringId(v["_id"]) data := ResolveInfo(v) //最终结果... ul.SourceMgo.Save(s_name, map[string]interface{}{ "_id": v["_id"], "href": v["href"], "jyhref": fmt.Sprintf(ul.Url, qu.CommonEncodeArticle("content", tmpid)), "zhipu": data, "num": v["num"], }) }(v) } wg_mgo.Wait() log.Debug("is over ...") } // 获取处理数据... func ResolveInfo(v map[string]interface{}) map[string]interface{} { detail := qu.ObjToString(v["detail"]) title := qu.ObjToString(v["title"]) if utf8.RuneCountInString(detail) < 100 { return map[string]interface{}{} } //分包判断,获取信息 ispkg, pkg := false, map[string]interface{}{} if ispkg = prompt.AcquireIsPackageInfo(detail); ispkg { pkg = prompt.AcquireMultiplePackageInfo(detail) } //获取外围字段数据 info := prompt.AcquireExtractFieldInfo(detail) //外围字段清洗 data := clean.CleanFieldInfo(info, pkg) //获取分类字段数据 s_toptype, s_subtype := "", "" if qu.ObjToString(v["toptype"]) == "拟建" { s_toptype, s_subtype = "拟建", "拟建" } else { s_toptype, s_subtype = prompt.AcquireClassInfo(detail, title) } if s_toptype != "" { data["s_toptype"] = s_toptype data["s_subtype"] = s_subtype } //临时···记录分包信息 data["ispkg"] = ispkg data["pkg"] = pkg //最终逻辑校验 data = clean.CleanFinallyInfo(data) return data } // 暂时不启用...无限重试 func RunResetUpdateFieldInfo(arr []string, name string, s_name string) { //log.Debug("开始重置更新...", len(arr)) //reset := []string{} //for k, v := range arr { // log.Debug("...", k, "...", v) // data := ul.SourceMgo.FindById(name, v) // content := PromptFieldText(qu.ObjToString(data["detail"])) // zp, ok := map[string]interface{}{}, 0 // for { // ok++ // if zp = ai.PostZhiPuAI(content); len(zp) > 0 { // break // } // if ok >= 5 { // log.Debug("请求数据失败...", v) // reset = append(reset, v) // break // } // } // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{ // "$set": map[string]interface{}{ // "zhipu": zp, // }, // }) //} //if len(reset) > 0 { //无限尝试 // RunResetUpdateFieldInfo(reset, name, s_name) //} //log.Debug("本轮重置更新结束......") }