123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- package extract
- import (
- "data_ai/clean"
- "data_ai/prompt"
- "data_ai/ul"
- "fmt"
- log "github.com/donnie4w/go-logger/logger"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "sync"
- "unicode/utf8"
- )
- // 抽取结构字段
- func ExtractFieldInfo(name string, s_name string) {
- pool_mgo := make(chan bool, 50)
- wg_mgo := &sync.WaitGroup{}
- dataArr, _ := ul.SourceMgo.Find(name, map[string]interface{}{}, nil, nil)
- for k, v := range dataArr {
- if k%100 == 0 {
- log.Debug(k, "~", ul.BsonTOStringId(v["_id"]))
- }
- infoformat := qu.IntAll(v["infoformat"])
- if infoformat != 1 {
- continue
- }
- pool_mgo <- true
- wg_mgo.Add(1)
- go func(v map[string]interface{}) {
- defer func() {
- <-pool_mgo
- wg_mgo.Done()
- }()
- tmpid := ul.BsonTOStringId(v["_id"])
- data := ResolveInfo(v)
- //最终结果...
- ul.SourceMgo.Save(s_name, map[string]interface{}{
- "_id": v["_id"],
- "href": v["href"],
- "jyhref": fmt.Sprintf(ul.Url, qu.CommonEncodeArticle("content", tmpid)),
- "zhipu": data,
- "num": v["num"],
- })
- }(v)
- }
- wg_mgo.Wait()
- log.Debug("is over ...")
- }
- // 获取处理数据...
- func ResolveInfo(v map[string]interface{}) map[string]interface{} {
- detail := qu.ObjToString(v["detail"])
- title := qu.ObjToString(v["title"])
- if utf8.RuneCountInString(detail) < 100 {
- return map[string]interface{}{}
- }
- //分包判断,获取信息
- ispkg, pkg := false, map[string]interface{}{}
- if ispkg = prompt.AcquireIsPackageInfo(detail); ispkg {
- pkg = prompt.AcquireMultiplePackageInfo(detail)
- }
- //获取外围字段数据
- info := prompt.AcquireExtractFieldInfo(detail)
- //外围字段清洗
- data := clean.CleanFieldInfo(info, pkg)
- //获取分类字段数据
- s_toptype, s_subtype := "", ""
- if qu.ObjToString(v["toptype"]) == "拟建" {
- s_toptype, s_subtype = "拟建", "拟建"
- } else {
- s_toptype, s_subtype = prompt.AcquireClassInfo(detail, title)
- }
- if s_toptype != "" {
- data["s_toptype"] = s_toptype
- data["s_subtype"] = s_subtype
- }
- //临时···记录分包信息
- data["ispkg"] = ispkg
- data["pkg"] = pkg
- //最终逻辑校验
- data = clean.CleanFinallyInfo(data)
- return data
- }
- // 暂时不启用...无限重试
- func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
- //log.Debug("开始重置更新...", len(arr))
- //reset := []string{}
- //for k, v := range arr {
- // log.Debug("...", k, "...", v)
- // data := ul.SourceMgo.FindById(name, v)
- // content := PromptFieldText(qu.ObjToString(data["detail"]))
- // zp, ok := map[string]interface{}{}, 0
- // for {
- // ok++
- // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
- // break
- // }
- // if ok >= 5 {
- // log.Debug("请求数据失败...", v)
- // reset = append(reset, v)
- // break
- // }
- // }
- // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
- // "$set": map[string]interface{}{
- // "zhipu": zp,
- // },
- // })
- //}
- //if len(reset) > 0 { //无限尝试
- // RunResetUpdateFieldInfo(reset, name, s_name)
- //}
- //log.Debug("本轮重置更新结束......")
- }
|