extract.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "sync"
  9. "unicode/utf8"
  10. )
  11. // 识别结构化字段
  12. func ExtractFieldInfo(sid string, eid string) {
  13. q := map[string]interface{}{
  14. "_id": map[string]interface{}{
  15. "$gt": ul.StringTOBsonId(sid),
  16. "$lte": ul.StringTOBsonId(eid),
  17. },
  18. }
  19. //先查询抽取表-确定大模型需要识别到范围
  20. dict := ConfrimExtractInfo(q)
  21. log.Debug("查询语句...", q, "~", len(dict))
  22. pool_mgo := make(chan bool, 90)
  23. wg_mgo := &sync.WaitGroup{}
  24. sess := ul.SourceMgo.GetMgoConn()
  25. defer ul.SourceMgo.DestoryMongoConn(sess)
  26. total, isok := 0, 0
  27. it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
  28. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  29. if total%5000 == 0 {
  30. log.Debug("cur index ", total)
  31. }
  32. tmpid := ul.BsonTOStringId(tmp["_id"])
  33. infoformat := qu.IntAll(tmp["infoformat"])
  34. if infoformat != 1 || dict[tmpid] == nil {
  35. tmp = make(map[string]interface{})
  36. continue
  37. }
  38. isok++
  39. pool_mgo <- true
  40. wg_mgo.Add(1)
  41. go func(tmp map[string]interface{}) {
  42. defer func() {
  43. <-pool_mgo
  44. wg_mgo.Done()
  45. }()
  46. u_id := ul.BsonTOStringId(tmp["_id"])
  47. data := ResolveInfo(tmp)
  48. if len(data) > 0 || u_id == "" {
  49. ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  50. "$set": map[string]interface{}{"ai_zhipu": data},
  51. })
  52. }
  53. }(tmp)
  54. tmp = make(map[string]interface{})
  55. }
  56. wg_mgo.Wait()
  57. log.Debug("ai is over ...", sid, "~", eid)
  58. }
  59. // 获取处理数据...
  60. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  61. detail := qu.ObjToString(v["detail"])
  62. title := qu.ObjToString(v["title"])
  63. f_data := map[string]interface{}{}
  64. if utf8.RuneCountInString(detail) < 100 {
  65. return f_data
  66. }
  67. //获取外围字段数据
  68. f_info := prompt.AcquireExtractFieldInfo(detail)
  69. //分包判断-获取信息
  70. ispkg, pkg := false, map[string]interface{}{}
  71. if ispkg = prompt.AcquireIsPackageInfo(detail); ispkg {
  72. f_info["ispkg"] = ispkg
  73. if pkg = prompt.AcquireMultiplePackageInfo(detail); len(pkg) > 0 {
  74. f_info["s_pkg"] = pkg
  75. }
  76. }
  77. //获取分类字段数据
  78. s_toptype, s_subtype := "", ""
  79. if qu.ObjToString(v["toptype"]) == "拟建" {
  80. s_toptype, s_subtype = "拟建", "拟建"
  81. } else if qu.ObjToString(v["toptype"]) == "产权" {
  82. s_toptype, s_subtype = "产权", "产权"
  83. } else {
  84. s_toptype, s_subtype = prompt.AcquireClassInfo(detail, title)
  85. }
  86. f_info["s_toptype"] = s_toptype
  87. f_info["s_subtype"] = s_subtype
  88. //字段清洗
  89. f_data = clean.CleanFieldInfo(f_info)
  90. return f_data
  91. }
  92. func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
  93. dict := map[string]interface{}{}
  94. sess := ul.SourceMgo.GetMgoConn()
  95. defer ul.SourceMgo.DestoryMongoConn(sess)
  96. total := 0
  97. it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
  98. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  99. if total%1000 == 0 {
  100. log.Debug("cur index ", total)
  101. }
  102. if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
  103. tmpid := ul.BsonTOStringId(tmp["_id"])
  104. dict[tmpid] = tmpid
  105. }
  106. tmp = make(map[string]interface{})
  107. }
  108. return dict
  109. }
  110. // 暂时不启用...无限重试
  111. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  112. //log.Debug("开始重置更新...", len(arr))
  113. //reset := []string{}
  114. //for k, v := range arr {
  115. // log.Debug("...", k, "...", v)
  116. // data := ul.SourceMgo.FindById(name, v)
  117. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  118. // zp, ok := map[string]interface{}{}, 0
  119. // for {
  120. // ok++
  121. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  122. // break
  123. // }
  124. // if ok >= 5 {
  125. // log.Debug("请求数据失败...", v)
  126. // reset = append(reset, v)
  127. // break
  128. // }
  129. // }
  130. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  131. // "$set": map[string]interface{}{
  132. // "zhipu": zp,
  133. // },
  134. // })
  135. //}
  136. //if len(reset) > 0 { //无限尝试
  137. // RunResetUpdateFieldInfo(reset, name, s_name)
  138. //}
  139. //log.Debug("本轮重置更新结束......")
  140. }