extract.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "sync"
  9. "unicode/utf8"
  10. )
  11. // 识别结构化字段
  12. func ExtractFieldInfo(sid string, eid string) {
  13. q := map[string]interface{}{
  14. "_id": map[string]interface{}{
  15. "$gt": ul.StringTOBsonId(sid),
  16. "$lte": ul.StringTOBsonId(eid),
  17. },
  18. }
  19. //先查询抽取表-确定大模型需要识别到范围
  20. dict := ConfrimExtractInfo(q)
  21. log.Debug("查询语句...", q, "~", len(dict))
  22. if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
  23. log.Debug("数量超过限制临时使用:glm-4-flashx")
  24. ul.FlashModel = "glm-4-flashx"
  25. } else {
  26. ul.FlashModel = "glm-4-flash"
  27. }
  28. pool_mgo := make(chan bool, ul.Reading)
  29. wg_mgo := &sync.WaitGroup{}
  30. sess := ul.SourceMgo.GetMgoConn()
  31. defer ul.SourceMgo.DestoryMongoConn(sess)
  32. total, isok := 0, 0
  33. it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
  34. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  35. if total%200 == 0 {
  36. log.Debug("cur ai index ", total)
  37. }
  38. tmpid := ul.BsonTOStringId(tmp["_id"])
  39. infoformat := qu.IntAll(tmp["infoformat"])
  40. if infoformat > 1 || dict[tmpid] == nil {
  41. tmp = make(map[string]interface{})
  42. continue
  43. }
  44. isok++
  45. pool_mgo <- true
  46. wg_mgo.Add(1)
  47. go func(tmp map[string]interface{}) {
  48. defer func() {
  49. <-pool_mgo
  50. wg_mgo.Done()
  51. }()
  52. u_id := ul.BsonTOStringId(tmp["_id"])
  53. data := ResolveInfo(tmp)
  54. if len(data) > 0 || u_id == "" {
  55. ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  56. "$set": map[string]interface{}{"ai_zhipu": data},
  57. })
  58. }
  59. }(tmp)
  60. tmp = make(map[string]interface{})
  61. }
  62. wg_mgo.Wait()
  63. log.Debug("ai is over ...", sid, "~", eid)
  64. }
  65. // 获取处理数据...
  66. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  67. tmpid := ul.BsonTOStringId(v["_id"])
  68. title := qu.ObjToString(v["title"])
  69. detail := getDetailText(v, tmpid) //获取正文文本
  70. if NotInProgressInfo(title, detail, v) { //过滤信息
  71. return map[string]interface{}{}
  72. }
  73. //识别结构,短文本结构
  74. f_data, shorText := map[string]interface{}{}, false
  75. if utf8.RuneCountInString(detail) < 100 {
  76. shorText = true
  77. }
  78. //文本格式转换
  79. detail = ul.HttpConvertToMarkdown(detail)
  80. //短文本判断是否有效性
  81. if shorText {
  82. if info := prompt.AcquireJudgeShortInfo(detail); info["结果"] != "是" {
  83. return map[string]interface{}{}
  84. }
  85. }
  86. //获取外围字段数据-拆分合并字段
  87. f_info_1 := prompt.AcquireExtractFieldInfoFirst(detail)
  88. f_info_2 := prompt.AcquireExtractFieldInfoSecond(detail)
  89. f_info_3 := prompt.AcquireExtractFieldInfoThird(detail)
  90. f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2, f_info_3})
  91. //非短文本以下识别
  92. if !shorText {
  93. //获取分包信息
  94. if pkg := prompt.AcquireNewMultiplePackageInfo(detail); len(pkg) > 0 {
  95. f_info["s_pkg"] = pkg
  96. }
  97. //获取分类字段数据
  98. s_toptype, s_subtype := prompt.AcquireClassInfo(detail, title, qu.ObjToString(v["toptype"]))
  99. f_info["s_toptype"] = s_toptype
  100. f_info["s_subtype"] = s_subtype
  101. //调用标的物识别
  102. if p_list := getPurList(v, detail, f_info); len(p_list) > 0 {
  103. f_info["purchasinglist"] = p_list
  104. }
  105. }
  106. //字段清洗
  107. fns := getpnsinfo(v) //获取附件名字
  108. f_data = clean.CleanFieldInfo(f_info, fns)
  109. //采购单位二级校验
  110. CheckOutBuyerInfo(f_data)
  111. //强制逻辑判断-
  112. ForcedLogicDecideInfo(f_data)
  113. //返回数据
  114. return f_data
  115. }
  116. // 暂时不启用...无限重试
  117. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  118. //log.Debug("开始重置更新...", len(arr))
  119. //reset := []string{}
  120. //for k, v := range arr {
  121. // log.Debug("...", k, "...", v)
  122. // data := ul.SourceMgo.FindById(name, v)
  123. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  124. // zp, ok := map[string]interface{}{}, 0
  125. // for {
  126. // ok++
  127. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  128. // break
  129. // }
  130. // if ok >= 5 {
  131. // log.Debug("请求数据失败...", v)
  132. // reset = append(reset, v)
  133. // break
  134. // }
  135. // }
  136. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  137. // "$set": map[string]interface{}{
  138. // "zhipu": zp,
  139. // },
  140. // })
  141. //}
  142. //if len(reset) > 0 { //无限尝试
  143. // RunResetUpdateFieldInfo(reset, name, s_name)
  144. //}
  145. //log.Debug("本轮重置更新结束......")
  146. }