extract.go 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "strings"
  9. "sync"
  10. "unicode/utf8"
  11. )
  12. // 识别结构化字段
  13. func ExtractFieldInfo(sid string, eid string) {
  14. q := map[string]interface{}{
  15. "_id": map[string]interface{}{
  16. "$gt": ul.StringTOBsonId(sid),
  17. "$lte": ul.StringTOBsonId(eid),
  18. },
  19. }
  20. //先查询抽取表-确定大模型需要识别到范围
  21. dict := ConfrimExtractInfo(q)
  22. log.Debug("查询语句...", q, "~", len(dict))
  23. if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
  24. log.Debug("数量超过限制临时使用:glm-4-flashx")
  25. ul.FlashModel = "glm-4-flashx"
  26. } else {
  27. ul.FlashModel = "glm-4-flash"
  28. }
  29. pool_mgo := make(chan bool, ul.Reading)
  30. wg_mgo := &sync.WaitGroup{}
  31. sess := ul.BidMgo.GetMgoConn()
  32. defer ul.BidMgo.DestoryMongoConn(sess)
  33. total, isok := 0, 0
  34. it := sess.DB(ul.BidMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
  35. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  36. if total%200 == 0 {
  37. log.Debug("cur ai index ", total)
  38. }
  39. tmpid := ul.BsonTOStringId(tmp["_id"])
  40. infoformat := qu.IntAll(tmp["infoformat"])
  41. if infoformat > 1 || dict[tmpid] == nil {
  42. tmp = make(map[string]interface{})
  43. continue
  44. }
  45. isok++
  46. pool_mgo <- true
  47. wg_mgo.Add(1)
  48. go func(tmp map[string]interface{}) {
  49. defer func() {
  50. <-pool_mgo
  51. wg_mgo.Done()
  52. }()
  53. u_id := ul.BsonTOStringId(tmp["_id"])
  54. data := ResolveInfo(tmp)
  55. if len(data) > 0 || u_id == "" {
  56. ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  57. "$set": map[string]interface{}{"ai_zhipu": data},
  58. })
  59. }
  60. }(tmp)
  61. tmp = make(map[string]interface{})
  62. }
  63. wg_mgo.Wait()
  64. log.Debug("ai is over ...", sid, "~", eid)
  65. }
  66. // 获取处理数据...
  67. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  68. tmpid := ul.BsonTOStringId(v["_id"])
  69. title := qu.ObjToString(v["title"])
  70. old_detail := getDetailText(v, tmpid) //获取正文文本
  71. isTable := false //是否表格
  72. if strings.Contains(old_detail, "<table>") {
  73. isTable = true
  74. }
  75. if NotInProgressInfo(title, old_detail, v) { //过滤信息
  76. return map[string]interface{}{}
  77. }
  78. //识别结构,短文本结构
  79. f_data, shorText := map[string]interface{}{}, false
  80. if utf8.RuneCountInString(old_detail) < 100 {
  81. shorText = true
  82. }
  83. //文本格式转换
  84. new_detail := ul.HttpConvertToMarkdown(old_detail)
  85. //短文本判断是否有效性
  86. if shorText {
  87. if info := prompt.AcquireJudgeShortInfo(new_detail); info["结果"] != "是" {
  88. return map[string]interface{}{}
  89. }
  90. }
  91. //获取外围字段数据-拆分合并字段
  92. f_info_1 := prompt.AcquireExtractFieldInfoFirst(new_detail)
  93. f_info_2 := prompt.AcquireExtractFieldInfoSecond(new_detail)
  94. f_info_3 := prompt.AcquireExtractFieldInfoThird(new_detail)
  95. f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2, f_info_3})
  96. //非短文本以下识别-纯测试
  97. if !shorText {
  98. //获取分包信息
  99. if pkg := prompt.AcquireNewMultiplePackageInfo(new_detail, isTable); len(pkg) > 0 {
  100. f_info["s_pkg"] = pkg
  101. }
  102. //获取分类字段数据
  103. s_toptype, s_subtype := prompt.AcquireClassInfo(new_detail, title, qu.ObjToString(v["toptype"]))
  104. f_info["s_toptype"] = s_toptype
  105. f_info["s_subtype"] = s_subtype
  106. //调用标的物识别
  107. if !ul.IsTool {
  108. if s_purchasinglist := getPurList(v, old_detail, f_info); len(s_purchasinglist) > 0 {
  109. f_info["s_purchasinglist"] = s_purchasinglist
  110. }
  111. }
  112. }
  113. //字段清洗
  114. fns := getpnsinfo(v) //获取附件名字
  115. f_data = clean.CleanFieldInfo(f_info, fns, isTable)
  116. //采购单位二级校验
  117. CheckOutBuyerInfo(f_data)
  118. //强制逻辑判断-
  119. ForcedLogicDecideInfo(f_data)
  120. //返回数据
  121. return f_data
  122. }
  123. // 暂时不启用...无限重试
  124. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  125. //log.Debug("开始重置更新...", len(arr))
  126. //reset := []string{}
  127. //for k, v := range arr {
  128. // log.Debug("...", k, "...", v)
  129. // data := ul.SourceMgo.FindById(name, v)
  130. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  131. // zp, ok := map[string]interface{}{}, 0
  132. // for {
  133. // ok++
  134. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  135. // break
  136. // }
  137. // if ok >= 5 {
  138. // log.Debug("请求数据失败...", v)
  139. // reset = append(reset, v)
  140. // break
  141. // }
  142. // }
  143. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  144. // "$set": map[string]interface{}{
  145. // "zhipu": zp,
  146. // },
  147. // })
  148. //}
  149. //if len(reset) > 0 { //无限尝试
  150. // RunResetUpdateFieldInfo(reset, name, s_name)
  151. //}
  152. //log.Debug("本轮重置更新结束......")
  153. }