extract.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "strings"
  9. "sync"
  10. "unicode/utf8"
  11. )
  12. // 识别结构化字段
  13. func ExtractFieldInfo(sid string, eid string) {
  14. q := map[string]interface{}{
  15. "_id": map[string]interface{}{
  16. "$gt": ul.StringTOBsonId(sid),
  17. "$lte": ul.StringTOBsonId(eid),
  18. },
  19. }
  20. //先查询抽取表-确定大模型需要识别到范围
  21. dict := ConfrimExtractInfo(q)
  22. log.Debug("查询语句...", q, "~", len(dict))
  23. if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
  24. ul.FlashModel = "glm-4-flashx"
  25. } else {
  26. ul.FlashModel = "glm-4-flash"
  27. }
  28. pool_mgo := make(chan bool, ul.Reading)
  29. wg_mgo := &sync.WaitGroup{}
  30. sess := ul.SourceMgo.GetMgoConn()
  31. defer ul.SourceMgo.DestoryMongoConn(sess)
  32. total, isok := 0, 0
  33. it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
  34. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  35. if total%200 == 0 {
  36. log.Debug("cur ai index ", total)
  37. }
  38. tmpid := ul.BsonTOStringId(tmp["_id"])
  39. infoformat := qu.IntAll(tmp["infoformat"])
  40. if infoformat != 1 || dict[tmpid] == nil {
  41. tmp = make(map[string]interface{})
  42. continue
  43. }
  44. isok++
  45. pool_mgo <- true
  46. wg_mgo.Add(1)
  47. go func(tmp map[string]interface{}) {
  48. defer func() {
  49. <-pool_mgo
  50. wg_mgo.Done()
  51. }()
  52. u_id := ul.BsonTOStringId(tmp["_id"])
  53. data := ResolveInfo(tmp)
  54. if len(data) > 0 || u_id == "" {
  55. ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  56. "$set": map[string]interface{}{"ai_zhipu": data},
  57. })
  58. }
  59. }(tmp)
  60. tmp = make(map[string]interface{})
  61. }
  62. wg_mgo.Wait()
  63. log.Debug("ai is over ...", sid, "~", eid)
  64. }
  65. // 获取处理数据...
  66. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  67. detail := qu.ObjToString(v["detail"])
  68. filetext := qu.ObjToString(v["filetext"]) //此处为附件信息···
  69. title := qu.ObjToString(v["title"])
  70. if strings.Contains(title, "开标记录") { //开标记录舍弃
  71. return map[string]interface{}{}
  72. }
  73. if v["jyfb_data"] != nil { //剑鱼发布舍弃qi
  74. return map[string]interface{}{}
  75. }
  76. fns := getpnsinfo(v) //获取附件名字
  77. f_data := map[string]interface{}{}
  78. if ul.IsTool && utf8.RuneCountInString(detail) < 100 {
  79. detail = filetext
  80. }
  81. if utf8.RuneCountInString(detail) < 100 {
  82. return f_data
  83. }
  84. detail = ul.HttpConvertToMarkdown(detail)
  85. //获取外围字段数据
  86. f_info := prompt.AcquireExtractFieldInfo(detail)
  87. //获取分包信息
  88. pkg := prompt.AcquireNewMultiplePackageInfo(detail)
  89. if len(pkg) > 0 {
  90. f_info["s_pkg"] = pkg
  91. }
  92. //分包判断-获取分包方法舍弃
  93. //ispkg, pkg := false, map[string]interface{}{}
  94. //if ispkg = prompt.AcquireIsPackageInfo(detail); ispkg {
  95. // f_info["ispkg"] = ispkg
  96. // if pkg = prompt.AcquireMultiplePackageInfo(detail); len(pkg) > 0 {
  97. // f_info["s_pkg"] = pkg
  98. // }
  99. //}
  100. //获取分类字段数据
  101. s_toptype, s_subtype := "", ""
  102. if qu.ObjToString(v["toptype"]) == "拟建" {
  103. s_toptype, s_subtype = "拟建", "拟建"
  104. } else if qu.ObjToString(v["toptype"]) == "产权" {
  105. s_toptype, s_subtype = "产权", "产权"
  106. } else if qu.ObjToString(v["toptype"]) == "采购意向" {
  107. s_toptype, s_subtype = "采购意向", "采购意向"
  108. } else {
  109. s_toptype, s_subtype = prompt.AcquireClassInfo(detail, title)
  110. }
  111. f_info["s_toptype"] = s_toptype
  112. f_info["s_subtype"] = s_subtype
  113. //字段清洗
  114. f_data = clean.CleanFieldInfo(f_info, fns)
  115. //对于某些字段进行二级校验
  116. if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
  117. if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
  118. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  119. f_data["s_buyer"] = ns_buyer
  120. }
  121. }
  122. }
  123. return f_data
  124. }
  125. func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
  126. dict := map[string]interface{}{}
  127. sess := ul.SourceMgo.GetMgoConn()
  128. defer ul.SourceMgo.DestoryMongoConn(sess)
  129. total := 0
  130. it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
  131. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  132. if total%1000 == 0 {
  133. log.Debug("cur index ", total)
  134. }
  135. if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
  136. tmpid := ul.BsonTOStringId(tmp["_id"])
  137. dict[tmpid] = tmpid
  138. }
  139. tmp = make(map[string]interface{})
  140. }
  141. return dict
  142. }
  143. // 获取附件名字信息
  144. func getpnsinfo(tmp map[string]interface{}) []string {
  145. arr := []string{}
  146. if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
  147. if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
  148. for _, v := range *attachments {
  149. if info := qu.ObjToMap(v); info != nil {
  150. if filename := qu.ObjToString((*info)["filename"]); filename != "" {
  151. arr = append(arr, filename)
  152. }
  153. }
  154. }
  155. }
  156. }
  157. return arr
  158. }
  159. // 暂时不启用...无限重试
  160. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  161. //log.Debug("开始重置更新...", len(arr))
  162. //reset := []string{}
  163. //for k, v := range arr {
  164. // log.Debug("...", k, "...", v)
  165. // data := ul.SourceMgo.FindById(name, v)
  166. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  167. // zp, ok := map[string]interface{}{}, 0
  168. // for {
  169. // ok++
  170. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  171. // break
  172. // }
  173. // if ok >= 5 {
  174. // log.Debug("请求数据失败...", v)
  175. // reset = append(reset, v)
  176. // break
  177. // }
  178. // }
  179. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  180. // "$set": map[string]interface{}{
  181. // "zhipu": zp,
  182. // },
  183. // })
  184. //}
  185. //if len(reset) > 0 { //无限尝试
  186. // RunResetUpdateFieldInfo(reset, name, s_name)
  187. //}
  188. //log.Debug("本轮重置更新结束......")
  189. }