extract.go 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "sync"
  9. "unicode/utf8"
  10. )
  11. // 识别结构化字段
  12. func ExtractFieldInfo(sid string, eid string, name string) {
  13. q := map[string]interface{}{
  14. "_id": map[string]interface{}{
  15. "$gt": ul.StringTOBsonId(sid),
  16. "$lte": ul.StringTOBsonId(eid),
  17. },
  18. }
  19. pool_mgo := make(chan bool, 50)
  20. wg_mgo := &sync.WaitGroup{}
  21. dataArr, _ := ul.SourceMgo.Find(name, q, nil, nil)
  22. for k, v := range dataArr {
  23. if k%100 == 0 {
  24. log.Debug(k, "~", ul.BsonTOStringId(v["_id"]))
  25. }
  26. infoformat := qu.IntAll(v["infoformat"])
  27. if infoformat != 1 {
  28. continue
  29. }
  30. pool_mgo <- true
  31. wg_mgo.Add(1)
  32. go func(v map[string]interface{}) {
  33. defer func() {
  34. <-pool_mgo
  35. wg_mgo.Done()
  36. }()
  37. tmpid := ul.BsonTOStringId(v["_id"])
  38. data := ResolveInfo(v)
  39. if len(data) > 0 || tmpid == "" {
  40. ul.SourceMgo.UpdateById(name, tmpid, map[string]interface{}{
  41. "$set": map[string]interface{}{"ai_zhipu": data},
  42. })
  43. }
  44. }(v)
  45. }
  46. wg_mgo.Wait()
  47. log.Debug("is over ...", sid, "~", eid)
  48. }
  49. // 获取处理数据...
  50. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  51. detail := qu.ObjToString(v["detail"])
  52. title := qu.ObjToString(v["title"])
  53. f_data := map[string]interface{}{}
  54. if utf8.RuneCountInString(detail) < 100 {
  55. return f_data
  56. }
  57. //获取外围字段数据
  58. f_info := prompt.AcquireExtractFieldInfo(detail)
  59. //分包判断-获取信息
  60. ispkg, pkg := false, map[string]interface{}{}
  61. if ispkg = prompt.AcquireIsPackageInfo(detail); ispkg {
  62. f_info["ispkg"] = ispkg
  63. if pkg = prompt.AcquireMultiplePackageInfo(detail); len(pkg) > 0 {
  64. f_info["s_pkg"] = pkg
  65. }
  66. }
  67. //获取分类字段数据
  68. s_toptype, s_subtype := "", ""
  69. if qu.ObjToString(v["toptype"]) == "拟建" {
  70. s_toptype, s_subtype = "拟建", "拟建"
  71. } else {
  72. s_toptype, s_subtype = prompt.AcquireClassInfo(detail, title)
  73. }
  74. f_info["s_toptype"] = s_toptype
  75. f_info["s_subtype"] = s_subtype
  76. //字段清洗
  77. f_data = clean.CleanFieldInfo(f_info)
  78. return f_data
  79. }
  80. // 暂时不启用...无限重试
  81. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  82. //log.Debug("开始重置更新...", len(arr))
  83. //reset := []string{}
  84. //for k, v := range arr {
  85. // log.Debug("...", k, "...", v)
  86. // data := ul.SourceMgo.FindById(name, v)
  87. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  88. // zp, ok := map[string]interface{}{}, 0
  89. // for {
  90. // ok++
  91. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  92. // break
  93. // }
  94. // if ok >= 5 {
  95. // log.Debug("请求数据失败...", v)
  96. // reset = append(reset, v)
  97. // break
  98. // }
  99. // }
  100. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  101. // "$set": map[string]interface{}{
  102. // "zhipu": zp,
  103. // },
  104. // })
  105. //}
  106. //if len(reset) > 0 { //无限尝试
  107. // RunResetUpdateFieldInfo(reset, name, s_name)
  108. //}
  109. //log.Debug("本轮重置更新结束......")
  110. }