extract.go 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. "fmt"
  7. log "github.com/donnie4w/go-logger/logger"
  8. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  9. "sync"
  10. "unicode/utf8"
  11. )
  12. // 抽取结构字段
  13. func ExtractFieldInfo(name string, s_name string) {
  14. pool_mgo := make(chan bool, 50)
  15. wg_mgo := &sync.WaitGroup{}
  16. dataArr, _ := ul.SourceMgo.Find(name, map[string]interface{}{}, nil, nil)
  17. for k, v := range dataArr {
  18. if k%100 == 0 {
  19. log.Debug(k, "~", ul.BsonTOStringId(v["_id"]))
  20. }
  21. infoformat := qu.IntAll(v["infoformat"])
  22. if infoformat != 1 {
  23. continue
  24. }
  25. pool_mgo <- true
  26. wg_mgo.Add(1)
  27. go func(v map[string]interface{}) {
  28. defer func() {
  29. <-pool_mgo
  30. wg_mgo.Done()
  31. }()
  32. tmpid := ul.BsonTOStringId(v["_id"])
  33. data := ResolveInfo(v)
  34. //最终结果...
  35. ul.SourceMgo.Save(s_name, map[string]interface{}{
  36. "_id": v["_id"],
  37. "href": v["href"],
  38. "jyhref": fmt.Sprintf(ul.Url, qu.CommonEncodeArticle("content", tmpid)),
  39. "zhipu": data,
  40. "num": v["num"],
  41. })
  42. }(v)
  43. }
  44. wg_mgo.Wait()
  45. log.Debug("is over ...")
  46. }
  47. // 获取处理数据...
  48. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  49. detail := qu.ObjToString(v["detail"])
  50. title := qu.ObjToString(v["title"])
  51. if utf8.RuneCountInString(detail) < 100 {
  52. return map[string]interface{}{}
  53. }
  54. //分包判断,获取信息
  55. ispkg, pkg := false, map[string]interface{}{}
  56. if ispkg = prompt.AcquireIsPackageInfo(detail); ispkg {
  57. pkg = prompt.AcquireMultiplePackageInfo(detail)
  58. }
  59. //获取外围字段数据
  60. info := prompt.AcquireExtractFieldInfo(detail)
  61. //外围字段清洗
  62. data := clean.CleanFieldInfo(info, pkg)
  63. //获取分类字段数据
  64. s_toptype, s_subtype := "", ""
  65. if qu.ObjToString(v["toptype"]) == "拟建" {
  66. s_toptype, s_subtype = "拟建", "拟建"
  67. } else {
  68. s_toptype, s_subtype = prompt.AcquireClassInfo(detail, title)
  69. }
  70. if s_toptype != "" {
  71. data["s_toptype"] = s_toptype
  72. data["s_subtype"] = s_subtype
  73. }
  74. //临时···记录分包信息
  75. data["ispkg"] = ispkg
  76. data["pkg"] = pkg
  77. //最终逻辑校验
  78. data = clean.CleanFinallyInfo(data)
  79. return data
  80. }
  81. // 暂时不启用...无限重试
  82. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  83. //log.Debug("开始重置更新...", len(arr))
  84. //reset := []string{}
  85. //for k, v := range arr {
  86. // log.Debug("...", k, "...", v)
  87. // data := ul.SourceMgo.FindById(name, v)
  88. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  89. // zp, ok := map[string]interface{}{}, 0
  90. // for {
  91. // ok++
  92. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  93. // break
  94. // }
  95. // if ok >= 5 {
  96. // log.Debug("请求数据失败...", v)
  97. // reset = append(reset, v)
  98. // break
  99. // }
  100. // }
  101. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  102. // "$set": map[string]interface{}{
  103. // "zhipu": zp,
  104. // },
  105. // })
  106. //}
  107. //if len(reset) > 0 { //无限尝试
  108. // RunResetUpdateFieldInfo(reset, name, s_name)
  109. //}
  110. //log.Debug("本轮重置更新结束......")
  111. }