extract.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "strings"
  9. "sync"
  10. "unicode/utf8"
  11. )
  12. // 识别结构化字段
  13. func ExtractFieldInfo(sid string, eid string) {
  14. q := map[string]interface{}{
  15. "_id": map[string]interface{}{
  16. "$gt": ul.StringTOBsonId(sid),
  17. "$lte": ul.StringTOBsonId(eid),
  18. },
  19. }
  20. //先查询抽取表-确定大模型需要识别到范围
  21. dict := ConfrimExtractInfo(q)
  22. log.Debug("查询语句...", q, "~", len(dict))
  23. if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
  24. log.Debug("数量超过限制临时使用:glm-4-flashx")
  25. ul.FlashModel = "glm-4-flashx"
  26. } else {
  27. ul.FlashModel = "glm-4-flash"
  28. }
  29. pool_mgo := make(chan bool, ul.Reading)
  30. wg_mgo := &sync.WaitGroup{}
  31. sess := ul.SourceMgo.GetMgoConn()
  32. defer ul.SourceMgo.DestoryMongoConn(sess)
  33. total, isok := 0, 0
  34. it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
  35. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  36. if total%200 == 0 {
  37. log.Debug("cur ai index ", total)
  38. }
  39. tmpid := ul.BsonTOStringId(tmp["_id"])
  40. infoformat := qu.IntAll(tmp["infoformat"])
  41. if infoformat > 1 || dict[tmpid] == nil {
  42. tmp = make(map[string]interface{})
  43. continue
  44. }
  45. isok++
  46. pool_mgo <- true
  47. wg_mgo.Add(1)
  48. go func(tmp map[string]interface{}) {
  49. defer func() {
  50. <-pool_mgo
  51. wg_mgo.Done()
  52. }()
  53. u_id := ul.BsonTOStringId(tmp["_id"])
  54. data := ResolveInfo(tmp)
  55. if len(data) > 0 || u_id == "" {
  56. ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  57. "$set": map[string]interface{}{"ai_zhipu": data},
  58. })
  59. }
  60. }(tmp)
  61. tmp = make(map[string]interface{})
  62. }
  63. wg_mgo.Wait()
  64. log.Debug("ai is over ...", sid, "~", eid)
  65. }
  66. // 获取处理数据...
  67. func ResolveInfo(v map[string]interface{}) map[string]interface{} {
  68. tmpid := ul.BsonTOStringId(v["_id"])
  69. detail := getDetailText(v, tmpid) //获取正文文本
  70. title := qu.ObjToString(v["title"])
  71. dl := utf8.RuneCountInString(detail) //文本长度
  72. //过滤数据···
  73. if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
  74. return map[string]interface{}{}
  75. }
  76. if dl < 20 {
  77. return map[string]interface{}{}
  78. }
  79. //识别结构,短文本结构
  80. f_data, shorText := map[string]interface{}{}, false
  81. if dl < 100 {
  82. shorText = true
  83. }
  84. //文本格式转换
  85. detail = ul.HttpConvertToMarkdown(detail)
  86. //短文本判断是否有效性
  87. if shorText {
  88. if info := prompt.AcquireJudgeShortInfo(detail); info["结果"] != "是" {
  89. return map[string]interface{}{}
  90. }
  91. }
  92. //获取外围字段数据-拆分合并字段
  93. f_info_1 := prompt.AcquireExtractFieldInfoFirst(detail)
  94. f_info_2 := prompt.AcquireExtractFieldInfoFirst(detail)
  95. f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2})
  96. if !shorText {
  97. //获取分包信息
  98. if pkg := prompt.AcquireNewMultiplePackageInfo(detail); len(pkg) > 0 {
  99. f_info["s_pkg"] = pkg
  100. }
  101. //获取分类字段数据
  102. s_toptype, s_subtype := prompt.AcquireClassInfo(detail, title, qu.ObjToString(v["toptype"]))
  103. f_info["s_toptype"] = s_toptype
  104. f_info["s_subtype"] = s_subtype
  105. //调用标的物识别
  106. if p_list := getPurList(v, detail, f_info); len(p_list) > 0 {
  107. f_info["purchasinglist"] = p_list
  108. }
  109. }
  110. //字段清洗
  111. fns := getpnsinfo(v) //获取附件名字
  112. f_data = clean.CleanFieldInfo(f_info, fns)
  113. //采购单位二级校验
  114. CheckOutBuyerInfo(f_data)
  115. //强制逻辑判断-
  116. ForcedLogicDecideInfo(f_data)
  117. return f_data
  118. }
  119. // 确认抽取范围
  120. func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
  121. dict := map[string]interface{}{}
  122. sess := ul.SourceMgo.GetMgoConn()
  123. defer ul.SourceMgo.DestoryMongoConn(sess)
  124. total := 0
  125. it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
  126. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  127. if total%1000 == 0 {
  128. log.Debug("cur index ", total)
  129. }
  130. if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
  131. tmpid := ul.BsonTOStringId(tmp["_id"])
  132. dict[tmpid] = tmpid
  133. }
  134. tmp = make(map[string]interface{})
  135. }
  136. return dict
  137. }
  138. // 获取附件名字信息
  139. func getpnsinfo(tmp map[string]interface{}) []string {
  140. arr := []string{}
  141. if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
  142. if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
  143. for _, v := range *attachments {
  144. if info := qu.ObjToMap(v); info != nil {
  145. if filename := qu.ObjToString((*info)["filename"]); filename != "" {
  146. arr = append(arr, filename)
  147. }
  148. }
  149. }
  150. }
  151. }
  152. return arr
  153. }
  154. // 获取正文数据
  155. func getDetailText(v map[string]interface{}, tmpid string) string {
  156. detail := qu.ObjToString(v["detail"])
  157. if ul.IsTool {
  158. detail = qu.ObjToString(v["details"])
  159. filetext := qu.ObjToString(v["filetext"])
  160. if utf8.RuneCountInString(detail) < 100 && filetext != "" {
  161. detail = filetext
  162. }
  163. } else {
  164. //if bs := ul.OssGetObject(tmpid); bs != "" {
  165. // detail = bs
  166. //}
  167. }
  168. return detail
  169. }
  170. // 获取标的物-过滤产权-拟建
  171. func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) []map[string]interface{} {
  172. if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
  173. return []map[string]interface{}{}
  174. }
  175. p_data := map[string]interface{}{}
  176. p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
  177. p_data["site"] = v["site"]
  178. p_data["attach_text"] = v["attach_text"]
  179. p_data["toptype"] = v["toptype"]
  180. if f_info["s_toptype"] != nil {
  181. p_data["toptype"] = f_info["s_toptype"]
  182. }
  183. if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
  184. if qu.IntAll(p_info["status"]) == 200 {
  185. p_list := ul.IsMarkInterfaceMap(p_info["purchasinglist"])
  186. return p_list
  187. }
  188. }
  189. return []map[string]interface{}{}
  190. }
  191. // 二次校验采购单位
  192. func CheckOutBuyerInfo(f_data map[string]interface{}) {
  193. if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
  194. if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
  195. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  196. f_data["s_buyer"] = ns_buyer
  197. }
  198. }
  199. }
  200. }
  201. // 合并字段
  202. func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
  203. info := map[string]interface{}{}
  204. for _, v := range infos {
  205. for k1, v1 := range v {
  206. info[k1] = v1
  207. }
  208. }
  209. return info
  210. }
  211. // 强制逻辑判断数据
  212. func ForcedLogicDecideInfo(f_data map[string]interface{}) {
  213. //原则大模型
  214. //多单位不能一致
  215. s_buyer := qu.ObjToString(f_data["s_buyer"])
  216. s_winner := qu.ObjToString(f_data["s_winner"])
  217. if s_buyer == s_winner && s_buyer != "" {
  218. /*
  219. 1、若单位名称-不含公司保留采购单位
  220. 2、若单位名称-含公司保留中标单位
  221. */
  222. if strings.Contains(s_buyer, "公司") {
  223. f_data["s_buyer"] = ""
  224. } else {
  225. f_data["s_winner"] = ""
  226. }
  227. }
  228. //代理机构
  229. if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
  230. if s_agency == s_buyer || s_agency == s_winner {
  231. f_data["s_agency"] = ""
  232. }
  233. }
  234. }
  235. // 暂时不启用...无限重试
  236. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  237. //log.Debug("开始重置更新...", len(arr))
  238. //reset := []string{}
  239. //for k, v := range arr {
  240. // log.Debug("...", k, "...", v)
  241. // data := ul.SourceMgo.FindById(name, v)
  242. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  243. // zp, ok := map[string]interface{}{}, 0
  244. // for {
  245. // ok++
  246. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  247. // break
  248. // }
  249. // if ok >= 5 {
  250. // log.Debug("请求数据失败...", v)
  251. // reset = append(reset, v)
  252. // break
  253. // }
  254. // }
  255. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  256. // "$set": map[string]interface{}{
  257. // "zhipu": zp,
  258. // },
  259. // })
  260. //}
  261. //if len(reset) > 0 { //无限尝试
  262. // RunResetUpdateFieldInfo(reset, name, s_name)
  263. //}
  264. //log.Debug("本轮重置更新结束......")
  265. }