extract.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "strings"
  9. "sync"
  10. "time"
  11. "unicode/utf8"
  12. )
  13. // 识别结构化字段
  14. func ExtractFieldInfo(sid string, eid string) {
  15. q := map[string]interface{}{
  16. "_id": map[string]interface{}{
  17. "$gt": ul.StringTOBsonId(sid),
  18. "$lte": ul.StringTOBsonId(eid),
  19. },
  20. }
  21. //先查询抽取表-确定大模型需要识别到范围
  22. dict := ConfrimExtractInfo(q)
  23. log.Debug("查询语句...", q, "~", len(dict))
  24. if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
  25. log.Debug("数量超过限制临时使用:glm-4-flashx")
  26. ul.FlashModel = "glm-4-flashx"
  27. } else {
  28. ul.FlashModel = "glm-4-flash"
  29. }
  30. pool_mgo := make(chan bool, ul.Reading)
  31. wg_mgo := &sync.WaitGroup{}
  32. sess := ul.BidMgo.GetMgoConn()
  33. defer ul.BidMgo.DestoryMongoConn(sess)
  34. total, isok := 0, 0
  35. it := sess.DB(ul.BidMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
  36. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  37. if total%1000 == 0 {
  38. log.Debug("cur ai index ", total)
  39. }
  40. tmpid := ul.BsonTOStringId(tmp["_id"])
  41. infoformat := qu.IntAll(tmp["infoformat"])
  42. if infoformat > 1 || dict[tmpid] == nil {
  43. tmp = make(map[string]interface{})
  44. continue
  45. }
  46. isok++
  47. pool_mgo <- true
  48. wg_mgo.Add(1)
  49. go func(tmp map[string]interface{}) {
  50. defer func() {
  51. <-pool_mgo
  52. wg_mgo.Done()
  53. }()
  54. info := map[string]interface{}{}
  55. u_id := ul.BsonTOStringId(tmp["_id"])
  56. //大模型数据···
  57. ai_zhipu := ResolveInfo(tmp, u_id)
  58. if len(ai_zhipu) > 0 {
  59. info["ai_zhipu"] = ai_zhipu
  60. }
  61. //科学计数法标记···数据标记···会冗余
  62. s, f := ScientificUnit(qu.ObjToString(tmp["detail"]))
  63. if s != "" && f > 0.0 {
  64. info["e_bidamount"] = f
  65. }
  66. //更新方法
  67. if len(info) > 0 && u_id != "" {
  68. ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  69. "$set": info,
  70. })
  71. }
  72. }(tmp)
  73. tmp = make(map[string]interface{})
  74. }
  75. wg_mgo.Wait()
  76. log.Debug("ai is over ...", sid, "~", eid)
  77. }
  78. // 获取处理数据...
  79. func ResolveInfo(v map[string]interface{}, tmpid string) map[string]interface{} {
  80. title := qu.ObjToString(v["title"])
  81. old_detail := getDetailText(v, tmpid) //获取正文文本
  82. //是否表格
  83. isTable := false
  84. if strings.Contains(old_detail, "<table>") {
  85. isTable = false //可以屏蔽表格的识别内容
  86. }
  87. //过滤信息
  88. if NotInProgressInfo(title, old_detail, v) {
  89. return map[string]interface{}{}
  90. }
  91. //识别结构,短文本结构
  92. f_data, shorText := map[string]interface{}{}, false
  93. if utf8.RuneCountInString(old_detail) < 100 {
  94. shorText = true
  95. }
  96. //文本格式转换
  97. new_detail := ul.HttpConvertToMarkdown(old_detail)
  98. //特殊文本转换
  99. new_detail = CleanText(new_detail)
  100. //短文本判断是否有效性
  101. if shorText {
  102. if info := prompt.AcquireJudgeShortInfo(new_detail); info["结果"] != "是" {
  103. return map[string]interface{}{}
  104. }
  105. }
  106. //获取外围字段数据-拆分合并字段
  107. f_info_1 := prompt.AcquireExtractFieldInfoFirst(new_detail)
  108. f_info_2 := prompt.AcquireExtractFieldInfoSecond(new_detail)
  109. f_info_3 := prompt.AcquireExtractFieldInfoThird(new_detail)
  110. f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2, f_info_3})
  111. //非短文本以下识别-纯测试
  112. if !shorText {
  113. //获取分包信息
  114. if pkg := prompt.AcquireNewMultiplePackageInfo(new_detail, isTable); len(pkg) > 0 {
  115. f_info["s_pkg"] = pkg
  116. }
  117. //获取分类字段数据
  118. s_toptype, s_subtype := prompt.AcquireClassInfo(new_detail, title, qu.ObjToString(v["toptype"]))
  119. f_info["s_toptype"] = s_toptype
  120. f_info["s_subtype"] = s_subtype
  121. //调用标的物识别
  122. if !ul.IsTool && !ul.IsLocal {
  123. if s_purchasinglist := getPurList(v, old_detail, f_info); len(s_purchasinglist) > 0 {
  124. f_info["s_purchasinglist"] = s_purchasinglist
  125. }
  126. }
  127. }
  128. //字段清洗
  129. fns := GetFnsInfo(v) //获取附件名字
  130. f_data = clean.CleanFieldInfo(f_info, fns, isTable)
  131. //采购单位二级校验
  132. CheckOutBuyerInfo(f_data)
  133. //标题提取采购单位
  134. if qu.ObjToString(f_data["s_buyer"]) == "" {
  135. if zp_buyer := prompt.AcquireBuyerInfo(title); zp_buyer["实体单位"] != nil {
  136. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  137. f_data["s_buyer"] = ns_buyer
  138. }
  139. }
  140. }
  141. //强制逻辑判断-
  142. ForcedLogicDecideInfo(f_data)
  143. //返回数据
  144. return f_data
  145. }
  146. /*
  147. ************************************************************
  148. ************************************************************
  149. ************************************************************
  150. 支持新模型-deepseek的轮询查询
  151. */
  152. func RunDeepSeek() {
  153. log.Debug("执行轮询定时···deepseek···", ul.Ext_Name)
  154. tmp_data := ul.PyMgo.FindById(ul.Ext_Name, "67c50d2088dabe81a67a2468")
  155. ttt := ExtractDeepSeekInfo(tmp_data)
  156. log.Debug(ttt)
  157. return
  158. for {
  159. log.Debug("开始处理线程数···", ul.Reading)
  160. pool_mgo := make(chan bool, ul.Reading)
  161. wg_mgo := &sync.WaitGroup{}
  162. sess := ul.PyMgo.GetMgoConn()
  163. defer ul.PyMgo.DestoryMongoConn(sess)
  164. q, total, isok := map[string]interface{}{}, 0, 0
  165. it := sess.DB(ul.PyMgo.DbName).C(ul.Ext_Name).Find(&q).Iter()
  166. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  167. if total%100 == 0 {
  168. log.Debug("cur ai index ", total)
  169. }
  170. isok++
  171. if tmp["ai_updatetime"] != nil {
  172. tmp = make(map[string]interface{})
  173. continue
  174. }
  175. pool_mgo <- true
  176. wg_mgo.Add(1)
  177. go func(tmp map[string]interface{}) {
  178. defer func() {
  179. <-pool_mgo
  180. wg_mgo.Done()
  181. }()
  182. u_id := ul.BsonTOStringId(tmp["_id"])
  183. //抽取deepseek数据···并更新
  184. data := ExtractDeepSeekInfo(tmp)
  185. update_info := make(map[string]interface{}, 0)
  186. if len(data) > 0 && u_id != "" {
  187. tmp["ai_zhipu"] = data
  188. ul.ChooseCheckDataAI(tmp, &update_info)
  189. if update_info["com_package"] == nil { //构建单包信息···
  190. com_package := ul.CreatSingleFieldInfo(tmp, update_info)
  191. update_info["com_package"] = com_package
  192. }
  193. update_info["ai_zhipu"] = data
  194. }
  195. update_info["ai_updatetime"] = time.Now().Unix()
  196. ul.PyMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
  197. "$set": update_info,
  198. })
  199. }(tmp)
  200. tmp = make(map[string]interface{})
  201. }
  202. wg_mgo.Wait()
  203. log.Debug("deepseek完毕······", isok)
  204. break
  205. time.Sleep(time.Second * 1800)
  206. }
  207. }
  208. // deepseek模型识别字段
  209. func ExtractDeepSeekInfo(tmp map[string]interface{}) map[string]interface{} {
  210. //基础信息
  211. tmpid := ul.BsonTOStringId(tmp["_id"])
  212. title := qu.ObjToString(tmp["title"])
  213. old_detail := getDetailText(tmp, tmpid) //获取正文文本
  214. //过滤信息
  215. if NotInProgressInfo(title, old_detail, tmp) {
  216. return map[string]interface{}{}
  217. }
  218. //识别结构,短文本结构,不想进行分类识别
  219. shorText := false
  220. if utf8.RuneCountInString(old_detail) < 100 {
  221. shorText = true
  222. }
  223. //文本格式转换
  224. new_detail := ul.HttpConvertToMarkdown(title + "\n" + old_detail)
  225. //特殊文本转换
  226. new_detail = CleanText(new_detail)
  227. //短文本判断是否有效性
  228. if shorText {
  229. if info := prompt.AcquireJudgeDeepSeekShortInfo(new_detail); info["结果"] != "是" {
  230. return map[string]interface{}{}
  231. }
  232. }
  233. //获取通用该字段
  234. f_info := prompt.AcquireExtractFieldDeepSeekInfo(new_detail)
  235. //******字段清洗******
  236. f_data := clean.CleanDeepSeekInfo(f_info, tmp)
  237. //******二级校验******
  238. CheckOutDeepSeekBuyerInfo(f_data)
  239. //******强制判断******
  240. ForcedLogicDecideInfo(f_data)
  241. return f_data
  242. }
  243. // 暂时不启用...无限重试
  244. func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
  245. //log.Debug("开始重置更新...", len(arr))
  246. //reset := []string{}
  247. //for k, v := range arr {
  248. // log.Debug("...", k, "...", v)
  249. // data := ul.SourceMgo.FindById(name, v)
  250. // content := PromptFieldText(qu.ObjToString(data["detail"]))
  251. // zp, ok := map[string]interface{}{}, 0
  252. // for {
  253. // ok++
  254. // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
  255. // break
  256. // }
  257. // if ok >= 5 {
  258. // log.Debug("请求数据失败...", v)
  259. // reset = append(reset, v)
  260. // break
  261. // }
  262. // }
  263. // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
  264. // "$set": map[string]interface{}{
  265. // "zhipu": zp,
  266. // },
  267. // })
  268. //}
  269. //if len(reset) > 0 { //无限尝试
  270. // RunResetUpdateFieldInfo(reset, name, s_name)
  271. //}
  272. //log.Debug("本轮重置更新结束......")
  273. }