extension.go 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "regexp"
  9. "strings"
  10. "unicode/utf8"
  11. )
  12. func FilterDetail(con string) string {
  13. return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
  14. }
  15. var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
  16. var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
  17. var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
  18. // 确认抽取范围
  19. func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
  20. dict := map[string]interface{}{}
  21. sess := ul.SourceMgo.GetMgoConn()
  22. defer ul.SourceMgo.DestoryMongoConn(sess)
  23. total := 0
  24. it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
  25. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  26. if total%1000 == 0 {
  27. log.Debug("cur index ", total)
  28. }
  29. if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
  30. tmpid := ul.BsonTOStringId(tmp["_id"])
  31. dict[tmpid] = tmpid
  32. }
  33. tmp = make(map[string]interface{})
  34. }
  35. return dict
  36. }
  37. // 获取附件名字信息
  38. func getpnsinfo(tmp map[string]interface{}) []string {
  39. arr := []string{}
  40. if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
  41. if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
  42. for _, v := range *attachments {
  43. if info := qu.ObjToMap(v); info != nil {
  44. if filename := qu.ObjToString((*info)["filename"]); filename != "" {
  45. arr = append(arr, filename)
  46. }
  47. }
  48. }
  49. }
  50. }
  51. return arr
  52. }
  53. // 获取正文数据
  54. func getDetailText(v map[string]interface{}, tmpid string) string {
  55. detail := qu.ObjToString(v["detail"])
  56. if ul.IsTool {
  57. if details := qu.ObjToString(v["details"]); details != "" {
  58. detail = details
  59. }
  60. filetext := qu.ObjToString(v["filetext"])
  61. if utf8.RuneCountInString(detail) < 100 && filetext != "" {
  62. detail = filetext
  63. }
  64. } else {
  65. //if bs := ul.OssGetObject(tmpid); bs != "" {
  66. // detail = bs
  67. //}
  68. }
  69. return detail
  70. }
  71. // 获取标的物-过滤产权-拟建
  72. func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) map[string]interface{} {
  73. if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
  74. return map[string]interface{}{}
  75. }
  76. p_data := map[string]interface{}{}
  77. p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
  78. p_data["site"] = v["site"]
  79. p_data["attach_text"] = v["attach_text"]
  80. p_data["toptype"] = v["toptype"]
  81. if f_info["s_toptype"] != nil {
  82. p_data["toptype"] = f_info["s_toptype"]
  83. }
  84. //结果有 标的物+标的物label+剑鱼码
  85. if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
  86. if qu.IntAll(p_info["status"]) == 200 {
  87. //消息体
  88. message := qu.ObjToMap(p_info["message"])
  89. if message != nil {
  90. return *message
  91. }
  92. return map[string]interface{}{}
  93. }
  94. }
  95. return map[string]interface{}{}
  96. }
  97. /*
  98. ****************************************
  99. ****************************************
  100. ****************************************
  101. */
  102. // 过滤信息规则···
  103. func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
  104. if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
  105. return true
  106. }
  107. detail = FilterDetail(detail) //只保留文本内容
  108. dl := utf8.RuneCountInString(detail) //文本长度
  109. if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
  110. return true
  111. }
  112. return false
  113. }
  114. // 二次校验采购单位
  115. func CheckOutBuyerInfo(f_data map[string]interface{}) {
  116. if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
  117. if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
  118. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  119. f_data["s_buyer"] = ns_buyer
  120. }
  121. }
  122. }
  123. }
  124. // 合并字段
  125. func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
  126. info := map[string]interface{}{}
  127. for _, v := range infos {
  128. for k1, v1 := range v {
  129. info[k1] = v1
  130. }
  131. }
  132. return info
  133. }
  134. // 强制逻辑判断数据
  135. func ForcedLogicDecideInfo(f_data map[string]interface{}) {
  136. //多单位不能一致,原则大模型
  137. s_buyer := qu.ObjToString(f_data["s_buyer"])
  138. s_winner := qu.ObjToString(f_data["s_winner"])
  139. if s_buyer == s_winner && s_buyer != "" {
  140. /*
  141. 1、若单位名称-不含公司保留采购单位
  142. 2、若单位名称-含公司保留中标单位
  143. */
  144. if strings.Contains(s_buyer, "公司") {
  145. f_data["s_buyer"] = ""
  146. } else {
  147. f_data["s_winner"] = ""
  148. }
  149. }
  150. //代理机构
  151. if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
  152. if s_agency == s_buyer || s_agency == s_winner {
  153. f_data["s_agency"] = ""
  154. }
  155. }
  156. }