extension.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "regexp"
  9. "strings"
  10. "unicode/utf8"
  11. )
  12. func FilterDetail(con string) string {
  13. return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
  14. }
  15. var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
  16. var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
  17. var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
  18. // 确认抽取范围
  19. func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
  20. dict := map[string]interface{}{}
  21. sess := ul.SourceMgo.GetMgoConn()
  22. defer ul.SourceMgo.DestoryMongoConn(sess)
  23. total := 0
  24. it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
  25. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  26. if total%1000 == 0 {
  27. log.Debug("cur index ", total)
  28. }
  29. if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
  30. tmpid := ul.BsonTOStringId(tmp["_id"])
  31. dict[tmpid] = tmpid
  32. }
  33. tmp = make(map[string]interface{})
  34. }
  35. return dict
  36. }
  37. // 获取附件名字信息
  38. func getpnsinfo(tmp map[string]interface{}) []string {
  39. arr := []string{}
  40. if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
  41. if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
  42. for _, v := range *attachments {
  43. if info := qu.ObjToMap(v); info != nil {
  44. if filename := qu.ObjToString((*info)["filename"]); filename != "" {
  45. arr = append(arr, filename)
  46. }
  47. }
  48. }
  49. }
  50. }
  51. return arr
  52. }
  53. // 获取正文数据
  54. func getDetailText(v map[string]interface{}, tmpid string) string {
  55. detail := qu.ObjToString(v["detail"])
  56. if ul.IsTool {
  57. if details := qu.ObjToString(v["details"]); details != "" {
  58. detail = details
  59. }
  60. filetext := qu.ObjToString(v["filetext"])
  61. if utf8.RuneCountInString(detail) < 100 && filetext != "" {
  62. detail = filetext
  63. }
  64. } else {
  65. //if bs := ul.OssGetObject(tmpid); bs != "" {
  66. // detail = bs
  67. //}
  68. }
  69. return detail
  70. }
  71. // 获取标的物-过滤产权-拟建
  72. func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) []map[string]interface{} {
  73. if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
  74. return []map[string]interface{}{}
  75. }
  76. p_data := map[string]interface{}{}
  77. p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
  78. p_data["site"] = v["site"]
  79. p_data["attach_text"] = v["attach_text"]
  80. p_data["toptype"] = v["toptype"]
  81. if f_info["s_toptype"] != nil {
  82. p_data["toptype"] = f_info["s_toptype"]
  83. }
  84. if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
  85. if qu.IntAll(p_info["status"]) == 200 {
  86. p_list := ul.IsMarkInterfaceMap(p_info["purchasinglist"])
  87. return p_list
  88. }
  89. }
  90. return []map[string]interface{}{}
  91. }
  92. /*
  93. ****************************************
  94. ****************************************
  95. ****************************************
  96. */
  97. // 过滤信息规则···
  98. func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
  99. if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
  100. return true
  101. }
  102. detail = FilterDetail(detail) //只保留文本内容
  103. dl := utf8.RuneCountInString(detail) //文本长度
  104. if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
  105. return true
  106. }
  107. return false
  108. }
  109. // 二次校验采购单位
  110. func CheckOutBuyerInfo(f_data map[string]interface{}) {
  111. if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
  112. if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
  113. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  114. f_data["s_buyer"] = ns_buyer
  115. }
  116. }
  117. }
  118. }
  119. // 合并字段
  120. func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
  121. info := map[string]interface{}{}
  122. for _, v := range infos {
  123. for k1, v1 := range v {
  124. info[k1] = v1
  125. }
  126. }
  127. return info
  128. }
  129. // 强制逻辑判断数据
  130. func ForcedLogicDecideInfo(f_data map[string]interface{}) {
  131. //多单位不能一致,原则大模型
  132. s_buyer := qu.ObjToString(f_data["s_buyer"])
  133. s_winner := qu.ObjToString(f_data["s_winner"])
  134. if s_buyer == s_winner && s_buyer != "" {
  135. /*
  136. 1、若单位名称-不含公司保留采购单位
  137. 2、若单位名称-含公司保留中标单位
  138. */
  139. if strings.Contains(s_buyer, "公司") {
  140. f_data["s_buyer"] = ""
  141. } else {
  142. f_data["s_winner"] = ""
  143. }
  144. }
  145. //代理机构
  146. if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
  147. if s_agency == s_buyer || s_agency == s_winner {
  148. f_data["s_agency"] = ""
  149. }
  150. }
  151. }