extension.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. package extract
  2. import (
  3. "data_ai/clean"
  4. "data_ai/prompt"
  5. "data_ai/ul"
  6. log "github.com/donnie4w/go-logger/logger"
  7. "github.com/shopspring/decimal"
  8. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  9. "regexp"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. func FilterDetail(con string) string {
  14. return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
  15. }
  16. var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
  17. var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
  18. var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
  19. var CleanReg0 = regexp.MustCompile("([eE][\\+])")
  20. var CleanReg1 = regexp.MustCompile("([::](([1-9][.][0-9]+)([eE][\\+])([0]+[6-9])))")
  21. var CleanReg2 = regexp.MustCompile("((([1-9][.][0-9]+)([eE][\\+])([0]*[6-9])))")
  22. // 确认抽取范围
  23. func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
  24. dict := map[string]interface{}{}
  25. sess := ul.SourceMgo.GetMgoConn()
  26. defer ul.SourceMgo.DestoryMongoConn(sess)
  27. total := 0
  28. it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
  29. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  30. if total%1000 == 0 {
  31. log.Debug("cur index ", total)
  32. }
  33. if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
  34. tmpid := ul.BsonTOStringId(tmp["_id"])
  35. dict[tmpid] = tmpid
  36. }
  37. tmp = make(map[string]interface{})
  38. }
  39. return dict
  40. }
  41. // 获取附件名字信息
  42. func GetFnsInfo(tmp map[string]interface{}) []string {
  43. arr := []string{}
  44. if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
  45. if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
  46. for _, v := range *attachments {
  47. if info := qu.ObjToMap(v); info != nil {
  48. if filename := qu.ObjToString((*info)["filename"]); filename != "" {
  49. arr = append(arr, filename)
  50. }
  51. }
  52. }
  53. }
  54. }
  55. return arr
  56. }
  57. // 获取正文数据
  58. func getDetailText(v map[string]interface{}, tmpid string) string {
  59. //按照最新文本请求的数据···
  60. //detail := ul.PostDetailContentHtmlText("detail", tmpid)
  61. //if detail != "" {
  62. // return detail
  63. //}
  64. detail := qu.ObjToString(v["detail"])
  65. if ul.IsTool {
  66. if details := qu.ObjToString(v["details"]); details != "" {
  67. detail = details
  68. }
  69. filetext := qu.ObjToString(v["filetext"])
  70. if utf8.RuneCountInString(detail) < 100 && filetext != "" {
  71. detail = filetext
  72. }
  73. }
  74. return detail
  75. }
  76. // 获取标的物-过滤产权-拟建
  77. func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) map[string]interface{} {
  78. if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
  79. return map[string]interface{}{}
  80. }
  81. p_data := map[string]interface{}{}
  82. p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
  83. p_data["site"] = v["site"]
  84. p_data["attach_text"] = v["attach_text"]
  85. p_data["toptype"] = v["toptype"]
  86. if f_info["s_toptype"] != nil {
  87. p_data["toptype"] = f_info["s_toptype"]
  88. }
  89. //结果有 标的物+标的物label+剑鱼码
  90. if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
  91. if qu.IntAll(p_info["status"]) == 200 {
  92. //消息体
  93. message := qu.ObjToMap(p_info["message"])
  94. if message != nil {
  95. return *message
  96. }
  97. return map[string]interface{}{}
  98. }
  99. }
  100. return map[string]interface{}{}
  101. }
  102. /*
  103. ****************************************
  104. ****************************************
  105. ****************************************
  106. */
  107. // 过滤信息规则···
  108. func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
  109. if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
  110. return true
  111. }
  112. detail = FilterDetail(detail) //只保留文本内容
  113. dl := utf8.RuneCountInString(detail) //文本长度
  114. if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
  115. return true
  116. }
  117. return false
  118. }
  119. // 二次校验采购单位
  120. func CheckOutBuyerInfo(f_data map[string]interface{}) {
  121. if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
  122. if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
  123. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  124. f_data["s_buyer"] = ns_buyer
  125. }
  126. }
  127. }
  128. }
  129. // 二次校验采购单位
  130. func CheckOutDeepSeekBuyerInfo(f_data map[string]interface{}) {
  131. if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
  132. if zp_buyer := prompt.AcquireDeepSeekBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
  133. if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
  134. f_data["s_buyer"] = ns_buyer
  135. }
  136. }
  137. }
  138. }
  139. // 合并字段
  140. func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
  141. info := map[string]interface{}{}
  142. for _, v := range infos {
  143. for k1, v1 := range v {
  144. info[k1] = v1
  145. }
  146. }
  147. return info
  148. }
  149. // 强制逻辑判断数据
  150. func ForcedLogicDecideInfo(f_data map[string]interface{}) {
  151. //多单位不能一致,原则大模型
  152. s_buyer := qu.ObjToString(f_data["s_buyer"])
  153. s_winner := qu.ObjToString(f_data["s_winner"])
  154. if s_buyer == s_winner && s_buyer != "" {
  155. /*
  156. 1、若单位名称-不含公司保留采购单位
  157. 2、若单位名称-含公司保留中标单位
  158. */
  159. if strings.Contains(s_buyer, "公司") {
  160. f_data["s_buyer"] = ""
  161. } else {
  162. f_data["s_winner"] = ""
  163. }
  164. }
  165. //代理机构
  166. if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
  167. if s_agency == s_buyer || s_agency == s_winner {
  168. f_data["s_agency"] = ""
  169. }
  170. }
  171. }
  172. // 科学计数法标记
  173. func ScientificUnit(detail string) (string, float64) {
  174. if !CleanReg0.MatchString(detail) {
  175. return "", 0.0
  176. }
  177. x, u := "", ""
  178. //符合条件1···修最后一个金额
  179. if arr := CleanReg1.FindAllString(detail, -1); len(arr) > 0 {
  180. str1 := arr[len(arr)-1]
  181. x = CleanReg1.ReplaceAllString(str1, "${3}")
  182. u = CleanReg1.ReplaceAllString(str1, "${5}")
  183. } else {
  184. //符合条件2···修第一个金额
  185. if str2 := CleanReg2.FindString(detail); str2 != "" {
  186. x = CleanReg2.ReplaceAllString(str2, "${3}")
  187. u = CleanReg2.ReplaceAllString(str2, "${5}")
  188. } else {
  189. return "", 0.0
  190. }
  191. }
  192. ut := qu.IntAll(u)
  193. if ut >= 10 {
  194. return "", 0.0
  195. }
  196. if xf := qu.Float64All(x); xf > 0.0 {
  197. a := decimal.NewFromFloat(xf)
  198. b := decimal.New(1, int32(ut))
  199. v := a.Mul(b)
  200. fv, _ := v.Float64()
  201. sv := v.String()
  202. if fv > 10000000000 {
  203. return "", 0.0
  204. }
  205. return sv, fv
  206. }
  207. return "", 0.0
  208. }