package extract import ( "data_ai/clean" "data_ai/prompt" "data_ai/ul" log "github.com/donnie4w/go-logger/logger" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "regexp" "strings" "unicode/utf8" ) func FilterDetail(con string) string { return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "") } var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+") var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]") var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)") // 确认抽取范围 func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} { dict := map[string]interface{}{} sess := ul.SourceMgo.GetMgoConn() defer ul.SourceMgo.DestoryMongoConn(sess) total := 0 it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter() for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%1000 == 0 { log.Debug("cur index ", total) } if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别 tmpid := ul.BsonTOStringId(tmp["_id"]) dict[tmpid] = tmpid } tmp = make(map[string]interface{}) } return dict } // 获取附件名字信息 func getpnsinfo(tmp map[string]interface{}) []string { arr := []string{} if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil { if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil { for _, v := range *attachments { if info := qu.ObjToMap(v); info != nil { if filename := qu.ObjToString((*info)["filename"]); filename != "" { arr = append(arr, filename) } } } } } return arr } // 获取正文数据 func getDetailText(v map[string]interface{}, tmpid string) string { detail := qu.ObjToString(v["detail"]) if ul.IsTool { if details := qu.ObjToString(v["details"]); details != "" { detail = details } filetext := qu.ObjToString(v["filetext"]) if utf8.RuneCountInString(detail) < 100 && filetext != "" { detail = filetext } } else { //if bs := ul.OssGetObject(tmpid); bs != "" { // detail = bs //} } return detail } // 获取标的物-过滤产权-拟建 func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) []map[string]interface{} { if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" { return []map[string]interface{}{} } p_data := map[string]interface{}{} p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail p_data["site"] = v["site"] p_data["attach_text"] = v["attach_text"] p_data["toptype"] = v["toptype"] if f_info["s_toptype"] != nil { p_data["toptype"] = f_info["s_toptype"] } if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 { if qu.IntAll(p_info["status"]) == 200 { p_list := ul.IsMarkInterfaceMap(p_info["purchasinglist"]) return p_list } } return []map[string]interface{}{} } /* **************************************** **************************************** **************************************** */ // 过滤信息规则··· func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool { if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil { return true } detail = FilterDetail(detail) //只保留文本内容 dl := utf8.RuneCountInString(detail) //文本长度 if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) { return true } return false } // 二次校验采购单位 func CheckOutBuyerInfo(f_data map[string]interface{}) { if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" { if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil { if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" { f_data["s_buyer"] = ns_buyer } } } } // 合并字段 func MergeInfo(infos []map[string]interface{}) map[string]interface{} { info := map[string]interface{}{} for _, v := range infos { for k1, v1 := range v { info[k1] = v1 } } return info } // 强制逻辑判断数据 func ForcedLogicDecideInfo(f_data map[string]interface{}) { //多单位不能一致,原则大模型 s_buyer := qu.ObjToString(f_data["s_buyer"]) s_winner := qu.ObjToString(f_data["s_winner"]) if s_buyer == s_winner && s_buyer != "" { /* 1、若单位名称-不含公司保留采购单位 2、若单位名称-含公司保留中标单位 */ if strings.Contains(s_buyer, "公司") { f_data["s_buyer"] = "" } else { f_data["s_winner"] = "" } } //代理机构 if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" { if s_agency == s_buyer || s_agency == s_winner { f_data["s_agency"] = "" } } }