package main import ( "fmt" "log" "math" qutil "qfw/util" "qfw/util/mongodb" "strconv" "strings" "sync" "time" ) type Info struct { id string title string area string city string subtype string buyer string agency string //代理机构 winner string //中标单位 projectname string projectcode string publishtime int64 comeintime int64 bidopentime int64 //开标时间 agencyaddr string//开标地点 detail string//招标内容 site string//站点 ContainSpecialWord bool } var datelimit = float64(432000) var mm int type datamap struct { lock sync.Mutex //锁 days int //保留几天数据 data map[string][]*Info keymap []string keys map[string]bool } func NewDatamap(days int, lastid string) *datamap { datelimit = qutil.Float64All(days * 86400) dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, map[string]bool{}} if lastid == "" { return dm } //初始化加载数据 sess := mgo.GetMgoConn() defer mgo.DestoryMongoConn(sess) it := sess.DB(mgo.DbName).C(extract).Find(mongodb.ObjToMQ(`{"_id":{"$lte":"`+lastid+`"}}`, true)).Sort("-_id").Iter() now1 := int64(0) n, continuSum := 0, 0 for tmp := make(map[string]interface{}); it.Next(&tmp); n++ { //|| qutil.ObjToString(tmp["subtype"]) == "变更" //变更的数据打开 if qutil.IntAll(tmp["repeat"]) == 1 { continuSum++ } else { cm := tmp["comeintime"] //时间单位 //cm := tmp["publishtime"] comeintime := qutil.Int64All(cm) if comeintime == 0 { id := qutil.BsonIdToSId(tmp["_id"])[0:8] comeintime, _ = strconv.ParseInt(id, 16, 64) } if now1 == 0 { now1 = comeintime } if qutil.Float64All(now1-comeintime) < datelimit { info := NewInfo(tmp) dkey := qutil.FormatDateWithObj(&cm, qutil.Date_yyyyMMdd) k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area) data := dm.data[k] if data == nil { data = []*Info{} //log.Println(k) } data = append(data, info) dm.data[k] = data dm.keys[dkey] = true } else { break } } if n%5000 == 0 { log.Println("current n:", n, continuSum) } tmp = make(map[string]interface{}) } log.Println("load data:", n) return dm } func NewInfo(tmp map[string]interface{}) *Info { subtype := qutil.ObjToString(tmp["subtype"]) area := qutil.ObjToString(tmp["area"]) if area == "A" { area = "全国" } info := &Info{} info.id = qutil.BsonIdToSId(tmp["_id"]) info.title = qutil.ObjToString(tmp["title"]) info.area = area info.subtype = subtype info.buyer = qutil.ObjToString(tmp["buyer"]) info.projectname = qutil.ObjToString(tmp["projectname"]) //info.ContainSpecialWord = FilterRegexp.MatchString(info.projectname) || FilterRegexp.MatchString(info.title) info.ContainSpecialWord = FilterRegTitle.MatchString(info.title) info.projectcode = qutil.ObjToString(tmp["projectcode"]) info.city = qutil.ObjToString(tmp["city"]) info.agency = qutil.ObjToString(tmp["agency"]) //info.winner = qutil.ObjToString(tmp["winner"]) info.publishtime = qutil.Int64All(tmp["publishtime"]) info.bidopentime = qutil.Int64All(tmp["bidopentime"]) info.agencyaddr = qutil.ObjToString(tmp["agencyaddr"]) info.detail = qutil.ObjToString(tmp["detail"]) info.site = qutil.ObjToString(tmp["site"]) return info } func (d *datamap) check(info *Info) (b bool, id string) { d.lock.Lock() defer d.lock.Unlock() keys := []string{} for k, _ := range d.keys { keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area)) if info.area != "全国" { //这个后续可以不要 keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国")) } } L: for _, k := range keys { data := d.data[k] if len(data) > 0 { //对比 for _, v := range data { //正常重复 if v.id == info.id { return false, v.id } if math.Abs(qutil.Float64All(v.publishtime-info.publishtime)) > datelimit { continue } if v.agency != "" && info.agency != "" && v.agency != info.agency { continue } if info.subtype==v.subtype { if info.subtype == "变更" { //以下为新增方法 , 变更数据判重处理 v为原数据 info为目标数据 if info.publishtime=10&&v.projectcode!=""{ continue } //同城判定有效 first_judge:= false if (v.projectcode != ""&&v.projectcode==info.projectcode&&v.projectname != ""&&v.projectname==info.projectname)|| (v.title != ""&&v.title==info.title&&v.bidopentime != 0&&v.bidopentime==info.bidopentime&&v.detail != ""&&v.detail==info.detail) { first_judge = true } //3/6等判断 n := 0 if v.title != "" && v.title == info.title { n++ } if v.projectname != "" && v.projectname == info.projectname { n++ } if v.projectcode != "" && v.projectcode == info.projectcode { n++ } if v.bidopentime != 0 && v.bidopentime == info.bidopentime { n++ } if v.agencyaddr != "" && v.agencyaddr == info.agencyaddr { n++ } if v.detail != "" && v.detail == info.detail { n++ } t:= judgeCityType(v.area,info.area,v.city,info.city) if n>=3||first_judge==true { if t==2 { //同城 b = true id = v.id log.Print("同城满足的",info.id) break L } } }else {//非变更数据判重处理 n:=0 //三要素 m:=0 //二要素 x:=0 //四要素 if info.buyer != "" &&v.buyer == info.buyer { n++ x++ } if info.projectname != ""&&v.projectname == info.projectname { n++ m++ x++ } if info.projectcode != ""&&v.projectcode == info.projectcode { n++ m++ x++ } if info.title != ""&&v.title == info.title { x++ } t:= judgeCityType(v.area,info.area,v.city,info.city) c_1 :=conditionTitle(v.title,info.title) //标题满足 c_2 :=conditionNum(v.projectcode,info.projectcode) //编号满足 c_3 :=conditionTAB(v.title,info.title,v.buyer,info.buyer) //标题+采购单位 //同站点判断 if info.site != "" && v.site == info.site { if n>1||c_1||c_2 { b = true id = v.id log.Println("站点满足过滤") break L } }else { if info.ContainSpecialWord&&info.title!=v.title&&v.title!="" { continue } if v.projectcode != info.projectcode&&len([]rune(info.projectcode)) >=10&&v.projectcode!=""{ continue } //先决条件满足三要素 if n==3{ b = true id = v.id break L } //城市判断 if t==0||t==1 { //最少一个全国 if c_1 && (c_2||n>1) { b = true id = v.id break L } if c_2&&x>2{ b = true id = v.id break L } }else if t==2 { // 省-市 if c_1||c_2||n>1 { b = true id = v.id break L } }else if t==3 {// !省 !市 if (c_1&&n>1)||(c_2&&x>2){ b = true id = v.id break L } }else if t==4 {// 省 !市 if m>1||(c_1&&m>0)||(c_2&&x>1)||(c_3&&n>1){ b = true id = v.id break L } }else { } } } } ////非变更数据判重处理 //n := 0 //if v.buyer != "" && v.buyer == info.buyer { // n++ //} //if v.projectname != "" && v.projectname == info.projectname { // n++ //} //if !info.ContainSpecialWord && n > 1 { // b = true // id = v.id // break L //} else if v.projectcode != "" && v.projectcode == info.projectcode { // n++ //} //if !info.ContainSpecialWord && n > 1 || n > 2 { // b = true // id = v.id // break L //} ////标题长度大于10且相等即为重复 //// if len([]rune(info.title)) > 10 && v.title == info.title { //// b = true //// id = v.id //// break L //// } ////标题长度大于10且包含关系+buyer/projectname/projectcode/city(全国/A的只判断包含关系即可)相等即为重复 //if len([]rune(info.title)) > 10 && len([]rune(v.title)) > 10 && (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) { // if info.area == "全国" || n > 0 || info.city == v.city { // b = true // id = v.id // break L // } //} } } } //往预存数据 d 添加 if !b { ct, _ := strconv.ParseInt(info.id[:8], 16, 64) dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd) k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area) data := d.data[k] if data == nil { data = []*Info{info} d.data[k] = data if !d.keys[dkey] { d.keys[dkey] = true d.update(ct) } } else { data = append(data, info) d.data[k] = data } } return } //判断是否同城等情况 func judgeCityType(v string, info string,v_c string,info_c string) (t int) { t=0 if (v=="全国"||v=="")&&(info=="全国"||info=="") {//均为全国 t=0 }else if v!="全国"&&info!="全国"&&v!=""&&info!=""&& v_c!="全国"&&info_c!="全国"&&v_c!=""&&info_c!=""{//均非全国 if v==info &&v_c==info_c { //同省同城 t=2 }else if v!=info&&v_c!=info_c{//非同省非同城 t=3 }else {//同省非同城 t=4 } }else {//有且一个全国 t=1 } return t } //条件一 标题 func conditionTitle(t1 string, t2 string) bool { if len([]rune(t1))>10 && len([]rune(t2))>10&& (strings.Contains(t1, t2)||strings.Contains(t2, t1)) { return true } return false } //条件二 项目编号 func conditionNum(c1 string ,c2 string) bool { if c1 == c2&&len([]rune(c1)) >=10 { return true } return false } //条件三 采购单位+标题 func conditionTAB(t1 string ,t2 string,b1 string,b2 string) bool { if t1==t2&&b1==b2 { return true } return false } func (d *datamap) update(t int64) { //每天0点清除历史数据 d.keymap = d.GetLatelyFiveDay(t) m := map[string]bool{} for _, v := range d.keymap { m[v] = true } all, all1 := 0, 0 for k, v := range d.data { all += len(v) if !m[k[:8]] { delete(d.data, k) } } for k, _ := range d.keys { if !m[k] { delete(d.keys, k) } } for _, v := range d.data { all1 += len(v) } //log.Println("更新前后数据:", all, all1) } func (d *datamap) GetLatelyFiveDay(t int64) []string { array := make([]string, d.days) now := time.Unix(t, 0) for i := 0; i < d.days; i++ { array[i] = now.Format(qutil.Date_yyyyMMdd) now = now.AddDate(0, 0, -1) } return array }