package main import ( "fmt" "log" "math" qutil "qfw/util" "qfw/util/mongodb" "strconv" "strings" "sync" "time" ) type Info struct { id string title string area string city string subtype string buyer string agency string //代理机构 winner string //中标单位 projectname string projectcode string publishtime int64 comeintime int64 ContainSpecialWord bool } var datelimit = float64(432000) type datamap struct { lock sync.Mutex //锁 days int //保留几天数据 data map[string][]*Info keymap []string keys map[string]bool } func NewDatamap(days int, lastid string) *datamap { datelimit = qutil.Float64All(days * 86400) dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, map[string]bool{}} if lastid == "" { return dm } //初始化加载数据 sess := mgo.GetMgoConn() defer mgo.DestoryMongoConn(sess) it := sess.DB(mgo.DbName).C(extract).Find(mongodb.ObjToMQ(`{"_id":{"$lte":"`+lastid+`"}}`, true)).Sort("-_id").Iter() now1 := int64(0) n, continuSum := 0, 0 for tmp := make(map[string]interface{}); it.Next(tmp); n++ { // if qutil.IntAll(tmp["repeat"]) == 1 || qutil.ObjToString(tmp["subtype"]) == "变更" { continuSum++ } else { cm := tmp["comeintime"] //cm := tmp["publishtime"] comeintime := qutil.Int64All(cm) if comeintime == 0 { id := qutil.BsonIdToSId(tmp["_id"])[0:8] comeintime, _ = strconv.ParseInt(id, 16, 64) } if now1 == 0 { now1 = comeintime } if qutil.Float64All(now1-comeintime) < datelimit { info := NewInfo(tmp) dkey := qutil.FormatDateWithObj(&cm, qutil.Date_yyyyMMdd) k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area) data := dm.data[k] if data == nil { data = []*Info{} //log.Println(k) } data = append(data, info) dm.data[k] = data dm.keys[dkey] = true } else { break } } if n%5000 == 0 { log.Println("current n:", n, continuSum) } tmp = make(map[string]interface{}) } log.Println("load data:", n) return dm } func NewInfo(tmp map[string]interface{}) *Info { subtype := qutil.ObjToString(tmp["subtype"]) area := qutil.ObjToString(tmp["area"]) if area == "A" { area = "全国" } info := &Info{} info.id = qutil.BsonIdToSId(tmp["_id"]) info.title = qutil.ObjToString(tmp["title"]) info.area = area info.subtype = subtype info.buyer = qutil.ObjToString(tmp["buyer"]) info.projectname = qutil.ObjToString(tmp["projectname"]) info.ContainSpecialWord = FilterRegexp.MatchString(info.projectname) || FilterRegexp.MatchString(info.title) info.projectcode = qutil.ObjToString(tmp["projectcode"]) info.city = qutil.ObjToString(tmp["city"]) info.agency = qutil.ObjToString(tmp["agency"]) //info.winner = qutil.ObjToString(tmp["winner"]) info.publishtime = qutil.Int64All(tmp["publishtime"]) return info } func (d *datamap) check(info *Info) (b bool, id string) { d.lock.Lock() defer d.lock.Unlock() keys := []string{} for k, _ := range d.keys { keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area)) if info.area != "全国" { //这个后续可以不要 keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国")) } } L: for _, k := range keys { data := d.data[k] if len(data) > 0 { //对比 for _, v := range data { if v.id == info.id { return false, v.id } if math.Abs(qutil.Float64All(v.publishtime-info.publishtime)) > datelimit { continue } if v.agency != "" && info.agency != "" && v.agency != info.agency { continue } n := 0 if v.buyer != "" && v.buyer == info.buyer { n++ } if v.projectname != "" && v.projectname == info.projectname { n++ } if !info.ContainSpecialWord && n > 1 { b = true id = v.id break L } else if v.projectcode != "" && v.projectcode == info.projectcode { n++ } if !info.ContainSpecialWord && n > 1 || n > 2 { b = true id = v.id break L } //标题长度大于10且相等即为重复 // if len([]rune(info.title)) > 10 && v.title == info.title { // b = true // id = v.id // break L // } //标题长度大于10且包含关系+buyer/projectname/projectcode/city(全国/A的只判断包含关系即可)相等即为重复 if len([]rune(info.title)) > 10 && len([]rune(v.title)) > 10 && (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) { if info.area == "全国" || n > 0 || info.city == v.city { b = true id = v.id break L } } } } } if !b { ct, _ := strconv.ParseInt(info.id[:8], 16, 64) dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd) k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area) data := d.data[k] if data == nil { data = []*Info{info} d.data[k] = data if !d.keys[dkey] { d.keys[dkey] = true d.update(ct) } } else { data = append(data, info) d.data[k] = data } } return } func (d *datamap) update(t int64) { //每天0点清除历史数据 d.keymap = d.GetLatelyFiveDay(t) m := map[string]bool{} for _, v := range d.keymap { m[v] = true } all, all1 := 0, 0 for k, v := range d.data { all += len(v) if !m[k[:8]] { delete(d.data, k) } } for k, _ := range d.keys { if !m[k] { delete(d.keys, k) } } for _, v := range d.data { all1 += len(v) } log.Println("更新前后数据:", all, all1) } func (d *datamap) GetLatelyFiveDay(t int64) []string { array := make([]string, d.days) now := time.Unix(t, 0) for i := 0; i < d.days; i++ { array[i] = now.Format(qutil.Date_yyyyMMdd) now = now.AddDate(0, 0, -1) } return array }