package main import ( "go.mongodb.org/mongo-driver/bson" "log" "mongodb" qu "qfw/util" "regexp" "strings" "time" ) var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"} var packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)`) var listSource []*dataSource type dataSource struct { _id, id, title string projectname, projectcode, contractcode string buyer, agency, s_winner string budget, bidamount float64 isrepeat bool repeat_id_source string repeat_id map[string]string repeatText string } func task4(coll, startTime, endTime, sortType string) { log.Printf("表名:%s,开始时间:%s,结束时间:%s,排序方式:%s", coll, startTime, endTime, sortType) stime, _ := time.Parse(qu.Date_Short_Layout, startTime) etime, _ := time.Parse(qu.Date_Short_Layout, endTime) query := bson.M{} query["$and"] = []interface{}{ bson.M{"publishtime": bson.M{"$gte": stime.Unix()}}, bson.M{"publishtime": bson.M{"$lte": etime.Unix()}}, } sort := "publishtime" if sortType == "-1" { sort = "-publishtime" } log.Println(query, sort) sess := Mgo.GetMgoConn() defer Mgo.DestoryMongoConn(sess) it := sess.DB(Mgo.DbName).C(coll).Find(query).Sort(sort).Iter() index := 0 for tmp := make(map[string]interface{}); it.Next(&tmp); index++ { d := &dataSource{ _id: mongodb.BsonIdToSId(tmp["_id"]), id: qu.ObjToString(tmp["id"]), title: strings.ToLower(qu.ObjToString(tmp["title"])), projectname: strings.ToLower(qu.ObjToString(tmp["projectname"])), projectcode: strings.ToLower(qu.ObjToString(tmp["projectcode"])), contractcode: strings.ToLower(qu.ObjToString(tmp["contractcode"])), buyer: strings.ToLower(qu.ObjToString(tmp["buyer"])), agency: strings.ToLower(qu.ObjToString(tmp["agency"])), s_winner: strings.ToLower(qu.ObjToString(tmp["s_winner"])), budget: qu.Float64All(tmp["budget"]), bidamount: qu.Float64All(tmp["bidamount"]), repeat_id: map[string]string{}, } //log.Println(tmp["_id"], d.id) if index%10000 == 0 { log.Println("加载数据:", index) } listSource = append(listSource, d) tmp = map[string]interface{}{} } log.Println("数据加载完成") dataItem() dd := 0 for i := 0; i < len(listSource); i++ { a := listSource[i] if a.isrepeat { dd++ } //更新数据 Mgo.UpdateById(coll, a._id, map[string]interface{}{"$set": map[string]interface{}{ "repeatid": a.repeat_id_source, //和那条数据重复id "repeat": a.isrepeat, //本条数据是否本判重 "repeatid_ids": a.repeat_id, //和我重复的数据都有哪些 "repeattext": a.repeatText, //本数据被判重的原因 }}) if i%1000 == 0 { log.Println("已更新", i) } } log.Println(dd) } var listSize = 20000 func dataItem() { for i := 0; i < len(listSource); i++ { a := listSource[i] // if a.isrepeat { // continue // } b := &dataSource{} for j := i + 1; j < len(listSource); j++ { b = listSource[j] // if b.isrepeat { // continue // } a, b = panchong(*a, *b) listSource[j] = b listSource[i] = a // if b.isrepeat { // log.Println("sss", a.id, b.isrepeat, b.repeat_id) // } } if i%500 == 0 { log.Println("已处理:", i) } } } func panchong(a, b dataSource) (c, d *dataSource) { switch { case a.title == b.title: //标题相等 if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) { if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && buyer && s_winner" //log.Println("1111", a.id, b.id, b.isrepeat) } } else { r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && budget && key_list" } } } else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && projectcode && key_list" } } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) { if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && bidamount && buyer && s_winner" //log.Println("1111", a.id, b.id, b.isrepeat) } } else { r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && bidamount && key_list" } } } else { // } case a.title != b.title: //标题不相等 //项目名称包含及相等 if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) { isp := packreg.MatchString(a.title) //有分包 if isp { //项目名称相等 if a.projectname == b.projectname { if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname && bidamount" } else if a.bidamount != b.bidamount { // } else { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget" } } } else { //项目名称包含 if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) { if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode" } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner" } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) { if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer" } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency" } else { // } } } else if a.bidamount != b.bidamount { // } else { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget" } else { // } } } } else { //无分包 //项目名称相等 if a.projectname == b.projectname { if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && bidamount" } else if a.bidamount != b.bidamount { // } else { if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && projectcode" } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && s_winner" } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) { if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer" } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && budget && agency" } else { // } } } } else { //项目名称包含 if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) { if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount" } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner" } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) { if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer" } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency" } else { // } } else { // } } else if a.bidamount != b.bidamount { // } else { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == a.budget && (a.budget > 0 || b.budget > 0) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget" } } } } } default: } return &a, &b } // zhb_key_list 判断 // "budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode" func key_list(a, b dataSource) bool { for i := 0; i < len(zhb_key_list); i++ { key := zhb_key_list[i] switch key { case "budget": if a.budget == b.budget && (a.budget > 0 || b.budget > 0) { continue } else { return false } case "buyer": if a.buyer == b.buyer && pankong(a.buyer) && pankong(b.buyer) { continue } else { return false } case "agency": if a.agency == b.agency && pankong(a.agency) && pankong(b.agency) { continue } else { return false } case "s_winner": if a.s_winner == b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) { continue } else { return false } case "bidamount": if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) { continue } else { return false } case "projectcode": if a.projectcode == b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) { continue } else { return false } case "contractcode": if a.contractcode == b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) { continue } else { return false } } } return true } func pankong(a string) bool { if a != "" { return true } else { return false } }