// main package main import ( "flag" "fmt" "log" "mongodb" "os" qu "qfw/util" "regexp" "strings" //"time" ) var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"} var packreg *regexp.Regexp var Mgo *mongodb.MongodbSim var listSource []*dataSource type dataSource struct { _id, id, title string projectname, projectcode, contractcode string buyer, agency, s_winner string budget, bidamount float64 budget_isnull,bidamount_isnull bool isrepeat bool repeat_id_source string repeat_id map[string]string repeatText string publishtime int64 } //var addr, dbname, table, startTime, endTime, sortType *string var addr, dbname, table, sortType *string func init() { //addr = flag.String("addr", "192.168.3.167:27080", "数据库名称") addr = flag.String("addr", "192.168.3.166:27082", "数据库名称") //addr = flag.String("addr", "127.0.0.1:27167", "数据库名称") //dbname = flag.String("dbname", "qfw", "数据库名称") dbname = flag.String("dbname", "zhaolongyue", "数据库名称") //table = flag.String("table", "0210test", "表名称") table = flag.String("table", "Htgx0425_data", "表名称") sortType = flag.String("sort", "1", "sort--请输入排序方式,1正序、-1倒序") flag.Parse() Mgo = &mongodb.MongodbSim{ MongodbAddr: *addr, Size: 3, DbName: *dbname, } Mgo.InitPool() packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段])`) //packreg, _ = regexp.Compile(`([包标段][::]?[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十]|[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段]){1,}`) //packreg, _ = regexp.MustCompile("([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)") } //创建mgo索引 //func createMgoIndex(){ // mongoDBDialInfo := &mgo.DialInfo{ // Addrs: []string{addr}, // Timeout: 60 * time.Second, // Database: dbname, // } // session, err := mgo.DialWithInfo(mongoDBDialInfo) // if err != nil { // log.Fatalf("CreateSession failed:%\n", err) // } // coll := session.DB(dbname).C(table) // err = coll.EnsureIndexKey("publishtime") // fmt.Println("创建索引~publishtime",err) // // //查询所有的已存在索引 // //indexs, err := coll.Indexes() // //fmt.Println("indexs--------------:", indexs) //} func main() { log.Printf("表名:%s,排序方式:%s", *table, *sortType) if *addr == "" || *dbname == "" || *table == "" || *sortType == "" { log.Println("参数输入有误") fmt.Printf("数据库地址:%s\n数据库名称:%s\n表名:%s\n排序方式:%s\n", *addr, *dbname, *table, *sortType) os.Exit(0) } //stime, _ := time.Parse(qu.Date_Short_Layout, *startTime) //etime, _ := time.Parse(qu.Date_Short_Layout, *endTime) //query := map[string]interface{}{} //query["$and"] = []interface{}{ // map[string]interface{}{ // "publishtime":map[string]interface{}{ // "$gte":stime.Unix(), // }, // }, // map[string]interface{}{ // "publishtime":map[string]interface{}{ // "$lte":etime.Unix(), // }, // }, // //bson.M{"publishtime": bson.M{"$gte": stime.Unix()}}, // //bson.M{"publishtime": bson.M{"$lte": etime.Unix()}}, //} sort := "publishtime" if *sortType == "-1" { sort = "-publishtime" } //log.Println(sort) sess := Mgo.GetMgoConn() defer Mgo.DestoryMongoConn(sess) //it := sess.DB(Mgo.DbName).C(*table).Find(query).Sort(sort).Iter() it := sess.DB(Mgo.DbName).C(*table).Find(nil).Sort(sort).Iter() //对标题、项目名称等中英文符号、空格等进行处理 var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_-]") //var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_--]") index := 0 for tmp := make(map[string]interface{}); it.Next(&tmp); index++ { d := &dataSource{ _id: mongodb.BsonIdToSId(tmp["_id"]), id: qu.ObjToString(tmp["id"]), title: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["title"])), ""), projectname: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectname"])), ""), projectcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectcode"])), ""), contractcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["contractcode"])), ""), buyer: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["buyer"])), ""), agency: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["agency"])), ""), s_winner: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["s_winner"])), ""), budget: qu.Float64All(tmp["budget"]), bidamount: qu.Float64All(tmp["bidamount"]), publishtime: qu.Int64All(tmp["publishtime"]), repeat_id: map[string]string{}, } if tmp["budget"]==nil{ d.budget_isnull=true } if tmp["bidamount"]==nil{ d.bidamount_isnull=true } //log.Println(tmp["_id"],tmp["title"],tmp["projectname"]) if index%10000 == 0 { log.Println("加载数据:", index) } listSource = append(listSource, d) tmp = map[string]interface{}{} } log.Println("数据加载完成",len(listSource)) dataItem() dd := 0 for i := 0; i < len(listSource); i++ { a := listSource[i] if a.isrepeat { dd++ } //更新数据 if len(a.repeat_id) ==0{ Mgo.UpdateById(*table, a._id, map[string]interface{}{"$set": map[string]interface{}{ //重复数据看repeatid "repeatid": a.repeat_id_source, //和哪条数据重复id "repeat": a.isrepeat, //本条数据是否重复数据 "repeattext": a.repeatText, //本数据被判重的原因 }}) }else { if len(a.repeat_id) > 0{ arr:=[]string{} for k,_:=range a.repeat_id{ arr = append(arr,k) } Mgo.UpdateById(*table, a._id, map[string]interface{}{"$set": map[string]interface{}{ //原始数据看repeatid_ids_str "repeatid": a.repeat_id_source, //和哪条数据重复id "repeat": a.isrepeat, //本条数据是否重复数据 //"repeatid_ids": a.repeat_id, //和我重复的数据都有哪些 "repeatid_ids_str": strings.Join(arr,","), "repeattext": a.repeatText, //本数据被判重的原因 }})} } if i%1000 == 0 { log.Println("已更新:", i) } } log.Println("重复数据量:",dd) } var listSize = 20000 func dataItem() { for i := 0; i < len(listSource); i++ { a := listSource[i] // if a.isrepeat { // continue // } b := &dataSource{} for j := i + 1; j < len(listSource); j++ { b = listSource[j] if *sortType == "1" { if publishtime_b_a(*a,*b){ // if b.isrepeat { // continue // } a, b = panchong(*a, *b) listSource[j] = b listSource[i] = a // if b.isrepeat { // log.Println("sss", a.id, b.isrepeat, b.repeat_id) // } } }else{ if publishtime_a_b(*a,*b){ // if b.isrepeat { // continue // } a, b = panchong(*a, *b) listSource[j] = b listSource[i] = a // if b.isrepeat { // log.Println("sss", a.id, b.isrepeat, b.repeat_id) // } } } } if i%500 == 0 { log.Println("已处理:", i) } } } func panchong(a, b dataSource) (c, d *dataSource) { switch { case a.title == b.title: //标题相等 if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount { if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && bidamount && buyer && s_winner" }else{ r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && bidamount && buyer && key_list" } } } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && bidamount && s_winner" }else { r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && bidamount && key_list" } } }else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && projectcode && key_list" } }else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && budget && buyer && s_winner && bidamount" //log.Println("1111", a.id, b.id, b.isrepeat) } } } else { r := key_list(a, b) if r { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题相等 && budget && key_list" } } } else { // } case a.title != b.title: //标题不相等 //项目名称包含及相等 if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) { isp := packreg.MatchString(a.title) //有分包 if isp { //项目名称相等 if a.projectname == b.projectname { if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner{ }else{ b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname && bidamount" } //b.repeat_id_source = a.id //a.repeat_id[b.id] = "" //b.isrepeat = true //b.repeatText = "标题不相等-->有分包 && projectname && bidamount" } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount { // } else { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget && (a.budget >=0 || b.budget >= 0) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget" } } } else { //项目名称包含 if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount { if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode" } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner" } else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer" } else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency" } else { // } } } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount { // } else { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget" } else { // } } } } else { //无分包 //项目名称相等 if a.projectname == b.projectname { if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && bidamount" } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount { // } else { if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner { }else if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget{ }else{ b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && projectcode" } //if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { // b.repeat_id_source = a.id // a.repeat_id[b.id] = "" // b.isrepeat = true // b.repeatText = "标题不相等-->无分包 && projectname && projectcode && s_winner" //} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { // b.repeat_id_source = a.id // a.repeat_id[b.id] = "" // b.isrepeat = true // b.repeatText = "标题不相等-->无分包 && projectname && projectcode && budget" //} } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && s_winner" //r := key_list(a, b) //if r { // b.repeat_id_source = a.id // a.repeat_id[b.id] = "" // b.isrepeat = true // b.repeatText = "标题不相等-->无分包 && projectname && s_winner && key_list" //} } else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer" } else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname && budget && agency" } else { // } } } } else { //项目名称包含 if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode { // } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount { if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && projectcode" } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && s_winner" } else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer" } else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency" } else { // } } else { // } } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount { // } else { if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget { b.repeat_id_source = a.id a.repeat_id[b.id] = "" b.isrepeat = true b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget" } } } } } default: } return &a, &b } //zhb_key_list 判断 //"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode" func key_list(a, b dataSource) bool { for i := 0; i < len(zhb_key_list); i++ { key := zhb_key_list[i] switch key { case "budget": if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget { return false } else { continue } case "buyer": if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) { continue } else { return false } case "agency": if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) { continue } else { return false } case "s_winner": if a.s_winner != b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) { return false } else { continue } case "bidamount": if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount { return false } else { continue } case "projectcode": if a.projectcode != b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) { return false } else { continue } case "contractcode": if a.contractcode != b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) { return false } else { continue } } } return true } //发布时间判断 //正序 func publishtime_b_a(a,b dataSource) bool{ return b.publishtime-a.publishtime < 86400 * 31 * 12 } //倒序 func publishtime_a_b(a,b dataSource) bool { return a.publishtime-b.publishtime < 86400 * 31 * 12 } // func pankong(a string) bool { if a != "" { return true } else { return false } }