123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554 |
- // main
- package main
- import (
- "flag"
- "fmt"
- "log"
- "mongodb"
- "os"
- qu "qfw/util"
- "regexp"
- "strings"
- //"time"
- )
- var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
- var packreg *regexp.Regexp
- var Mgo *mongodb.MongodbSim
- var listSource []*dataSource
- type dataSource struct {
- _id, id, title string
- projectname, projectcode, contractcode string
- buyer, agency, s_winner string
- budget, bidamount float64
- budget_isnull,bidamount_isnull bool
- isrepeat bool
- repeat_id_source string
- repeat_id map[string]string
- repeatText string
- publishtime int64
- }
- //var addr, dbname, table, startTime, endTime, sortType *string
- var addr, dbname, table, sortType *string
- func init() {
- //addr = flag.String("addr", "192.168.3.167:27080", "数据库名称")
- addr = flag.String("addr", "192.168.3.166:27082", "数据库名称")
- //addr = flag.String("addr", "127.0.0.1:27167", "数据库名称")
- //dbname = flag.String("dbname", "qfw", "数据库名称")
- dbname = flag.String("dbname", "zhaolongyue", "数据库名称")
- //table = flag.String("table", "0210test", "表名称")
- table = flag.String("table", "Htgx0425_data", "表名称")
- sortType = flag.String("sort", "1", "sort--请输入排序方式,1正序、-1倒序")
- flag.Parse()
- Mgo = &mongodb.MongodbSim{
- MongodbAddr: *addr,
- Size: 3,
- DbName: *dbname,
- }
- Mgo.InitPool()
- packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段])`)
- //packreg, _ = regexp.Compile(`([包标段][::]?[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十]|[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段]){1,}`)
- //packreg, _ = regexp.MustCompile("([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)")
- }
- //创建mgo索引
- //func createMgoIndex(){
- // mongoDBDialInfo := &mgo.DialInfo{
- // Addrs: []string{addr},
- // Timeout: 60 * time.Second,
- // Database: dbname,
- // }
- // session, err := mgo.DialWithInfo(mongoDBDialInfo)
- // if err != nil {
- // log.Fatalf("CreateSession failed:%\n", err)
- // }
- // coll := session.DB(dbname).C(table)
- // err = coll.EnsureIndexKey("publishtime")
- // fmt.Println("创建索引~publishtime",err)
- //
- // //查询所有的已存在索引
- // //indexs, err := coll.Indexes()
- // //fmt.Println("indexs--------------:", indexs)
- //}
- func main() {
- log.Printf("表名:%s,排序方式:%s", *table, *sortType)
- if *addr == "" || *dbname == "" || *table == "" || *sortType == "" {
- log.Println("参数输入有误")
- fmt.Printf("数据库地址:%s\n数据库名称:%s\n表名:%s\n排序方式:%s\n", *addr, *dbname, *table, *sortType)
- os.Exit(0)
- }
- //stime, _ := time.Parse(qu.Date_Short_Layout, *startTime)
- //etime, _ := time.Parse(qu.Date_Short_Layout, *endTime)
- //query := map[string]interface{}{}
- //query["$and"] = []interface{}{
- // map[string]interface{}{
- // "publishtime":map[string]interface{}{
- // "$gte":stime.Unix(),
- // },
- // },
- // map[string]interface{}{
- // "publishtime":map[string]interface{}{
- // "$lte":etime.Unix(),
- // },
- // },
- // //bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
- // //bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
- //}
- sort := "publishtime"
- if *sortType == "-1" {
- sort = "-publishtime"
- }
- //log.Println(sort)
- sess := Mgo.GetMgoConn()
- defer Mgo.DestoryMongoConn(sess)
- //it := sess.DB(Mgo.DbName).C(*table).Find(query).Sort(sort).Iter()
- it := sess.DB(Mgo.DbName).C(*table).Find(nil).Sort(sort).Iter()
- //对标题、项目名称等中英文符号、空格等进行处理
- var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_-]")
- //var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_--]")
- index := 0
- for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
- d := &dataSource{
- _id: mongodb.BsonIdToSId(tmp["_id"]),
- id: qu.ObjToString(tmp["id"]),
- title: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["title"])), ""),
- projectname: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectname"])), ""),
- projectcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectcode"])), ""),
- contractcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["contractcode"])), ""),
- buyer: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["buyer"])), ""),
- agency: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["agency"])), ""),
- s_winner: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["s_winner"])), ""),
- budget: qu.Float64All(tmp["budget"]),
- bidamount: qu.Float64All(tmp["bidamount"]),
- publishtime: qu.Int64All(tmp["publishtime"]),
- repeat_id: map[string]string{},
- }
- if tmp["budget"]==nil{
- d.budget_isnull=true
- }
- if tmp["bidamount"]==nil{
- d.bidamount_isnull=true
- }
- //log.Println(tmp["_id"],tmp["title"],tmp["projectname"])
- if index%10000 == 0 {
- log.Println("加载数据:", index)
- }
- listSource = append(listSource, d)
- tmp = map[string]interface{}{}
- }
- log.Println("数据加载完成",len(listSource))
- dataItem()
- dd := 0
- for i := 0; i < len(listSource); i++ {
- a := listSource[i]
- if a.isrepeat {
- dd++
- }
- //更新数据
- if len(a.repeat_id) ==0{
- Mgo.UpdateById(*table, a._id,
- map[string]interface{}{"$set": map[string]interface{}{
- //重复数据看repeatid
- "repeatid": a.repeat_id_source, //和哪条数据重复id
- "repeat": a.isrepeat, //本条数据是否重复数据
- "repeattext": a.repeatText, //本数据被判重的原因
- }})
- }else {
- if len(a.repeat_id) > 0{
- arr:=[]string{}
- for k,_:=range a.repeat_id{
- arr = append(arr,k)
- }
- Mgo.UpdateById(*table, a._id,
- map[string]interface{}{"$set": map[string]interface{}{
- //原始数据看repeatid_ids_str
- "repeatid": a.repeat_id_source, //和哪条数据重复id
- "repeat": a.isrepeat, //本条数据是否重复数据
- //"repeatid_ids": a.repeat_id, //和我重复的数据都有哪些
- "repeatid_ids_str": strings.Join(arr,","),
- "repeattext": a.repeatText, //本数据被判重的原因
- }})}
- }
- if i%1000 == 0 {
- log.Println("已更新:", i)
- }
- }
- log.Println("重复数据量:",dd)
- }
- var listSize = 20000
- func dataItem() {
- for i := 0; i < len(listSource); i++ {
- a := listSource[i]
- // if a.isrepeat {
- // continue
- // }
- b := &dataSource{}
- for j := i + 1; j < len(listSource); j++ {
- b = listSource[j]
- if *sortType == "1" {
- if publishtime_b_a(*a,*b){
- // if b.isrepeat {
- // continue
- // }
- a, b = panchong(*a, *b)
- listSource[j] = b
- listSource[i] = a
- // if b.isrepeat {
- // log.Println("sss", a.id, b.isrepeat, b.repeat_id)
- // }
- }
- }else{
- if publishtime_a_b(*a,*b){
- // if b.isrepeat {
- // continue
- // }
- a, b = panchong(*a, *b)
- listSource[j] = b
- listSource[i] = a
- // if b.isrepeat {
- // log.Println("sss", a.id, b.isrepeat, b.repeat_id)
- // }
- }
- }
- }
- if i%500 == 0 {
- log.Println("已处理:", i)
- }
- }
- }
- func panchong(a, b dataSource) (c, d *dataSource) {
- switch {
- case a.title == b.title: //标题相等
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount {
- if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && bidamount && buyer && s_winner"
- }else{
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && bidamount && buyer && key_list"
- }
- }
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && bidamount && s_winner"
- }else {
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && bidamount && key_list"
- }
- }
- }else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && projectcode && key_list"
- }
- }else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && budget && buyer && s_winner && bidamount"
- //log.Println("1111", a.id, b.id, b.isrepeat)
- }
- }
- } else {
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && budget && key_list"
- }
- }
- } else {
- //
- }
- case a.title != b.title: //标题不相等
- //项目名称包含及相等
- if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
- isp := packreg.MatchString(a.title)
- //有分包
- if isp {
- //项目名称相等
- if a.projectname == b.projectname {
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner{
- }else{
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
- }
- //b.repeat_id_source = a.id
- //a.repeat_id[b.id] = ""
- //b.isrepeat = true
- //b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget && (a.budget >=0 || b.budget >= 0) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
- }
- }
- } else { //项目名称包含
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount {
- if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
- } else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
- } else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
- } else {
- //
- }
- }
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
- } else {
- //
- }
- }
- }
- } else { //无分包
- //项目名称相等
- if a.projectname == b.projectname {
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner {
- }else if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget{
- }else{
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
- }
- //if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- // b.repeat_id_source = a.id
- // a.repeat_id[b.id] = ""
- // b.isrepeat = true
- // b.repeatText = "标题不相等-->无分包 && projectname && projectcode && s_winner"
- //} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- // b.repeat_id_source = a.id
- // a.repeat_id[b.id] = ""
- // b.isrepeat = true
- // b.repeatText = "标题不相等-->无分包 && projectname && projectcode && budget"
- //}
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && s_winner"
- //r := key_list(a, b)
- //if r {
- // b.repeat_id_source = a.id
- // a.repeat_id[b.id] = ""
- // b.isrepeat = true
- // b.repeatText = "标题不相等-->无分包 && projectname && s_winner && key_list"
- //}
- } else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
- } else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
- } else {
- //
- }
- }
- }
- } else { //项目名称包含
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount {
- if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && projectcode"
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && s_winner"
- } else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
- } else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
- } else {
- //
- }
- } else {
- //
- }
- } else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
- }
- }
- }
- }
- }
- default:
- }
- return &a, &b
- }
- //zhb_key_list 判断
- //"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
- func key_list(a, b dataSource) bool {
- for i := 0; i < len(zhb_key_list); i++ {
- key := zhb_key_list[i]
- switch key {
- case "budget":
- if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget {
- return false
- } else {
- continue
- }
- case "buyer":
- if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer) && pankong(a.buyer) && pankong(b.buyer) {
- continue
- } else {
- return false
- }
- case "agency":
- if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency) && pankong(a.agency) && pankong(b.agency) {
- continue
- } else {
- return false
- }
- case "s_winner":
- if a.s_winner != b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
- return false
- } else {
- continue
- }
- case "bidamount":
- if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
- return false
- } else {
- continue
- }
- case "projectcode":
- if a.projectcode != b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
- return false
- } else {
- continue
- }
- case "contractcode":
- if a.contractcode != b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
- return false
- } else {
- continue
- }
- }
- }
- return true
- }
- //发布时间判断
- //正序
- func publishtime_b_a(a,b dataSource) bool{
- return b.publishtime-a.publishtime < 86400 * 31 * 12
- }
- //倒序
- func publishtime_a_b(a,b dataSource) bool {
- return a.publishtime-b.publishtime < 86400 * 31 * 12
- }
- //
- func pankong(a string) bool {
- if a != "" {
- return true
- } else {
- return false
- }
- }
|