123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- package main
- import (
- "go.mongodb.org/mongo-driver/bson"
- "log"
- "mongodb"
- qu "qfw/util"
- "regexp"
- "strings"
- "time"
- )
- var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
- var packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)`)
- var listSource []*dataSource
- type dataSource struct {
- _id, id, title string
- projectname, projectcode, contractcode string
- buyer, agency, s_winner string
- budget, bidamount float64
- isrepeat bool
- repeat_id_source string
- repeat_id map[string]string
- repeatText string
- }
- func task4(coll, startTime, endTime, sortType string) {
- log.Printf("表名:%s,开始时间:%s,结束时间:%s,排序方式:%s", coll, startTime, endTime, sortType)
- stime, _ := time.Parse(qu.Date_Short_Layout, startTime)
- etime, _ := time.Parse(qu.Date_Short_Layout, endTime)
- query := bson.M{}
- query["$and"] = []interface{}{
- bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
- bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
- }
- sort := "publishtime"
- if sortType == "-1" {
- sort = "-publishtime"
- }
- log.Println(query, sort)
- sess := Mgo.GetMgoConn()
- defer Mgo.DestoryMongoConn(sess)
- it := sess.DB(Mgo.DbName).C(coll).Find(query).Sort(sort).Iter()
- index := 0
- for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
- d := &dataSource{
- _id: mongodb.BsonIdToSId(tmp["_id"]),
- id: qu.ObjToString(tmp["id"]),
- title: strings.ToLower(qu.ObjToString(tmp["title"])),
- projectname: strings.ToLower(qu.ObjToString(tmp["projectname"])),
- projectcode: strings.ToLower(qu.ObjToString(tmp["projectcode"])),
- contractcode: strings.ToLower(qu.ObjToString(tmp["contractcode"])),
- buyer: strings.ToLower(qu.ObjToString(tmp["buyer"])),
- agency: strings.ToLower(qu.ObjToString(tmp["agency"])),
- s_winner: strings.ToLower(qu.ObjToString(tmp["s_winner"])),
- budget: qu.Float64All(tmp["budget"]),
- bidamount: qu.Float64All(tmp["bidamount"]),
- repeat_id: map[string]string{},
- }
- //log.Println(tmp["_id"], d.id)
- if index%10000 == 0 {
- log.Println("加载数据:", index)
- }
- listSource = append(listSource, d)
- tmp = map[string]interface{}{}
- }
- log.Println("数据加载完成")
- dataItem()
- dd := 0
- for i := 0; i < len(listSource); i++ {
- a := listSource[i]
- if a.isrepeat {
- dd++
- }
- //更新数据
- Mgo.UpdateById(coll, a._id,
- map[string]interface{}{"$set": map[string]interface{}{
- "repeatid": a.repeat_id_source, //和那条数据重复id
- "repeat": a.isrepeat, //本条数据是否本判重
- "repeatid_ids": a.repeat_id, //和我重复的数据都有哪些
- "repeattext": a.repeatText, //本数据被判重的原因
- }})
- if i%1000 == 0 {
- log.Println("已更新", i)
- }
- }
- log.Println(dd)
- }
- var listSize = 20000
- func dataItem() {
- for i := 0; i < len(listSource); i++ {
- a := listSource[i]
- // if a.isrepeat {
- // continue
- // }
- b := &dataSource{}
- for j := i + 1; j < len(listSource); j++ {
- b = listSource[j]
- // if b.isrepeat {
- // continue
- // }
- a, b = panchong(*a, *b)
- listSource[j] = b
- listSource[i] = a
- // if b.isrepeat {
- // log.Println("sss", a.id, b.isrepeat, b.repeat_id)
- // }
- }
- if i%500 == 0 {
- log.Println("已处理:", i)
- }
- }
- }
- func panchong(a, b dataSource) (c, d *dataSource) {
- switch {
- case a.title == b.title: //标题相等
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && buyer && s_winner"
- //log.Println("1111", a.id, b.id, b.isrepeat)
- }
- } else {
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && budget && key_list"
- }
- }
- } else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && projectcode && key_list"
- }
- } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
- if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && bidamount && buyer && s_winner"
- //log.Println("1111", a.id, b.id, b.isrepeat)
- }
- } else {
- r := key_list(a, b)
- if r {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题相等 && bidamount && key_list"
- }
- }
- } else {
- //
- }
- case a.title != b.title: //标题不相等
- //项目名称包含及相等
- if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
- isp := packreg.MatchString(a.title)
- //有分包
- if isp {
- //项目名称相等
- if a.projectname == b.projectname {
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
- } else if a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
- }
- }
- } else { //项目名称包含
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
- if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
- } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
- } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
- } else {
- //
- }
- }
- } else if a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
- } else {
- //
- }
- }
- }
- } else { //无分包
- //项目名称相等
- if a.projectname == b.projectname {
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
- } else if a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && s_winner"
- } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
- } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
- } else {
- //
- }
- }
- }
- } else { //项目名称包含
- if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
- //
- } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
- if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount"
- } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner"
- } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
- } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
- } else {
- //
- }
- } else {
- //
- }
- } else if a.bidamount != b.bidamount {
- //
- } else {
- if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == a.budget && (a.budget > 0 || b.budget > 0) {
- b.repeat_id_source = a.id
- a.repeat_id[b.id] = ""
- b.isrepeat = true
- b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
- }
- }
- }
- }
- }
- default:
- }
- return &a, &b
- }
- // zhb_key_list 判断
- // "budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
- func key_list(a, b dataSource) bool {
- for i := 0; i < len(zhb_key_list); i++ {
- key := zhb_key_list[i]
- switch key {
- case "budget":
- if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
- continue
- } else {
- return false
- }
- case "buyer":
- if a.buyer == b.buyer && pankong(a.buyer) && pankong(b.buyer) {
- continue
- } else {
- return false
- }
- case "agency":
- if a.agency == b.agency && pankong(a.agency) && pankong(b.agency) {
- continue
- } else {
- return false
- }
- case "s_winner":
- if a.s_winner == b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
- continue
- } else {
- return false
- }
- case "bidamount":
- if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
- continue
- } else {
- return false
- }
- case "projectcode":
- if a.projectcode == b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
- continue
- } else {
- return false
- }
- case "contractcode":
- if a.contractcode == b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
- continue
- } else {
- return false
- }
- }
- }
- return true
- }
- func pankong(a string) bool {
- if a != "" {
- return true
- } else {
- return false
- }
- }
|