|
@@ -0,0 +1,384 @@
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "go.mongodb.org/mongo-driver/bson"
|
|
|
+ "log"
|
|
|
+ "mongodb"
|
|
|
+ qu "qfw/util"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+ "time"
|
|
|
+)
|
|
|
+
|
|
|
+var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
|
|
|
+var packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)`)
|
|
|
+var listSource []*dataSource
|
|
|
+
|
|
|
+type dataSource struct {
|
|
|
+ _id, id, title string
|
|
|
+ projectname, projectcode, contractcode string
|
|
|
+ buyer, agency, s_winner string
|
|
|
+ budget, bidamount float64
|
|
|
+ isrepeat bool
|
|
|
+ repeat_id_source string
|
|
|
+ repeat_id map[string]string
|
|
|
+ repeatText string
|
|
|
+}
|
|
|
+
|
|
|
+func task4(coll, startTime, endTime, sortType string) {
|
|
|
+ log.Printf("表名:%s,开始时间:%s,结束时间:%s,排序方式:%s", coll, startTime, endTime, sortType)
|
|
|
+ stime, _ := time.Parse(qu.Date_Short_Layout, startTime)
|
|
|
+ etime, _ := time.Parse(qu.Date_Short_Layout, endTime)
|
|
|
+ query := bson.M{}
|
|
|
+ query["$and"] = []interface{}{
|
|
|
+ bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
|
|
|
+ bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
|
|
|
+ }
|
|
|
+ sort := "publishtime"
|
|
|
+ if sortType == "-1" {
|
|
|
+ sort = "-publishtime"
|
|
|
+ }
|
|
|
+ log.Println(query, sort)
|
|
|
+ sess := Mgo.GetMgoConn()
|
|
|
+ defer Mgo.DestoryMongoConn(sess)
|
|
|
+ it := sess.DB(Mgo.DbName).C(coll).Find(query).Sort(sort).Iter()
|
|
|
+ index := 0
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
|
|
|
+ d := &dataSource{
|
|
|
+ _id: mongodb.BsonIdToSId(tmp["_id"]),
|
|
|
+ id: qu.ObjToString(tmp["id"]),
|
|
|
+ title: strings.ToLower(qu.ObjToString(tmp["title"])),
|
|
|
+ projectname: strings.ToLower(qu.ObjToString(tmp["projectname"])),
|
|
|
+ projectcode: strings.ToLower(qu.ObjToString(tmp["projectcode"])),
|
|
|
+ contractcode: strings.ToLower(qu.ObjToString(tmp["contractcode"])),
|
|
|
+ buyer: strings.ToLower(qu.ObjToString(tmp["buyer"])),
|
|
|
+ agency: strings.ToLower(qu.ObjToString(tmp["agency"])),
|
|
|
+ s_winner: strings.ToLower(qu.ObjToString(tmp["s_winner"])),
|
|
|
+ budget: qu.Float64All(tmp["budget"]),
|
|
|
+ bidamount: qu.Float64All(tmp["bidamount"]),
|
|
|
+ repeat_id: map[string]string{},
|
|
|
+ }
|
|
|
+ //log.Println(tmp["_id"], d.id)
|
|
|
+ if index%10000 == 0 {
|
|
|
+ log.Println("加载数据:", index)
|
|
|
+ }
|
|
|
+ listSource = append(listSource, d)
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
+ }
|
|
|
+ log.Println("数据加载完成")
|
|
|
+ dataItem()
|
|
|
+ dd := 0
|
|
|
+ for i := 0; i < len(listSource); i++ {
|
|
|
+ a := listSource[i]
|
|
|
+ if a.isrepeat {
|
|
|
+ dd++
|
|
|
+ }
|
|
|
+ //更新数据
|
|
|
+ Mgo.UpdateById(coll, a._id,
|
|
|
+ map[string]interface{}{"$set": map[string]interface{}{
|
|
|
+ "repeatid": a.repeat_id_source, //和那条数据重复id
|
|
|
+ "repeat": a.isrepeat, //本条数据是否本判重
|
|
|
+ "repeatid_ids": a.repeat_id, //和我重复的数据都有哪些
|
|
|
+ "repeattext": a.repeatText, //本数据被判重的原因
|
|
|
+ }})
|
|
|
+ if i%1000 == 0 {
|
|
|
+ log.Println("已更新", i)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ log.Println(dd)
|
|
|
+}
|
|
|
+
|
|
|
+var listSize = 20000
|
|
|
+
|
|
|
+func dataItem() {
|
|
|
+ for i := 0; i < len(listSource); i++ {
|
|
|
+ a := listSource[i]
|
|
|
+ // if a.isrepeat {
|
|
|
+ // continue
|
|
|
+ // }
|
|
|
+ b := &dataSource{}
|
|
|
+ for j := i + 1; j < len(listSource); j++ {
|
|
|
+ b = listSource[j]
|
|
|
+ // if b.isrepeat {
|
|
|
+ // continue
|
|
|
+ // }
|
|
|
+ a, b = panchong(*a, *b)
|
|
|
+ listSource[j] = b
|
|
|
+ listSource[i] = a
|
|
|
+ // if b.isrepeat {
|
|
|
+ // log.Println("sss", a.id, b.isrepeat, b.repeat_id)
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ if i%500 == 0 {
|
|
|
+ log.Println("已处理:", i)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+func panchong(a, b dataSource) (c, d *dataSource) {
|
|
|
+ switch {
|
|
|
+ case a.title == b.title: //标题相等
|
|
|
+ if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
|
|
|
+
|
|
|
+ } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
|
|
|
+ if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题相等 && buyer && s_winner"
|
|
|
+ //log.Println("1111", a.id, b.id, b.isrepeat)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ r := key_list(a, b)
|
|
|
+ if r {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题相等 && budget && key_list"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
|
|
|
+ r := key_list(a, b)
|
|
|
+ if r {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题相等 && projectcode && key_list"
|
|
|
+ }
|
|
|
+ } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
|
|
|
+ if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
|
|
|
+ if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题相等 && bidamount && buyer && s_winner"
|
|
|
+ //log.Println("1111", a.id, b.id, b.isrepeat)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ r := key_list(a, b)
|
|
|
+ if r {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题相等 && bidamount && key_list"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ //
|
|
|
+ }
|
|
|
+ case a.title != b.title: //标题不相等
|
|
|
+ //项目名称包含及相等
|
|
|
+ if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
|
|
|
+ isp := packreg.MatchString(a.title)
|
|
|
+ //有分包
|
|
|
+ if isp {
|
|
|
+ //项目名称相等
|
|
|
+ if a.projectname == b.projectname {
|
|
|
+ if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
|
|
|
+ //
|
|
|
+ } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
|
|
|
+ } else if a.bidamount != b.bidamount {
|
|
|
+ //
|
|
|
+ } else {
|
|
|
+ if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else { //项目名称包含
|
|
|
+ if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
|
|
|
+ //
|
|
|
+ } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
|
|
|
+ if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
|
|
|
+ } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
|
|
|
+ } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
|
|
|
+ } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
|
|
|
+ } else {
|
|
|
+ //
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if a.bidamount != b.bidamount {
|
|
|
+ //
|
|
|
+ } else {
|
|
|
+ if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
|
|
|
+ } else {
|
|
|
+ //
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else { //无分包
|
|
|
+ //项目名称相等
|
|
|
+ if a.projectname == b.projectname {
|
|
|
+ if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
|
|
|
+ //
|
|
|
+ } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
|
|
|
+ } else if a.bidamount != b.bidamount {
|
|
|
+ //
|
|
|
+ } else {
|
|
|
+ if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
|
|
|
+ } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname && s_winner"
|
|
|
+ } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
|
|
|
+ } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
|
|
|
+ } else {
|
|
|
+ //
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else { //项目名称包含
|
|
|
+ if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
|
|
|
+ //
|
|
|
+ } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
|
|
|
+ if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount"
|
|
|
+ } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner"
|
|
|
+ } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
|
|
|
+ } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
|
|
|
+ } else {
|
|
|
+ //
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ //
|
|
|
+ }
|
|
|
+ } else if a.bidamount != b.bidamount {
|
|
|
+ //
|
|
|
+ } else {
|
|
|
+ if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == a.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ b.repeat_id_source = a.id
|
|
|
+ a.repeat_id[b.id] = ""
|
|
|
+ b.isrepeat = true
|
|
|
+ b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ default:
|
|
|
+
|
|
|
+ }
|
|
|
+ return &a, &b
|
|
|
+}
|
|
|
+
|
|
|
+// zhb_key_list 判断
|
|
|
+// "budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
|
|
|
+func key_list(a, b dataSource) bool {
|
|
|
+ for i := 0; i < len(zhb_key_list); i++ {
|
|
|
+ key := zhb_key_list[i]
|
|
|
+ switch key {
|
|
|
+ case "budget":
|
|
|
+ if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ case "buyer":
|
|
|
+ if a.buyer == b.buyer && pankong(a.buyer) && pankong(b.buyer) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ case "agency":
|
|
|
+ if a.agency == b.agency && pankong(a.agency) && pankong(b.agency) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ case "s_winner":
|
|
|
+ if a.s_winner == b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ case "bidamount":
|
|
|
+ if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ case "projectcode":
|
|
|
+ if a.projectcode == b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ case "contractcode":
|
|
|
+ if a.contractcode == b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
|
|
|
+ continue
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|
|
|
+
|
|
|
+func pankong(a string) bool {
|
|
|
+ if a != "" {
|
|
|
+ return true
|
|
|
+ } else {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+}
|