apple 5 éve
szülő
commit
90c6c2b83f
3 módosított fájl, 112 hozzáadás és 69 törlés
  1. 5 5
      udpfilterdup/src/config.json
  2. 100 60
      udpfilterdup/src/datamap.go
  3. 7 4
      udpfilterdup/src/main.go

+ 5 - 5
udpfilterdup/src/config.json

@@ -1,12 +1,12 @@
 {
-    "udpport": ":11995",
+    "udpport": ":11888",
     "dupdays": 5,
     "mongodb": {
         "addr": "192.168.3.207:27092",
         "pool": 5,
         "db": "extract_kf",
-        "extract": "zk_Copy_of_zk_bidding_0506",
-        "extract_back": "zk_Copy_of_zk_bidding_0506",
+        "extract": "zk_bidding_0506",
+        "extract_back": "zk_bidding_0506",
         "site": {
             "dbname": "extract_kf",
             "coll": "site"
@@ -24,10 +24,10 @@
     "lowHeavy":true,
     "timingTask":false,
     "timingSpanDay": 3,
-    "timingPubScope": 1080,
+    "timingPubScope": 720,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
     "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
     "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
     "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
-}
+}

+ 100 - 60
udpfilterdup/src/datamap.go

@@ -3,6 +3,7 @@ package main
 import (
 	"fmt"
 	"log"
+	"math"
 	qutil "qfw/util"
 	"regexp"
 	"strings"
@@ -72,7 +73,7 @@ func TimedTaskDatamap(days int,lasttime int64) *datamap {
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
 		//qutil.IntAll(tmp["dataging"]) == 1
-		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1||qutil.IntAll(tmp["dataging"]) == 1  {
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 {
 
 		} else {
 			pt := tmp["publishtime"]
@@ -106,7 +107,7 @@ func TimedTaskDatamap(days int,lasttime int64) *datamap {
 			}
 		}
 		if n%50000 == 0 {
-			log.Println("current 数据池:", n, continuSum)
+			log.Println("当前数据池:", n, continuSum)
 		}
 		tmp = make(map[string]interface{})
 	}
@@ -132,12 +133,16 @@ func NewDatamap(days int, lastid string) *datamap {
 		"$lte": StringTOBsonId(lastid),
 	}}
 	log.Println("query", query)
-	it := sess.DB(mgo.DbName).C(extract).Find(query).Sort("-_id").Iter()
+	sortName := "-_id"
+	if Is_Sort {
+		sortName = "-publishtime"
+	}
+	it := sess.DB(mgo.DbName).C(extract).Find(query).Sort(sortName).Iter()
 	now1 := int64(0)
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 {
-			continuSum++
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
+
 		} else {
 			pt := tmp["comeintime"]
 			if Is_Sort {
@@ -151,6 +156,7 @@ func NewDatamap(days int, lastid string) *datamap {
 				now1 = pt_time
 			}
 			if qutil.Float64All(now1-pt_time) < datelimit {
+				continuSum++
 				info := NewInfo(tmp)
 				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
 				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
@@ -178,11 +184,11 @@ func NewDatamap(days int, lastid string) *datamap {
 			}
 		}
 		if n%5000 == 0 {
-			log.Println("current n:", n, continuSum)
+			log.Println("当前 n:", n,"数量:" ,continuSum)
 		}
 		tmp = make(map[string]interface{})
 	}
-	log.Println("load data:", n)
+	log.Println("load data:", n,"总数:",continuSum)
 	return dm
 }
 
@@ -278,14 +284,22 @@ L:
 				//前置条件1 - 站点相关
 				if info.site != "" && info.site == v.site {
 					if info.href != "" && info.href == v.href {
-						reason = "href相同"
+						reason = "同站点-href相同"
 						b = true
 						source = v
 						reasons = reason
 						break L
 					}
 					if info.href != "" && info.href != v.href {
-						reason = "href不同-"
+						if v.title==info.title && isTheSameDay(info.publishtime,v.publishtime){
+							reason = "同站点-href不同-标题相同"
+							b = true
+							source = v
+							reasons = reason
+							break L
+						}else {
+							continue
+						}
 					}
 				}
 
@@ -309,15 +323,20 @@ L:
 						if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
 							letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
 						}
-						if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
-							continue
-						}else {
-							reason = reason + "标题关键词且包含关系"
+						if letter1==letter2 {
+							reason = reason + "标题关键词相等关系"
 							if !againRepeat(v, info) {//继续二级金额判断
 								b = true
 								source = v
 								reasons = reason
 								break L
+							}else {
+								if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
+									//无包含关系-即不相等
+									continue
+								}else {
+									//有包含关系走要素判重逻辑
+								}
 							}
 						}
 					}
@@ -413,8 +432,25 @@ L:
 }
 //替换原始数据池
 func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
-	ct := newData.comeintime
+	//删除数据池的老数据
+	ct_old := oldData.comeintime
 	if Is_Sort||TimingTask {
+		ct_old = oldData.publishtime
+	}
+	dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
+	k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
+	data_old := d.data[k_old]
+	for k, v := range data_old {
+		if v.id == oldData.id {//删除对应当前的老数据
+			data_old = append(data_old[:k], data_old[k+1:]...)
+			break
+		}
+	}
+	d.data[k_old] = data_old
+
+	//添加新的
+	ct := newData.comeintime
+	if Is_Sort ||TimingTask{
 		ct = newData.publishtime
 	}
 	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
@@ -426,44 +462,12 @@ func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
 		d.data[k] = data
 		if !d.keys[dkey] {
 			d.keys[dkey] = true
+			d.update(ct)
 		}
 	} else {
- 		//遍历替换
- 		isReplace := false
-		for k, v := range data {
-			if v.id == oldData.id {
-				data[k] = newData //同天_type_area 替换
-				isReplace = true
-				break
-			}
-		}
-		if !isReplace {
-			//添加新数据 删除老数据
-			data = append(data,newData)
-			ct_old := oldData.comeintime
-			if Is_Sort||TimingTask {
-				ct_old = oldData.publishtime
-			}
-			dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
-			k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
-			data_old := d.data[k_old]
-			if len(data_old)==1 {
-				delete(d.data ,k_old)
-			} else {
-				for k, v := range data_old {
-					if v.id == oldData.id {
-						//删除对应当前的
-						data_old = append(data_old[:k], data_old[k+1:]...)
-						break
-					}
-				}
-				d.data[k_old] = data_old
-			}
-		}else {
-			d.data[k] = data
-		}
+		data = append(data, newData)
+		d.data[k] = data
 	}
-
 	//添加省
 	isAreaExist :=false
 	for _,v:= range d.areakeys {
@@ -477,6 +481,7 @@ func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
 		d.areakeys = areaArr
 	}
 
+
 	d.lock.Unlock()
 }
 
@@ -554,7 +559,7 @@ func dealWithSpecialPhrases(str1 string,str2 string) (string,string) {
 	return newStr1,newStr2
 }
 //关键词数量v
-func dealWithSpecialWordNumber(info *Info,  v*Info) int {
+func dealWithSpecialWordNumber(info*Info,v*Info) int {
 	okNum:=0
 	if  info.titleSpecialWord || info.specialWord {
 		okNum++
@@ -615,7 +620,7 @@ func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
 			reason = reason + "---招标类:预算"
 			return true,reason
 		}
-		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+		if info.bidopentime != 0 && info.bidopentime==v.bidopentime{//开标时间
 			reason = reason + "---招标类:开标时间"
 			return true,reason
 		}
@@ -639,7 +644,7 @@ func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
 			reason = reason + "---合同类:预算"
 			return true,reason
 		}
-		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+		if info.bidopentime != 0 && info.bidopentime==v.bidopentime{//开标时间
 			reason = reason + "---合同类:开标时间"
 			return true,reason
 		}
@@ -661,7 +666,7 @@ func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
 			reason = reason + "---类别空-招标类:预算"
 			return true,reason
 		}
-		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+		if info.bidopentime != 0 && info.bidopentime==v.bidopentime{//开标时间
 			reason = reason + "---类别空-招标类:开标时间"
 			return true,reason
 		}
@@ -834,7 +839,7 @@ func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		p4 = true
 	}
 	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
-		ss = ss + "p9-开标时间-"
+		ss = ss + "p9-开标时间相同-"
 		p9 = true
 	}
 	if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
@@ -904,16 +909,16 @@ func tenderRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 func tenderRepeat_C(v *Info, info *Info) bool {
 
 	if v.budget != 0 && info.budget != 0 && v.budget != info.budget {
-
 		return true
 	}
 	//原始地址...
 	if v.buyer != "" && info.buyer != "" && v.buyer != info.buyer {
 		return true
 	}
-	//if v.bidopentime != 0 && info.bidopentime != 0 && v.bidopentime != info.bidopentime {
-	//	return true
-	//}
+
+	if v.bidopentime != 0 && info.bidopentime != 0 && isBidopentimeInterval(info.bidopentime,v.bidopentime) {
+		return true
+	}
 	//if v.bidopenaddress != "" && info.bidopenaddress != "" && v.bidopenaddress != info.bidopenaddress {
 	//	return true
 	//}
@@ -1093,7 +1098,10 @@ func againRepeat(v *Info, info *Info) bool {
 				return true
 			}
 		} else {
-
+			//预算金额满足条件
+			if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+				return true
+			}
 		}
 	}
 
@@ -1126,8 +1134,40 @@ func isBidWinningAmount(f1 float64 ,f2 float64) bool {
 }
 
 
+//开标时间区间为一天
+func isBidopentimeInterval(i1 int64 ,i2 int64) bool {
+	if i1==0||i2==0 {
+		return false
+	}
+	//不在同一天-或者同一天间隔超过六小时,属于不相等返回true
+	timeOne,timeTwo:=i1,i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1==day2 {
+		//是否间隔超过六小时
+		if math.Abs(float64(i1-i2)) >21600.0 {
+			return true
+		}else {
+			return false
+		}
+	}else {
+		return true
+	}
+}
 
-
+//开标时间区间为一天
+func isTheSameDay(i1 int64 ,i2 int64) bool {
+	if i1==0||i2==0 {
+		return false
+	}
+	timeOne,timeTwo:=i1,i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1==day2 {
+		return true
+	}
+	return false
+}
 
 
 

+ 7 - 4
udpfilterdup/src/main.go

@@ -122,7 +122,6 @@ func main() {
 
 //测试组人员使用
 func mainT() {
-
 	if TimingTask {
 		log.Println("定时任务测试开始")
 		go timedTaskDay()
@@ -207,6 +206,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	sortName :="_id"
 	if Is_Sort {
 		sortName = "publishtime"
+		log.Println("排序")
 	}
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort(sortName).Iter()
 	updateExtract := [][]map[string]interface{}{}
@@ -218,9 +218,12 @@ func task(data []byte, mapInfo map[string]interface{}) {
 		if n%10000 == 0 {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
-		if util.IntAll(tmp["repeat"]) == 1 || util.IntAll(tmp["repeat"]) == -1 {
+		if util.IntAll(tmp["repeat"]) == 1 || util.IntAll(tmp["repeat"]) == -1||
+			util.IntAll(tmp["dataging"]) == 1 ||util.IntAll(tmp["dataging"]) == -1{
 			tmp = make(map[string]interface{})
-			repeateN++
+			if util.IntAll(tmp["repeat"]) == 1 {
+				repeateN++
+			}
 			continue
 		}
 		pool <- true
@@ -350,7 +353,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 						repeat_id = info.id
 					}
 				}
-				if repeateN%90==0&&repeateN>0 {
+				if repeateN%120==0&&repeateN>0 {
 					fmt.Println("最终结果","目标id:",repeat_idMap["_id"])
 				}