Procházet zdrojové kódy

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

maxiaoshan před 5 roky
rodič
revize
6d241b3251

+ 2 - 2
udpfilterdup/src/config.json

@@ -5,8 +5,8 @@
         "addr": "192.168.3.207:27092",
         "pool": 5,
         "db": "extract_kf",
-        "extract": "zk_bidding_0506",
-        "extract_back": "zk_bidding_0506",
+        "extract": "zk_Copy_bidding_copy",
+        "extract_back": "zk_Copy_bidding_copy",
         "site": {
             "dbname": "extract_kf",
             "coll": "site"

+ 84 - 20
udpfilterdup/src/datamap.go

@@ -121,7 +121,7 @@ func TimedTaskDatamap(days int,lasttime int64) *datamap {
 
 
 func NewDatamap(days int, lastid string) *datamap {
-	datelimit = qutil.Float64All(days * 86400)
+	datelimit = qutil.Float64All(days * 86400 * 2)
 	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
 	if lastid == "" {
 		return dm
@@ -239,7 +239,6 @@ func NewInfo(tmp map[string]interface{}) *Info {
 
 //判重方法
 func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
-
 	reason := ""
 	keys := []string{}
 	d.lock.Lock()
@@ -256,6 +255,8 @@ func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
 		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
 
 	}
+
+
 	d.lock.Unlock()
 L:
 	for _, k := range keys {
@@ -268,6 +269,9 @@ L:
 				if v.id == info.id { //正常重复
 					return false, v, ""
 				}
+
+
+
 				if info.site != "" {//站点临时赋值
 					sitelock.Lock()
 					dict := SiteMap[info.site]
@@ -291,12 +295,14 @@ L:
 						break L
 					}
 					if info.href != "" && info.href != v.href {
-						if v.title==info.title && isTheSameDay(info.publishtime,v.publishtime){
-							reason = "同站点-href不同-标题相同"
-							b = true
-							source = v
-							reasons = reason
-							break L
+						if v.title==info.title&&len([]rune(info.title)) >10 && isTheSameDay(info.publishtime,v.publishtime){
+							if !againHrefRepeat(v, info) {//进行同站点二次判断
+								reason = "同站点-href不同-标题相同等"
+								b = true
+								source = v
+								reasons = reason
+								break L
+							}
 						}else {
 							continue
 						}
@@ -325,18 +331,16 @@ L:
 						}
 						if letter1==letter2 {
 							reason = reason + "标题关键词相等关系"
-							if !againRepeat(v, info) {//继续二级金额判断
+							if !againRepeat(v, info) {//进行二级金额判断
 								b = true
 								source = v
 								reasons = reason
 								break L
-							}else {
-								if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
-									//无包含关系-即不相等
-									continue
-								}else {
-									//有包含关系走要素判重逻辑
-								}
+							}
+						}else {
+							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
+								//无包含关系-即不相等
+								continue
 							}
 						}
 					}
@@ -487,13 +491,19 @@ func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
 
 
 func (d *datamap) update(t int64) {
-	//每天0点清除历史数据
-	d.keymap = d.GetLatelyFiveDay(t)
+
+	if TimingTask {
+		d.keymap = d.GetLatelyFiveDay(t)
+	}else {
+		//d.keymap = d.GetLatelyFiveDay(t)//测试数据采用
+		d.keymap = d.GetLatelyFiveDayDouble(t)
+	}
 	m := map[string]bool{}
 	for _, v := range d.keymap {
 		m[v] = true
 	}
 	all, all1 := 0, 0
+
 	for k, v := range d.data {
 		all += len(v)
 		if !m[k[:8]] {
@@ -521,6 +531,16 @@ func (d *datamap) GetLatelyFiveDay(t int64) []string  {
 	return array
 }
 
+func (d *datamap) GetLatelyFiveDayDouble(t int64) []string  {//增量-两倍
+	array := make([]string, d.days*2)
+	now := time.Now()
+	for i := 0; i < d.days*2; i++ {
+		array[i] = now.Format(qutil.Date_yyyyMMdd)
+		now = now.AddDate(0, 0, -1)
+	}
+	return array
+}
+
 /*
 **************************
 ******** 以下为判重 ********
@@ -960,7 +980,7 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		p11 = true
 	}
 
-	if (p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
+	if 	(p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
 		(p1 && p3 && p5) || (p1 && p3 && p6) || (p1 && p5 && p6) ||
 		(p2 && p3 && p5) || (p2 && p3 && p6) || (p2 && p3 && p11) ||
 		(p2 && p5 && p6) || (p2 && p5 && p11) || (p2 && p6 && p11) ||
@@ -1077,7 +1097,51 @@ func contractRepeat_C(v *Info, info *Info) bool {
 	return false
 }
 
-//再次金额判断
+//同站点再次判断
+func againHrefRepeat(v *Info, info *Info) bool {
+	if v.buyer == info.buyer {
+		if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+			info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+			info.subtype == "变更" || info.subtype == "其他" {
+			//招标结果
+			if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+				return true
+			}
+		} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+			//中标结果
+			if (isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0) ||
+				(deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "") {
+				return true
+			}
+		} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+			//合同
+			if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+				return true
+			}
+			if (isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0) ||
+				(deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "") {
+				return true
+			}
+			if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+				return true
+			}
+			if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+				return true
+			}
+		} else {
+			if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+
+
+
+//关键词再次金额判断
 func againRepeat(v *Info, info *Info) bool {
 	//相同采购单位下
 	//if info.buyer != "" && v.buyer == info.buyer {

+ 10 - 4
udpfilterdup/src/main.go

@@ -234,6 +234,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
+
 			if !LowHeavy { //是否进行低质量数据判重
 				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
 					updateExtract = append(updateExtract, []map[string]interface{}{
@@ -353,9 +354,9 @@ func task(data []byte, mapInfo map[string]interface{}) {
 						repeat_id = info.id
 					}
 				}
-				if repeateN%120==0&&repeateN>0 {
-					fmt.Println("最终结果","目标id:",repeat_idMap["_id"])
-				}
+				//if repeateN%150==0&&repeateN>0 {
+				//	fmt.Println("最终结果","目标id:",repeat_idMap["_id"])
+				//}
 
 
 
@@ -959,6 +960,11 @@ func movedata() {
 		}
 	}
 	log.Println("save to", extract_back, " ok index", index)
-	delnum := mgo.Delete(extract, q)
+	qv := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$lt": time.Date(year, month, day, 0, 0, 0, 0, time.Local).Add(-time.Duration(dupdays) * 24 * time.Hour*2).Unix(),
+		},
+	}
+	delnum := mgo.Delete(extract, qv)
 	log.Println("remove from ", extract, delnum)
 }