Sfoglia il codice sorgente

增量-历史-记录repeat_ids

apple 5 anni fa
parent
commit
3d3644336b

+ 3 - 3
udpfilterdup/src/config.json

@@ -5,8 +5,8 @@
         "addr": "192.168.3.207:27092",
         "pool": 5,
         "db": "extract_kf",
-        "extract": "zk_zk_test",
-        "extract_back": "zk_zk_test",
+        "extract": "zk_move",
+        "extract_back": "zk_move",
         "site": {
             "dbname": "extract_kf",
             "coll": "site"
@@ -24,7 +24,7 @@
     "timingTask":false,
     "timingSpanDay": 3,
     "timingPubScope": 720,
-    "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
+    "specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
     "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
     "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",

+ 28 - 1
udpfilterdup/src/dataMethod.go

@@ -76,6 +76,33 @@ func againRepeat(v *Info, info *Info) bool {
 	return false
 }
 
+////站点再次判断
+//func againSite(v *Info, info *Info) bool {
+//
+//	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+//		return true
+//	}
+//	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
+//		return true
+//	}
+//	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+//		return true
+//	}
+//	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+//		return true
+//	}
+//	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+//		return true
+//	}
+//
+//	return false
+//}
+
+
+
+
+
+
 //删除中标单位字符串中多余的空格(含tab)
 func deleteExtraSpace(s string) string {
 	//删除字符串中的多余空格,有多个空格时,仅保留一个空格
@@ -112,7 +139,7 @@ func isBidopentimeInterval(i1 int64 ,i2 int64) bool {
 	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
 	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
 	if day1==day2 {
-		//是否间隔超过小时
+		//是否间隔超过十二小时
 		if math.Abs(float64(i1-i2)) >43200.0 {
 			return true
 		}else {

+ 15 - 4
udpfilterdup/src/datamap.go

@@ -37,6 +37,7 @@ type Info struct {
 	specialWord      bool                   //再次判断的特殊词
 	mergemap         map[string]interface{} //合并记录
 	is_site          bool                   //是否站点城市
+	repeat_ids        []string               //记录所有重复id
 
 }
 
@@ -231,6 +232,11 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	if info.mergemap == nil {
 		info.mergemap = make(map[string]interface{}, 0)
 	}
+	if info.repeat_ids == nil {
+		info.repeat_ids = make([]string, 0)
+	}
+
+
 
 	info.is_site = false
 
@@ -305,7 +311,7 @@ L:
 						break L
 					}
 					if info.href != "" && info.href != v.href { //待优化
-						if v.title==info.title&&len([]rune(info.title)) >10 && isTheSameDay(info.publishtime,v.publishtime){
+						if v.title==info.title{
 							if !againRepeat(v, info) {//进行同站点二次判断
 								reason = "同站点-href不同-标题相同等"
 								b = true
@@ -316,7 +322,9 @@ L:
 								continue
 							}
 						}else {
-							continue
+							if againRepeat(v, info) {//进行同站点二次判断
+								continue
+							}
 						}
 					}
 				}
@@ -324,7 +332,7 @@ L:
 				specialNum:= dealWithSpecialWordNumber(info,v)
 				//前置条件 - 标题相关,有且一个关键词
 				if specialNum==1 {
-					if info.title != v.title && v.title != "" && info.title != "" {
+					if againRepeat(v, info) {
 						continue
 					}
 				}
@@ -352,7 +360,10 @@ L:
 						}else {
 							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
 								//无包含关系-即不相等
-								continue
+								if againRepeat(v, info) {
+									continue
+								}
+
 							}
 						}
 					}

+ 34 - 0
udpfilterdup/src/main.go

@@ -292,6 +292,23 @@ func task(data []byte, mapInfo map[string]interface{}) {
 					updateID["_id"] = info.id
 				}
 
+				repeat_ids:=source.repeat_ids
+				repeat_ids =  append(repeat_ids,info.id)
+				source.repeat_ids = repeat_ids
+				//替换数据池-更新
+				DM.replacePoolData(source)
+				updateExtract = append(updateExtract, []map[string]interface{}{//重复数据打标签
+					map[string]interface{}{
+						"_id": StringTOBsonId(source.id),
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat_ids": repeat_ids,
+						},
+					},
+				})
+
+
 				updateExtract = append(updateExtract, []map[string]interface{}{//重复数据打标签
 					updateID,
 					map[string]interface{}{
@@ -561,6 +578,23 @@ func historyTaskDay() {
 					if b { //有重复,生成更新语句,更新抽取和更新招标
 						repeateN++
 						//重复数据打标签
+						repeat_ids:=source.repeat_ids
+						repeat_ids =  append(repeat_ids,info.id)
+						source.repeat_ids = repeat_ids
+						//替换数据池-更新
+						DM.replacePoolData(source)
+						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{//重复数据打标签
+							map[string]interface{}{
+								"_id": StringTOBsonId(source.id),
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat_ids": repeat_ids,
+								},
+							},
+						})
+
+
 						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
 							map[string]interface{}{
 								"_id": tmp["_id"],