Pārlūkot izejas kodu

新判重功能的调试

apple 5 gadi atpakaļ
vecāks
revīzija
4b25de0efc
4 mainītis faili ar 69 papildinājumiem un 48 dzēšanām
  1. 1 1
      udpfilterdup/src/config.json
  2. 18 12
      udpfilterdup/src/datamap.go
  3. 42 33
      udpfilterdup/src/main.go
  4. 8 2
      udps/main.go

+ 1 - 1
udpfilterdup/src/config.json

@@ -5,7 +5,7 @@
         "addr": "192.168.3.207:27082",
         "pool": 15,
         "db": "extract_kf",
-        "extract": "a_testbidding",
+        "extract": "bidding_20190910_01",
         "extract_copy": "a_testbidding_copy",
         "bidding": "bidding_126"
     },

+ 18 - 12
udpfilterdup/src/datamap.go

@@ -232,6 +232,11 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.specialWord = FilterRegTitle.MatchString(info.title)
 	info.titleSpecialWord = FilterRegTitle_1.MatchString(info.title)||FilterRegTitle_2.MatchString(info.title)
 	info.mergemap = *qutil.ObjToMap(tmp["merge_map"])
+	if info.mergemap==nil {
+		info.mergemap = make(map[string]interface{},0)
+	}
+
+
 	info.accurateTime = qutil.Int64All(tmp["publishtime"])
 	if info.accurateTime ==0 {
 		info.accurateTime = qutil.Int64All(tmp["comeintime"])
@@ -261,6 +266,7 @@ L:
 		data := d.data[k]
 		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
 			for _, v := range data {
+				reason = ""
 				if v.id == info.id {//正常重复
 					return false, v,""
 				}
@@ -300,15 +306,15 @@ L:
 					}
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
-						reason = fmt.Sprintf(reason,"同机构,")
+						reason = reason + "同机构,"
 						if info.agency=="" {
-							reason = fmt.Sprintf(reason,"指定范围,")
+							reason = reason + "指定范围,"
 							//指定该范围内数据判重  jsondata
 							if v.agency=="" {
 								continue
 							}
 						}else {
-							reason = fmt.Sprintf(reason,"非指定范围,")
+							reason = reason + "非指定范围,"
 							if quickHeavyMethodTwo(v,info) {
 								b = true
 								source = v
@@ -317,15 +323,15 @@ L:
 							}
 						}
 					}else {
-						reason = fmt.Sprintf(reason,"非同机构,")
+						reason = reason + "非同机构,"
 						if info.agency=="" {
-							reason = fmt.Sprintf(reason,"指定范围,")
+							reason = reason + "指定范围,"
 							//指定该范围内数据判重  jsondata
 							if v.agency=="" {
 								continue
 							}
 						}else {
-							reason = fmt.Sprintf(reason,"非指定范围,")
+							reason = reason + "非指定范围,"
 							if quickHeavyMethodOne(v,info) {
 								b = true
 								source = v
@@ -389,15 +395,15 @@ L:
 				if info.subtype==v.subtype {
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
-						reason = fmt.Sprintf(reason,"同机构,")
+						reason = reason + "同机构,"
 						if info.agency=="" {
-							reason = fmt.Sprintf(reason,"指定范围,")
+							reason = reason + "指定范围,"
 							//指定该范围内数据判重  jsondata
 							if v.agency=="" {
 								continue
 							}
 						}else {
-							reason = fmt.Sprintf(reason,"非指定范围,")
+							reason = reason + "非指定范围,"
 							if quickHeavyMethodTwo(v,info) {
 								b = true
 								source = v
@@ -406,15 +412,15 @@ L:
 							}
 						}
 					}else {
-						reason = fmt.Sprintf(reason,"非同机构,")
+						reason = reason + "非同机构,"
 						if info.agency=="" {
-							reason = fmt.Sprintf(reason,"指定范围,")
+							reason = reason + "指定范围,"
 							//指定该范围内数据判重  jsondata
 							if v.agency=="" {
 								continue
 							}
 						}else {
-							reason = fmt.Sprintf(reason,"非指定范围,")
+							reason = reason + "非指定范围,"
 							if quickHeavyMethodOne(v,info) {
 								b = true
 								source = v

+ 42 - 33
udpfilterdup/src/main.go

@@ -36,9 +36,10 @@ var (
 	dupdays      = 5                      //初始化判重范围
 	DM           *datamap                 //
 	HM           *historymap                 //判重数据
-	lastid       = "5da3f2c5a5cb26b9b79847fe"
+	lastid       = "5d767728a5cb26b9b7748868"
 	//5da3f2c5a5cb26b9b79847fc
-
+	//ObjectId("5d767728a5cb26b9b7748868")
+	//5da3f2c5a5cb26b9b79847fe
 	//正则筛选相关
 	FilterRegTitle = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
@@ -247,7 +248,7 @@ func mainTest()  {
 
 
 func main() {
-	return
+
 	go checkMapJob()
 
 	updport := Sysconfig["udpport"].(string)
@@ -349,15 +350,15 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			}else  {
 				//判重原因 reason
 				// tmp["_id"] 对比id   id原始id
+				mapLock.Lock()
 				b, source,reason := DM.check(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
 					repeateN++
-					mapLock.Lock()
-
 					var mergeArr []int64 	//更改合并数组记录
 					var newData *Info		//更换新的数据池数据
 
 					var id_map  = map[string]interface{}{}
+					repeat_id := ""
 					//合并操作--评功权重打分-合并完替换原始数据池
 					basic_bool := basicDataScore(source,info)
 					if basic_bool {
@@ -365,48 +366,55 @@ func task(data []byte, mapInfo map[string]interface{}) {
 						newData,mergeArr= mergeDataFields(source,info)
 						DM.replaceSourceData(newData,source.id) //替换
 						id_map["_id"]= util.StringTOBsonId(source.id)
-
+						repeat_id = source.id
 						//对比的数据打判重标签
-						updateExtract = append(updateExtract, []map[string]interface{}{
-							map[string]interface{}{
-								"_id": tmp["_id"],
-							},
-							map[string]interface{}{
-								"$set": map[string]interface{}{
-									"repeat":   1,
-									"repeatid": source.id,
-								},
-							},
-						})
-
-
+						//updateExtract = append(updateExtract, []map[string]interface{}{
+						//	map[string]interface{}{
+						//		"_id": tmp["_id"],
+						//	},
+						//	map[string]interface{}{
+						//		"$set": map[string]interface{}{
+						//			"repeat":   1,
+						//			"repeatid": source.id,
+						//		},
+						//	},
+						//})
+						//if len(updateExtract) > 500 {
+						//	mgo.UpdateBulk(extract, updateExtract...)
+						//	updateExtract = [][]map[string]interface{}{}
+						//}
 
 					}else {
 						//已对比数据为标准 ,数据池的数据打判重标签
 						newData,mergeArr= mergeDataFields(info,source)
 						DM.replaceSourceData(newData,source.id)//替换
 						id_map["_id"]= util.StringTOBsonId(info.id)
-
+						repeat_id = info.id
 						//数据池的数据打判重标签
-						updateExtract = append(updateExtract, []map[string]interface{}{
-							map[string]interface{}{
-								"_id": util.StringTOBsonId(source.id),
-							},
-							map[string]interface{}{
-								"$set": map[string]interface{}{
-									"repeat":   1,
-									"repeatid": info.id,
-								},
-							},
-						})
+						//updateExtract = append(updateExtract, []map[string]interface{}{
+						//	map[string]interface{}{
+						//		"_id": util.StringTOBsonId(source.id),
+						//	},
+						//	map[string]interface{}{
+						//		"$set": map[string]interface{}{
+						//			"repeat":   1,
+						//			"repeatid": info.id,
+						//		},
+						//	},
+						//})
+						//
+						//if len(updateExtract) > 500 {
+						//	mgo.UpdateBulk(extract, updateExtract...)
+						//	updateExtract = [][]map[string]interface{}{}
+						//}
 
 					}
-
-
 					//
 					var update_map  = map[string]interface{}{
 						"$set": map[string]interface{}{
 							"reason":reason,
+							"repeat":"1",
+							"repeatid":repeat_id,
 							"merge":newData.mergemap,
 						},
 					}
@@ -471,6 +479,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 
 				} else {
 					//IS.Add("new")
+					mapLock.Unlock()
 				}
 			}
 		}(tmp)

+ 8 - 2
udps/main.go

@@ -27,8 +27,14 @@ func main() {
 	5da3f2c5a5cb26b9b79847fc
 	5db2735ba5cb26b9b7c99c6f   76万
 	*/
-	flag.StringVar(&sid, "sid", "", "开始id")
-	flag.StringVar(&eid, "eid", "", "结束id")
+
+	/*
+		9W
+	5d767728a5cb26b9b7748868
+	ObjectId("5d77c881a5cb26b9b7de209d")
+	*/
+	flag.StringVar(&sid, "sid", "5d767728a5cb26b9b7748868", "开始id")
+	flag.StringVar(&eid, "eid", "5d77c881a5cb26b9b7de209d", "结束id")
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")