瀏覽代碼

正式—判重

apple 5 年之前
父節點
當前提交
e6a6657b27
共有 3 個文件被更改,包括 78 次插入24 次删除
  1. 1 1
      udpfilterdup/src/config.json
  2. 19 2
      udpfilterdup/src/datamap.go
  3. 58 21
      udpfilterdup/src/main.go

+ 1 - 1
udpfilterdup/src/config.json

@@ -17,7 +17,7 @@
     },
     "nextNode": [],
     "isMerger": false,
-    "threads": 1,
+    "threads": 4,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
     "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",

+ 19 - 2
udpfilterdup/src/datamap.go

@@ -13,6 +13,7 @@ import (
 type Info struct {
 	id          string	//id
 	title       string  //标题
+
 	area        string  //省份
 	city        string  //城市
 	subtype     string  //信息类型
@@ -27,6 +28,7 @@ type Info struct {
 	publishtime int64   //发布时间
 	bidopentime int64   //开标时间
 	agencyaddr  string  //开标地点
+
 	site        string  //站点
 	href        string  //正文的url
 
@@ -34,6 +36,7 @@ type Info struct {
 	titleSpecialWord bool                   //标题特殊词
 	specialWord      bool                   //再次判断的特殊词
 	mergemap         map[string]interface{} //合并记录
+	is_site     bool   //是否站点城市
 
 }
 
@@ -211,6 +214,10 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	if info.mergemap == nil {
 		info.mergemap = make(map[string]interface{}, 0)
 	}
+
+	info.is_site = false
+
+
 	return info
 }
 //判重方法
@@ -244,10 +251,12 @@ L:
 						sitelock.Unlock()
 						if dict != nil {
 							if info.area == "全国" && dict["area"] != "" {
+								info.is_site = true
 								info.area = qutil.ObjToString(dict["area"])
 								info.city = qutil.ObjToString(dict["city"])
 							} else {
 								if info.city == "" && dict["city"] != "" {
+									info.is_site = true
 									info.area = qutil.ObjToString(dict["area"])
 									info.city = qutil.ObjToString(dict["city"])
 								}
@@ -292,6 +301,10 @@ L:
 						}
 					}
 
+
+
+
+
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
@@ -838,7 +851,9 @@ func tenderRepeat_C(v *Info, info *Info) bool {
 		return true
 	}
 	//原始地址...
-
+	if v.buyer != "" && info.buyer != "" && v.buyer != info.buyer {
+		return true
+	}
 	if v.bidopentime != 0 && info.bidopentime != 0 && v.bidopentime != info.bidopentime {
 		return true
 	}
@@ -941,7 +956,9 @@ func winningRepeat_C(v *Info, info *Info) bool {
 		return true
 	}
 	//原始地址...
-
+	if v.buyer != "" && info.buyer != "" && v.buyer != info.buyer {
+		return true
+	}
 	return false
 }
 

+ 58 - 21
udpfilterdup/src/main.go

@@ -102,15 +102,15 @@ func main() {
 }
 
 //测试组人员使用
-func mainT() {
+func mainTT() {
 	/*
 	ObjectId("5da3f31aa5cb26b9b798d3aa")
 	ObjectId("5da418c4a5cb26b9b7e3e9a6")
 	ObjectId("5df5071ce9d1f601e495fa54")
 	ObjectId("5e09c05f0cf41612e0626abc")
 	*/
-	//sid = "5df5071ce9d1f601e495fa50"
-	//eid = "5e09c05f0cf41612e0626abc"
+	//sid = "5da3f31aa5cb26b9b798d3aa"
+	//eid = "5da418c4a5cb26b9b7e3e9a6"
 
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {
@@ -282,7 +282,10 @@ func task(data []byte, mapInfo map[string]interface{}) {
 
 							//更新合并后的数据
 							for _, value := range mergeArr {
-								if value == 1 {
+								if value == 0 {
+									merge_map["$set"].(map[string]interface{})["area"] = newData.area
+									merge_map["$set"].(map[string]interface{})["city"] = newData.city
+								} else if value == 1 {
 									merge_map["$set"].(map[string]interface{})["area"] = newData.area
 									merge_map["$set"].(map[string]interface{})["city"] = newData.city
 								} else if value == 2 {
@@ -301,12 +304,12 @@ func task(data []byte, mapInfo map[string]interface{}) {
 									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
 								} else if value == 9 {
 									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+								}else if value == 10 {
+									merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
+								}else if value == 11 {
+									merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
 								}else {
 								}
-
-								if value==0 {
-
-								}
 							}
 							//模板数据更新
 							updateExtract = append(updateExtract, []map[string]interface{}{
@@ -553,7 +556,10 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 
 								//更新合并后的数据
 								for _, value := range mergeArr {
-									if value == 1 {
+									if value == 0 {
+										merge_map["$set"].(map[string]interface{})["area"] = newData.area
+										merge_map["$set"].(map[string]interface{})["city"] = newData.city
+									} else if value == 1 {
 										merge_map["$set"].(map[string]interface{})["area"] = newData.area
 										merge_map["$set"].(map[string]interface{})["city"] = newData.city
 									} else if value == 2 {
@@ -572,12 +578,12 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 										merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
 									} else if value == 9 {
 										merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+									}else if value == 10 {
+										merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
+									}else if value == 11 {
+										merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
 									}else {
 									}
-
-									if value==0 {
-
-									}
 								}
 								//模板数据更新
 								updateExtract = append(updateExtract, []map[string]interface{}{
@@ -643,19 +649,32 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 //合并字段-并更新merge字段的值
 func mergeDataFields(source *Info, info *Info) (*Info, []int64,bool) {
 
-	//定义一个新的map[string]interface{}{}
 	merge_recordMap := make(map[string]interface{},0)
 	mergeArr := make([]int64, 0)
 	//是否替换数据了-记录原始的数据
 	is_replace :=false
 	//1、城市
-	if (source.area == "" || source.area == "全国") && info.area != "全国" && info.area != "" {
-		merge_recordMap["area"] = source.area
-		merge_recordMap["city"] = source.city
-		source.area = info.area
-		source.city = info.city
-		mergeArr = append(mergeArr, 1)
-		is_replace = true
+	if source.area == "" || source.area == "全国"{
+		//为空
+		if info.area != "全国" && info.area != "" {
+			merge_recordMap["area"] = source.area
+			merge_recordMap["city"] = source.city
+			source.area = info.area
+			source.city = info.city
+			mergeArr = append(mergeArr, 1)
+			is_replace = true
+		}
+	}else {
+		//不为空-查看站点相关-有值必替换
+		if source.is_site {
+			//是站点替换的城市
+			merge_recordMap["site_area"] = source.area
+			merge_recordMap["site_city"] = source.city
+			mergeArr = append(mergeArr, 0)
+			is_replace = true
+			source.is_site = false
+
+		}
 	}
 	//2、项目名称
 	if source.projectname == "" && info.projectname != "" {
@@ -715,6 +734,24 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64,bool) {
 		is_replace = true
 	}
 
+	//10、发布时间
+	if source.publishtime == 0 && info.publishtime != 0 {
+		merge_recordMap["publishtime"] = source.publishtime
+		source.publishtime = info.publishtime
+		mergeArr = append(mergeArr, 10)
+		is_replace = true
+	}
+	//11、代理机构
+	if source.agency == "" && info.agency != "" {
+		merge_recordMap["agency"] = source.agency
+		source.agency = info.agency
+		mergeArr = append(mergeArr, 11)
+		is_replace = true
+	}
+
+
+
+
 	if is_replace {//有过替换更新
 		//总次数+1
 		source.mergemap["total_num"] = util.Int64All(source.mergemap["total_num"])+1