瀏覽代碼

测试-抽查-修改-判重

apple 5 年之前
父節點
當前提交
0d474b782e
共有 4 個文件被更改,包括 161 次插入106 次删除
  1. 2 2
      udpfilterdup/src/config.json
  2. 71 34
      udpfilterdup/src/datamap.go
  3. 84 67
      udpfilterdup/src/main.go
  4. 4 3
      udps/main.go

+ 2 - 2
udpfilterdup/src/config.json

@@ -5,8 +5,8 @@
         "addr": "192.168.3.207:27082",
         "pool": 15,
         "db": "extract_kf",
-        "extract": "bidding_20190910_01",
-        "extract_copy": "a_testbidding_copy",
+        "extract": "a_testbidding_new",
+        "extract_copy": "a_testbidding",
         "bidding": "bidding_126"
     },
     "jkmail": {

+ 71 - 34
udpfilterdup/src/datamap.go

@@ -279,31 +279,23 @@ L:
 				if info.subtype==v.subtype {
 					//站点配置--
 					if info.site!="" {
-						dict := SiteMap[info.site].(map[string]string)
+						dict := SiteMap[info.site]
+
 						if dict!=nil{
 							//临时改变--具体值
 							if info.area=="全国" &&dict["area"]!="" {
-								info.area = dict["area"]
-								info.city = dict["city"]
+								info.area = qutil.ObjToString(dict["area"])
+								info.city = qutil.ObjToString(dict["city"])
 							}else {
 								if info.city=="" &&dict["city"]!="" {
-									info.area = dict["area"]
-									info.city = dict["city"]
+									info.area = qutil.ObjToString(dict["area"])
+									info.city = qutil.ObjToString(dict["city"])
 								}
 							}
 						}
 					}
 
-					//前置条件2个不重复  一个重复
-					if info.titleSpecialWord&&info.title!=v.title&&v.title!="" {
-						continue
-					}
-					if info.buyer != "" &&v.buyer == info.buyer {
-						//满足标题
-						if len([]rune(v.title)) >= 10 && len([]rune(info.title)) >= 10 && v.title != info.title && (info.specialWord || v.specialWord) {
-							continue
-						}
-					}
+					//前置条件1  	站点相关
 					if info.site!=""&&info.site==v.site{
 						if info.href!=""&&info.href==v.href {
 							reason = "href相同"
@@ -312,7 +304,34 @@ L:
 							reasons = reason
 							break L
 						}
+						if info.href!=""&&info.href!=v.href {
+							continue
+						}
 					}
+
+					//前置条件2  标题相关 - 有且一个关键词
+					if ((info.titleSpecialWord&&!v.titleSpecialWord)||(info.specialWord&&!v.specialWord))&&
+						info.title!=v.title&&v.title!=""&&info.title!="" {
+						continue
+					}
+
+					//前置条件3 	标题相关 - 均含有关键词
+					if ((info.titleSpecialWord&&v.titleSpecialWord)||(info.specialWord&&v.specialWord))&&
+						len([]rune(v.title))>10 && len([]rune(info.title))>10&&v.title!=""&&info.title!=""{
+						if !(strings.Contains(v.title, info.title)||strings.Contains(info.title, v.title)) {
+							continue //无包含关系
+						}
+						if strings.Contains(v.title, info.title)||strings.Contains(info.title, v.title) {
+							reason = "标题关键词且包含关系"
+							b = true
+							source = v
+							reasons = reason
+							break L
+						}
+					}
+
+
+
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
@@ -398,25 +417,27 @@ L:
 				if info.subtype==v.subtype {
 					//站点配置--
 					if info.site!="" {
-						dict := SiteMap[info.site].(map[string]string)
+						dict := SiteMap[info.site]
 						if dict!=nil{
 							//临时改变--具体值
 							if info.area=="全国" &&dict["area"]!="" {
-								info.area = dict["area"]
-								info.city = dict["city"]
+								info.area = qutil.ObjToString(dict["area"])
+								info.city = qutil.ObjToString(dict["city"])
 							}else {
 								if info.city=="" &&dict["city"]!="" {
-									info.area = dict["area"]
-									info.city = dict["city"]
+									info.area = qutil.ObjToString(dict["area"])
+									info.city = qutil.ObjToString(dict["city"])
 								}
 							}
 						}
 					}
 
-					//前置条件2个不重复  一个重复
-					if info.titleSpecialWord&&info.title!=v.title&&v.title!="" {
+					//前置条件  一个不重复  一个重复
+					if ((info.titleSpecialWord&&!v.titleSpecialWord)||(info.specialWord&&!v.specialWord))&&
+						info.title!=v.title&&v.title!="" {
 						continue
 					}
+
 					if info.buyer != "" &&v.buyer == info.buyer {
 						//满足标题
 						if len([]rune(v.title)) >= 10 && len([]rune(info.title)) >= 10 && v.title != info.title && (info.specialWord || v.specialWord) {
@@ -610,7 +631,17 @@ func quickHeavyMethodOne(v *Info ,info *Info) bool {
 			return false
 		}
 	}else {
-
+		//招标结果
+		if tenderRepeat_A(v,info) {
+			if tenderRepeat_C(v,info) {
+				return false
+			}else {
+				reason = reason+"---类别空-招标类"
+				return true
+			}
+		}else {
+			return false
+		}
 	}
 
 	return false
@@ -662,7 +693,17 @@ func quickHeavyMethodTwo(v *Info ,info *Info) bool {
 				return false
 			}
 		}else {
-
+			//招标结果
+			if tenderRepeat_B(v,info) {
+				if tenderRepeat_C(v,info) { //有不同
+					return false
+				}else {
+					reason = reason+"---类别空-招标类"
+					return true
+				}
+			}else{
+				return false
+			}
 		}
 	}
 
@@ -679,6 +720,10 @@ func quickHeavyMethodTwo(v *Info ,info *Info) bool {
 			return false
 		}
 	}
+
+
+
+
 	return false
 }
 
@@ -720,7 +765,7 @@ func tenderRepeat_A(v *Info ,info *Info) bool {
 	}
 
 	if (p1&&p2&&p3)||(p1&&p2&&p4)||(p1&&p2&&p9)||
-		(p1&&p2&&p10)||(p1&&p3&&p9)||(p1&&p3&&p10)||
+		(p1&&p2&&p10)||(p1&&p2&&p11)||(p1&&p3&&p9)||(p1&&p3&&p10)||
 		(p1&&p4&&p9)||(p1&&p4&&p10)||(p2&&p3&&p4)||
 		(p2&&p3&&p9)||(p2&&p3&&p10)||(p2&&p3&&p11)||
 		(p2&&p4&&p9)||(p2&&p4&&p10)||(p2&&p4&&p11)||
@@ -785,11 +830,6 @@ func tenderRepeat_C(v *Info ,info *Info) bool {
 	if v.agencyaddr!=""&&info.agencyaddr!=""&&v.agencyaddr!=info.agencyaddr {
 		return true
 	}
-	if info.specialWord||v.specialWord||info.titleSpecialWord||v.titleSpecialWord{
-		return true
-	}
-
-
 
 	return false
 }
@@ -800,7 +840,7 @@ func winningRepeat_A(v *Info ,info *Info) bool {
 	var ss string
 	p1,p2,p3,p5,p6,p11 := false,false,false,false,false,false
 	if v.projectname!=""&&v.projectname==info.projectname {
-		ss = ss+"p1(标题)-"
+		ss = ss+"p1(项目名称)-"
 		p1 = true
 	}
 	if v.buyer!=""&&v.buyer==info.buyer {
@@ -884,9 +924,6 @@ func winningRepeat_C(v *Info ,info *Info) bool {
 	}
 	//原始地址...
 
-	if info.specialWord||v.specialWord||info.titleSpecialWord||v.titleSpecialWord{
-		return true
-	}
 
 	return false
 }

+ 84 - 67
udpfilterdup/src/main.go

@@ -6,6 +6,7 @@ package main
 
 import (
 	"encoding/json"
+	"flag"
 	"fmt"
 	"gopkg.in/mgo.v2/bson"
 	"log"
@@ -36,8 +37,8 @@ var (
 	dupdays      = 5                      //初始化判重范围
 	DM           *datamap                 //
 	HM           *historymap                 //判重数据
-	lastid       = "5d767728a5cb26b9b7748868"
-	//ObjectId("5d767728a5cb26b9b7748868")
+	lastid       = "5da3f2c5a5cb26b9b79847fc"
+	//5da3f2c5a5cb26b9b79847fc
 	//正则筛选相关
 	FilterRegTitle = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
@@ -46,12 +47,12 @@ var (
 
 
 
-	SiteMap  map[string]interface{} //站点map
+	SiteMap  map[string]map[string]interface{} //站点map
 )
 
 func init() {
-	//flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
-	//flag.Parse()
+	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
+	flag.Parse()
 	//172.17.145.163:27080
 	util.ReadConfig(&Sysconfig)
 	nextNode = util.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
@@ -87,7 +88,7 @@ func init() {
 	siteMgo.InitPool()
 
 
-	SiteMap = make(map[string]interface{},0)
+	SiteMap = make(map[string]map[string]interface{},0)
 
 	start := int(time.Now().Unix())
 	//站点配置
@@ -95,12 +96,14 @@ func init() {
 	defer sess_site.Close()
 	res_site := sess_site.DB("zhaolongyue").C("site").Find(nil).Sort("_id").Iter()
 	for site_dict := make(map[string]interface{}); res_site.Next(&site_dict); {
-			data_map := map[string]string{
+			data_map := map[string]interface{}{
 				"area":util.ObjToString(site_dict["area"]),
 				"city":util.ObjToString(site_dict["city"]),
 				"district":util.ObjToString(site_dict["district"]),
+				"subdepttype":util.ObjToString(site_dict["subdepttype"]),
+				"level":util.ObjToString(site_dict["level"]),
 			}
-		SiteMap[site_dict["site"].(string)]= data_map
+		SiteMap[util.ObjToString(site_dict["site"])]= data_map
 	}
 	
 	fmt.Printf("用时:%d秒,%d个",int(time.Now().Unix())-start,len(SiteMap))
@@ -142,8 +145,7 @@ func mainTest()  {
 	//	//	arr = append(arr,dict)
 	//	//}
 	//}
-
-
+	//
 
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
@@ -207,11 +209,11 @@ func mainTest()  {
 		}
 
 	}
-	//打印 1:0情况    66989
+	//打印 1:0情况    ;
 	mm:=0
 	for _,v:=range arr1 {
 		mm++
-		if mm%222==0 {
+		if mm%200==0 {
 			log.Println(v)
 		}
 	}
@@ -220,11 +222,11 @@ func mainTest()  {
 	log.Println("分割线---------------")
 
 
-	//打印 0:1情况  8729
+	//打印 0:1情况
 	nn:=0
 	for _,v:=range arr2 {
 		nn++
-		if nn%30==0 {
+		if nn%200==0 {
 			log.Println(v)
 		}
 	}
@@ -265,11 +267,11 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 		} else if mapInfo != nil {
 
 			//更新流程
-			go historyTask(data,mapInfo)
+			//go historyTask(data,mapInfo)
 
 
 			//判重流程
-			//go task(data, mapInfo)
+			go task(data, mapInfo)
 
 			key, _ := mapInfo["key"].(string)
 			if key == "" {
@@ -330,7 +332,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 					},
 					map[string]interface{}{
 						"$set": map[string]interface{}{
-							"repeat":   -1,
+							"repeat":-1,
 						},
 					},
 				})
@@ -346,63 +348,64 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				b, source,reason := DM.check(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
 					repeateN++
-					var mergeArr  = []int64{} 	//更改合并数组记录
-					var newData  = &Info{}		//更换新的数据池数据
+					//var mergeArr  = []int64{} 	//更改合并数组记录
+					//var newData  = &Info{}		//更换新的数据池数据
 					var id_map  = map[string]interface{}{}
-					repeat_id := ""
-
+					repeat_id := source.id
+					id_map["_id"]= util.StringTOBsonId(info.id)
 					//合并操作--评功权重打分-合并完替换原始数据池
-					basic_bool := basicDataScore(source,info)
-					if basic_bool {
-						//已原始数据为标准-对比数据打判重标签
-						newData,mergeArr= mergeDataFields(source,info)
-						DM.replaceSourceData(newData,source.id) //替换
-						id_map["_id"]= util.StringTOBsonId(source.id)
-						repeat_id = source.id
-					}else {
-						//已对比数据为标准 ,数据池的数据打判重标签
-						newData,mergeArr= mergeDataFields(info,source)
-						DM.replaceSourceData(newData,source.id)//替换
-						id_map["_id"]= util.StringTOBsonId(info.id)
-						repeat_id = info.id
-					}
+					//basic_bool := basicDataScore(source,info)
+					//if basic_bool {
+					//	//已原始数据为标准-对比数据打判重标签
+					//	newData,mergeArr= mergeDataFields(source,info)
+					//	DM.replaceSourceData(newData,source.id) //替换
+					//	id_map["_id"]= util.StringTOBsonId(source.id)
+					//	repeat_id = source.id
+					//}else {
+					//	//已对比数据为标准 ,数据池的数据打判重标签
+					//	newData,mergeArr= mergeDataFields(info,source)
+					//	DM.replaceSourceData(newData,source.id)//替换
+					//	id_map["_id"]= util.StringTOBsonId(info.id)
+					//	repeat_id = info.id
+					//}
 
 					var update_map  = map[string]interface{}{
 						"$set": map[string]interface{}{
-							"reason":reason,
-							"repeat":"1",
+							"repeat_reason":reason,
+							"repeat":1,
 							"repeatid":repeat_id,
 						},
 					}
 
 					//合并记录
-					if len(newData.mergemap)>0 {
-						update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
-					}
-
-					//更新合并后的数据
-					for _,value :=range mergeArr {
-						if value==1 {
-							update_map["$set"].(map[string]interface{})["area"] = newData.area
-							update_map["$set"].(map[string]interface{})["city"] = newData.city
-						}else if value==2 {
-							update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-						}else if value==3 {
-							update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-						}else if value==4 {
-							update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-						}else if value==5 {
-							update_map["$set"].(map[string]interface{})["budget"] = newData.budget
-						}else if value==6 {
-							update_map["$set"].(map[string]interface{})["winner"] = newData.winner
-						}else if value==7 {
-							update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-						}else if value==8 {
-							update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-						}else {
-
-						}
-					}
+					//if len(newData.mergemap)>0 {
+					//	update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
+					//	//fmt.Println("合并长度:",len(newData.mergemap))
+					//}
+					//
+					////更新合并后的数据
+					//for _,value :=range mergeArr {
+					//	if value==1 {
+					//		update_map["$set"].(map[string]interface{})["area"] = newData.area
+					//		update_map["$set"].(map[string]interface{})["city"] = newData.city
+					//	}else if value==2 {
+					//		update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+					//	}else if value==3 {
+					//		update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+					//	}else if value==4 {
+					//		update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+					//	}else if value==5 {
+					//		update_map["$set"].(map[string]interface{})["budget"] = newData.budget
+					//	}else if value==6 {
+					//		update_map["$set"].(map[string]interface{})["winner"] = newData.winner
+					//	}else if value==7 {
+					//		update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+					//	}else if value==8 {
+					//		update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+					//	}else {
+					//
+					//	}
+					//}
 
 					//构建数据库更新用到的
 					updateExtract = append(updateExtract, []map[string]interface{}{
@@ -552,7 +555,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 							map[string]interface{}{
 								"$set": map[string]interface{}{
 									"repeat":   0,
-									"repeatid": "-1",
+									"repeatid": -2,
 								},
 							},
 						})
@@ -586,8 +589,8 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 
 						var update_map  = map[string]interface{}{
 							"$set": map[string]interface{}{
-								"reason":reason,
-								"repeat":"1",
+								"repeat_reason":reason,
+								"repeat":1,
 								"repeatid":repeat_id,
 							},
 						}
@@ -595,6 +598,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 						//合并记录
 						if len(newData.mergemap)>0 {
 							update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
+							//fmt.Println("合并长度:",len(newData.mergemap))
 						}
 
 						//更新合并后的数据
@@ -706,6 +710,8 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 		source.area = info.area
 		source.city = info.city
 		mergeArr = append(mergeArr,1)
+
+		//fmt.Println("合并-城市")
 	}
 	//2、项目名称
 	if source.projectname==""&&info.projectname!=""{
@@ -720,6 +726,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.projectname = info.projectname
 		mergeArr = append(mergeArr,2)
+		//fmt.Println("合并-项目名称")
 	}
 	//3、项目编号
 	if source.projectcode==""&&info.projectcode!=""{
@@ -734,6 +741,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.projectcode = info.projectcode
 		mergeArr = append(mergeArr,3)
+		//fmt.Println("合并-项目标号")
 	}
 	//4、采购单位
 	if source.buyer==""&&info.buyer!=""{
@@ -748,6 +756,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.buyer = info.buyer
 		mergeArr = append(mergeArr,4)
+		//fmt.Println("合并-采购单位")
 	}
 	//5、预算
 	if source.budget==0&&info.budget!=0{
@@ -762,6 +771,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.budget = info.budget
 		mergeArr = append(mergeArr,5)
+		//fmt.Println("合并-预算")
 	}
 	//6、中标单位
 	if source.winner==""&&info.winner!=""{
@@ -776,6 +786,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.winner = info.winner
 		mergeArr = append(mergeArr,6)
+		//fmt.Println("合并-中标单位")
 	}
 	//7、中标金额
 	if source.bidamount==0&&info.bidamount!=0{
@@ -790,6 +801,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.bidamount = info.bidamount
 		mergeArr = append(mergeArr,7)
+		//fmt.Println("合并-中标金额")
 	}
 	//8、开天时间-地点
 	if source.bidopentime==0&&info.bidopentime!=0{
@@ -804,6 +816,7 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 		source.bidopentime = info.bidopentime
 		mergeArr = append(mergeArr,8)
+		//fmt.Println("合并-开标时间")
 	}
 
 	//以上合并过于简单,待进一步优化
@@ -813,6 +826,10 @@ func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
 
 //权重评估
 func basicDataScore(v *Info, info *Info) bool  {
+
+
+
+	//网站评估
 	m,n:=0,0
 	if v.projectname!="" {m++}
 	if v.buyer!="" {m++}

+ 4 - 3
udps/main.go

@@ -33,13 +33,14 @@ func main() {
 	5d767728a5cb26b9b7748868
 	ObjectId("5d77c881a5cb26b9b7de209d")
 
-
+ObjectId("5da3f2c5a5cb26b9b79847fc")
+	ObjectId("5db2735ba5cb26b9b7c99c6f")
 	//历史中间一段数据
 	ObjectId("5d771e90a5cb26b9b7be7976")
 	ObjectId("5d775be4a5cb26b9b759b5eb")
 	*/
-	flag.StringVar(&sid, "sid", "5d771e90a5cb26b9b7be7976", "开始id")
-	flag.StringVar(&eid, "eid", "5d775be4a5cb26b9b759b5eb", "结束id")
+	flag.StringVar(&sid, "sid", "", "开始id")
+	flag.StringVar(&eid, "eid", "", "结束id")
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")