apple 5 жил өмнө
parent
commit
bdea93e3b3

+ 38 - 3
udpfilterdup/src/datamap.go

@@ -34,6 +34,7 @@ type Info struct {
 	href			   string		//正文的url
 	titleSpecialWord bool 			//标题特殊词
 	specialWord bool	 			//再次判断的特殊词
+	mergemap           map[string]interface{}   //合并记录
 }
 
 var datelimit = float64(432000)
@@ -62,7 +63,6 @@ func NewDatamap(days int, lastid string) *datamap {
 	now1 := int64(0)
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		//|| qutil.ObjToString(tmp["subtype"]) == "变更"
 		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
 			continuSum++
 		} else {
@@ -78,7 +78,9 @@ func NewDatamap(days int, lastid string) *datamap {
 			}
 			if qutil.Float64All(now1-comeintime) < datelimit {
 				info := NewInfo(tmp)
+				//时间字符串
 				dkey := qutil.FormatDateWithObj(&cm, qutil.Date_yyyyMMdd)
+				//拼接的一个时间字符串 xxxx_类型_省份
 				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
 				data := dm.data[k]
 				if data == nil {
@@ -131,16 +133,20 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.detail		= qutil.ObjToString(tmp["detail"])
 	info.site	 = qutil.ObjToString(tmp["site"])
 	info.href	 = qutil.ObjToString(tmp["href"])
+	info.mergemap = *qutil.ObjToMap(tmp["merge_map"])
+
+
 	return info
 }
 // 486 396 315
-
 func (d *datamap) check(info *Info) (b bool,  source *Info,reasons string) {
 	reason = ""
 	d.lock.Lock()
 	defer d.lock.Unlock()
 	keys := []string{}
+	//不同时间段
 	for k, _ := range d.keys {
+		//...代码
 		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
 		if info.area != "全国" { //这个后续可以不要
 			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
@@ -149,7 +155,7 @@ func (d *datamap) check(info *Info) (b bool,  source *Info,reasons string) {
 L:
 	for _, k := range keys {
 		data := d.data[k]
-		if len(data) > 0 { //对比v
+		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
 			for _, v := range data {
 				if v.id == info.id {//正常重复
 					return false, v,""
@@ -239,6 +245,35 @@ L:
 }
 
 
+func (d *datamap) replaceSourceData(replaceData *Info , replaceId string) {
+	ct, _ := strconv.ParseInt(replaceId[:8], 16, 64)
+	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+	k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
+	data := d.data[k]
+	if data == nil {
+		data = []*Info{replaceData}
+		d.data[k] = data
+		if !d.keys[dkey] {
+			d.keys[dkey] = true
+			d.update(ct)
+		}
+	} else {
+		//遍历替换
+		for k,v:=range data{
+			if v.id==replaceId{
+				data[k] = replaceData
+				break
+			}
+		}
+
+
+
+		d.data[k] = data
+	}
+
+}
+
+
 //判重方法1
 func quickHeavyMethodOne(v *Info ,info *Info) bool {
 

+ 225 - 33
udpfilterdup/src/main.go

@@ -6,7 +6,6 @@ package main
 
 import (
 	"encoding/json"
-	"flag"
 	"fmt"
 	"gopkg.in/mgo.v2/bson"
 	"log"
@@ -33,8 +32,9 @@ var (
 	nextNode     []map[string]interface{} //下节点数组
 	dupdays      = 5                      //初始化判重范围
 	DM           *datamap                 //判重数据
-	lastid       = ""
+	lastid       = "5da3f2c5a5cb26b9b79847fe"
 	//5da3f2c5a5cb26b9b79847fc
+
 	//正则筛选相关
 	FilterRegTitle = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
@@ -46,8 +46,8 @@ var (
 )
 
 func init() {
-	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
-	flag.Parse()
+	//flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
+	//flag.Parse()
 	//172.17.145.163:27080
 	util.ReadConfig(&Sysconfig)
 	nextNode = util.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
@@ -66,12 +66,14 @@ func init() {
 
 
 	//测试临时注释
-	//dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
-	////加载数据
-	//DM = NewDatamap(dupdays, lastid)
-	//FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
-	//FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
-	//FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
+	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
+	//加载数据
+	DM = NewDatamap(dupdays, lastid)
+	fmt.Println(DM.keys)
+	fmt.Println(DM.data)
+	FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
+	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
 
 
 
@@ -90,7 +92,7 @@ func init() {
 }
 
 //新增一个方法 判断
-func main()  {
+func mainTest()  {
 
 	//log.Println("1")
 	//代码copy数据
@@ -221,7 +223,7 @@ func main()  {
 
 
 
-func mainTest() {
+func main() {
 	go checkMapJob()
 
 	updport := Sysconfig["udpport"].(string)
@@ -243,6 +245,14 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 		if err != nil {
 			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
 		} else if mapInfo != nil {
+
+			//更新流程
+
+
+
+
+
+			//判重流程
 			go task(data, mapInfo)
 			key, _ := mapInfo["key"].(string)
 			if key == "" {
@@ -321,34 +331,68 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				// tmp["_id"] 对比id   id原始id
 				b, source,reason := DM.check(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
-					log.Println("判重具体原因:",reason)
 					repeateN++
 					mapLock.Lock()
-					updateExtract = append(updateExtract, []map[string]interface{}{
-						map[string]interface{}{
-							"_id": tmp["_id"],
-						},
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat":   1,
-								"repeatid": source.id,
-								"reason":reason,
-
-								//......
-							},
-						},
-					})
-					//合并操作
-					//评功权重打分
+
+					var mergeArr []int64 	//更改合并数组记录
+					var newData *Info		//更换新的数据池数据
+
+					//数据库更新1
+					var id_map  = map[string]interface{}{
+
+					}
+					//合并操作--评功权重打分-合并完替换原始数据池
 					if basicDataScore(source,info) {
-						//已原始数据为标准
+						//已原始数据为标准-
+						newData,mergeArr= mergeDataFields(source,info)
+						DM.replaceSourceData(newData,source.id)
+						id_map["_id"]= util.StringTOBsonId(source.id)
 					}else {
 						//对比数据为标准
+						newData,mergeArr= mergeDataFields(info,source)
+						DM.replaceSourceData(newData,source.id)
+						id_map["_id"]= util.StringTOBsonId(info.id)
 					}
 
+					//数据库更新2
+					var update_map  = map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat":   1,
+							"repeatid": newData.id,
+							"reason":reason,
+							"merge":newData.mergemap,
+						},
+					}
 
+					//更新合并后的数据
+					for _,value :=range mergeArr {
+						if value==1 {
+							update_map["$set"].(map[string]interface{})["area"] = newData.area
+							update_map["$set"].(map[string]interface{})["city"] = newData.city
+						}else if value==2 {
+							update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+						}else if value==3 {
+							update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+						}else if value==4 {
+							update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+						}else if value==5 {
+							update_map["$set"].(map[string]interface{})["budget"] = newData.budget
+						}else if value==6 {
+							update_map["$set"].(map[string]interface{})["winner"] = newData.winner
+						}else if value==7 {
+							update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+						}else if value==8 {
+							update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+						}else {
+
+						}
+					}
 
-
+					//构建数据库更新用到的
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						id_map,
+						update_map,
+					})
 					if len(updateExtract) > 500 {
 						mgo.UpdateBulk(extract, updateExtract...)
 						updateExtract = [][]map[string]interface{}{}
@@ -391,6 +435,147 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	}
 }
 
+
+//合并字段
+func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
+
+	var mergeArr []int64
+	mergeArr = make([]int64,0)
+	//1、城市
+	if (source.area==""||source.area=="全国")&&info.area!="全国"&&info.area!=""{
+		var arrA []string
+		if source.mergemap["area"]==nil {
+			arrA = make([]string, 0)
+		}else {
+			arrA = source.mergemap["area"].([]string)
+		}
+		arrA = append(arrA,source.area)
+		source.mergemap["area"] = arrA
+
+
+		var arrC []string
+		if source.mergemap["city"]==nil {
+			arrC = make([]string, 0)
+		}else {
+			arrC = source.mergemap["city"].([]string)
+		}
+		arrC = append(arrC,source.city)
+		source.mergemap["city"] = arrC
+
+		source.area = info.area
+		source.city = info.city
+		mergeArr = append(mergeArr,1)
+	}
+	//2、项目名称
+	if source.projectname==""&&info.projectname!=""{
+		var arr []string
+		if source.mergemap["projectname"]==nil {
+			arr = make([]string, 0)
+		}else {
+			arr = source.mergemap["projectname"].([]string)
+		}
+		arr = append(arr,source.projectname)
+		source.mergemap["projectname"] = arr
+
+		source.projectname = info.projectname
+		mergeArr = append(mergeArr,2)
+	}
+	//3、项目编号
+	if source.projectcode==""&&info.projectcode!=""{
+		var arr []string
+		if source.mergemap["projectcode"]==nil {
+			arr = make([]string, 0)
+		}else {
+			arr = source.mergemap["projectcode"].([]string)
+		}
+		arr = append(arr,source.projectcode)
+		source.mergemap["projectcode"] = arr
+
+		source.projectcode = info.projectcode
+		mergeArr = append(mergeArr,3)
+	}
+	//4、采购单位
+	if source.buyer==""&&info.buyer!=""{
+		var arr []string
+		if source.mergemap["buyer"]==nil {
+			arr = make([]string, 0)
+		}else {
+			arr = source.mergemap["buyer"].([]string)
+		}
+		arr = append(arr,source.buyer)
+		source.mergemap["buyer"] = arr
+
+		source.buyer = info.buyer
+		mergeArr = append(mergeArr,4)
+	}
+	//5、预算
+	if source.budget==0&&info.budget!=0{
+		var arr []float64
+		if source.mergemap["budget"]==nil {
+			arr = make([]float64, 0)
+		}else {
+			arr = source.mergemap["budget"].([]float64)
+		}
+		arr = append(arr,source.budget)
+		source.mergemap["budget"] = arr
+
+		source.budget = info.budget
+		mergeArr = append(mergeArr,5)
+	}
+	//6、中标单位
+	if source.winner==""&&info.winner!=""{
+		var arr []string
+		if source.mergemap["winner"]==nil {
+			arr = make([]string, 0)
+		}else {
+			arr = source.mergemap["winner"].([]string)
+		}
+		arr = append(arr,source.winner)
+		source.mergemap["winner"] = arr
+
+		source.winner = info.winner
+		mergeArr = append(mergeArr,6)
+	}
+	//7、中标金额
+	if source.bidamount==0&&info.bidamount!=0{
+		var arr []float64
+		if source.mergemap["bidamount"]==nil {
+			arr = make([]float64, 0)
+		}else {
+			arr = source.mergemap["bidamount"].([]float64)
+		}
+		arr = append(arr,source.bidamount)
+		source.mergemap["bidamount"] = arr
+
+		source.bidamount = info.bidamount
+		mergeArr = append(mergeArr,7)
+	}
+	//8、开天时间-地点
+	if source.bidopentime==0&&info.bidopentime!=0{
+		var arr []int64
+		if source.mergemap["bidopentime"]==nil {
+			arr = make([]int64, 0)
+		}else {
+			arr = source.mergemap["bidopentime"].([]int64)
+		}
+		arr = append(arr,source.bidopentime)
+		source.mergemap["bidopentime"] = arr
+
+		source.bidopentime = info.bidopentime
+		mergeArr = append(mergeArr,8)
+	}
+
+
+
+
+
+	//以上合并过于简单,待进一步优化
+
+	return source,mergeArr
+}
+
+
+//权重评估
 func basicDataScore(v *Info, info *Info) bool  {
 	m,n:=0,0
 	if v.projectname!="" {m++}
@@ -415,10 +600,17 @@ func basicDataScore(v *Info, info *Info) bool  {
 	if info.agency!="" {n=m+2}
 	if info.city!="" {n=m+2}
 
-	if m>=n {
+	if m>n {
 		return true
+	}else if m==n {
+		if v.comeintime>=info.comeintime {
+			return true
+		}else {
+			return false
+		}
+	}else {
+		return false
 	}
-	return false
 }