Ver Fonte

判重优化

apple há 5 anos atrás
pai
commit
9ce6731fd3
4 ficheiros alterados com 120 adições e 85 exclusões
  1. 11 2
      udpfilterdup/src/config.json
  2. 83 47
      udpfilterdup/src/datamap.go
  3. 24 27
      udpfilterdup/src/main.go
  4. 2 9
      udps/main.go

+ 11 - 2
udpfilterdup/src/config.json

@@ -35,5 +35,14 @@
             "addr": "招标网"
         }
     ],
-    "specialwords": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
-}
+    "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包)",
+    "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",
+
+
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+
+
+
+}
+

+ 83 - 47
udpfilterdup/src/datamap.go

@@ -21,23 +21,22 @@ type Info struct {
 	buyer              string
 	agency             string //代理机构
 	winner             string //中标单位
+	budget             float64 //预算金额
+	bidamount		   float64//中标金额
 	projectname        string
 	projectcode        string
 	publishtime        int64
 	comeintime         int64
-
 	bidopentime        int64 //开标时间
 	agencyaddr		   string//开标地点
 	detail		   	   string//招标内容
 	site			   string//站点
-
-	ContainSpecialWord bool
+	titleSpecialWord bool //标题特殊次
+	specialWord bool	  //再次判断的特殊次
 }
 
 var datelimit = float64(432000)
 
-var mm int
-
 type datamap struct {
 	lock   sync.Mutex //锁
 	days   int        //保留几天数据
@@ -111,13 +110,18 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.subtype = subtype
 	info.buyer = qutil.ObjToString(tmp["buyer"])
 	info.projectname = qutil.ObjToString(tmp["projectname"])
-	//info.ContainSpecialWord = FilterRegexp.MatchString(info.projectname) || FilterRegexp.MatchString(info.title)
 
-	info.ContainSpecialWord = FilterRegTitle.MatchString(info.title)
+	info.specialWord = FilterRegTitle.MatchString(info.title)
+	info.titleSpecialWord = FilterRegTitle_1.MatchString(info.title)||FilterRegTitle_2.MatchString(info.title)
+
 	info.projectcode = qutil.ObjToString(tmp["projectcode"])
 	info.city = qutil.ObjToString(tmp["city"])
 	info.agency = qutil.ObjToString(tmp["agency"])
-	//info.winner = qutil.ObjToString(tmp["winner"])
+	info.winner = qutil.ObjToString(tmp["winner"])
+	info.budget = qutil.Float64All(tmp["budget"])
+	info.bidamount = qutil.Float64All(tmp["bidamount"])
+
+
 	info.publishtime = qutil.Int64All(tmp["publishtime"])
 
 	info.bidopentime  = qutil.Int64All(tmp["bidopentime"])
@@ -162,7 +166,7 @@ L:
 						if info.publishtime<v.publishtime{
 							continue
 						}
-						if info.ContainSpecialWord&&info.title!=v.title&&v.title!="" {
+						if info.titleSpecialWord&&info.title!=v.title&&v.title!="" {
 							continue
 						}
 						if v.projectcode != info.projectcode&&len([]rune(info.projectcode)) >=10&&v.projectcode!=""{
@@ -200,6 +204,9 @@ L:
 						t:= judgeCityType(v.area,info.area,v.city,info.city)
 						if n>=3||first_judge==true {
 							if t==2 {//同城
+								if conditionAgainRepeat(v,info) {
+									continue
+								}
 								b = true
 								id = v.id
 								break L
@@ -245,13 +252,16 @@ L:
 						}
 						if info.site != "" && v.site == info.site&&site_b {
 							if n>1||c_1||c_2 {
+								if conditionAgainRepeat(v,info) {
+									continue
+								}
 								b = true
 								id = v.id
 								log.Println("站点满足过滤")
 								break L
 							}
 						}else {
-							if info.ContainSpecialWord&&info.title!=v.title&&v.title!="" {
+							if info.titleSpecialWord&&info.title!=v.title&&v.title!="" {
 								continue
 							}
 							if v.projectcode != info.projectcode&&len([]rune(info.projectcode)) >=10&&v.projectcode!=""{
@@ -259,6 +269,9 @@ L:
 							}
 							//先决条件满足三要素,条件4
 							if n==3||c_4{
+								if conditionAgainRepeat(v,info) {
+									continue
+								}
 								b = true
 								id = v.id
 								break L
@@ -267,23 +280,39 @@ L:
 							//城市判断
 							if t==0||t==1 { //最少一个全国
 								if c_1 && (c_2||n>1) {
+
+									if conditionAgainRepeat(v,info) {
+										continue
+									}
 									b = true
 									id = v.id
 									break L
 								}
 								if c_2&&x>2{
+
+									if conditionAgainRepeat(v,info) {
+										continue
+									}
 									b = true
 									id = v.id
 									break L
 								}
 							}else if t==2 { //	省-市
 								if c_1||c_2||n>1 {
+									//新增二次判断逻辑
+									if conditionAgainRepeat(v,info) {
+										continue
+									}
 									b = true
 									id = v.id
 									break L
 								}
 							}else if t==3 {//	!省 !市
 								if (c_1&&n>1)||(c_2&&x>2){
+
+									if conditionAgainRepeat(v,info) {
+										continue
+									}
 									b = true
 									id = v.id
 									break L
@@ -291,6 +320,10 @@ L:
 
 							}else if t==4 {//	省 !市
 								if m>1||(c_1&&m>0)||(c_2&&x>1)||(c_3&&n>1){
+
+									if conditionAgainRepeat(v,info) {
+										continue
+									}
 									b = true
 									id = v.id
 									break L
@@ -301,46 +334,10 @@ L:
 						}
 					}
 				}
-
-				////非变更数据判重处理
-				//n := 0
-				//if v.buyer != "" && v.buyer == info.buyer {
-				//	n++
-				//}
-				//if v.projectname != "" && v.projectname == info.projectname {
-				//	n++
-				//}
-				//if !info.ContainSpecialWord && n > 1 {
-				//	b = true
-				//	id = v.id
-				//	break L
-				//} else if v.projectcode != "" && v.projectcode == info.projectcode {
-				//	n++
-				//}
-				//if !info.ContainSpecialWord && n > 1 || n > 2 {
-				//	b = true
-				//	id = v.id
-				//	break L
-				//}
-				////标题长度大于10且相等即为重复
-				////				if len([]rune(info.title)) > 10 && v.title == info.title {
-				////					b = true
-				////					id = v.id
-				////					break L
-				////				}
-				////标题长度大于10且包含关系+buyer/projectname/projectcode/city(全国/A的只判断包含关系即可)相等即为重复
-				//if len([]rune(info.title)) > 10 && len([]rune(v.title)) > 10 && (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
-				//	if info.area == "全国" || n > 0 || info.city == v.city {
-				//		b = true
-				//		id = v.id
-				//		break L
-				//	}
-				//}
 			}
 		}
 	}
 
-
 	//往预存数据 d 添加
 	if !b {
 		ct, _ := strconv.ParseInt(info.id[:8], 16, 64)
@@ -378,7 +375,7 @@ func judgeCityType(v string, info string,v_c string,info_c string) (t int) {
 		}else {//同省非同城
 			t=4
 		}
-	}else {//有且一个全国
+	}else {//有且一个全国 ,包含多种情况,
 		t=1
 	}
 	return t
@@ -418,7 +415,46 @@ func conditionCodeTime(t1 int64 ,t2 int64,c1 string,c2 string) bool {
 	return false
 }
 
+func conditionAgainRepeat(v *Info ,info *Info) bool {
+	//同省情况下
+	//if v.area==info.area&&v.area!="全国"&&v.area!=""&&v.city==info.city {
+	//
+	//}
 
+	// 编号过短+金额  不等时,暂存
+
+
+
+
+	//相同采购单位下
+	if info.buyer != "" &&v.buyer == info.buyer {
+		//满足标题
+		if  len([]rune(v.title)) >=10&&len([]rune(info.title)) >=10&&v.title!=info.title&&(info.specialWord||v.specialWord){
+			return true
+		}
+
+		if info.subtype=="招标"||info.subtype=="邀标"||info.subtype=="询价"||
+			info.subtype=="竞谈"||info.subtype=="单一"||info.subtype=="竞价"||
+			info.subtype=="其他"||info.subtype=="变更" {
+			//预算金额满足条件
+			if v.budget!=info.budget&&v.budget!=0&&info.budget!=0 {
+				return true
+			}
+		}else if info.subtype=="中标"||info.subtype=="成交"||info.subtype=="废标"||
+			info.subtype=="流标"||info.subtype=="合同"||info.subtype=="验收"||
+			info.subtype=="违规"{
+			//中标金额单位满足条件
+			if (v.bidamount!=info.bidamount&&v.bidamount!=0&&info.bidamount!=0)||
+				(v.winner!=info.winner&&v.winner!=""&&info.winner!=""){
+				return true
+			}
+		}else {
+
+		}
+	}
+
+	return false
+}
 
 func (d *datamap) update(t int64) {
 	//每天0点清除历史数据

+ 24 - 27
udpfilterdup/src/main.go

@@ -6,6 +6,7 @@ package main
 
 import (
 	"encoding/json"
+	"flag"
 	"fmt"
 	"gopkg.in/mgo.v2/bson"
 	"log"
@@ -32,21 +33,24 @@ var (
 	nextNode     []map[string]interface{} //下节点数组
 	dupdays      = 5                      //初始化判重范围
 	DM           *datamap                 //判重数据
-	FilterRegexp = regexp.MustCompile("^_$")
-	lastid       = "5da3f2c5a5cb26b9b79847fc"
+	lastid       = ""
+	//5da3f2c5a5cb26b9b79847fc
+	//正则筛选相关
+	FilterRegTitle = regexp.MustCompile("^_$")
+	FilterRegTitle_1 = regexp.MustCompile("^_$")
+	FilterRegTitle_2 = regexp.MustCompile("^_$")
+
+
+
 
-	//5d767728a5cb26b9b7748868  //9万
-	//5da3f2c5a5cb26b9b79847fc  //76万
-	FilterRegTitle = regexp.MustCompile("[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包)")
-	inV_n int   //无效数据数量
 	siteArr     []map[string]interface{} //站点
-	//56404035af5374672e00059c
-	//5d767728a5cb26b9b7748868
+
+	inV_n int   //无效数据数量
 )
 
 func init() {
-	//flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
-	//flag.Parse()
+	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
+	flag.Parse()
 	//172.17.145.163:27080
 	util.ReadConfig(&Sysconfig)
 	nextNode = util.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
@@ -60,19 +64,19 @@ func init() {
 	}
 	extract = mconf["extract"].(string)
 	extract_copy = mconf["extract_copy"].(string)
-
 	//bidding = mconf["bidding"].(string)
 	mgo.InitPool()
 
 
 	//测试临时注释
-	//dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
-	////加载数据
-	//DM = NewDatamap(dupdays, lastid)
-	//sw := util.ObjToString(Sysconfig["specialwords"])
-	//if sw != "" {
-	//	FilterRegexp = regexp.MustCompile(sw)
-	//}
+	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
+	//加载数据
+	DM = NewDatamap(dupdays, lastid)
+	FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
+	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
+
+
 
 
 	//数据库
@@ -89,7 +93,7 @@ func init() {
 }
 
 //新增一个方法 判断
-func main()  {
+func mainTest()  {
 	//log.Println("1")
 	//代码copy数据
 	//sessTest :=mgoTest.GetMgoConn()
@@ -137,9 +141,6 @@ func main()  {
 		if i%2000==0 {
 			log.Println("当前i:",i)
 		}
-		//if i>10000 {
-		//	break
-		//}
 		m1[(v1["_id"].(bson.ObjectId).Hex())]= util.IntAll(v1["repeat"])
 	}
 
@@ -152,9 +153,6 @@ func main()  {
 		if j%2000==0 {
 			log.Println("当前j:",j)
 		}
-		//if j>10000 {
-		//	break
-		//}
 		m2[(v2["_id"].(bson.ObjectId).Hex())]= util.IntAll(v2["repeat"])
 	}
 
@@ -183,7 +181,6 @@ func main()  {
 		}
 	}
 	//打印 1:0情况 少打印 300条    38841
-
 	mm:=0
 	for _,v:=range arr1 {
 		mm++
@@ -219,7 +216,7 @@ func main()  {
 
 
 
-func main22() {
+func main() {
 	go checkMapJob()
 
 	updport := Sysconfig["udpport"].(string)

+ 2 - 9
udps/main.go

@@ -24,18 +24,11 @@ func main() {
 	//2018-06-01,2019-02-20
 
 	/*
-	ObjectId("5d767728a5cb26b9b7748868")
-	ObjectId("5d77c881a5cb26b9b7de209d")  //9W条数据
-
-	56404035af5374672e00059c
-	5d4da9c8a5cb26b9b7b6bbcd  100万
-
-
 	5da3f2c5a5cb26b9b79847fc
 	5db2735ba5cb26b9b7c99c6f   76万
 	*/
-	flag.StringVar(&sid, "sid", "5da3f2c5a5cb26b9b79847fc", "开始id")
-	flag.StringVar(&eid, "eid", "5db2735ba5cb26b9b7c99c6f", "结束id")
+	flag.StringVar(&sid, "sid", "", "开始id")
+	flag.StringVar(&eid, "eid", "", "结束id")
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")