Browse Source

判重优化

apple 5 years ago
parent
commit
18f44815e1
3 changed files with 58 additions and 26 deletions
  1. 3 3
      udpfilterdup/src/config.json
  2. 14 10
      udpfilterdup/src/datamap.go
  3. 41 13
      udpfilterdup/src/main.go

+ 3 - 3
udpfilterdup/src/config.json

@@ -5,7 +5,7 @@
         "addr": "192.168.3.207:27092",
         "pool": 5,
         "db": "extract_kf",
-        "extract": "zheng_test1_jd1",
+        "extract": "zk",
         "site": {
             "dbname": "zhaolongyue",
             "coll": "site"
@@ -17,9 +17,9 @@
     },
     "nextNode": [],
     "isMerger": false,
-    "threads": 1,
+    "threads": 5,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
-    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包)",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
     "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",
     "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
 }

+ 14 - 10
udpfilterdup/src/datamap.go

@@ -23,6 +23,7 @@ type Info struct {
 	bidamount   float64 //中标金额
 	projectname string  //项目名称
 	projectcode string  //项目编号
+	contractnumber string //合同编号
 	publishtime int64   //发布时间
 	bidopentime int64   //开标时间
 	agencyaddr  string  //开标地点
@@ -190,6 +191,7 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.subtype = subtype
 	info.buyer = qutil.ObjToString(tmp["buyer"])
 	info.projectname = qutil.ObjToString(tmp["projectname"])
+	info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
 	info.projectcode = qutil.ObjToString(tmp["projectcode"])
 	info.city = qutil.ObjToString(tmp["city"])
 	info.agency = qutil.ObjToString(tmp["agency"])
@@ -276,10 +278,8 @@ L:
 							continue //无包含关系
 						}
 						if strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title) {
-							reason = "标题关键词且包含关系"
-							b = true
-							source = v
-							break L
+							reason = reason+"标题关键词且包含关系"
+							//继续二级金额判断
 						}
 					}
 
@@ -735,8 +735,9 @@ func tenderRepeat_A(v *Info, info *Info, reason string) (bool ,string) {
 		ss = ss + "p2(单位)-"
 		p2 = true
 	}
-	if v.projectcode != "" && v.projectcode == info.projectcode {
-		ss = ss + "p3(编号)-"
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode)>=5)||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber)>=5){
+		ss = ss + "p3(编号组)-"
 		p3 = true
 	}
 	if v.budget != 0 && v.budget == info.budget {
@@ -781,7 +782,8 @@ func tenderRepeat_B(v *Info, info *Info, reason string) (bool,string) {
 	if v.buyer != "" && v.buyer == info.buyer {
 		m++
 	}
-	if v.projectcode != "" && v.projectcode == info.projectcode {
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode)>=5)||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber)>=5){
 		m++
 	}
 	if v.budget != 0 && v.budget == info.budget {
@@ -841,8 +843,9 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool,string) {
 		ss = ss + "p2(单位)-"
 		p2 = true
 	}
-	if v.projectcode != "" && v.projectcode == info.projectcode {
-		ss = ss + "p3(编号)-"
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode)>=5)||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber)>=5){
+		ss = ss + "p3(编号组)-"
 		p3 = true
 	}
 	if v.bidamount != 0 && v.bidamount == info.bidamount {
@@ -883,7 +886,8 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool,string) {
 	if v.buyer != "" && v.buyer == info.buyer {
 		m++
 	}
-	if v.projectcode != "" && v.projectcode == info.projectcode {
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode)>=5)||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber)>=5){
 		m++
 	}
 	if v.bidamount != 0 && v.bidamount == info.bidamount {

+ 41 - 13
udpfilterdup/src/main.go

@@ -38,14 +38,14 @@ var (
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
 	FilterRegTitle_2 = regexp.MustCompile("^_$")
 
-
 	isMerger bool                              //是否合并
 	threadNum int								   //线程数量
 	SiteMap  map[string]map[string]interface{} //站点map
 	idtype, sid, eid string //测试人员判重使用
 )
 
-func init() {
+func init2() {
+
 	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")
@@ -104,8 +104,14 @@ func main() {
 
 //测试组人员使用
 func mainT() {
-	//sid = "5dfbd43ce9d1f601e43fa402"
-	//eid = "5e0954b30cf41612e061d0c8"
+
+	/*
+	761414
+	ObjectId("5da3f2c5a5cb26b9b79847fc")
+	ObjectId("5db2735ba5cb26b9b7c99c6f")
+	*/
+	//sid = "5da3f2c5a5cb26b9b79847f0"
+	//eid = "5db2735ba5cb26b9b7c99c6f"
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {
 		log.Println("sid,eid参数不能为空")
@@ -115,7 +121,7 @@ func mainT() {
 	mapinfo["lteid"] = eid
 	mapinfo["stop"] = "true"
 	task([]byte{}, mapinfo)
-	time.Sleep(5 * time.Second)
+	time.Sleep(30 * time.Second)
 }
 func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 	fmt.Println("接受的段数据")
@@ -200,7 +206,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			}()
 			info := NewInfo(tmp)
 			//是否为无效数据
-			if invalidData(info.buyer, info.projectname, info.projectcode) {
+			if invalidData(info.buyer, info.projectname, info.projectcode,info.contractnumber) {
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
 						"_id": tmp["_id"],
@@ -286,7 +292,9 @@ func task(data []byte, mapInfo map[string]interface{}) {
 								update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
 							} else if value == 8 {
 								update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-							} else {
+							} else if value == 9 {
+								update_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+							}else {
 							}
 						}
 					}
@@ -427,7 +435,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			if invalidData(info.buyer, info.projectname, info.projectcode) {
+			if invalidData(info.buyer, info.projectname, info.projectcode,info.contractnumber) {
 				//mapLock.Lock()
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
@@ -529,7 +537,9 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 									update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
 								} else if value == 8 {
 									update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-								} else {
+								} else if value == 9 {
+									update_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+								}else {
 
 								}
 							}
@@ -707,6 +717,21 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 		mergeArr = append(mergeArr, 8)
 	}
 
+	//9、合同编号
+	if source.contractnumber == "" && info.contractnumber != "" {
+		var arr []string
+		if source.mergemap["contractnumber"] == nil {
+			arr = make([]string, 0)
+		} else {
+			arr = source.mergemap["contractnumber"].([]string)
+		}
+		arr = append(arr, source.contractnumber)
+		source.mergemap["contractnumber"] = arr
+
+		source.contractnumber = info.contractnumber
+		mergeArr = append(mergeArr, 9)
+	}
+
 	//以上合并过于简单,待进一步优化
 	return source, mergeArr
 }
@@ -812,7 +837,7 @@ func basicDataScore(v *Info, info *Info) bool {
 	if v.buyer != "" {
 		m++
 	}
-	if v.projectcode != "" {
+	if v.projectcode != ""||v.contractnumber != "" {
 		m++
 	}
 	if v.budget != 0 {
@@ -843,7 +868,7 @@ func basicDataScore(v *Info, info *Info) bool {
 	if info.buyer != "" {
 		n++
 	}
-	if info.projectcode != "" {
+	if info.projectcode != "" || info.contractnumber != ""{
 		n++
 	}
 	if info.budget != 0 {
@@ -882,7 +907,7 @@ func basicDataScore(v *Info, info *Info) bool {
 }
 
 //无效数据
-func invalidData(d1 string, d2 string, d3 string) bool {
+func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
 	var n int
 	if d1 != "" {
 		n++
@@ -893,7 +918,10 @@ func invalidData(d1 string, d2 string, d3 string) bool {
 	if d3 != "" {
 		n++
 	}
-	if n == 0 {
+	if d4 != "" {
+		n++
+	}
+ 	if n == 0 {
 		return true
 	}
 	return false