浏览代码

判重优化-备份

apple 4 年之前
父节点
当前提交
a9fc2b4e4b
共有 5 个文件被更改,包括 125 次插入34 次删除
  1. 83 22
      udpfilterdup/src/dataMethod.go
  2. 1 9
      udpfilterdup/src/datamap.go
  3. 2 1
      udpfilterdup/src/main.go
  4. 2 2
      udpfusion/src/main.go
  5. 37 0
      udpfusion/src/weightValue.go

+ 83 - 22
udpfilterdup/src/dataMethod.go

@@ -2,9 +2,9 @@ package main
 
 import (
 	"math"
+	qutil "qfw/util"
 	"regexp"
 	"strings"
-	qutil "qfw/util"
 )
 
 
@@ -75,31 +75,92 @@ func againRepeat(v *Info, info *Info) bool {
 	return false
 }
 
-////站点再次判断
-//func againSite(v *Info, info *Info) bool {
-//
-//	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
-//		return true
-//	}
-//	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
-//		return true
-//	}
-//	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
-//		return true
-//	}
-//	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
-//		return true
-//	}
-//	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
-//		return true
-//	}
-//
-//	return false
-//}
+//均含有关键词再次判断
+func againContainSpecialWord (v *Info, info *Info) bool {
 
+	if isBidopentimeInterval(info.bidopentime,v.bidopentime) {
+		return true
+	}
+	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+		return true
+	}
+	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
+		return true
+	}
+	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+		return true
+	}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+	//提取标题-标段号处理
+	if dealTitleSpecial(v.title,info.title) {
+		return true
+	}
+
+	return false
+}
+
+//提取标题-标段号处理
+func dealTitleSpecial(title1 string,title2 string) bool{
+
+	regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?"
+	regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789](包|标段|标包)"
+	regx1_1,_ := regexp.Compile(regular1)
+	str1:=regx1_1.FindString(title1)
+	if str1!="" {
+		//log.Println("标题1,规则一提取:",str1)
+	}else {
+		regx1_2,_ := regexp.Compile(regular2)
+		str1=regx1_2.FindString(title1)
+		if str1!="" {
+			//log.Println("标题1,规则二提取:",str1)
+		}
+	}
+
+	regx2_1,_ := regexp.Compile(regular1)
+	str2:=regx2_1.FindString(title2)
+	if str2!="" {
+		//log.Println("标题2,规则一提取:",str2)
+	}else {
+		regx2_2,_ := regexp.Compile(regular2)
+		str2=regx2_2.FindString(title2)
+		if str2!="" {
+			//log.Println("标题2,规则二提取:",str2)
+		}
+	}
 
+	//根据提取的结果,在进行清洗
+	if str1!="" {
+		str1 = deleteExtraSpace(str1)
+		str1= strings.Replace(str1, "(", "", -1)
+		str1= strings.Replace(str1, "(", "", -1)
+		str1= strings.Replace(str1, ")", "", -1)
+		str1= strings.Replace(str1, ")", "", -1)
+		str1 = convertArabicNumeralsAndLetters(str1)
+	}
 
+	if str2!="" {
+		str2 = deleteExtraSpace(str2)
+		str2= strings.Replace(str2, "(", "", -1)
+		str2= strings.Replace(str2, "(", "", -1)
+		str2= strings.Replace(str2, ")", "", -1)
+		str2= strings.Replace(str2, ")", "", -1)
+		str2 = convertArabicNumeralsAndLetters(str2)
+	}
 
+	//log.Println("最终:",str1,str2)
+	if str1!=str2 {
+		//log.Println("不一致")
+		return true
+	}else {
+		//log.Println("一致")
+		return false
+	}
+}
 
 
 //删除中标单位字符串中多余的空格(含tab)

+ 1 - 9
udpfilterdup/src/datamap.go

@@ -362,15 +362,7 @@ L:
 						}else {
 							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
 								//无包含关系-即不相等
-								////特殊 标段X 类直接过滤
-								//if info.titleSpecialWord && v.titleSpecialWord {
-								//	continue
-								//}else {
-								//	if againRepeat(v, info) {
-								//		continue
-								//	}
-								//}
-								if againRepeat(v, info) {
+								if againContainSpecialWord(v, info) {
 									continue
 								}
 							}

+ 2 - 1
udpfilterdup/src/main.go

@@ -131,6 +131,7 @@ func init() {
 
 
 func main() {
+
 	go checkMapJob()
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
@@ -249,7 +250,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	wg := &sync.WaitGroup{}
 	n, repeateN := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		if n%100 == 0 {
+		if n%1000 == 0 {
 			log.Println("current:", n, tmp["_id"],tmp["publishtime"], "repeateN:", repeateN)
 		}
 

+ 2 - 2
udpfusion/src/main.go

@@ -132,8 +132,8 @@ func startTask(data []byte, mapInfo map[string]interface{}) {
 			log.Println("当前数量:", index, tmp["_id"])
 		}
 
-
-
+		//we:=weight.NewWeightData([]string{})
+		//log.Println(we)
 
 
 

+ 37 - 0
udpfusion/src/weightValue.go

@@ -0,0 +1,37 @@
+package main
+
+import (
+	"log"
+	qu "qfw/util"
+	"sync"
+)
+
+type WeightInfo struct {
+	maxLevel	bool
+	minLevel	bool
+	siteLevel string
+	elementScore    int
+	ranking		int
+}
+
+
+//一般数据判重
+type weightDataMap struct {
+	lock   sync.Mutex //锁
+	data   map[string][]*WeightInfo
+}
+
+func NewWeightData(arr []string) *weightDataMap {
+	log.Print(qu.ObjToString(""))
+	//测试
+	arr = []string{"5f210d1752c1d9fbf849a6a2","5f20eb1da120e23754bc8422"}
+
+	log.Println(len(arr))
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+
+
+
+	return nil
+
+}