소스 검색

判重算法优化~(主要针对~适配竞品通用判重)

zhengkun 2 년 전
부모
커밋
0b09ed6e76
5개의 변경된 파일88개의 추가작업 그리고 176개의 파일을 삭제
  1. 42 109
      src/dataMethod.go
  2. 2 7
      src/dataMethodHeavy.go
  3. 24 43
      src/datamap.go
  4. 1 1
      src/increaseRepeat.go
  5. 19 16
      src/main.go

+ 42 - 109
src/dataMethod.go

@@ -7,9 +7,10 @@ import (
 	"strings"
 )
 
-var cleanNameReg_0 = regexp.MustCompile("(项目)(.{0,5})(招标公告)$")
-var cleanNameReg_1 = regexp.MustCompile("([(())])")
+var cleanNameReg_0 = regexp.MustCompile("([(())::\\s ])")
+var cleanNameReg_1 = regexp.MustCompile("(项目)(.{0,5})(招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?$")
 var cleanNameReg_2 = regexp.MustCompile("(公告|公告公告)$")
+var cleanNameReg_3 = regexp.MustCompile("(公开)(比选|招标)")
 
 //完善判重数据检测-前置条件
 func convertArabicNumeralsAndLetters(data string) string {
@@ -30,6 +31,7 @@ func convertArabicNumeralsAndLetters(data string) string {
 	return newData
 }
 
+//特殊词处理
 func dealWithSpecialPhrases(str1 string, str2 string) (string, string) {
 	newStr1 := str1
 	newStr2 := str2
@@ -78,22 +80,19 @@ func againRepeat(v *Info, info *Info, site bool) bool {
 	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
 		return true
 	}
-
-	if judgeNameIsDifferent(v.title, info.title) {
-		if judgeNameIsDifferent(v.projectname, info.projectname) {
+	if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
+		return true
+	}
+	if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
+		if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
 			return true
 		}
 	}
-	if judgeNameIsDifferent(v.projectname, info.projectname) {
-		return true
-	}
-
 	return false
 }
 
 //均含有关键词再次判断
 func againContainSpecialWord(v *Info, info *Info) bool {
-
 	if isBidopentimeInterval(info.bidopentime, v.bidopentime) {
 		return true
 	}
@@ -122,58 +121,34 @@ func againContainSpecialWord(v *Info, info *Info) bool {
 
 //提取标题-标段号处理
 func dealTitleSpecial(title1 string, title2 string) bool {
-
 	regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789]+[))]?"
 	regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789]+(包|标段|标包)"
 	regx1_1, _ := regexp.Compile(regular1)
 	str1 := regx1_1.FindString(title1)
-	if str1 != "" {
-		//log.Println("标题1,规则一提取:",str1)
-	} else {
+	if str1 == "" {
 		regx1_2, _ := regexp.Compile(regular2)
 		str1 = regx1_2.FindString(title1)
-		if str1 != "" {
-			//log.Println("标题1,规则二提取:",str1)
-		}
 	}
-
 	regx2_1, _ := regexp.Compile(regular1)
 	str2 := regx2_1.FindString(title2)
-	if str2 != "" {
-		//log.Println("标题2,规则一提取:",str2)
-	} else {
+	if str2 == "" {
 		regx2_2, _ := regexp.Compile(regular2)
 		str2 = regx2_2.FindString(title2)
-		if str2 != "" {
-			//log.Println("标题2,规则二提取:",str2)
-		}
 	}
-
 	//根据提取的结果,在进行清洗
 	if str1 != "" {
 		str1 = deleteExtraSpace(str1)
-		str1 = strings.Replace(str1, "(", "", -1)
-		str1 = strings.Replace(str1, "(", "", -1)
-		str1 = strings.Replace(str1, ")", "", -1)
-		str1 = strings.Replace(str1, ")", "", -1)
+		str1 = cleanNameReg_0.ReplaceAllString(str1, "")
 		str1 = convertArabicNumeralsAndLetters(str1)
 	}
-
 	if str2 != "" {
 		str2 = deleteExtraSpace(str2)
-		str2 = strings.Replace(str2, "(", "", -1)
-		str2 = strings.Replace(str2, "(", "", -1)
-		str2 = strings.Replace(str2, ")", "", -1)
-		str2 = strings.Replace(str2, ")", "", -1)
+		str2 = cleanNameReg_0.ReplaceAllString(str2, "")
 		str2 = convertArabicNumeralsAndLetters(str2)
 	}
-
-	//log.Println("最终:",str1,str2)
 	if str1 != str2 {
-		//log.Println("不一致")
 		return true
 	} else {
-		//log.Println("一致")
 		return false
 	}
 }
@@ -196,7 +171,6 @@ func deleteExtraSpace(s string) string {
 
 //中标金额倍率:10000
 func isBidWinningAmount(f1 float64, f2 float64) bool {
-
 	if f1 == f2 || f1*10000 == f2 || f2*10000 == f1 {
 		return false
 	}
@@ -205,7 +179,6 @@ func isBidWinningAmount(f1 float64, f2 float64) bool {
 
 //时间间隔周期
 func isTimeIntervalPeriod(i1 int64, i2 int64) bool {
-
 	if math.Abs(float64(i1-i2)) < 172800.0 {
 		return true
 	} else {
@@ -255,7 +228,7 @@ func isPublishtimeInterval(i1 int64, i2 int64) bool {
 	}
 }
 
-//开标时间区间为一天
+//时间区间为一天
 func isTheSameDay(i1 int64, i2 int64) bool {
 	if i1 == 0 || i2 == 0 {
 		return false
@@ -266,15 +239,11 @@ func isTheSameDay(i1 int64, i2 int64) bool {
 	if day1 == day2 {
 		return true
 	}
-	//if math.Abs(float64(i1-i2)) <=86400.0 {
-	//	return true
-	//}
 	return false
 }
 
 //前置0 五要素均相等认为重复
 func leadingElementSame(v *Info, info *Info) bool {
-
 	isok := 0
 	if info.projectname != "" && v.projectname == info.projectname {
 		isok++
@@ -313,70 +282,20 @@ func buyerIsContinue(v *Info, info *Info) bool {
 	if !isTheSameDay(info.publishtime, v.publishtime) {
 		return true
 	}
-	if judgeNameIsDifferent(v.title, info.title) {
-		if judgeNameIsDifferent(v.projectname, info.projectname) {
+	if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
+		return true
+	}
+	if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
+		if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
 			return true
 		}
 	}
-
-	if judgeNameIsDifferent(v.projectname, info.projectname) {
-		return true
-	}
-
 	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
 		return true
 	}
 	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
 		return true
 	}
-
-	return false
-}
-
-//采用通用清洗~判断是否为有效不同
-func judgeNameIsDifferent(name_1 string, name_2 string) bool {
-	if name_1 == "" || name_2 == "" {
-		return false
-	}
-	if name_1 == name_2 {
-		return false
-	}
-	//通用清洗
-	new_name_1, new_name_2 := cleanNameFilterRedundant(name_1), cleanNameFilterRedundant(name_2)
-	if new_name_1 == new_name_2 {
-		return false
-	}
-	return true
-}
-
-//临时清洗名称过滤冗余
-func cleanNameFilterRedundant(name string) string {
-	new_name := name
-	new_name = cleanNameReg_0.ReplaceAllString(new_name, "${1}${3}")
-	new_name = cleanNameReg_1.ReplaceAllString(new_name, "")
-	new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
-
-	return new_name
-}
-
-//无效数据
-func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
-	var n int
-	if d1 != "" {
-		n++
-	}
-	if d2 != "" {
-		n++
-	}
-	if d3 != "" {
-		n++
-	}
-	if d4 != "" {
-		n++
-	}
-	if n == 0 {
-		return true
-	}
 	return false
 }
 
@@ -446,28 +365,31 @@ func confrimBiddingData(source_id string, info_id string) (bool, map[string]inte
 	return isvalid, info_data, source_data
 }
 
-func IsJingPinData(s_href string, i_href string) bool {
-	if strings.Contains(s_href, "www.jianyu360") ||
-		strings.Contains(i_href, "www.jianyu360") {
+//是否为竞品链接
+func IsJpHref(href string) bool {
+	if strings.Contains(href, "www.jianyu360") && href != "" {
 		return true
 	}
 	return false
 }
 
+//验证竞品是否重复
 func confirmJingPinIsRepeatData(v *Info, info *Info) bool {
-	//先验证标题
-	if (strings.Contains(info.title, v.title) || strings.Contains(v.title, info.title)) &&
-		v.title != "" && info.title != "" {
+	//标题验证~是否有关联~是否需要清洗数据
+	if v.c_title != "" && info.c_title != "" { //标题相似判断
+		if !(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) {
+			return false
+		}
 		if !isTheSameDay(v.publishtime, info.publishtime) {
 			return false
 		}
 		if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
 			return false
 		}
-		if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 {
+		if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 {
 			return false
 		}
-		if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+		if v.winner != "" && info.winner != "" && deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) {
 			return false
 		}
 		if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
@@ -479,4 +401,15 @@ func confirmJingPinIsRepeatData(v *Info, info *Info) bool {
 		return true
 	}
 	return false
-}
+}
+
+//通用清洗~清洗名称~过滤冗余~
+func cleanNameFilterRedundant(name string) string {
+	new_name := name
+	new_name = cleanNameReg_0.ReplaceAllString(new_name, "")
+	new_name = cleanNameReg_1.ReplaceAllString(new_name, "${1}${3}")
+	new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
+	new_name = cleanNameReg_3.ReplaceAllString(new_name, "${2}")
+
+	return new_name
+}

+ 2 - 7
src/dataMethodHeavy.go

@@ -184,13 +184,13 @@ func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 			ss = ss + "p11-标题-"
 			p11 = true
 		} else {
-			if !judgeNameIsDifferent(v.title, info.title) {
+			if v.c_title != "" && info.c_title != "" &&
+				(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) {
 				ss = ss + "p11-标题-"
 				p11 = true
 			}
 		}
 	}
-
 	if info.subtype != "" && (p1 && p3 && p11) {
 		reason = reason + "满足招标A,3要素组合-" + ss + ","
 		return true, reason
@@ -423,15 +423,10 @@ func isTheSimilarName(name1 string, name2 string) bool {
 
 //快速低质量数据判重
 func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
-	//if !isTheSameDay(v.publishtime,info.publishtime) {
-	//	return false,reason
-	//}
-
 	//区间间隔24小时
 	if !isTimeIntervalPeriod(v.publishtime, info.publishtime) {
 		return false, reason
 	}
-
 	//首先判定是否为低质量数据    info目标数据
 	if info.title != "" && (info.agency == "" || v.agency == "") &&
 		(info.title == v.title) &&

+ 24 - 43
src/datamap.go

@@ -33,9 +33,11 @@ type Info struct {
 	site             string  //站点
 	href             string  //正文的url
 	repeatid         string  //重复id
+	specialWord      bool    //特殊词
 	titleSpecialWord bool    //标题特殊词
-	specialWord      bool    //再次判断的特殊词
-	is_site          bool    //是否站点城市
+	isJphref         bool    //是否竞品数据
+	c_title          string  //清洗后的标题
+	c_projectname    string  //清洗后的项目名称
 }
 
 var datelimit = float64(432000) //五天
@@ -141,11 +143,6 @@ func NewDatamap(days int, lastid string) *datamap {
 	nowTime := time.Now().Unix() //当前时间的时间戳
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-
-		//source := util.ObjToMap(tmp["jsondata"]) //修复临时添加
-		//if util.IntAll((*source)["sourcewebsite"]) == 1 {
-		//	continue
-		//}
 		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 {
 
 		} else {
@@ -230,7 +227,11 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.repeatid = qutil.ObjToString(tmp["repeatid"])
 	info.specialWord = FilterRegTitle.MatchString(info.title)
 	info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) || FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
-	info.is_site = false
+	info.isJphref = IsJpHref(qutil.ObjToString(tmp["href"]))
+
+	//经过通用清洗后
+	info.c_title = cleanNameFilterRedundant(info.title)
+	info.c_projectname = cleanNameFilterRedundant(info.projectname)
 
 	return info
 }
@@ -240,7 +241,6 @@ func NewInfo(tmp map[string]interface{}) *Info {
 //判重方法
 func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
 	reason := ""
-	isTestLog := false
 	keys := []string{}
 	d.lock.Lock()
 	for k, _ := range d.keys { //不同时间段
@@ -262,13 +262,17 @@ L:
 		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
 			for _, v := range data {
 				reason = ""
-				isTestLog = false
 				if v.id == info.id { //正常重复
 					return false, v, ""
 				}
-
+				//buyer 优先级高,有值且不相等过滤
+				if info.buyer != "" && v.buyer != "" && info.buyer != v.buyer {
+					if buyerIsContinue(v, info) {
+						continue
+					}
+				}
 				// 竞品判重模式
-				if IsJingPinData(v.href, info.href) {
+				if v.isJphref || info.isJphref {
 					if confirmJingPinIsRepeatData(v, info) {
 						reason = "竞品模式~重复"
 						b = true
@@ -277,29 +281,18 @@ L:
 						break L
 					}
 				}
-				//buyer 优先级高,有值且不相等过滤
-				if info.buyer != "" && v.buyer != "" && info.buyer != v.buyer {
-					if v.title != info.title && v.title != "" && info.title != "" {
-						isTestLog = true
-					}
-					if buyerIsContinue(v, info) {
-						continue
-					}
-				}
+				//站点补城市
 				if info.site != "" { //站点临时赋值
-					sitelock.Lock()
-					dict := SiteMap[info.site]
-					sitelock.Unlock()
-					if dict != nil {
-						if (info.area == "全国" && dict["area"] != "") ||
-							(info.city == "" && dict["city"] != "") {
-							info.is_site = true
+					if info.area == "全国" || info.city == "" {
+						sitelock.Lock()
+						dict := SiteMap[info.site]
+						sitelock.Unlock()
+						if dict != nil && qutil.ObjToString(dict["city"]) != "" {
 							info.area = qutil.ObjToString(dict["area"])
 							info.city = qutil.ObjToString(dict["city"])
 						}
 					}
 				}
-
 				//前置条件-五要素均相等
 				if leadingElementSame(v, info) {
 					reason = "五要素-相同-满足"
@@ -308,7 +301,6 @@ L:
 					reasons = reason
 					break L
 				}
-
 				//前置条件 - 站点相关
 				if info.site != "" && info.site == v.site {
 					if info.href != "" && info.href == v.href {
@@ -324,8 +316,6 @@ L:
 							continue
 						}
 					}
-					//
-
 					//不同href
 					if info.href != "" && info.href != v.href {
 						if v.title == info.title {
@@ -353,7 +343,7 @@ L:
 						continue
 					}
 				}
-				//前置条件3 - 标题相关,均含有关键词
+				//前置条件 - 标题相关,均含有关键词
 				if specialNum == 2 {
 					if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 						v.title != "" && info.title != "" {
@@ -366,10 +356,8 @@ L:
 						if strings.Contains(letter1, "重新招标") || strings.Contains(letter2, "重新招标") {
 							letter1, letter2 = dealWithSpecialPhrases(letter1, letter2)
 						}
-
 						letter1 = cleanNameFilterRedundant(letter1)
 						letter2 = cleanNameFilterRedundant(letter2)
-
 						if letter1 == letter2 {
 							reason = reason + "标题关键词相等有效关系"
 							if !againRepeat(v, info, false) { //进行二级金额判断
@@ -380,15 +368,13 @@ L:
 							}
 						} else {
 							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
-								//无包含关系-即不相等
-								if againContainSpecialWord(v, info) {
+								if againContainSpecialWord(v, info) { //无包含关系-即不相等
 									continue
 								}
 							}
 						}
 					}
 				}
-
 				//新增快速数据过少判重
 				if LowHeavy {
 					repeat := false
@@ -468,13 +454,8 @@ L:
 			areaArr = append(areaArr, info.area)
 			d.areakeys = areaArr
 		}
-
 		d.lock.Unlock()
 	}
-
-	if isTestLog {
-		reasons = reasons + "-新修改"
-	}
 	return
 }
 

+ 1 - 1
src/increaseRepeat.go

@@ -161,7 +161,7 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 	log.Println("当前~判重~结束~", total, "重复~", repeatN)
 	//更新Ocr的标记
 	updateOcrFileData(mapInfo["lteid"].(string))
-	time.Sleep(15 * time.Second)
+	time.Sleep(10 * time.Second)
 	//任务完成,开始发送广播通知下面节点
 	log.Println("判重任务完成发送udp")
 	for _, to := range nextNode {

+ 19 - 16
src/main.go

@@ -40,7 +40,7 @@ var (
 	LowHeavy, TimingTask, IsFull, isUpdateSite bool
 	timingSpanDay, timingPubScope              int64
 	gtid, lastid, sec_gtid, sec_lteid, lteid   string
-	updatelock, datalock, numlock              sync.Mutex
+	updatelock, datalock, numlock, cronlock    sync.Mutex
 	userName, passWord                         string
 	jyfb_data                                  map[string]string
 	taskList                                   []map[string]interface{}
@@ -121,6 +121,7 @@ func initOther() {
 	c.Start()
 }
 func initSite() {
+	cronlock.Lock()
 	site := mconf["site"].(map[string]interface{})
 	SiteMap = make(map[string]map[string]interface{}, 0)
 	start := int(time.Now().Unix())
@@ -137,6 +138,7 @@ func initSite() {
 	}
 	isUpdateSite = false
 	log.Printf("new站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
+	cronlock.Unlock()
 }
 
 //初始化加载
@@ -154,6 +156,7 @@ func init() {
 }
 
 func mainT() {
+
 	IsFull = true
 	//AddGroupPool = newAddGroupPool()
 	//go AddGroupPool.addGroupData()
@@ -163,11 +166,9 @@ func mainT() {
 		"gtid":  "12ec61170ae152a3c2310f02",
 		"lteid": "92ec61170ae152a3c2310f02",
 	})
-
 	//gtid = "62ec2dd00ae152a3c230c1a1"
 	//lteid = "62ec2dd00ae152a3c230c1e1"
 	//historyRepeat()
-
 	time.Sleep(99999 * time.Hour)
 }
 
@@ -199,20 +200,22 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 		if err != nil {
 			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
 		} else if mapInfo != nil {
-			key, _ := mapInfo["key"].(string)
-			if key == "" {
-				key = "udpok"
+			sid, eid := qu.ObjToString(mapInfo["gtid"]), qu.ObjToString(mapInfo["lteid"])
+			if sid == "" || eid == "" {
+				log.Println("接收id段异常-err ", "sid=", sid, ",eid=", eid)
+			} else {
+				key := sid + "-" + eid + "-" + qu.ObjToString(mapInfo["stype"])
+				udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+				//计算是否需要加载站点~每天加载一次
+				if isUpdateSite {
+					initSite()
+				}
+				//插入任务-判断任务-是否存在
+				updatelock.Lock()
+				taskList = append(taskList, mapInfo)
+				log.Println("udp收到任务...数量:", len(taskList), "具体任务:", taskList)
+				updatelock.Unlock()
 			}
-			udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
-			//计算是否需要加载站点~每天加载一次
-			if isUpdateSite {
-				initSite()
-			}
-			//插入任务-判断任务-是否存在
-			updatelock.Lock()
-			taskList = append(taskList, mapInfo)
-			log.Println("udp收到任务...数量:", len(taskList), "具体任务:", taskList)
-			updatelock.Unlock()
 		}
 	case mu.OP_NOOP: //下个节点回应
 		ok := string(data)