Bladeren bron

备份~抽取版本

zhengkun 2 jaren geleden
bovenliggende
commit
38394c618c
8 gewijzigde bestanden met toevoegingen van 148 en 92 verwijderingen
  1. 15 1
      src/README.md
  2. 40 16
      src/dataMethod.go
  3. 55 59
      src/dataMethodHeavy.go
  4. 8 4
      src/datamap.go
  5. 7 2
      src/historyRepeat.go
  6. 11 4
      src/increaseRepeat.go
  7. 11 6
      src/main.go
  8. 1 0
      src/mgo.go

+ 15 - 1
src/README.md

@@ -789,4 +789,18 @@ func historyTaskDay() {
  		}
  		log.Println("继续下一段的历史判重")
  	}
- }	       					
+ }	       		
+
+
+
+
+func getDB() *mgo.Database {
+session, err := mgo.Dial("127.0.0.1:27017")
+if err != nil {
+panic(err)
+}
+session.SetMode(mgo.Monotonic, true)
+db := session.DB("zhengkun")
+return db
+}
+

+ 40 - 16
src/dataMethod.go

@@ -7,6 +7,10 @@ import (
 	"strings"
 )
 
+var cleanNameReg_0 = regexp.MustCompile("(项目)(.{0,5})(招标公告)$")
+var cleanNameReg_1 = regexp.MustCompile("([(())])")
+var cleanNameReg_2 = regexp.MustCompile("(公告|公告公告)$")
+
 //完善判重数据检测-前置条件
 func convertArabicNumeralsAndLetters(data string) string {
 	newData := data
@@ -74,12 +78,13 @@ func againRepeat(v *Info, info *Info, site bool) bool {
 	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
 		return true
 	}
-	if v.title != info.title && v.title != "" && info.title != "" {
-		if v.projectname != info.projectname && v.projectname != "" && info.projectname != "" {
+
+	if judgeNameIsDifferent(v.title, info.title) {
+		if judgeNameIsDifferent(v.projectname, info.projectname) {
 			return true
 		}
 	}
-	if v.projectname != info.projectname && v.projectname != "" && info.projectname != "" {
+	if judgeNameIsDifferent(v.projectname, info.projectname) {
 		return true
 	}
 
@@ -308,23 +313,16 @@ func buyerIsContinue(v *Info, info *Info) bool {
 	if !isTheSameDay(info.publishtime, v.publishtime) {
 		return true
 	}
-	if v.title != info.title && v.title != "" && info.title != "" {
-		if v.projectname != info.projectname && v.projectname != "" && info.projectname != "" {
+	if judgeNameIsDifferent(v.title, info.title) {
+		if judgeNameIsDifferent(v.projectname, info.projectname) {
 			return true
 		}
 	}
-	if v.projectname != info.projectname && v.projectname != "" && info.projectname != "" {
+
+	if judgeNameIsDifferent(v.projectname, info.projectname) {
 		return true
 	}
-	//if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
-	//	return true
-	//}
-	//if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
-	//	return true
-	//}
-	//if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
-	//	return true
-	//}
+
 	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
 		return true
 	}
@@ -335,6 +333,32 @@ func buyerIsContinue(v *Info, info *Info) bool {
 	return false
 }
 
+//采用通用清洗~判断是否为有效不同
+func judgeNameIsDifferent(name_1 string, name_2 string) bool {
+	if name_1 == "" || name_2 == "" {
+		return false
+	}
+	if name_1 == name_2 {
+		return false
+	}
+	//通用清洗
+	new_name_1, new_name_2 := cleanNameFilterRedundant(name_1), cleanNameFilterRedundant(name_2)
+	if new_name_1 == new_name_2 {
+		return false
+	}
+	return true
+}
+
+//临时清洗名称过滤冗余
+func cleanNameFilterRedundant(name string) string {
+	new_name := name
+	new_name = cleanNameReg_0.ReplaceAllString(new_name, "${1}${3}")
+	new_name = cleanNameReg_1.ReplaceAllString(new_name, "")
+	new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
+
+	return new_name
+}
+
 //无效数据
 func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
 	var n int
@@ -358,7 +382,7 @@ func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
 
 //判断~是否需要替换数据相关
 func judgeIsReplaceInfo(s_href string, i_href string) bool {
-	if strings.Contains(s_href, "https://www.jianyu360.cn") &&
+	if strings.Contains(s_href, "https://www.jianyu360.cn") && i_href != "" &&
 		!strings.Contains(i_href, "https://www.jianyu360.cn") {
 		return true
 	}

+ 55 - 59
src/dataMethodHeavy.go

@@ -68,18 +68,18 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 //判重方法2
 func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 	isMeet := false
-	isAgency :=false
+	isAgency := false
 	//招标类-代理机构不同-广泛前后缀比较
 	if v.agency != info.agency && v.agency != "" && info.agency != "" {
 		//新增一层判断
 		if strings.Contains(v.agency, info.agency) || strings.Contains(info.agency, v.agency) {
 			isAgency = true
-		}else {
+		} else {
 			return false, reason
 		}
 	}
 
-	if (v.agency == info.agency && v.agency != "" && info.agency != "")|| isAgency {
+	if (v.agency == info.agency && v.agency != "" && info.agency != "") || isAgency {
 		if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
 			info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
 			info.subtype == "变更" || info.subtype == "其他" {
@@ -179,18 +179,24 @@ func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		ss = ss + "p10-开标地点-"
 		p10 = true
 	}
-	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
-		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
-		ss = ss + "p11-标题-"
-		p11 = true
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 {
+		if strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title) {
+			ss = ss + "p11-标题-"
+			p11 = true
+		} else {
+			if !judgeNameIsDifferent(v.title, info.title) {
+				ss = ss + "p11-标题-"
+				p11 = true
+			}
+		}
 	}
 
-	if info.subtype !=""&&(p1 && p3 && p11)  {
+	if info.subtype != "" && (p1 && p3 && p11) {
 		reason = reason + "满足招标A,3要素组合-" + ss + ","
 		return true, reason
 	}
 
-	if  (p1 && p2 && p3) || (p1 && p2 && p4) || (p1 && p2 && p9) ||
+	if (p1 && p2 && p3) || (p1 && p2 && p4) || (p1 && p2 && p9) ||
 		(p1 && p2 && p10) || (p1 && p2 && p11) || (p1 && p3 && p9) || (p1 && p3 && p10) || (p1 && p3 && p4) ||
 		(p1 && p4 && p9) || (p1 && p4 && p10) || (p2 && p3 && p4) ||
 		(p2 && p3 && p9) || (p2 && p3 && p10) || (p2 && p3 && p11) ||
@@ -249,7 +255,7 @@ func tenderRepeat_C(v *Info, info *Info) bool {
 	if v.budget != 0 && info.budget != 0 && v.budget != info.budget {
 		return true
 	}
-	if v.bidopentime != 0 && info.bidopentime != 0 && isBidopentimeInterval(info.bidopentime,v.bidopentime) {
+	if v.bidopentime != 0 && info.bidopentime != 0 && isBidopentimeInterval(info.bidopentime, v.bidopentime) {
 		return true
 	}
 	return false
@@ -273,7 +279,7 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		ss = ss + "p3-编号组--"
 		p3 = true
 	}
-	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount, info.bidamount) {
 		ss = ss + "p5-中标金-"
 		p5 = true
 	}
@@ -282,15 +288,14 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		p6 = true
 	}
 
-
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
 		ss = ss + "p11-标题-"
 		p11 = true
 	}
 
-	if 	(p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
-		(p1 && p2 && p11)|| (p1 && p3 && p11)||
+	if (p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
+		(p1 && p2 && p11) || (p1 && p3 && p11) ||
 		(p1 && p3 && p5) || (p1 && p3 && p6) || (p1 && p5 && p6) ||
 		(p2 && p3 && p5) || (p2 && p3 && p6) || (p2 && p3 && p11) ||
 		(p2 && p5 && p6) || (p2 && p5 && p11) || (p2 && p6 && p11) ||
@@ -318,7 +323,7 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
 		m++
 	}
-	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount, info.bidamount) {
 		m++
 	}
 	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
@@ -343,11 +348,11 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 //中标_C
 func winningRepeat_C(v *Info, info *Info) bool {
 
-	if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount,info.bidamount) {
+	if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount, info.bidamount) {
 		//避免抽错金额-
-		if ((v.projectcode!=""&&info.projectcode!=""&&v.projectcode==info.projectcode)||
-			(v.contractnumber!=""&&info.contractnumber!=""&&v.contractnumber==info.contractnumber)) &&
-			(v.winner!=""&&info.winner!=""&&v.winner==info.winner) {
+		if ((v.projectcode != "" && info.projectcode != "" && v.projectcode == info.projectcode) ||
+			(v.contractnumber != "" && info.contractnumber != "" && v.contractnumber == info.contractnumber)) &&
+			(v.winner != "" && info.winner != "" && v.winner == info.winner) {
 			return false
 		}
 		return true
@@ -408,20 +413,14 @@ func contractRepeat_C(v *Info, info *Info) bool {
 	return false
 }
 
-
-
-
-
-
 //是否相似
-func isTheSimilarName(name1 string,name2 string) bool {
-	if strings.Contains(name1,name2) || strings.Contains(name2,name1) {
+func isTheSimilarName(name1 string, name2 string) bool {
+	if strings.Contains(name1, name2) || strings.Contains(name2, name1) {
 		return true
 	}
 	return false
 }
 
-
 //快速低质量数据判重
 func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
 	//if !isTheSameDay(v.publishtime,info.publishtime) {
@@ -429,73 +428,70 @@ func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
 	//}
 
 	//区间间隔24小时
-	if !isTimeIntervalPeriod(v.publishtime,info.publishtime) {
-		return false,reason
+	if !isTimeIntervalPeriod(v.publishtime, info.publishtime) {
+		return false, reason
 	}
 
-
 	//首先判定是否为低质量数据    info目标数据
-	if info.title!=""&&(info.agency==""||v.agency=="")&&
-		(info.title==v.title)&&
-		(info.projectcode==""||info.projectcode==v.projectcode)&&
-		info.contractnumber==""&&info.buyer=="" {
-		isValue:=0//五要素判断
-		if info.projectname != "" {//项目名称
+	if info.title != "" && (info.agency == "" || v.agency == "") &&
+		(info.title == v.title) &&
+		(info.projectcode == "" || info.projectcode == v.projectcode) &&
+		info.contractnumber == "" && info.buyer == "" {
+		isValue := 0                //五要素判断
+		if info.projectname != "" { //项目名称
 			isValue++
 		}
-		if info.budget != 0 {//预算
+		if info.budget != 0 { //预算
 			isValue++
 		}
-		if info.winner != ""{//中标单位
+		if info.winner != "" { //中标单位
 			isValue++
 		}
-		if info.bidamount != 0 {//中标金额
+		if info.bidamount != 0 { //中标金额
 			isValue++
 		}
-		if isValue==0 {
+		if isValue == 0 {
 			reason = reason + "---低质量-要素均为空-标题满足"
 			return true, reason
-		}else if isValue==1 {
+		} else if isValue == 1 {
 			isMeet := false
 			if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
 				reason = reason + "---低质量-有且一个要素组合"
 				return true, reason
 			}
-		}else if isValue==2{
+		} else if isValue == 2 {
 			if info.subtype == "采购意向" { //特殊
-				if info.projectname!="" && info.projectname == v.projectname &&
+				if info.projectname != "" && info.projectname == v.projectname &&
 					info.budget != 0 && info.budget == v.budget &&
-					info.city != "" && info.city == v.city{
+					info.city != "" && info.city == v.city {
 					reason = reason + "---采购意向~同城~预算~名称均一致"
-					return true,reason
+					return true, reason
 				}
 			}
-		}else {
+		} else {
 
 		}
 	}
-	return false,reason
+	return false, reason
 }
 
-
-
 //类别细节原因记录
 func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
-	if info.projectname!="" && isTheSimilarName(info.projectname,v.projectname) {
+	if info.projectname != "" && isTheSimilarName(info.projectname, v.projectname) {
 		reason = reason + "---项目名称"
-		return true,reason
+		return true, reason
 	}
-	if info.budget != 0 && info.budget == v.budget{//预算
+	if info.budget != 0 && info.budget == v.budget { //预算
 		reason = reason + "---预算"
-		return true,reason
+		return true, reason
 	}
-	if v.winner != "" && info.winner == v.winner{//中标单位
+	if v.winner != "" && info.winner == v.winner { //中标单位
 		reason = reason + "---中标单位"
-		return true,reason
+		return true, reason
 	}
-	if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
+	if v.bidamount != 0 && info.bidamount == v.bidamount { //中标金额
 		reason = reason + "---中标金额"
-		return true,reason
+		return true, reason
 	}
-	return false,reason
-}
+	return false, reason
+}

+ 8 - 4
src/datamap.go

@@ -69,9 +69,9 @@ func TimedTaskDatamap(days int, lasttime int64, numIndex int) *datamap {
 	it := sess.DB(data_mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		if n%10000 == 0 {
-			log.Println("当前 n:", n, "数量:", continuSum, tmp["_id"], tmp["publishtime"])
-		}
+		//if n%10000 == 0 {
+		//	log.Println("当前 n:", n, "数量:", continuSum, tmp["_id"], tmp["publishtime"])
+		//}
 		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
 			qutil.IntAll(tmp["dataging"]) == 1 {
 
@@ -355,8 +355,12 @@ L:
 						if strings.Contains(letter1, "重新招标") || strings.Contains(letter2, "重新招标") {
 							letter1, letter2 = dealWithSpecialPhrases(letter1, letter2)
 						}
+
+						letter1 = cleanNameFilterRedundant(letter1)
+						letter2 = cleanNameFilterRedundant(letter2)
+
 						if letter1 == letter2 {
-							reason = reason + "标题关键词相等关系"
+							reason = reason + "标题关键词相等有效关系"
 							if !againRepeat(v, info, false) { //进行二级金额判断
 								b = true
 								source = v

+ 7 - 2
src/historyRepeat.go

@@ -161,7 +161,7 @@ func historyRepeat() {
 					b, source, reason := curTM.check(info)
 					if b { //有重复,更新
 						repeateN++
-						if judgeIsReplaceInfo(source.href, info.href) {
+						if judgeIsReplaceInfo(source.href, info.href) && !IsFull {
 							datalock.Lock()
 							temp_source_id := source.id
 							temp_info_id := info.id
@@ -191,6 +191,10 @@ func historyRepeat() {
 								} else {
 									data_mgo.DeleteById(extract_back, temp_source_id)
 									data_mgo.Save(extract_back, ext_s_data)
+									is_del := data_mgo.DeleteById(extract, temp_source_id)
+									if is_del > 0 {
+										data_mgo.Save(extract, ext_s_data)
+									}
 								}
 								data_mgo.DeleteById(extract, temp_info_id)
 								data_mgo.Save(extract, ext_i_data)
@@ -202,7 +206,8 @@ func historyRepeat() {
 
 								//通道填充数据
 								msg := "id=" + temp_source_id
-								_ = MP.Publish(msg)
+								_ = nspdata_1.Publish(msg)
+								_ = nspdata_2.Publish(msg)
 
 							} else {
 								log.Println("替换~相关表~未查询到数据~", temp_source_id, "~", temp_info_id)

+ 11 - 4
src/increaseRepeat.go

@@ -88,7 +88,7 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 					} else {
 						num++
 						//判断是否为~替换数据~模式
-						if judgeIsReplaceInfo(source.href, info.href) {
+						if judgeIsReplaceInfo(source.href, info.href) && !IsFull {
 							datalock.Lock()
 							temp_source_id := source.id
 							temp_info_id := info.id
@@ -108,8 +108,15 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 								ext_i_data["repeat"] = 1
 								ext_i_data["repeat_id"] = temp_source_id
 								ext_i_data["repeat_reason"] = reason
+
 								data_mgo.DeleteById(extract, temp_source_id)
 								data_mgo.Save(extract, ext_s_data)
+
+								is_del := data_mgo.DeleteById(extract_back, temp_source_id)
+								if is_del > 0 {
+									data_mgo.Save(extract_back, ext_s_data)
+								}
+
 								data_mgo.DeleteById(extract, temp_info_id)
 								data_mgo.Save(extract, ext_i_data)
 
@@ -120,8 +127,8 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 
 								//通道填充数据
 								msg := "id=" + temp_source_id
-								_ = MP.Publish(msg)
-
+								_ = nspdata_1.Publish(msg)
+								_ = nspdata_2.Publish(msg)
 							} else {
 								log.Println("替换~相关表~未查询到数据~", temp_source_id, "~", temp_info_id)
 							}
@@ -150,7 +157,7 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 		}(dataArr)
 	}
 	wg.Wait()
-	log.Println("")
+	fmt.Println("")
 	log.Println("当前~判重~结束~", total, "重复~", repeatN)
 	//更新Ocr的标记
 	updateOcrFileData(mapInfo["lteid"].(string))

+ 11 - 6
src/main.go

@@ -44,7 +44,7 @@ var (
 	userName, passWord                         string
 	jyfb_data                                  map[string]string
 	taskList                                   []map[string]interface{}
-	MP                                         *nsqdata.Producer
+	nspdata_1, nspdata_2                       *nsqdata.Producer
 )
 
 func initMgo() {
@@ -100,7 +100,11 @@ func initOther() {
 	go Update.updateData()
 
 	var err error
-	MP, err = nsqdata.NewProducer("192.168.3.166:4150", "testnsq", true)
+	nspdata_1, err = nsqdata.NewProducer("172.17.4.232:4150", "bidding_id", true)
+	if err != nil {
+		log.Fatal("通道配置异常~", err)
+	}
+	nspdata_2, err = nsqdata.NewProducer("172.17.4.232:4150", "project_id", true)
 	if err != nil {
 		log.Fatal("通道配置异常~", err)
 	}
@@ -142,6 +146,7 @@ func init() {
 	initMgo()
 	initOther()
 	initSite()
+
 }
 
 func mainT() {
@@ -150,10 +155,10 @@ func mainT() {
 	//go AddGroupPool.addGroupData()
 	//fullDataRepeat() //全量判重
 
-	//increaseRepeat(map[string]interface{}{
-	//	"gtid":  "12ec61170ae152a3c2310f02",
-	//	"lteid": "92ec61170ae152a3c2310f02",
-	//})
+	increaseRepeat(map[string]interface{}{
+		"gtid":  "12ec61170ae152a3c2310f02",
+		"lteid": "92ec61170ae152a3c2310f02",
+	})
 
 	//gtid = "62ec2dd00ae152a3c230c1a1"
 	//lteid = "62ec2dd00ae152a3c230c1e1"

+ 1 - 0
src/mgo.go

@@ -270,6 +270,7 @@ func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
 	coll := m.C.Database(m.DbName).Collection(c)
 	_, err := coll.UpdateOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)}, doc)
 	if err != nil {
+		log.Println(err)
 		return false
 	}
 	return true