Browse Source

Merge branch 'dev3.4.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4.2

# Conflicts:
#	udpdataclear/udpSensitiveWords/grpc_server/main.go
fengweiqiang 4 years ago
parent
commit
be3192ff9e

+ 39 - 41
udpdataclear/udpSensitiveWords/grpc_server/data.go

@@ -11,7 +11,6 @@ import (
 	"runtime"
 	"sensitiveWords.udp/util"
 	"strings"
-	"sync"
 	"time"
 )
 
@@ -193,10 +192,23 @@ func dealWithEsData(name string,tmpid string)  {
 
 //处理内存分段
 func dealWithDataMemory()  {
-	iter := MixDataMgo.GetMgoConn().C("unique_qyxy").Find(map[string]interface{}{
+
+	//临时测试
+	MixDataMgo = &util.MongodbSim{
+		MongodbAddr: "172.17.4.187:27082,172.17.145.163:27083",
+		Size:        20,
+		DbName:      "mixdata",
+		UserName:    "fengweiqiang",
+		PassWord:    "fwq@123123",
+	}
+	MixDataMgo.InitPool()
+
+	sess := MixDataMgo.GetMgoConn()
+	defer MixDataMgo.DestoryMongoConn(sess)
+	iter := sess.DB(MixDataMgo.DbName).C("unique_qyxy").Find(map[string]interface{}{
 		"_id": map[string]interface{}{
-			"$gte": util.BsonTOStringId("1fffffffffffffffffffffff"),
-			"$lte":  util.BsonTOStringId("9fffffffffffffffffffffff"),
+			"$gte": util.StringTOBsonId("1fffffffffffffffffffffff"),
+			"$lte":  util.StringTOBsonId("9fffffffffffffffffffffff"),
 		},
 	}).Sort("_id").Iter()
 	Filter = sensitive.New()
@@ -209,22 +221,22 @@ func dealWithDataMemory()  {
 		}
 		Filter.AddWord(tmp["qy_name"].(string))
 		initnum++
-		if initnum%100000==0 {
+		if initnum%50000==0 {
 			runtime.ReadMemStats(&m)
 			men :=util.ToMegaBytes(m.HeapAlloc)
 			log.Printf("current index %d\tos %.2f M",initnum, men)
-			if men>5*1024 { //7.5G
+			if men>7.5*1024 { //7.5G
 				saveIdArr = append(saveIdArr, map[string]string{
 					"start":start_id,
 					"end":util.BsonTOStringId(tmp["_id"]),
 				})
-				runtime.GC()
+				log.Println("分段:",start_id,util.BsonTOStringId(tmp["_id"]),men)
 				Filter = sensitive.New()
+				runtime.GC()
 				start_id = ""
-				time.Sleep(time.Second*5)
+				time.Sleep(time.Second*30)
 			}
 		}
-		break
 	}
 
 	saveIdArr = append(saveIdArr, map[string]string{
@@ -252,49 +264,35 @@ func temporaryTest()  {
 	}
 	sess := MixDataMgo.GetMgoConn()
 	defer MixDataMgo.DestoryMongoConn(sess)
-	//多线程升索引
-	pool_es := make(chan bool, 20)
-	wg_es := &sync.WaitGroup{}
 	it := sess.DB(MixDataMgo.DbName).C("winner_err_new").Find(&q).Iter()
 	total,isOK:=0,0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total % 1000 == 0 {
+		if total % 100 == 0 {
 			log.Println("current index",total,isOK)
 		}
 
 		name:=util.ObjToString(tmp["name"])
 		tmpid := util.BsonTOStringId(tmp["_id"])
-		pool_es <- true
-		wg_es.Add(1)
-
-		go func(name string,tmpid string) {
-			defer func() {
-				<-pool_es
-				wg_es.Done()
-			}()
-			new_name,b :=dealWithNameScoreRules(name)
-			//log.Println(b,name,new_name,tmpid)
-			if new_name!="" && b {
-				isOK++
-				MixDataMgo.UpdateById("winner_err_new",tmpid,map[string]interface{}{
-					"$set": map[string]interface{}{
-						"is_word": 1,
-						"name_word" : new_name,
-					},
-				})
-			}else {
-				MixDataMgo.UpdateById("winner_err_new",tmpid,map[string]interface{}{
-					"$set": map[string]interface{}{
-						"is_word": -1,
-						"name_word" : new_name,
-					},
-				})
-			}
-		}(name,tmpid)
+		new_name,b :=dealWithNameScoreRules(name)
+		if new_name!="" && b {
+			isOK++
+			MixDataMgo.UpdateById("winner_err_new",tmpid,map[string]interface{}{
+				"$set": map[string]interface{}{
+					"is_word": 1,
+					"name_word" : new_name,
+				},
+			})
+		}else {
+			MixDataMgo.UpdateById("winner_err_new",tmpid,map[string]interface{}{
+				"$set": map[string]interface{}{
+					"is_word": -1,
+					"name_word" : new_name,
+				},
+			})
+		}
 		tmp = make(map[string]interface{})
 	}
 
-	wg_es.Wait()
 
 	log.Println("is over",total,isOK)
 }

+ 18 - 18
udpdataclear/udpSensitiveWords/grpc_server/main.go

@@ -19,6 +19,24 @@ import (
 	"strings"
 )
 
+/*
+2021/04/28 14:31:35.755969 data.go:248: 第 0 段 6082579e19a23f9d3f39eff2 60825b1d9ce9fc2e6c8a5b92
+2021/04/28 14:31:35.756001 data.go:248: 第 1 段 60825b1d9ce9fc2e6c8a5b93 60825e5d9ce9fc2e6ccea709
+2021/04/28 14:31:35.756008 data.go:248: 第 2 段 60825e5d9ce9fc2e6ccea70a 608261cd9ce9fc2e6c13382e
+2021/04/28 14:31:35.756014 data.go:248: 第 3 段 608261cd9ce9fc2e6c13382f 6082658c9ce9fc2e6c574971
+2021/04/28 14:31:35.756021 data.go:248: 第 4 段 6082658c9ce9fc2e6c574972 608269619ce9fc2e6c9c668c
+2021/04/28 14:31:35.756029 data.go:248: 第 5 段 608269619ce9fc2e6c9c668d 60826d3e9ce9fc2e6ce1c9d8
+2021/04/28 14:31:35.756039 data.go:248: 第 6 段 60826d3e9ce9fc2e6ce1c9d9 608271559ce9fc2e6c26aca5
+2021/04/28 14:31:35.756046 data.go:248: 第 7 段 608271559ce9fc2e6c26aca6 608275929ce9fc2e6c6ca1ec
+2021/04/28 14:31:35.756053 data.go:248: 第 8 段 608275929ce9fc2e6c6ca1ed 608279e19ce9fc2e6cb2ddb8
+2021/04/28 14:31:35.756060 data.go:248: 第 9 段 608279e19ce9fc2e6cb2ddb9 60827e359ce9fc2e6cf96417
+2021/04/28 14:31:35.756069 data.go:248: 第 10 段 60827e359ce9fc2e6cf96419 608282299ce9fc2e6c4034ee
+2021/04/28 14:31:35.756077 data.go:248: 第 11 段 608282299ce9fc2e6c4034ef 608285b09ce9fc2e6c868546
+2021/04/28 14:31:35.756087 data.go:248: 第 12 段 608285b09ce9fc2e6c868547 608289199ce9fc2e6ccbc72e
+2021/04/28 14:31:35.756095 data.go:248: 第 13 段 608289199ce9fc2e6ccbc72f 608293f49ce9fc2e6cfdbf7b
+2021/04/28 14:31:35.756103 data.go:248: 第 14 段 608293f49ce9fc2e6cfdbfa7 
+*/
+
 const (
 	YAMLFILE = "./server.yaml"
 )
@@ -73,24 +91,6 @@ func init() {
 
 func main() {
 
-/*	//临时测试
-	MixDataMgo = &util.MongodbSim{
-		MongodbAddr: "172.17.4.187:27082,172.17.145.163:27083",
-		Size:        20,
-		DbName:      "mixdata",
-		UserName:    "fengweiqiang",
-		PassWord:    "fwq@123123",
-	}
-	MixDataMgo.InitPool()
-
-	Client_Es ,_= elastic.NewClient(http.DefaultClient, "http://172.17.145.170:9800")
-
-	es_type, es_index = "unique_qy","unique_qy"
-	temporaryTest()
-
-	//单独跑-分段
-	return*/
-
 	if YamlConfig.IsAddTask == 0 {
 		initSensitiveWordsData() //初始化敏感词数据
 	} else {

+ 32 - 20
udpdataclear/udpSensitiveWords/grpc_server/words.go

@@ -3,26 +3,24 @@ package main
 import (
 	"encoding/json"
 	"log"
-	"os"
 	"sensitiveWords.udp/util"
 	"strings"
+	"unicode"
 	"unicode/utf8"
 )
 
 func dealWithNameScoreRules(name string) (string,bool) {
 	new_name,new_score,isok :="",float64(0),false
-	old_name := escape(name)
+	old_name := escapeNew(name)
 	if old_name=="" {
 		return "",false
 	}
 	query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"`+old_name+`"}}],"must_not":[],"should":[]}},"from":"0","size":"1"}`
-	log.Println("222",query)
-	return "",false
 	tmp := make(map[string]interface{})
 	json.Unmarshal([]byte(query),&tmp)
 	searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do()
 	if err != nil {
-		//log.Println("ES查询出错",name,old_name)
+		log.Println("ES查询出错",name,old_name)
 		return "",false
 	}
 	resNum := len(searchResult.Hits.Hits)
@@ -42,6 +40,8 @@ func dealWithNameScoreRules(name string) (string,bool) {
 		}
 	}
 	if len(res)>0 && res != nil {
+		//分析分数...取最大
+
 		new_name = util.ObjToString(res[0]["name"])
 		new_score = util.Float64All(res[0]["score"])
 	}
@@ -97,26 +97,38 @@ func calculateWordCount(name string) ([]string,int) {
 	return arr,len(arr)
 }
 
-func escape(s string) string {
+//func escape(s string) string {
+//	news := ""
+//	s = strings.ReplaceAll(s," ","")
+//	for _, c := range s {
+//		//if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
+//		//	news = news + string(c)
+//		//}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
+//		//	a := string([]rune{os.PathSeparator, '\\'})
+//		//	news = news + a + string(c)
+//		//} else {
+//		//	return ""
+//		//}
+//		if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
+//			a := string([]rune{os.PathSeparator,'\\'})
+//			//news = news + a + `\` + string(c)
+//			news = news + a  + string(c)
+//		} else {
+//			news = news + string(c)
+//		}
+//
+//	}
+//	return news
+//}
+
+
+func escapeNew(s string) string {
 	news := ""
 	s = strings.ReplaceAll(s," ","")
 	for _, c := range s {
-		//if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
-		//	news = news + string(c)
-		//}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
-		//	a := string([]rune{os.PathSeparator, '\\'})
-		//	news = news + a + string(c)
-		//} else {
-		//	return ""
-		//}
-		if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
-			a := string([]rune{os.PathSeparator, '\\'})
-			//news = news + a + `\` + string(c)
-			news = news + a  + string(c)
-		} else {
+		if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
 			news = news + string(c)
 		}
-
 	}
 	return news
 }