浏览代码

备份-敏感词-分词-等

apple 4 年之前
父节点
当前提交
04ac3af888

+ 7 - 1
udpdataclear/udpSensitiveWords/config.json

@@ -22,5 +22,11 @@
     }
   ],
   "userName": "",
-  "passWord": ""
+  "passWord": "",
+  "winner_es_type": "azktest",
+  "winner_es_index": "azktest",
+  "buyer_es_type": "azktest",
+  "buyer_es_index": "azktest",
+  "agency_es_type": "azktest",
+  "agency_es_index": "azktest"
 }

+ 3 - 0
udpdataclear/udpSensitiveWords/go.mod

@@ -4,9 +4,12 @@ go 1.13
 
 require (
 	github.com/importcjj/sensitive v0.0.0-20200106142752-42d1c505be7b
+	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/olivere/elastic v6.2.35+incompatible
 	go.mongodb.org/mongo-driver v1.5.1
 	google.golang.org/grpc v1.36.1
 	google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0 // indirect
 	google.golang.org/protobuf v1.26.0
+	gopkg.in/olivere/elastic.v1 v1.0.1
 	gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c
 )

+ 8 - 0
udpdataclear/udpSensitiveWords/go.sum

@@ -68,6 +68,8 @@ github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9Y
 github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
 github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
 github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg=
+github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaRPx4tDPEn4=
 github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA=
 github.com/klauspost/compress v1.9.5 h1:U+CaK85mrNNb4k8BNOfgJtJ/gr6kswUCFj6miSzVC6M=
@@ -77,9 +79,13 @@ github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxv
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/markbates/oncer v0.0.0-20181203154359-bf2de49a0be2/go.mod h1:Ld9puTsIW75CHf65OeIOkyKbteujpZVXDpWK6YGZbxE=
 github.com/markbates/safe v1.0.1/go.mod h1:nAqgmRi7cY2nqMc92/bSEeQA+R4OheNU2T1kNSCBdG0=
 github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
+github.com/olivere/elastic v6.2.35+incompatible h1:MMklYDy2ySi01s123CB2WLBuDMzFX4qhFcA5tKWJPgM=
+github.com/olivere/elastic v6.2.35+incompatible/go.mod h1:J+q1zQJTgAz9woqsbVRqGeB5G1iqDKVBWLNSYW8yfJ8=
 github.com/pelletier/go-toml v1.7.0/go.mod h1:vwGMzjaWMwyfHwgIBhI2YUM4fB6nL6lVAvS1LBMMhTE=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -189,6 +195,8 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
+gopkg.in/olivere/elastic.v1 v1.0.1 h1:ZoJwTKCI0jJdVptoGB0QEFt/4bDUs6A5Pjrmn/Zb+5g=
+gopkg.in/olivere/elastic.v1 v1.0.1/go.mod h1:sMIrW2Y2hS8bEAqdTvdcrNN/KV21XXOfjdi4tHxwVnI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=

+ 35 - 4
udpdataclear/udpSensitiveWords/grpc_server/main.go

@@ -10,9 +10,11 @@ import (
 	"io/ioutil"
 	"log"
 	"net"
+	"runtime"
 	"sensitiveWords.udp/proto_grpc"
 	"sensitiveWords.udp/util"
 	"strings"
+	"time"
 )
 
 const (
@@ -59,23 +61,52 @@ func init() {
 	}).Sort("_id").Iter()
 	Filter = sensitive.New()
 	var initnum uint
-	start_id,endid:="",""
+	saveIdArr ,start_id:= make([]map[string]string,0),""
+	var m runtime.MemStats
 	for tmp := map[string]interface{}{}; iter.Next(&tmp); tmp = map[string]interface{}{} {
 		if start_id=="" {
 			start_id = BsonTOStringId(tmp["_id"])
 		}
 		Filter.AddWord(tmp["qy_name"].(string))
 		initnum++
+		if initnum%10000==0 {
+			runtime.ReadMemStats(&m)
+			men :=toMegaBytes(m.HeapAlloc)
+			log.Printf("current index %d\tos %.2f M",initnum, men)
+			if men>500 {
+				saveIdArr = append(saveIdArr, map[string]string{
+					"start":start_id,
+					"end":BsonTOStringId(tmp["_id"]),
+				})
+				runtime.GC()
+				Filter = sensitive.New()
+				start_id = ""
+				time.Sleep(time.Second*5)
+			}
+		}
+		break
 
+	}
 
-		//Filter = sensitive.New()
-		//runtime.GC()
-
+	saveIdArr = append(saveIdArr, map[string]string{
+		"start":start_id,
+		"end":"",
+	})
 
+	for k,v:=range saveIdArr{
+		log.Println("第",k,"段",v["start"],v["end"])
 	}
+
 	log.Println("init ok", initnum)
 }
 
+
+func toMegaBytes(bytes uint64) float64 {
+	return float64(bytes) / 1024 / 1024
+}
+
+
+
 func main() {
 	lis, err := net.Listen("tcp", PORT)
 	if err != nil {

+ 2 - 2
udpdataclear/udpSensitiveWords/grpc_server/server.yaml

@@ -3,5 +3,5 @@ mongodbPoolSize: 10
 dbName: mixdata
 userName:
 passWord:
-taskGteId: 605d4d97a15e7ed8e49ec1ac
-taskLteId: 605d4d97a15e7ed8e49ec1de
+taskGteId: 1fffffffffffffffffffffff
+taskLteId: 9fffffffffffffffffffffff

+ 0 - 2
udpdataclear/udpSensitiveWords/main.go

@@ -8,12 +8,10 @@ import (
 )
 
 func init() {
-	log.Println("232")
 	util.InitC()
 }
 func main() {
 	// 主函数中添加
-	log.Println("222222")
 	go func() {
 		log.Println(http.ListenAndServe("127.0.0.1:8080", nil))
 	}()

+ 17 - 1
udpdataclear/udpSensitiveWords/util/config.go

@@ -2,7 +2,9 @@ package util
 
 import (
 	"google.golang.org/grpc"
+	"gopkg.in/olivere/elastic.v1"
 	"log"
+	"net/http"
 	"sensitiveWords.udp/proto_grpc"
 )
 
@@ -23,6 +25,14 @@ func InitC() {
 		PassWord:    Config["passWord"].(string),
 	}
 	BiddingMgo.InitPool()
+
+	Client_Es ,_= elastic.NewClient(http.DefaultClient, "http://192.168.3.11:9800")
+
+
+	winner_type, winner_index = Config["winner_es_type"].(string),Config["winner_es_index"].(string)
+	buyer_type, buyer_index = Config["buyer_es_type"].(string),Config["buyer_es_type"].(string)
+	agency_type, agency_index = Config["agency_es_type"].(string),Config["agency_es_type"].(string)
+
 	Fields = Config["fields"].(map[string]interface{})
 	FindBuyerC, FindAgencyC, FindWinnerC = Config["buyer_c"].(string), Config["agency_c"].(string), Config["winner_c"].(string)
 	qaddrs := Config["query_addrs"].([]interface{})
@@ -35,7 +45,9 @@ func InitC() {
 		QAddrs = append(QAddrs, &c)
 	}
 
-	log.Println(22222211111)
+
+
+
 }
 
 var Config map[string]interface{}
@@ -43,3 +55,7 @@ var BiddingMgo *MongodbSim
 var Fields map[string]interface{}
 var FindBuyerC, FindAgencyC, FindWinnerC string
 var QAddrs []*proto_grpc.SensitiveWordsClient
+var winner_type, winner_index	string
+var buyer_type, buyer_index		string
+var agency_type, agency_index	string
+var Client_Es  *elastic.Client

+ 19 - 3
udpdataclear/udpSensitiveWords/util/udputil.go

@@ -24,8 +24,7 @@ func ExtractUdp() {
 
 	sid := "1fffffffffffffffffffffff"
 	eid := "9fffffffffffffffffffffff"
-	log.Println(sid,eid)
-	//QuerySensitiveWords(sid,eid )
+	QuerySensitiveWords(sid,eid )
 }
 
 var task chan struct{} = make(chan struct{}, 1)
@@ -93,10 +92,14 @@ func QuerySensitiveWords(sid, eid string) {
 	}).Select(Fields).Sort("_id").Iter()
 
 	for tmp := map[string]interface{}{}; iter.Next(&tmp); tmp = map[string]interface{}{} {
-		log.Println(tmp["_id"])
 		if win, isok := tmp["winner"].(string); isok {
 			queryGrpcWinner := query_grpc(win, FindWinnerC)
 			if queryGrpcWinner == "" {
+				/**********处理未匹配的数据-进行es-分词打分比较**********/
+				new_name,b :=dealWithScoreRules(win,winner_type,winner_index)
+				if b {
+					tmp["winner"] = new_name
+				}
 			} else {
 				tmp["winner"] = queryGrpcWinner
 			}
@@ -104,6 +107,10 @@ func QuerySensitiveWords(sid, eid string) {
 		if win, isok := tmp["s_winner"].(string); isok {
 			queryGrpcWinner := query_grpc(win, FindWinnerC)
 			if queryGrpcWinner == "" {
+				new_name,b :=dealWithScoreRules(win,winner_type,winner_index)
+				if b {
+					tmp["s_winner"] = new_name
+				}
 			} else {
 				tmp["s_winner"] = queryGrpcWinner
 			}
@@ -112,6 +119,10 @@ func QuerySensitiveWords(sid, eid string) {
 		if agency, isok := tmp["agency"].(string); isok {
 			queryGrpcAgency := query_grpc(agency, FindAgencyC)
 			if queryGrpcAgency == "" {
+				new_name,b :=dealWithScoreRules(agency,agency_type,agency_index)
+				if b {
+					tmp["agency"] = new_name
+				}
 			} else {
 				tmp["agency"] = queryGrpcAgency
 			}
@@ -120,6 +131,10 @@ func QuerySensitiveWords(sid, eid string) {
 		if buyer, isok := tmp["buyer"].(string); isok {
 			queryGrpcBuyer := query_grpc(buyer, FindBuyerC)
 			if queryGrpcBuyer == "" {
+				new_name,b :=dealWithScoreRules(buyer,buyer_type,buyer_index)
+				if b {
+					tmp["buyer"] = new_name
+				}
 			} else {
 				tmp["buyer"] = queryGrpcBuyer
 			}
@@ -164,6 +179,7 @@ func QuerySensitiveWords(sid, eid string) {
 			}
 		}
 		num++
+		break //测试
 	}
 	log.Println("处理完成:", num)
 

+ 75 - 0
udpdataclear/udpSensitiveWords/util/word.go

@@ -0,0 +1,75 @@
+package util
+
+import (
+	"encoding/json"
+	"log"
+	"strings"
+	"unicode/utf8"
+)
+
+func dealWithScoreRules(name string,estype string,esindex string) (string,bool) {
+	new_name,isok :="",false
+	query:= `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name_2","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":1,"sort":[],"facets":{}}`
+	//默认取最高分-分析多个分-遍历器查询
+	tmp := make(map[string]interface{})
+	json.Unmarshal([]byte(query),&tmp)
+	searchResult, err := Client_Es.Search().Index(esindex).Type(estype).Source(tmp).Do()
+	if err != nil {
+		log.Println("从ES查询出错", err.Error())
+		return new_name,isok
+	}
+	data := make(map[string]interface{},0)
+	if searchResult.Hits != nil {
+		for _, hit := range searchResult.Hits.Hits {
+			json.Unmarshal(*hit.Source, &data)
+		}
+	}
+	if len(data)>0 && data != nil {
+		new_name = objToString(data["name"])
+	}
+	if new_name!="" { //分析hit比例
+		total,hit := dealWithWordsRules(name,new_name)
+		if float64(hit)/float64(total)>=0.8 {
+			isok = true
+		}
+	}
+	return new_name,isok
+}
+
+//击中数量以及比例
+func dealWithWordsRules(info_name string ,source_name string) (int,int){
+	total,hit :=0,0
+	nameArr,_ := calculateWordCount(info_name)
+	_,total = calculateWordCount(source_name)
+	for _,v1 := range nameArr {
+		if strings.Contains(source_name,v1) {
+			hit++
+		}
+	}
+	return total,hit
+}
+
+//分词结果
+func calculateWordCount(name string) ([]string,int) {
+
+	arr, space := make([]string, 0), 2
+	total := utf8.RuneCountInString(name) - (space - 1)
+	if name == "" || total <= 0 {
+		return arr, 0
+	}
+	nameRune := []rune(name)
+	for i := 0; i < total; i++ {
+		new_str := string(nameRune[i : space+i])
+		arr = append(arr, new_str)
+	}
+	return arr, len(arr)
+}
+
+func objToString(old interface{}) string {
+	if nil == old {
+		return ""
+	} else {
+		r, _ := old.(string)
+		return r
+	}
+}