瀏覽代碼

备份 修改 - 优化

apple 4 年之前
父節點
當前提交
455e36c5d9

+ 3 - 0
udpdataclear/udpSensitiveWords/go.mod

@@ -7,11 +7,14 @@ require (
 	github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
 	github.com/importcjj/sensitive v0.0.0-20200106142752-42d1c505be7b
 	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/olivere/elastic v6.2.35+incompatible // indirect
 	github.com/prometheus/client_golang v1.10.0
+	github.com/spf13/pflag v1.0.3 // indirect
 	go.mongodb.org/mongo-driver v1.5.1
 	google.golang.org/grpc v1.36.1
 	google.golang.org/protobuf v1.26.0
+	gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect
 	gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22
 	gopkg.in/olivere/elastic.v1 v1.0.1
 	gopkg.in/yaml.v2 v2.3.0

+ 1 - 0
udpdataclear/udpSensitiveWords/main.go

@@ -5,6 +5,7 @@ import (
 )
 
 func init() {
+
 	util.InitC()
 }
 func main() {

+ 101 - 0
udpdataclear/udpSensitiveWords/util/udpdata.go

@@ -7,10 +7,13 @@ import (
 	"go.mongodb.org/mongo-driver/bson"
 	"go.mongodb.org/mongo-driver/bson/primitive"
 	"go.mongodb.org/mongo-driver/mongo/options"
+	"gopkg.in/olivere/elastic.v1"
 	"log"
 	"net"
+	"net/http"
 	"regexp"
 	"strings"
+	"sync"
 	"time"
 )
 
@@ -316,6 +319,95 @@ func dealWithEsData(name string, tmpid string) {
 
 }
 
+
+
+
+
+
+
+func TemporaryTest()  {
+	log.Println("测试......导出数据")
+
+	QfwMgo85 = &MongodbSim{
+		MongodbAddr: "172.17.4.187:27082,172.17.145.163:27083",
+		Size:        5,
+		DbName:      "mixdata",
+		UserName:    "fengweiqiang",
+		PassWord:    "fwq@123123",
+	}
+	QfwMgo85.InitPool()
+
+	Client_Es, _ = elastic.NewClient(http.DefaultClient, "http://ela.spdata.jianyu360.com")
+
+	es_type, es_index = "unique_qy", "unique_qy"
+
+
+	q := map[string]interface{}{
+		"check_history":map[string]interface{}{
+			"$exists":0,
+		},
+	}
+	sess := QfwMgo85.GetMgoConn()
+	defer QfwMgo85.DestoryMongoConn(sess)
+	//多线程升索引
+	pool_es := make(chan bool, 3)
+	wg_es := &sync.WaitGroup{}
+	it := sess.DB(QfwMgo85.DbName).C("winner_err_new").Find(&q).Iter()
+	total,isOK:=0,0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total % 100 == 0 {
+			log.Println("current index",total,isOK)
+		}
+
+		name:=ObjToString(tmp["name"])
+		tmpid := BsonTOStringId(tmp["_id"])
+		pool_es <- true
+		wg_es.Add(1)
+
+		go func(name string,tmpid string) {
+			defer func() {
+				<-pool_es
+				wg_es.Done()
+			}()
+			//start := int(time.Now().Unix())
+			new_name,b,_ :=dealWithNameScoreRules(name)
+			//log.Println("耗时:",int(time.Now().Unix())-start,"秒",b,name,new_name,tmpid)
+			if new_name!="" && b {
+				isOK++
+				QfwMgo85.UpdateById("winner_err_new",tmpid,map[string]interface{}{
+					"$set": map[string]interface{}{
+						"is_word": 1,
+						"name_word" : new_name,
+					},
+				})
+			}else {
+				QfwMgo85.UpdateById("winner_err_new",tmpid,map[string]interface{}{
+					"$set": map[string]interface{}{
+						"is_word": -1,
+						"name_word" : new_name,
+					},
+				})
+			}
+
+		}(name,tmpid)
+		tmp = make(map[string]interface{})
+	}
+
+	wg_es.Wait()
+
+	log.Println("is over",total,isOK)
+}
+
+
+
+
+
+
+
+
+
+
+
 var reg_alias = regexp.MustCompile("(税务局|工商行政管理局|文化广播电视新闻出版局|外国专家局|" +
 	"中医药管理局|市场监督管理局|广播电视局|医疗保障局|机关事务管理局|粮食和物资储备局|" +
 	"监狱管理局|畜牧兽医局|食品药品监督管理局|城市管理行政执法局|城市管理局|国家保密局|密码管理局|" +
@@ -349,3 +441,12 @@ var con_strReg *regexp.Regexp = regexp.MustCompile("(\\?|?|%|代码标识|删
 	"[a-zA-Z]{5,}")
 
 var uncon_strReg *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处)")
+
+
+
+var startWordReg_1 *regexp.Regexp = regexp.MustCompile("^(.{1,5})(省|市|县|州|自治区|特别行政区)")
+var startWordReg_2 *regexp.Regexp = regexp.MustCompile("^(北京|天津|重庆|上海|河北|山西|" +
+	"浙江|江西|湖北|吉林|海南|甘肃|广东|陕西|辽宁|山东|河南|云南|黑龙江|福建|贵州|江苏|安徽|" +
+	"湖南|四川|青海|台湾|新疆|内蒙古|宁夏|西藏|广西|澳门|香港)")
+
+var endWordReg *regexp.Regexp = regexp.MustCompile("(有限公司|有限责任公司)$")

+ 42 - 1
udpdataclear/udpSensitiveWords/util/words.go

@@ -14,7 +14,14 @@ func dealWithNameScoreRules(name string) (string, bool, []map[string]interface{}
 	if old_name == "" {
 		return "", false, nil
 	}
-	query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + old_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"300"}`
+
+	query_name := old_name
+	endstr := endWordReg.FindString(query_name)
+	if endstr !="" {
+		query_name = strings.ReplaceAll(query_name,endstr,"")
+	}
+
+	query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + query_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"1"}`
 	tmp := make(map[string]interface{})
 	json.Unmarshal([]byte(query), &tmp)
 	searchResult, err := Client_Es.Search().Index(es_index).Type(es_type).Source(tmp).Do()
@@ -52,7 +59,41 @@ func dealWithNameScoreRules(name string) (string, bool, []map[string]interface{}
 				isok = true
 			} else {
 				if float64(hit)/float64(total) >= 0.8 && new_score > 4.0 {
+
+					str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name)
+					if str1!="" && str2!="" {
+						if strings.Contains(str1,str2)||strings.Contains(str2,str1) {
+
+						}else {
+							return new_name, false, res
+						}
+					}
+					str1,str2 = startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
+					if str1!="" && str2!=""{
+						if str1 != str2 {
+							return new_name, false, res
+						}
+					}
 					isok = true
+				}else if new_score > 4.0 {
+					str1,str2:=name,new_name
+					str1 = strings.ReplaceAll(str1,"责任","")
+					str2 = strings.ReplaceAll(str2,"责任","")
+
+					str1 = strings.ReplaceAll(str1,"有限","")
+					str2 = strings.ReplaceAll(str2,"有限","")
+
+					str1 = strings.ReplaceAll(str1,"科技","")
+					str2 = strings.ReplaceAll(str2,"科技","")
+
+					str1 = strings.ReplaceAll(str1,"工程","")
+					str2 = strings.ReplaceAll(str2,"工程","")
+					if str1==str2 {
+						return new_name, true, res
+					}
+
+				}else {
+
 				}
 			}
 		}