Ver código fonte

备份-通用-分词es规则等

apple 4 anos atrás
pai
commit
ca5765f95f

+ 32 - 256
data_monitoring/words_vaild/src/main.go

@@ -1,239 +1,69 @@
 package main
 
 import (
-	"encoding/json"
-	"fmt"
-	"github.com/tealeg/xlsx"
 	"log"
-	"os"
 	qu "qfw/util"
 	"qfw/util/elastic"
 	"strings"
-	"sync"
 	"unicode/utf8"
-	"go.mongodb.org/mongo-driver/bson/primitive"
-
-)
-var (
-	sysconfig			map[string]interface{} //配置文件
-	save_mgo        	*MongodbSim
 )
 
 func init()  {
-	save_mgo = &MongodbSim{
-		MongodbAddr: "192.168.3.207:27092",
-		DbName:      "zhengkun",
-		Size:        5,
-	}
-	save_mgo.InitPool()
-
-	elastic.InitElasticSize("http://192.168.3.11:9800",20)
-}
-
-func dealWithDataXlsx()  {
-
-	q := map[string]interface{}{}
-	sess := save_mgo.GetMgoConn()
-	defer save_mgo.DestoryMongoConn(sess)
-	it := sess.DB(save_mgo.DbName).C("zk_test_words").Find(&q).Iter()
-	total:=0
-	saveArr := make([]map[string]string,0)
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total % 10000 == 0 {
-			log.Println("current index",total,tmp["_id"])
-		}
-
-		if total % 30 ==0 {
-			name:=qu.ObjToString(tmp["name"])
-			dict := make(map[string]string)
-			dict["name"] = name
-			for i:=0; i<5;i++ {
-				value,total,hit :="","",""
-				key := "word_"+fmt.Sprintf("%d",i)
-				if tmp[key]!=nil {
-
-					if arr,ok := tmp[key].(primitive.A);ok {
-						dataArr :=qu.ObjArrToMapArr(arr)
-						value =qu.ObjToString(dataArr[0]["name"])
-						if i!=0 {
-							total = fmt.Sprintf("%d",dataArr[0]["all_words"])
-							hit = fmt.Sprintf("%d",dataArr[0]["hit_words"])
-						}
-					}
-
-				}
-				key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
-				dict[key] = value
-				dict[key1] = total
-				dict[key2] = hit
-
-			}
-			saveArr= append(saveArr,dict)
-		}
-		tmp = make(map[string]interface{})
-	}
-
-
-	os.Remove("words.xlsx")	//写excle
-	f :=xlsx.NewFile()
-
-
-	for i:=0; i<5;i++ {
-		key := "word_"+fmt.Sprintf("%d",i)
-		sheet, _ := f.AddSheet("统计"+key)
-		row := sheet.AddRow()
-		row.AddCell().Value = "name"
-		row.AddCell().Value = key
-		if i!=0 {
-			row.AddCell().Value = "total"
-			row.AddCell().Value = "hit"
-		}
-		key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
-
-		for _,tmp := range saveArr {
-			row = sheet.AddRow()
-			row.AddCell().SetString(tmp["name"])
-			row.AddCell().SetString(tmp[key])
-			row.AddCell().SetString(fmt.Sprintf("%s",tmp[key1]))
-			row.AddCell().SetString(fmt.Sprintf("%s",tmp[key2]))
-		}
-	}
-
-	err := f.Save("words.xlsx")
-	if err != nil {
-		log.Println("保存xlsx失败:", err)
-	}else {
-		log.Println("保存xlsx成功:", err)
-	}
-
+	elastic.InitElasticSize("http://192.168.3.11:9800",10)
 }
 
 func main()  {
 
-	//导出xlsx
-	dealWithDataXlsx()
-	return
-
-
-
-
 	defer qu.Catch()
 	log.Println("处理 ... 指定企业名称 ...")
-
-	//分析错误数据
-	//
-	q := map[string]interface{}{}
-	sess := save_mgo.GetMgoConn()
-	defer save_mgo.DestoryMongoConn(sess)
-	//细节才需要遍历
-	it := sess.DB(save_mgo.DbName).C("zk_company_test").Find(&q).Iter()
-	total:=0
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total % 10000 == 0 {
-			log.Println("current index",total,tmp["_id"])
-		}
-
-		name:=qu.ObjToString(tmp["name"])
-		save_dict := make(map[string]interface{},0)
-		for i:=0; i<5;i++ {
-			key := "word_"+fmt.Sprintf("%d",i)
-			dataArr :=dealWithScoreRules(name,i)
-			if dataArr ==nil || len(dataArr)<1 {
-				//无数据
-			}else {
-				save_dict[key] = dealWithWordsRules(name,dataArr,i)
-			}
-
-		}
-
-		if len(save_dict)>0 {
-			save_dict["name"]  = name
-			save_mgo.Save("zk_test_words",save_dict)
-		}
-
-		tmp = make(map[string]interface{})
+	/*
+	云南和合泰商贸有限公司
+	安徽省微乡华艺环境工程有限公司
+	*/
+	new_name,b :=dealWithScoreRules("安徽省微乡华艺环境工程有限公司")
+	if b {
+		log.Println("最终",new_name)
 	}
 
 }
 
-//分数维度
-func dealWithScoreRules(name string,space int) []map[string]interface{} {
-	key := ""
-	if space>0&&space<5{
-		key = fmt.Sprintf("%d",space)
-	}
-	query:= `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name_`+key+`","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
 
-	if key=="" {
-		query = `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
+func dealWithScoreRules(name string) (string,bool) {
+	new_name,isok :="",false
+	query:= `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name_2","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":1,"sort":[],"facets":{}}`
+	//默认取最高分-分析多个分-遍历器查询
+	data := *elastic.Get("azktest","azktest",query)
+	if len(data)>0 && data != nil {
+		new_name = qu.ObjToString(data[0]["name"])
 	}
-	client := elastic.GetEsConn()
-	defer elastic.DestoryEsConn(client)
-	searchResult, err := client.Search().Index("azktest").Type("azktest").Source(query).Do()
-	if err != nil {
-		log.Println("从ES查询出错", err.Error())
-		return nil
-	}
-	resNum := len(searchResult.Hits.Hits)
-	res := make([]map[string]interface{}, resNum)
-	if searchResult.Hits != nil {
-		if resNum < 5000 {
-			for i, hit := range searchResult.Hits.Hits {
-				data := make(map[string]interface{},0)
-				json.Unmarshal(*hit.Source, &data)
-				res[i] = map[string]interface{}{
-					"name":data["name"],
-					"score":*hit.Score,
-				}
-			}
-		} else {
-			log.Println("查询结果太多,查询到:", resNum, "条")
+	if new_name!="" { //分析hit比例
+		total,hit := dealWithWordsRules(name,new_name)
+		if float64(hit)/float64(total)>=0.8 {
+			isok = true
 		}
-
 	}
-	return res
+	return new_name,isok
 }
 
-
-
 //击中数量以及比例
-func dealWithWordsRules(name string ,source []map[string]interface{},space int) []map[string]interface{} {
-
-	nameArr,_ := calculateWordCount(name,space)
-	newArr := make([]map[string]interface{},0)
-	for _,v := range source {
-		total,hit :=0,0
-		source_name :=qu.ObjToString(v["name"])
-		_,total = calculateWordCount(source_name,space)
-		for _,v1 := range nameArr {
-			if strings.Contains(source_name,v1) {
-				hit++
-			}
-		}
-
-
-		if space==0 {
-			newArr = append(newArr, map[string]interface{}{
-				"name":source_name,
-				"score":qu.Float64All(v["score"]),
-			})
-		}else {
-			newArr = append(newArr, map[string]interface{}{
-				"name":source_name,
-				"score":qu.Float64All(v["score"]),
-				"all_words" : total,
-				"hit_words" : hit,
-			})
+func dealWithWordsRules(info_name string ,source_name string) (int,int){
+	total,hit :=0,0
+	nameArr,_ := calculateWordCount(info_name)
+	_,total = calculateWordCount(source_name)
+	for _,v1 := range nameArr {
+		if strings.Contains(source_name,v1) {
+			hit++
 		}
 	}
-	return newArr
+	return total,hit
 }
 
 //分词结果
-func calculateWordCount(name string,space int) ([]string,int) {
-	arr := make([]string,0)
+func calculateWordCount(name string) ([]string,int) {
+
+	arr ,space:= make([]string,0),2
 	total := utf8.RuneCountInString(name)-(space-1)
-	if name == "" || space<=0 || total<=0  {
+	if name == "" || total<=0  {
 		return arr,0
 	}
 	nameRune := []rune(name)
@@ -249,57 +79,3 @@ func calculateWordCount(name string,space int) ([]string,int) {
 
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-func readyDataEs()  {
-
-	q := map[string]interface{}{}
-	sess := save_mgo.GetMgoConn()
-	defer save_mgo.DestoryMongoConn(sess)
-	//多线程升索引
-	pool_es := make(chan bool, 10)
-	wg_es := &sync.WaitGroup{}
-	//细节才需要遍历
-	it := sess.DB(save_mgo.DbName).C("zk_company_name").Find(&q).Iter()
-	total:=0
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total % 10000 == 0 {
-			log.Println("current index",total,tmp["_id"])
-		}
-		savetmp := make(map[string]interface{}, 0)
-		savetmp["_id"] = tmp["_id"]
-		savetmp["name"] = qu.ObjToString(tmp["company_name"])
-		savetmp["name_1"] = qu.ObjToString(tmp["company_name"])
-		savetmp["name_2"] = qu.ObjToString(tmp["company_name"])
-		savetmp["name_3"] = qu.ObjToString(tmp["company_name"])
-		savetmp["name_4"] = qu.ObjToString(tmp["company_name"])
-		pool_es <- true
-		wg_es.Add(1)
-		go func(savetmp map[string]interface{}) {
-			defer func() {
-				<-pool_es
-				wg_es.Done()
-			}()
-			elastic.Save("azktest","azktest", savetmp)
-		}(savetmp)
-		tmp = make(map[string]interface{})
-	}
-	wg_es.Wait()
-
-
-	log.Println("is over",total)
-}
-

+ 304 - 0
data_monitoring/words_vaild/src1/main.go

@@ -0,0 +1,304 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"github.com/tealeg/xlsx"
+	"log"
+	"os"
+	qu "qfw/util"
+	"qfw/util/elastic"
+	"strings"
+	"sync"
+	"unicode/utf8"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+
+)
+var (
+	sysconfig			map[string]interface{} //配置文件
+	save_mgo        	*MongodbSim
+)
+
+func init()  {
+	save_mgo = &MongodbSim{
+		MongodbAddr: "192.168.3.207:27092",
+		DbName:      "zhengkun",
+		Size:        5,
+	}
+	save_mgo.InitPool()
+
+	elastic.InitElasticSize("http://192.168.3.11:9800",20)
+}
+
+func dealWithDataXlsx()  {
+
+	q := map[string]interface{}{}
+	sess := save_mgo.GetMgoConn()
+	defer save_mgo.DestoryMongoConn(sess)
+	it := sess.DB(save_mgo.DbName).C("zk_test_words").Find(&q).Iter()
+	total:=0
+	saveArr := make([]map[string]string,0)
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total % 10000 == 0 {
+			log.Println("current index",total,tmp["_id"])
+		}
+
+		if total % 30 ==0 {
+			name:=qu.ObjToString(tmp["name"])
+			dict := make(map[string]string)
+			dict["name"] = name
+			for i:=0; i<5;i++ {
+				value,total,hit :="","",""
+				key := "word_"+fmt.Sprintf("%d",i)
+				if tmp[key]!=nil {
+
+					if arr,ok := tmp[key].(primitive.A);ok {
+						dataArr :=qu.ObjArrToMapArr(arr)
+						value =qu.ObjToString(dataArr[0]["name"])
+						if i!=0 {
+							total = fmt.Sprintf("%d",dataArr[0]["all_words"])
+							hit = fmt.Sprintf("%d",dataArr[0]["hit_words"])
+						}
+					}
+
+				}
+				key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
+				dict[key] = value
+				dict[key1] = total
+				dict[key2] = hit
+
+			}
+			saveArr= append(saveArr,dict)
+		}
+		tmp = make(map[string]interface{})
+	}
+
+
+	os.Remove("words.xlsx")	//写excle
+	f :=xlsx.NewFile()
+
+
+	for i:=0; i<5;i++ {
+		key := "word_"+fmt.Sprintf("%d",i)
+		sheet, _ := f.AddSheet("统计"+key)
+		row := sheet.AddRow()
+		row.AddCell().Value = "name"
+		row.AddCell().Value = key
+		if i!=0 {
+			row.AddCell().Value = "total"
+			row.AddCell().Value = "hit"
+		}
+		key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
+
+		for _,tmp := range saveArr {
+			row = sheet.AddRow()
+			row.AddCell().SetString(tmp["name"])
+			row.AddCell().SetString(tmp[key])
+			row.AddCell().SetString(fmt.Sprintf("%s",tmp[key1]))
+			row.AddCell().SetString(fmt.Sprintf("%s",tmp[key2]))
+		}
+	}
+
+	err := f.Save("words.xlsx")
+	if err != nil {
+		log.Println("保存xlsx失败:", err)
+	}else {
+		log.Println("保存xlsx成功:", err)
+	}
+}
+
+func main()  {
+
+	//导出xlsx
+	dealWithDataXlsx()
+	return
+
+
+
+
+	defer qu.Catch()
+	log.Println("处理 ... 指定企业名称 ...")
+
+	//分析错误数据
+	//
+	q := map[string]interface{}{}
+	sess := save_mgo.GetMgoConn()
+	defer save_mgo.DestoryMongoConn(sess)
+	//细节才需要遍历
+	it := sess.DB(save_mgo.DbName).C("zk_company_test").Find(&q).Iter()
+	total:=0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total % 10000 == 0 {
+			log.Println("current index",total,tmp["_id"])
+		}
+
+		name:=qu.ObjToString(tmp["name"])
+		save_dict := make(map[string]interface{},0)
+		for i:=0; i<5;i++ {
+			key := "word_"+fmt.Sprintf("%d",i)
+			dataArr :=dealWithScoreRules(name,i)
+			if dataArr ==nil || len(dataArr)<1 {
+				//无数据
+			}else {
+				save_dict[key] = dealWithWordsRules(name,dataArr,i)
+			}
+
+		}
+
+		if len(save_dict)>0 {
+			save_dict["name"]  = name
+			save_mgo.Save("zk_test_words",save_dict)
+		}
+
+		tmp = make(map[string]interface{})
+	}
+
+}
+
+//分数维度
+func dealWithScoreRules(name string,space int) []map[string]interface{} {
+	key := ""
+	if space>0&&space<5{
+		key = fmt.Sprintf("%d",space)
+	}
+	query:= `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name_`+key+`","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
+
+	if key=="" {
+		query = `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
+	}
+	client := elastic.GetEsConn()
+	defer elastic.DestoryEsConn(client)
+	searchResult, err := client.Search().Index("azktest").Type("azktest").Source(query).Do()
+	if err != nil {
+		log.Println("从ES查询出错", err.Error())
+		return nil
+	}
+	resNum := len(searchResult.Hits.Hits)
+	res := make([]map[string]interface{}, resNum)
+	if searchResult.Hits != nil {
+		if resNum < 5000 {
+			for i, hit := range searchResult.Hits.Hits {
+				data := make(map[string]interface{},0)
+				json.Unmarshal(*hit.Source, &data)
+				res[i] = map[string]interface{}{
+					"name":data["name"],
+					"score":*hit.Score,
+				}
+			}
+		} else {
+			log.Println("查询结果太多,查询到:", resNum, "条")
+		}
+
+	}
+	return res
+}
+
+
+
+//击中数量以及比例
+func dealWithWordsRules(name string ,source []map[string]interface{},space int) []map[string]interface{} {
+
+	nameArr,_ := calculateWordCount(name,space)
+	newArr := make([]map[string]interface{},0)
+	for _,v := range source {
+		total,hit :=0,0
+		source_name :=qu.ObjToString(v["name"])
+		_,total = calculateWordCount(source_name,space)
+		for _,v1 := range nameArr {
+			if strings.Contains(source_name,v1) {
+				hit++
+			}
+		}
+
+
+		if space==0 {
+			newArr = append(newArr, map[string]interface{}{
+				"name":source_name,
+				"score":qu.Float64All(v["score"]),
+			})
+		}else {
+			newArr = append(newArr, map[string]interface{}{
+				"name":source_name,
+				"score":qu.Float64All(v["score"]),
+				"all_words" : total,
+				"hit_words" : hit,
+			})
+		}
+	}
+	return newArr
+}
+
+//分词结果
+func calculateWordCount(name string,space int) ([]string,int) {
+	arr := make([]string,0)
+	total := utf8.RuneCountInString(name)-(space-1)
+	if name == "" || space<=0 || total<=0  {
+		return arr,0
+	}
+	nameRune := []rune(name)
+	for i:=0;i<total ;i++  {
+		new_str := string(nameRune[i:space+i])
+		arr = append(arr,new_str)
+	}
+	return arr,len(arr)
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+func readyDataEs()  {
+
+	q := map[string]interface{}{}
+	sess := save_mgo.GetMgoConn()
+	defer save_mgo.DestoryMongoConn(sess)
+	//多线程升索引
+	pool_es := make(chan bool, 10)
+	wg_es := &sync.WaitGroup{}
+	//细节才需要遍历
+	it := sess.DB(save_mgo.DbName).C("zk_company_name").Find(&q).Iter()
+	total:=0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total % 10000 == 0 {
+			log.Println("current index",total,tmp["_id"])
+		}
+		savetmp := make(map[string]interface{}, 0)
+		savetmp["_id"] = tmp["_id"]
+		savetmp["name"] = qu.ObjToString(tmp["company_name"])
+		savetmp["name_1"] = qu.ObjToString(tmp["company_name"])
+		savetmp["name_2"] = qu.ObjToString(tmp["company_name"])
+		savetmp["name_3"] = qu.ObjToString(tmp["company_name"])
+		savetmp["name_4"] = qu.ObjToString(tmp["company_name"])
+		pool_es <- true
+		wg_es.Add(1)
+		go func(savetmp map[string]interface{}) {
+			defer func() {
+				<-pool_es
+				wg_es.Done()
+			}()
+			elastic.Save("azktest","azktest", savetmp)
+		}(savetmp)
+		tmp = make(map[string]interface{})
+	}
+	wg_es.Wait()
+
+
+	log.Println("is over",total)
+}
+

+ 0 - 0
data_monitoring/words_vaild/src/mgo.go → data_monitoring/words_vaild/src1/mgo.go