|
@@ -0,0 +1,305 @@
|
|
|
|
+package main
|
|
|
|
+
|
|
|
|
+import (
|
|
|
|
+ "encoding/json"
|
|
|
|
+ "fmt"
|
|
|
|
+ "github.com/tealeg/xlsx"
|
|
|
|
+ "log"
|
|
|
|
+ "os"
|
|
|
|
+ qu "qfw/util"
|
|
|
|
+ "qfw/util/elastic"
|
|
|
|
+ "strings"
|
|
|
|
+ "sync"
|
|
|
|
+ "unicode/utf8"
|
|
|
|
+ "go.mongodb.org/mongo-driver/bson/primitive"
|
|
|
|
+
|
|
|
|
+)
|
|
|
|
+var (
|
|
|
|
+ sysconfig map[string]interface{} //配置文件
|
|
|
|
+ save_mgo *MongodbSim
|
|
|
|
+)
|
|
|
|
+
|
|
|
|
+func init() {
|
|
|
|
+ save_mgo = &MongodbSim{
|
|
|
|
+ MongodbAddr: "192.168.3.207:27092",
|
|
|
|
+ DbName: "zhengkun",
|
|
|
|
+ Size: 5,
|
|
|
|
+ }
|
|
|
|
+ save_mgo.InitPool()
|
|
|
|
+
|
|
|
|
+ elastic.InitElasticSize("http://192.168.3.11:9800",20)
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func dealWithDataXlsx() {
|
|
|
|
+
|
|
|
|
+ q := map[string]interface{}{}
|
|
|
|
+ sess := save_mgo.GetMgoConn()
|
|
|
|
+ defer save_mgo.DestoryMongoConn(sess)
|
|
|
|
+ it := sess.DB(save_mgo.DbName).C("zk_test_words").Find(&q).Iter()
|
|
|
|
+ total:=0
|
|
|
|
+ saveArr := make([]map[string]string,0)
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
|
|
|
|
+ if total % 10000 == 0 {
|
|
|
|
+ log.Println("current index",total,tmp["_id"])
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if total % 30 ==0 {
|
|
|
|
+ name:=qu.ObjToString(tmp["name"])
|
|
|
|
+ dict := make(map[string]string)
|
|
|
|
+ dict["name"] = name
|
|
|
|
+ for i:=0; i<5;i++ {
|
|
|
|
+ value,total,hit :="","",""
|
|
|
|
+ key := "word_"+fmt.Sprintf("%d",i)
|
|
|
|
+ if tmp[key]!=nil {
|
|
|
|
+
|
|
|
|
+ if arr,ok := tmp[key].(primitive.A);ok {
|
|
|
|
+ dataArr :=qu.ObjArrToMapArr(arr)
|
|
|
|
+ value =qu.ObjToString(dataArr[0]["name"])
|
|
|
|
+ if i!=0 {
|
|
|
|
+ total = fmt.Sprintf("%d",dataArr[0]["all_words"])
|
|
|
|
+ hit = fmt.Sprintf("%d",dataArr[0]["hit_words"])
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
|
|
|
|
+ dict[key] = value
|
|
|
|
+ dict[key1] = total
|
|
|
|
+ dict[key2] = hit
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ saveArr= append(saveArr,dict)
|
|
|
|
+ }
|
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ os.Remove("words.xlsx") //写excle
|
|
|
|
+ f :=xlsx.NewFile()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ for i:=0; i<5;i++ {
|
|
|
|
+ key := "word_"+fmt.Sprintf("%d",i)
|
|
|
|
+ sheet, _ := f.AddSheet("统计"+key)
|
|
|
|
+ row := sheet.AddRow()
|
|
|
|
+ row.AddCell().Value = "name"
|
|
|
|
+ row.AddCell().Value = key
|
|
|
|
+ if i!=0 {
|
|
|
|
+ row.AddCell().Value = "total"
|
|
|
|
+ row.AddCell().Value = "hit"
|
|
|
|
+ }
|
|
|
|
+ key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
|
|
|
|
+
|
|
|
|
+ for _,tmp := range saveArr {
|
|
|
|
+ row = sheet.AddRow()
|
|
|
|
+ row.AddCell().SetString(tmp["name"])
|
|
|
|
+ row.AddCell().SetString(tmp[key])
|
|
|
|
+ row.AddCell().SetString(fmt.Sprintf("%s",tmp[key1]))
|
|
|
|
+ row.AddCell().SetString(fmt.Sprintf("%s",tmp[key2]))
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ err := f.Save("words.xlsx")
|
|
|
|
+ if err != nil {
|
|
|
|
+ log.Println("保存xlsx失败:", err)
|
|
|
|
+ }else {
|
|
|
|
+ log.Println("保存xlsx成功:", err)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func main() {
|
|
|
|
+
|
|
|
|
+ //导出xlsx
|
|
|
|
+ dealWithDataXlsx()
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ log.Println("处理 ... 指定企业名称 ...")
|
|
|
|
+
|
|
|
|
+ //分析错误数据
|
|
|
|
+ //
|
|
|
|
+ q := map[string]interface{}{}
|
|
|
|
+ sess := save_mgo.GetMgoConn()
|
|
|
|
+ defer save_mgo.DestoryMongoConn(sess)
|
|
|
|
+ //细节才需要遍历
|
|
|
|
+ it := sess.DB(save_mgo.DbName).C("zk_company_test").Find(&q).Iter()
|
|
|
|
+ total:=0
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
|
|
|
|
+ if total % 10000 == 0 {
|
|
|
|
+ log.Println("current index",total,tmp["_id"])
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ name:=qu.ObjToString(tmp["name"])
|
|
|
|
+ save_dict := make(map[string]interface{},0)
|
|
|
|
+ for i:=0; i<5;i++ {
|
|
|
|
+ key := "word_"+fmt.Sprintf("%d",i)
|
|
|
|
+ dataArr :=dealWithScoreRules(name,i)
|
|
|
|
+ if dataArr ==nil || len(dataArr)<1 {
|
|
|
|
+ //无数据
|
|
|
|
+ }else {
|
|
|
|
+ save_dict[key] = dealWithWordsRules(name,dataArr,i)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if len(save_dict)>0 {
|
|
|
|
+ save_dict["name"] = name
|
|
|
|
+ save_mgo.Save("zk_test_words",save_dict)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+//分数维度
|
|
|
|
+func dealWithScoreRules(name string,space int) []map[string]interface{} {
|
|
|
|
+ key := ""
|
|
|
|
+ if space>0&&space<5{
|
|
|
|
+ key = fmt.Sprintf("%d",space)
|
|
|
|
+ }
|
|
|
|
+ query:= `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name_`+key+`","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
|
|
|
|
+
|
|
|
|
+ if key=="" {
|
|
|
|
+ query = `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
|
|
|
|
+ }
|
|
|
|
+ client := elastic.GetEsConn()
|
|
|
|
+ defer elastic.DestoryEsConn(client)
|
|
|
|
+ searchResult, err := client.Search().Index("azktest").Type("azktest").Source(query).Do()
|
|
|
|
+ if err != nil {
|
|
|
|
+ log.Println("从ES查询出错", err.Error())
|
|
|
|
+ return nil
|
|
|
|
+ }
|
|
|
|
+ resNum := len(searchResult.Hits.Hits)
|
|
|
|
+ res := make([]map[string]interface{}, resNum)
|
|
|
|
+ if searchResult.Hits != nil {
|
|
|
|
+ if resNum < 5000 {
|
|
|
|
+ for i, hit := range searchResult.Hits.Hits {
|
|
|
|
+ data := make(map[string]interface{},0)
|
|
|
|
+ json.Unmarshal(*hit.Source, &data)
|
|
|
|
+ res[i] = map[string]interface{}{
|
|
|
|
+ "name":data["name"],
|
|
|
|
+ "score":*hit.Score,
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ log.Println("查询结果太多,查询到:", resNum, "条")
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ }
|
|
|
|
+ return res
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+//击中数量以及比例
|
|
|
|
+func dealWithWordsRules(name string ,source []map[string]interface{},space int) []map[string]interface{} {
|
|
|
|
+
|
|
|
|
+ nameArr,_ := calculateWordCount(name,space)
|
|
|
|
+ newArr := make([]map[string]interface{},0)
|
|
|
|
+ for _,v := range source {
|
|
|
|
+ total,hit :=0,0
|
|
|
|
+ source_name :=qu.ObjToString(v["name"])
|
|
|
|
+ _,total = calculateWordCount(source_name,space)
|
|
|
|
+ for _,v1 := range nameArr {
|
|
|
|
+ if strings.Contains(source_name,v1) {
|
|
|
|
+ hit++
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if space==0 {
|
|
|
|
+ newArr = append(newArr, map[string]interface{}{
|
|
|
|
+ "name":source_name,
|
|
|
|
+ "score":qu.Float64All(v["score"]),
|
|
|
|
+ })
|
|
|
|
+ }else {
|
|
|
|
+ newArr = append(newArr, map[string]interface{}{
|
|
|
|
+ "name":source_name,
|
|
|
|
+ "score":qu.Float64All(v["score"]),
|
|
|
|
+ "all_words" : total,
|
|
|
|
+ "hit_words" : hit,
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return newArr
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+//分词结果
|
|
|
|
+func calculateWordCount(name string,space int) ([]string,int) {
|
|
|
|
+ arr := make([]string,0)
|
|
|
|
+ total := utf8.RuneCountInString(name)-(space-1)
|
|
|
|
+ if name == "" || space<=0 || total<=0 {
|
|
|
|
+ return arr,0
|
|
|
|
+ }
|
|
|
|
+ nameRune := []rune(name)
|
|
|
|
+ for i:=0;i<total ;i++ {
|
|
|
|
+ new_str := string(nameRune[i:space+i])
|
|
|
|
+ arr = append(arr,new_str)
|
|
|
|
+ }
|
|
|
|
+ return arr,len(arr)
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+func readyDataEs() {
|
|
|
|
+
|
|
|
|
+ q := map[string]interface{}{}
|
|
|
|
+ sess := save_mgo.GetMgoConn()
|
|
|
|
+ defer save_mgo.DestoryMongoConn(sess)
|
|
|
|
+ //多线程升索引
|
|
|
|
+ pool_es := make(chan bool, 10)
|
|
|
|
+ wg_es := &sync.WaitGroup{}
|
|
|
|
+ //细节才需要遍历
|
|
|
|
+ it := sess.DB(save_mgo.DbName).C("zk_company_name").Find(&q).Iter()
|
|
|
|
+ total:=0
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
|
|
|
|
+ if total % 10000 == 0 {
|
|
|
|
+ log.Println("current index",total,tmp["_id"])
|
|
|
|
+ }
|
|
|
|
+ savetmp := make(map[string]interface{}, 0)
|
|
|
|
+ savetmp["_id"] = tmp["_id"]
|
|
|
|
+ savetmp["name"] = qu.ObjToString(tmp["company_name"])
|
|
|
|
+ savetmp["name_1"] = qu.ObjToString(tmp["company_name"])
|
|
|
|
+ savetmp["name_2"] = qu.ObjToString(tmp["company_name"])
|
|
|
|
+ savetmp["name_3"] = qu.ObjToString(tmp["company_name"])
|
|
|
|
+ savetmp["name_4"] = qu.ObjToString(tmp["company_name"])
|
|
|
|
+ pool_es <- true
|
|
|
|
+ wg_es.Add(1)
|
|
|
|
+ go func(savetmp map[string]interface{}) {
|
|
|
|
+ defer func() {
|
|
|
|
+ <-pool_es
|
|
|
|
+ wg_es.Done()
|
|
|
|
+ }()
|
|
|
|
+ elastic.Save("azktest","azktest", savetmp)
|
|
|
|
+ }(savetmp)
|
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
|
+ }
|
|
|
|
+ wg_es.Wait()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ log.Println("is over",total)
|
|
|
|
+}
|
|
|
|
+
|