123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- package main
- import (
- "encoding/json"
- "fmt"
- "github.com/tealeg/xlsx"
- "log"
- "os"
- qu "qfw/util"
- "qfw/util/elastic"
- "strings"
- "sync"
- "unicode/utf8"
- "go.mongodb.org/mongo-driver/bson/primitive"
- )
- var (
- sysconfig map[string]interface{} //配置文件
- save_mgo *MongodbSim
- )
- func init() {
- save_mgo = &MongodbSim{
- MongodbAddr: "192.168.3.207:27092",
- DbName: "zhengkun",
- Size: 5,
- }
- save_mgo.InitPool()
- elastic.InitElasticSize("http://192.168.3.11:9800",20)
- }
- func dealWithDataXlsx() {
- q := map[string]interface{}{}
- sess := save_mgo.GetMgoConn()
- defer save_mgo.DestoryMongoConn(sess)
- it := sess.DB(save_mgo.DbName).C("zk_test_words").Find(&q).Iter()
- total:=0
- saveArr := make([]map[string]string,0)
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total % 10000 == 0 {
- log.Println("current index",total,tmp["_id"])
- }
- if total % 30 ==0 {
- name:=qu.ObjToString(tmp["name"])
- dict := make(map[string]string)
- dict["name"] = name
- for i:=0; i<5;i++ {
- value,total,hit :="","",""
- key := "word_"+fmt.Sprintf("%d",i)
- if tmp[key]!=nil {
- if arr,ok := tmp[key].(primitive.A);ok {
- dataArr :=qu.ObjArrToMapArr(arr)
- value =qu.ObjToString(dataArr[0]["name"])
- if i!=0 {
- total = fmt.Sprintf("%d",dataArr[0]["all_words"])
- hit = fmt.Sprintf("%d",dataArr[0]["hit_words"])
- }
- }
- }
- key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
- dict[key] = value
- dict[key1] = total
- dict[key2] = hit
- }
- saveArr= append(saveArr,dict)
- }
- tmp = make(map[string]interface{})
- }
- os.Remove("words.xlsx") //写excle
- f :=xlsx.NewFile()
- for i:=0; i<5;i++ {
- key := "word_"+fmt.Sprintf("%d",i)
- sheet, _ := f.AddSheet("统计"+key)
- row := sheet.AddRow()
- row.AddCell().Value = "name"
- row.AddCell().Value = key
- if i!=0 {
- row.AddCell().Value = "total"
- row.AddCell().Value = "hit"
- }
- key1,key2:="total"+fmt.Sprintf("%d",i),"hit"+fmt.Sprintf("%d",i)
- for _,tmp := range saveArr {
- row = sheet.AddRow()
- row.AddCell().SetString(tmp["name"])
- row.AddCell().SetString(tmp[key])
- row.AddCell().SetString(fmt.Sprintf("%s",tmp[key1]))
- row.AddCell().SetString(fmt.Sprintf("%s",tmp[key2]))
- }
- }
- err := f.Save("words.xlsx")
- if err != nil {
- log.Println("保存xlsx失败:", err)
- }else {
- log.Println("保存xlsx成功:", err)
- }
- }
- func main() {
- //导出xlsx
- dealWithDataXlsx()
- return
- defer qu.Catch()
- log.Println("处理 ... 指定企业名称 ...")
- //分析错误数据
- //
- q := map[string]interface{}{}
- sess := save_mgo.GetMgoConn()
- defer save_mgo.DestoryMongoConn(sess)
- //细节才需要遍历
- it := sess.DB(save_mgo.DbName).C("zk_company_test").Find(&q).Iter()
- total:=0
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total % 10000 == 0 {
- log.Println("current index",total,tmp["_id"])
- }
- name:=qu.ObjToString(tmp["name"])
- save_dict := make(map[string]interface{},0)
- for i:=0; i<5;i++ {
- key := "word_"+fmt.Sprintf("%d",i)
- dataArr :=dealWithScoreRules(name,i)
- if dataArr ==nil || len(dataArr)<1 {
- //无数据
- }else {
- save_dict[key] = dealWithWordsRules(name,dataArr,i)
- }
- }
- if len(save_dict)>0 {
- save_dict["name"] = name
- save_mgo.Save("zk_test_words",save_dict)
- }
- tmp = make(map[string]interface{})
- }
- }
- //分数维度
- func dealWithScoreRules(name string,space int) []map[string]interface{} {
- key := ""
- if space>0&&space<5{
- key = fmt.Sprintf("%d",space)
- }
- query:= `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name_`+key+`","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
- if key=="" {
- query = `{"query":{"bool":{"must":[{"query_string":{"default_field":"azktest.name","query":"`+name+`"}}],"must_not":[],"should":[]}},"from":0,"size":3,"sort":[],"facets":{}}`
- }
- client := elastic.GetEsConn()
- defer elastic.DestoryEsConn(client)
- searchResult, err := client.Search().Index("azktest").Type("azktest").Source(query).Do()
- if err != nil {
- log.Println("从ES查询出错", err.Error())
- return nil
- }
- resNum := len(searchResult.Hits.Hits)
- res := make([]map[string]interface{}, resNum)
- if searchResult.Hits != nil {
- if resNum < 5000 {
- for i, hit := range searchResult.Hits.Hits {
- data := make(map[string]interface{},0)
- json.Unmarshal(*hit.Source, &data)
- res[i] = map[string]interface{}{
- "name":data["name"],
- "score":*hit.Score,
- }
- }
- } else {
- log.Println("查询结果太多,查询到:", resNum, "条")
- }
- }
- return res
- }
- //击中数量以及比例
- func dealWithWordsRules(name string ,source []map[string]interface{},space int) []map[string]interface{} {
- nameArr,_ := calculateWordCount(name,space)
- newArr := make([]map[string]interface{},0)
- for _,v := range source {
- total,hit :=0,0
- source_name :=qu.ObjToString(v["name"])
- _,total = calculateWordCount(source_name,space)
- for _,v1 := range nameArr {
- if strings.Contains(source_name,v1) {
- hit++
- }
- }
- if space==0 {
- newArr = append(newArr, map[string]interface{}{
- "name":source_name,
- "score":qu.Float64All(v["score"]),
- })
- }else {
- newArr = append(newArr, map[string]interface{}{
- "name":source_name,
- "score":qu.Float64All(v["score"]),
- "all_words" : total,
- "hit_words" : hit,
- })
- }
- }
- return newArr
- }
- //分词结果
- func calculateWordCount(name string,space int) ([]string,int) {
- arr := make([]string,0)
- total := utf8.RuneCountInString(name)-(space-1)
- if name == "" || space<=0 || total<=0 {
- return arr,0
- }
- nameRune := []rune(name)
- for i:=0;i<total ;i++ {
- new_str := string(nameRune[i:space+i])
- arr = append(arr,new_str)
- }
- return arr,len(arr)
- }
- func readyDataEs() {
- q := map[string]interface{}{}
- sess := save_mgo.GetMgoConn()
- defer save_mgo.DestoryMongoConn(sess)
- //多线程升索引
- pool_es := make(chan bool, 10)
- wg_es := &sync.WaitGroup{}
- //细节才需要遍历
- it := sess.DB(save_mgo.DbName).C("zk_company_name").Find(&q).Iter()
- total:=0
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total % 10000 == 0 {
- log.Println("current index",total,tmp["_id"])
- }
- savetmp := make(map[string]interface{}, 0)
- savetmp["_id"] = tmp["_id"]
- savetmp["name"] = qu.ObjToString(tmp["company_name"])
- savetmp["name_1"] = qu.ObjToString(tmp["company_name"])
- savetmp["name_2"] = qu.ObjToString(tmp["company_name"])
- savetmp["name_3"] = qu.ObjToString(tmp["company_name"])
- savetmp["name_4"] = qu.ObjToString(tmp["company_name"])
- pool_es <- true
- wg_es.Add(1)
- go func(savetmp map[string]interface{}) {
- defer func() {
- <-pool_es
- wg_es.Done()
- }()
- elastic.Save("azktest","azktest", savetmp)
- }(savetmp)
- tmp = make(map[string]interface{})
- }
- wg_es.Wait()
- log.Println("is over",total)
- }
|