123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326 |
- // score
- package extract
- import (
- "fmt"
- ju "jy/util"
- "log"
- qu "qfw/util"
- "regexp"
- "strconv"
- "strings"
- "unicode/utf8"
- )
- var (
- SoreConfig map[string]map[string]interface{}
- TagConfig map[string]map[string]float64
- TagConfigDesc map[string]string
- RepeatScore, BlockScore float64
- CommonScore map[string]float64
- FieldsScore map[string]map[string]float64
- )
- func init() {
- qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
- qu.ReadConfig("./res/tagscore.json", &TagConfig)
- qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
- if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
- RepeatScore = qu.Float64All(repeat["score"])
- }
- if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
- BlockScore = qu.Float64All(block["score"])
- }
- //通用抽取属性打分配置
- if tmp, ok := SoreConfig["extractype"]["common"].(map[string]interface{}); ok {
- CommonScore = map[string]float64{}
- for k, v := range tmp {
- CommonScore[k] = qu.Float64All(v)
- }
- }
- log.Println(CommonScore)
- //指定抽取属性打分配置
- if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
- FieldsScore = map[string]map[string]float64{}
- for key, fieldmap := range tmp {
- fieldscore := map[string]float64{}
- if field, ok := fieldmap.(map[string]interface{}); ok {
- for k, score := range field {
- fieldscore[k] = qu.Float64All(score)
- }
- }
- FieldsScore[key] = fieldscore
- }
- }
- log.Println(FieldsScore)
- //实例化正则
- for _, tmp := range SoreConfig {
- //log.Println(tmp)
- if tmp["type"] == "string" {
- if positions, ok := tmp["positivewords"].([]interface{}); ok {
- for _, position := range positions {
- if p, ok := position.(map[string]interface{}); ok {
- qu.Try(func() {
- strReq, _ := p["regstr"].(string)
- if strings.Contains(strReq, "\\u") {
- strReq = strings.Replace(strReq, "\\", "\\\\", -1)
- strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
- strReq, _ = strconv.Unquote(`"` + strReq + `"`)
- p["regexp"] = regexp.MustCompile(strReq)
- } else {
- p["regexp"] = regexp.MustCompile(strReq)
- }
- }, func(err interface{}) {
- log.Println(err)
- })
- }
- }
- }
- if positions, ok := tmp["negativewords"].([]interface{}); ok {
- for _, position := range positions {
- if p, ok := position.(map[string]interface{}); ok {
- qu.Try(func() {
- strReq, _ := p["regstr"].(string)
- if strings.Contains(strReq, "\\u") {
- strReq = strings.Replace(strReq, "\\", "\\\\", -1)
- strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
- strReq, _ = strconv.Unquote(`"` + strReq + `"`)
- p["regexp"] = regexp.MustCompile(strReq)
- } else {
- p["regexp"] = regexp.MustCompile(strReq)
- }
- }, func(err interface{}) {
- log.Println(err)
- })
- }
- }
- }
- if winnerorders, ok := tmp["winnerorder"].([]interface{}); ok {
- for _, winnerorder := range winnerorders {
- if p, ok := winnerorder.(map[string]interface{}); ok {
- qu.Try(func() {
- strReq, _ := p["regstr"].(string)
- if strings.Contains(strReq, "\\u") {
- strReq = strings.Replace(strReq, "\\", "\\\\", -1)
- strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
- strReq, _ = strconv.Unquote(`"` + strReq + `"`)
- p["regexp"] = regexp.MustCompile(strReq)
- } else {
- p["regexp"] = regexp.MustCompile(strReq)
- }
- }, func(err interface{}) {
- log.Println(err)
- })
- }
- }
- }
- }
- }
- }
- //结果打分
- func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
- qu.Catch()
- result := j.Result
- for field, tmps := range result {
- for tmpsindex, tmpsvalue := range tmps {
- describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
- if tmpsvalue.ExtFrom == "title" { //标题打分初始化
- tmps[tmpsindex].Score += CommonScore["title"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
- }
- //是否有段标签
- if len(tmpsvalue.BlockTag) > 0 {
- //有标签段
- var qz float64 = 0.0 //取权重最高的
- for key := range tmpsvalue.BlockTag {
- //key = "其他"//TODO 测试用
- if TagConfig[key][field] > qz {
- qz = TagConfig[key][field]
- }
- }
- tmps[tmpsindex].Score += BlockScore * qz //乘以权重系数
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
- } else {
- //没有段标签,走其他
- //qz := TagConfig["其他"][field]
- //tmps[tmpsindex].Score += 2 * qz //乘以权重系数
- }
- //抽取类型打分
- if FieldsScore[field] != nil { //指定抽取属性打分配置
- fieldscore := FieldsScore[field]
- if strings.Contains(tmpsvalue.Type, "colon") {
- tmps[tmpsindex].Score += fieldscore["colon"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
- } else if strings.Contains(tmpsvalue.Type, "space") {
- tmps[tmpsindex].Score += fieldscore["space"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["space"]})
- } else if strings.Contains(tmpsvalue.Type, "table") {
- tmps[tmpsindex].Score += fieldscore["table"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["table"]})
- } else if strings.Contains(tmpsvalue.Type, "regexp") {
- tmps[tmpsindex].Score += fieldscore["regexp"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
- }
- } else { //通用抽取属性打分配置
- if strings.Contains(tmpsvalue.Type, "colon") {
- tmps[tmpsindex].Score += CommonScore["colon"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})
- } else if strings.Contains(tmpsvalue.Type, "space") {
- tmps[tmpsindex].Score += CommonScore["space"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["space"]})
- } else if strings.Contains(tmpsvalue.Type, "table") {
- tmps[tmpsindex].Score += CommonScore["table"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["table"]})
- } else if strings.Contains(tmpsvalue.Type, "regexp") {
- tmps[tmpsindex].Score += CommonScore["regexp"]
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["regexp"]})
- }
- }
- scoreRule := SoreConfig[field]
- if scoreRule == nil {
- continue
- }
- //配置打分
- if scoreRule["type"] == "string" {
- //1.长度打分
- valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
- if valueLen < 1 {
- continue
- }
- if valueLen > 100 && field != "projectscope" {
- tmps[tmpsindex].Score = -99
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Value: tmpsvalue.Value, Score: -99})
- }
- if lengths, ok := scoreRule["length"].([]interface{}); ok {
- for _, tmp := range lengths {
- if length, ok := tmp.(map[string]interface{}); ok {
- if ranges, ok := length["range"].([]interface{}); ok {
- gt := qu.IntAll(ranges[0])
- lte := qu.IntAll(ranges[1])
- if lte < 0 { //∞
- lte = 999999
- }
- score := qu.Float64All(ranges[2])
- if valueLen > gt && valueLen <= lte {
- tmps[tmpsindex].Score += score
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
- break
- }
- }
- }
- }
- }
- //2.负面词打分
- if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
- for _, position := range positions {
- if p, ok := position.(map[string]interface{}); ok {
- qu.Try(func() {
- if p["regexp"] != nil {
- reg := p["regexp"].(*regexp.Regexp)
- if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
- tmps[tmpsindex].Score += qu.Float64All(p["score"])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
- }
- }
- }, func(err interface{}) {
- log.Println(err)
- })
- }
- }
- }
- //3.正面词打分
- if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
- for _, position := range positions {
- if p, ok := position.(map[string]interface{}); ok {
- qu.Try(func() {
- if p["regexp"] != nil {
- reg := p["regexp"].(*regexp.Regexp)
- if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
- tmps[tmpsindex].Score += qu.Float64All(p["score"])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
- }
- }
- }, func(err interface{}) {
- log.Println(err)
- })
- }
- }
- }
- //4.中标候选人打分
- if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
- for _, winnerorder := range winnerorders {
- if p, ok := winnerorder.(map[string]interface{}); ok {
- qu.Try(func() {
- if p["regexp"] != nil {
- reg := p["regexp"].(*regexp.Regexp)
- if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
- tmps[tmpsindex].Score += qu.Float64All(p["score"])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: "winnerorder", RuleText: reg.String(), ScoreFrom: "fieldscore.json.winnerorder", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
- }
- }
- }, func(err interface{}) {
- log.Println(err)
- })
- }
- }
- }
- }
- //5.数据范围打分
- if scoreRule["type"] == "float" {
- min := qu.IntAll(scoreRule["min"])
- max := qu.IntAll(scoreRule["max"])
- val := qu.IntAll(tmpsvalue.Value)
- scores, _ := scoreRule["score"].([]interface{})
- if len(scores) < 3 {
- continue
- }
- if val < min && 0 < val {
- tmps[tmpsindex].Score += qu.Float64All(scores[0])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
- } else if val > max {
- tmps[tmpsindex].Score += qu.Float64All(scores[2])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
- } else if val <= max && val >= min {
- tmps[tmpsindex].Score += qu.Float64All(scores[1])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
- }
- }
- //其他打分配置
- // decimal
- if scoreRule["type"] == "decimal" {
- min := qu.IntAll(scoreRule["min"])
- max := qu.IntAll(scoreRule["max"])
- val := qu.IntAll(tmpsvalue.Value)
- scores, _ := scoreRule["score"].([]interface{})
- if len(scores) < 3 {
- continue
- }
- if val > max {
- tmps[tmpsindex].Score += qu.Float64All(scores[2])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
- } else if val <= max && val > min {
- tmps[tmpsindex].Score += qu.Float64All(scores[1])
- tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
- }
- }
- }
- //计算重复值,并加分=重复数量*乘系数
- valrepeat := map[string]int{}
- for _, v := range tmps {
- valrepeat[fmt.Sprint(v.Value)] += 1
- }
- for index, v := range tmps {
- v.ValRepeat = valrepeat[fmt.Sprint(v.Value)] - 1
- if v.ValRepeat > 0 {
- score := RepeatScore * float64(v.ValRepeat)
- v.Score += score
- tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score})
- }
- }
- }
- return result
- }
|