// score package extract import ( "fmt" ju "jy/util" "log" qu "qfw/util" "regexp" "strconv" "strings" "unicode/utf8" ) var SoreConfig map[string]map[string]interface{} var TagConfig map[string]map[string]float64 func init() { qu.ReadConfig("./res/tagscore.json", &TagConfig) qu.ReadConfig("./res/fieldscore.json", &SoreConfig) //实例化正则 for _, tmp := range SoreConfig { //log.Println(tmp) if tmp["type"] == "string" { if positions, ok := tmp["positivewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { strReq, _ := p["regstr"].(string) if strings.Contains(strReq, "\\u") { strReq = strings.Replace(strReq, "\\", "\\\\", -1) strReq = strings.Replace(strReq, "\\\\u", "\\u", -1) strReq, _ = strconv.Unquote(`"` + strReq + `"`) p["regexp"] = regexp.MustCompile(strReq) } else { p["regexp"] = regexp.MustCompile(strReq) } }, func(err interface{}) { log.Println(err) }) } } } if positions, ok := tmp["negativewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { strReq, _ := p["regstr"].(string) if strings.Contains(strReq, "\\u") { strReq = strings.Replace(strReq, "\\", "\\\\", -1) strReq = strings.Replace(strReq, "\\\\u", "\\u", -1) strReq, _ = strconv.Unquote(`"` + strReq + `"`) p["regexp"] = regexp.MustCompile(strReq) } else { p["regexp"] = regexp.MustCompile(strReq) } }, func(err interface{}) { log.Println(err) }) } } } if winnerorders, ok := tmp["winnerorder"].([]interface{}); ok { for _, winnerorder := range winnerorders { if p, ok := winnerorder.(map[string]interface{}); ok { qu.Try(func() { strReq, _ := p["regstr"].(string) if strings.Contains(strReq, "\\u") { strReq = strings.Replace(strReq, "\\", "\\\\", -1) strReq = strings.Replace(strReq, "\\\\u", "\\u", -1) strReq, _ = strconv.Unquote(`"` + strReq + `"`) p["regexp"] = regexp.MustCompile(strReq) } else { p["regexp"] = regexp.MustCompile(strReq) } }, func(err interface{}) { log.Println(err) }) } } } } } } //结果打分 func ScoreFields(j *ju.Job) map[string][]*ju.ExtField { result := j.Result qu.Catch() for field, tmps := range result { for tmpsindex, tmpsvalue := range tmps { //是否有段标签 if len(tmpsvalue.BlockTag) > 0 { //有标签段 var qz float64 = 0.0 //取权重最高的 var tgk string for key := range tmpsvalue.BlockTag { //key = "其他"//TODO 测试用 if TagConfig[key][field] > qz { qz = TagConfig[key][field] tgk = key } } tmps[tmpsindex].Score += 2 * qz //乘以权重系数 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", Type: tgk + field, ExtFrom: "tagscore.json", Value: tmpsvalue.Value, Score: 2 * qz}) } else { //没有段标签,走其他 //qz := TagConfig["其他"][field] //tmps[tmpsindex].Score += 2 * qz //乘以权重系数 } if tmpsvalue.ExtFrom != "title" { //非标题抽取 //是否有kv值 if strings.Contains(tmpsvalue.Type, "colon") { tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])}) } else if strings.Contains(tmpsvalue.Type, "space") { tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])}) } else if strings.Contains(tmpsvalue.Type, "table") { tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])}) } } if tmpsvalue.ExtFrom != "title" { //非标题抽取 if strings.Contains(tmpsvalue.Type, "regexp") { tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])}) } } else { if strings.Contains(tmpsvalue.Type, "regexp") { tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1}) } } scoreRule := SoreConfig[field] if scoreRule == nil { continue } if scoreRule["type"] == "string" { //1.长度打分 valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value)) if valueLen < 1 { continue } if valueLen > 100 && field != "projectscope" { tmps[tmpsindex].Score = -99 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Type: "length", Value: tmpsvalue.Value, Score: -99}) } if lengths, ok := scoreRule["length"].([]interface{}); ok { for _, tmp := range lengths { if length, ok := tmp.(map[string]interface{}); ok { min := qu.IntAll(length["min"]) max := qu.IntAll(length["max"]) scores, _ := length["score"].([]interface{}) if len(scores) < 3 { continue } if valueLen < min { tmps[tmpsindex].Score += qu.Float64All(scores[0]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, "<", min), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])}) } else if valueLen > max { tmps[tmpsindex].Score += qu.Float64All(scores[2]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", max), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])}) } else { tmps[tmpsindex].Score += qu.Float64All(scores[1]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", min, "&&", valueLen, "<", max), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])}) } } } } //2.负面词打分 if positions, ok := scoreRule["negativewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { if p["regexp"] != nil { reg := p["regexp"].(*regexp.Regexp) if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) { tmps[tmpsindex].Score += qu.Float64All(p["score"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: field + ".negativewords", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])}) } } }, func(err interface{}) { log.Println(err) }) } } } //3.正面词打分 if positions, ok := scoreRule["positivewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { if p["regexp"] != nil { reg := p["regexp"].(*regexp.Regexp) if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) { tmps[tmpsindex].Score += qu.Float64All(p["score"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: field + ".positivewords", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])}) } } }, func(err interface{}) { log.Println(err) }) } } } //4.位置打分 if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok { for _, winnerorder := range winnerorders { if p, ok := winnerorder.(map[string]interface{}); ok { qu.Try(func() { if p["regexp"] != nil { reg := p["regexp"].(*regexp.Regexp) if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) { tmps[tmpsindex].Score += qu.Float64All(p["score"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: field + ".winnerorder", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])}) } } }, func(err interface{}) { log.Println(err) }) } } } } //5.数据范围打分 if scoreRule["type"] == "float" { min := qu.IntAll(scoreRule["min"]) max := qu.IntAll(scoreRule["max"]) val := qu.IntAll(tmpsvalue.Value) scores, _ := scoreRule["score"].([]interface{}) if len(scores) < 3 { continue } if val < min && 0 < val { tmps[tmpsindex].Score += qu.Float64All(scores[0]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])}) } else if val > max { tmps[tmpsindex].Score += qu.Float64All(scores[2]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])}) } else if val <= max && val >= min { tmps[tmpsindex].Score += qu.Float64All(scores[1]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])}) } } //其他打分配置 // decimal if scoreRule["type"] == "decimal" { min := qu.IntAll(scoreRule["min"]) max := qu.IntAll(scoreRule["max"]) val := qu.IntAll(tmpsvalue.Value) scores, _ := scoreRule["score"].([]interface{}) if len(scores) < 3 { continue } if val > max { tmps[tmpsindex].Score += qu.Float64All(scores[2]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])}) } else if val <= max && val > min { tmps[tmpsindex].Score += qu.Float64All(scores[1]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])}) } } } } return result }