// score package extract import ( "fmt" ju "jy/util" "log" qu "qfw/util" "regexp" "strconv" "strings" "sync" "unicode/utf8" ) var ( lockscore sync.RWMutex SoreConfig map[string]map[string]interface{} TagConfig map[string]map[string]float64 TagConfigDesc map[string]string RepeatScore, BlockScore float64 CommonScore map[string]float64 FieldsScore map[string]map[string]float64 ) func init() { qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc) qu.ReadConfig("./res/tagscore.json", &TagConfig) qu.ReadConfig("./res/fieldscore.json", &SoreConfig) if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok { RepeatScore = qu.Float64All(repeat["score"]) } if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok { BlockScore = qu.Float64All(block["score"]) } //通用抽取属性打分配置 if tmp, ok := SoreConfig["extractype"]["common"].(map[string]interface{}); ok { CommonScore = map[string]float64{} for k, v := range tmp { CommonScore[k] = qu.Float64All(v) } } //指定抽取属性打分配置 if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok { FieldsScore = map[string]map[string]float64{} for key, fieldmap := range tmp { fieldscore := map[string]float64{} if field, ok := fieldmap.(map[string]interface{}); ok { for k, score := range field { fieldscore[k] = qu.Float64All(score) } } FieldsScore[key] = fieldscore } } //实例化正则 for _, tmp := range SoreConfig { //log.Println(tmp) if tmp["type"] == "string" { if positions, ok := tmp["positivewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { strReq, _ := p["regstr"].(string) if strings.Contains(strReq, "\\u") { strReq = strings.Replace(strReq, "\\", "\\\\", -1) strReq = strings.Replace(strReq, "\\\\u", "\\u", -1) strReq, _ = strconv.Unquote(`"` + strReq + `"`) p["regexp"] = regexp.MustCompile(strReq) } else { p["regexp"] = regexp.MustCompile(strReq) } }, func(err interface{}) { log.Println(err) }) } } } if positions, ok := tmp["negativewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { strReq, _ := p["regstr"].(string) if strings.Contains(strReq, "\\u") { strReq = strings.Replace(strReq, "\\", "\\\\", -1) strReq = strings.Replace(strReq, "\\\\u", "\\u", -1) strReq, _ = strconv.Unquote(`"` + strReq + `"`) p["regexp"] = regexp.MustCompile(strReq) } else { p["regexp"] = regexp.MustCompile(strReq) } }, func(err interface{}) { log.Println(err) }) } } } } } } //结果打分 func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField { qu.Catch() result := j.Result for field, tmps := range result { if field == "projectcode" { tmps = projectWeightClear(tmps) } locktag.Lock() taglength := len(ftag[field]) locktag.Unlock() for tmpsindex, tmpsvalue := range tmps { //没有抽取到值,不打分 if string_value := fmt.Sprint(tmpsvalue.Value); string_value == "" || string_value == "0" || string_value == "" { tmps[tmpsindex].Score = -10 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10}) continue } lockscore.Lock() describe := qu.ObjToString(SoreConfig["extractype"]["describe"]) lockscore.Unlock() //是否有段标签 if len(tmpsvalue.BlockTag) > 0 { //有标签段 var qz float64 = 0.0 //取权重最高的 for key := range tmpsvalue.BlockTag { //key = "其他"//TODO 测试用 lockscore.Lock() if TagConfig[key][field] > qz { qz = TagConfig[key][field] } lockscore.Unlock() } tmps[tmpsindex].Score += ju.FloatFormat(BlockScore*qz, 4) //乘以权重系数 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz}) } else { //没有段标签,走其他 //qz := TagConfig["其他"][field] //tmps[tmpsindex].Score += 2 * qz //乘以权重系数 } //抽取类型打分 lockscore.Lock() fieldscore := FieldsScore[field] typescore := float64(0) titlescore := float64(0) if fieldscore != nil { //指定抽取属性打分配置 if tmpsvalue.ExtFrom == "title" { //标题打分初始化 titlescore = fieldscore["title"] } typescore = fieldscore[tmpsvalue.Type] } else { //通用抽取属性打分配置 if tmpsvalue.ExtFrom == "title" { //标题打分初始化 titlescore = CommonScore["title"] } typescore = CommonScore[tmpsvalue.Type] } lockscore.Unlock() if titlescore > 0 { tmps[tmpsindex].Score += titlescore tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore}) } tmps[tmpsindex].Score += typescore tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore}) //kv权重打分 if fieldscore != nil { //指定抽取属性打分配置 if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" { if taglength == 0 { continue } weightscore := ju.FloatFormat(float64(qu.Float64All(fieldscore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4) tmps[tmpsindex].Score += weightscore tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore}) } else { //正则权重,暂不考虑 } } else { if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" { if taglength == 0 { continue } weightscore := ju.FloatFormat(float64(qu.Float64All(CommonScore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4) tmps[tmpsindex].Score += weightscore tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore}) } else { //正则权重,暂不考虑 } } lockscore.Lock() scoreRule := SoreConfig[field] lockscore.Unlock() if scoreRule == nil { continue } //配置打分 if scoreRule["type"] == "string" { //1.长度打分 valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value)) if valueLen < 1 { tmps[tmpsindex].Score = -10 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen < 1 && field != "projectscope"直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10}) continue } if valueLen > 100 && field != "projectscope" { tmps[tmpsindex].Score = -99 tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Value: tmpsvalue.Value, Score: -99}) } if lengths, ok := scoreRule["length"].([]interface{}); ok { for _, tmp := range lengths { if length, ok := tmp.(map[string]interface{}); ok { if ranges, ok := length["range"].([]interface{}); ok { gt := qu.IntAll(ranges[0]) lte := qu.IntAll(ranges[1]) if lte < 0 { //∞ lte = 999999 } score := qu.Float64All(ranges[2]) if valueLen > gt && valueLen <= lte { tmps[tmpsindex].Score += score tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score}) break } } } } } //2.负面词打分 if positions, ok := scoreRule["negativewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { if p["regexp"] != nil { reg := p["regexp"].(*regexp.Regexp) if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) { tmps[tmpsindex].Score += qu.Float64All(p["score"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])}) } } }, func(err interface{}) { log.Println(err) }) } } } //3.正面词打分 if positions, ok := scoreRule["positivewords"].([]interface{}); ok { for _, position := range positions { if p, ok := position.(map[string]interface{}); ok { qu.Try(func() { if p["regexp"] != nil { reg := p["regexp"].(*regexp.Regexp) if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) { tmps[tmpsindex].Score += qu.Float64All(p["score"]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])}) } } }, func(err interface{}) { log.Println(err) }) } } } } //4.数据范围打分 if scoreRule["type"] == "float" { min := qu.IntAll(scoreRule["min"]) max := qu.IntAll(scoreRule["max"]) val := qu.IntAll(tmpsvalue.Value) scores, _ := scoreRule["score"].([]interface{}) if len(scores) < 3 || val == 0 { continue } if val < min && 0 < val { tmps[tmpsindex].Score += qu.Float64All(scores[0]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])}) } else if val > max { tmps[tmpsindex].Score += qu.Float64All(scores[2]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])}) } else if val <= max && val >= min { tmps[tmpsindex].Score += qu.Float64All(scores[1]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])}) } } //其他打分配置 // decimal if scoreRule["type"] == "decimal" { min := qu.IntAll(scoreRule["min"]) max := qu.IntAll(scoreRule["max"]) val := qu.IntAll(tmpsvalue.Value) scores, _ := scoreRule["score"].([]interface{}) if len(scores) < 3 { continue } if val > max { tmps[tmpsindex].Score += qu.Float64All(scores[2]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])}) } else if val <= max && val > min { tmps[tmpsindex].Score += qu.Float64All(scores[1]) tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])}) } } } //计算重复值,并加分=重复数量*乘系数 valrepeat := map[string]int{} for _, v := range tmps { valrepeat[fmt.Sprint(v.Value)] += 1 } for index, v := range tmps { v.ValRepeat = valrepeat[fmt.Sprint(v.Value)] - 1 if v.ValRepeat > 0 { score := RepeatScore * float64(v.ValRepeat) v.Score += score tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score}) } v.Score = ju.FloatFormat(v.Score, 4) } } return result } //项目编号权重清理 func projectWeightClear(tmps []*ju.ExtField) []*ju.ExtField { newList := make([]*ju.ExtField, 0) if len(tmps) < 1 { return newList } ju.Sort(tmps) tmpWeight := -999 //记录最大权重 tmpIndex := -999 //记录最大权重下标 vmap := make(map[string]int, 0) for i, v := range tmps { if v.Weight == 0 { newList = append(newList, v) continue } else if v.Weight > tmpWeight { tmpWeight = v.Weight tmpIndex = i } else if v.Weight == tmpWeight { if utf8.RuneCountInString(qu.ObjToString(v.Value)) >= 5 && utf8.RuneCountInString(qu.ObjToString(v.Value)) <= 38 && v.Value != tmps[tmpIndex].Value { vmap[qu.ObjToString(v.Value)] = i } } } if tmpIndex != -999 { newList = append(newList, tmps[tmpIndex]) } if len(vmap) > 0 { for _, v := range vmap { newList = append(newList, tmps[v]) } } return newList }