score.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. // score
  2. package extract
  3. import (
  4. "fmt"
  5. ju "jy/util"
  6. "log"
  7. qu "qfw/util"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. var (
  14. SoreConfig map[string]map[string]interface{}
  15. TagConfig map[string]map[string]float64
  16. TagConfigDesc map[string]string
  17. RepeatScore, BlockScore float64
  18. CommonScore map[string]float64
  19. FieldsScore map[string]map[string]float64
  20. )
  21. func init() {
  22. qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
  23. qu.ReadConfig("./res/tagscore.json", &TagConfig)
  24. qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
  25. if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
  26. RepeatScore = qu.Float64All(repeat["score"])
  27. }
  28. if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
  29. BlockScore = qu.Float64All(block["score"])
  30. }
  31. //通用抽取属性打分配置
  32. if tmp, ok := SoreConfig["extractype"]["common"].(map[string]interface{}); ok {
  33. CommonScore = map[string]float64{}
  34. for k, v := range tmp {
  35. CommonScore[k] = qu.Float64All(v)
  36. }
  37. }
  38. log.Println(CommonScore)
  39. //指定抽取属性打分配置
  40. if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
  41. FieldsScore = map[string]map[string]float64{}
  42. for key, fieldmap := range tmp {
  43. fieldscore := map[string]float64{}
  44. if field, ok := fieldmap.(map[string]interface{}); ok {
  45. for k, score := range field {
  46. fieldscore[k] = qu.Float64All(score)
  47. }
  48. }
  49. FieldsScore[key] = fieldscore
  50. }
  51. }
  52. log.Println(FieldsScore)
  53. //实例化正则
  54. for _, tmp := range SoreConfig {
  55. //log.Println(tmp)
  56. if tmp["type"] == "string" {
  57. if positions, ok := tmp["positivewords"].([]interface{}); ok {
  58. for _, position := range positions {
  59. if p, ok := position.(map[string]interface{}); ok {
  60. qu.Try(func() {
  61. strReq, _ := p["regstr"].(string)
  62. if strings.Contains(strReq, "\\u") {
  63. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  64. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  65. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  66. p["regexp"] = regexp.MustCompile(strReq)
  67. } else {
  68. p["regexp"] = regexp.MustCompile(strReq)
  69. }
  70. }, func(err interface{}) {
  71. log.Println(err)
  72. })
  73. }
  74. }
  75. }
  76. if positions, ok := tmp["negativewords"].([]interface{}); ok {
  77. for _, position := range positions {
  78. if p, ok := position.(map[string]interface{}); ok {
  79. qu.Try(func() {
  80. strReq, _ := p["regstr"].(string)
  81. if strings.Contains(strReq, "\\u") {
  82. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  83. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  84. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  85. p["regexp"] = regexp.MustCompile(strReq)
  86. } else {
  87. p["regexp"] = regexp.MustCompile(strReq)
  88. }
  89. }, func(err interface{}) {
  90. log.Println(err)
  91. })
  92. }
  93. }
  94. }
  95. if winnerorders, ok := tmp["winnerorder"].([]interface{}); ok {
  96. for _, winnerorder := range winnerorders {
  97. if p, ok := winnerorder.(map[string]interface{}); ok {
  98. qu.Try(func() {
  99. strReq, _ := p["regstr"].(string)
  100. if strings.Contains(strReq, "\\u") {
  101. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  102. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  103. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  104. p["regexp"] = regexp.MustCompile(strReq)
  105. } else {
  106. p["regexp"] = regexp.MustCompile(strReq)
  107. }
  108. }, func(err interface{}) {
  109. log.Println(err)
  110. })
  111. }
  112. }
  113. }
  114. }
  115. }
  116. }
  117. //结果打分
  118. func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
  119. qu.Catch()
  120. result := j.Result
  121. for field, tmps := range result {
  122. for tmpsindex, tmpsvalue := range tmps {
  123. describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
  124. if tmpsvalue.ExtFrom == "title" { //标题打分初始化
  125. tmps[tmpsindex].Score += CommonScore["title"]
  126. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
  127. }
  128. //是否有段标签
  129. if len(tmpsvalue.BlockTag) > 0 {
  130. //有标签段
  131. var qz float64 = 0.0 //取权重最高的
  132. for key := range tmpsvalue.BlockTag {
  133. //key = "其他"//TODO 测试用
  134. if TagConfig[key][field] > qz {
  135. qz = TagConfig[key][field]
  136. }
  137. }
  138. tmps[tmpsindex].Score += BlockScore * qz //乘以权重系数
  139. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
  140. } else {
  141. //没有段标签,走其他
  142. //qz := TagConfig["其他"][field]
  143. //tmps[tmpsindex].Score += 2 * qz //乘以权重系数
  144. }
  145. //抽取类型打分
  146. if FieldsScore[field] != nil { //指定抽取属性打分配置
  147. fieldscore := FieldsScore[field]
  148. if strings.Contains(tmpsvalue.Type, "colon") {
  149. tmps[tmpsindex].Score += fieldscore["colon"]
  150. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
  151. } else if strings.Contains(tmpsvalue.Type, "space") {
  152. tmps[tmpsindex].Score += fieldscore["space"]
  153. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["space"]})
  154. } else if strings.Contains(tmpsvalue.Type, "table") {
  155. tmps[tmpsindex].Score += fieldscore["table"]
  156. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["table"]})
  157. } else if strings.Contains(tmpsvalue.Type, "regexp") {
  158. tmps[tmpsindex].Score += fieldscore["regexp"]
  159. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
  160. }
  161. } else { //通用抽取属性打分配置
  162. if strings.Contains(tmpsvalue.Type, "colon") {
  163. tmps[tmpsindex].Score += CommonScore["colon"]
  164. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})
  165. } else if strings.Contains(tmpsvalue.Type, "space") {
  166. tmps[tmpsindex].Score += CommonScore["space"]
  167. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["space"]})
  168. } else if strings.Contains(tmpsvalue.Type, "table") {
  169. tmps[tmpsindex].Score += CommonScore["table"]
  170. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["table"]})
  171. } else if strings.Contains(tmpsvalue.Type, "regexp") {
  172. tmps[tmpsindex].Score += CommonScore["regexp"]
  173. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["regexp"]})
  174. }
  175. }
  176. scoreRule := SoreConfig[field]
  177. if scoreRule == nil {
  178. continue
  179. }
  180. //配置打分
  181. if scoreRule["type"] == "string" {
  182. //1.长度打分
  183. valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
  184. if valueLen < 1 {
  185. continue
  186. }
  187. if valueLen > 100 && field != "projectscope" {
  188. tmps[tmpsindex].Score = -99
  189. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Value: tmpsvalue.Value, Score: -99})
  190. }
  191. if lengths, ok := scoreRule["length"].([]interface{}); ok {
  192. for _, tmp := range lengths {
  193. if length, ok := tmp.(map[string]interface{}); ok {
  194. if ranges, ok := length["range"].([]interface{}); ok {
  195. gt := qu.IntAll(ranges[0])
  196. lte := qu.IntAll(ranges[1])
  197. if lte < 0 { //∞
  198. lte = 999999
  199. }
  200. score := qu.Float64All(ranges[2])
  201. if valueLen > gt && valueLen <= lte {
  202. tmps[tmpsindex].Score += score
  203. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
  204. break
  205. }
  206. }
  207. }
  208. }
  209. }
  210. //2.负面词打分
  211. if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
  212. for _, position := range positions {
  213. if p, ok := position.(map[string]interface{}); ok {
  214. qu.Try(func() {
  215. if p["regexp"] != nil {
  216. reg := p["regexp"].(*regexp.Regexp)
  217. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  218. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  219. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  220. }
  221. }
  222. }, func(err interface{}) {
  223. log.Println(err)
  224. })
  225. }
  226. }
  227. }
  228. //3.正面词打分
  229. if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
  230. for _, position := range positions {
  231. if p, ok := position.(map[string]interface{}); ok {
  232. qu.Try(func() {
  233. if p["regexp"] != nil {
  234. reg := p["regexp"].(*regexp.Regexp)
  235. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  236. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  237. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  238. }
  239. }
  240. }, func(err interface{}) {
  241. log.Println(err)
  242. })
  243. }
  244. }
  245. }
  246. //4.中标候选人打分
  247. if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
  248. for _, winnerorder := range winnerorders {
  249. if p, ok := winnerorder.(map[string]interface{}); ok {
  250. qu.Try(func() {
  251. if p["regexp"] != nil {
  252. reg := p["regexp"].(*regexp.Regexp)
  253. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  254. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  255. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: "winnerorder", RuleText: reg.String(), ScoreFrom: "fieldscore.json.winnerorder", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  256. }
  257. }
  258. }, func(err interface{}) {
  259. log.Println(err)
  260. })
  261. }
  262. }
  263. }
  264. }
  265. //5.数据范围打分
  266. if scoreRule["type"] == "float" {
  267. min := qu.IntAll(scoreRule["min"])
  268. max := qu.IntAll(scoreRule["max"])
  269. val := qu.IntAll(tmpsvalue.Value)
  270. scores, _ := scoreRule["score"].([]interface{})
  271. if len(scores) < 3 {
  272. continue
  273. }
  274. if val < min && 0 < val {
  275. tmps[tmpsindex].Score += qu.Float64All(scores[0])
  276. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
  277. } else if val > max {
  278. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  279. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  280. } else if val <= max && val >= min {
  281. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  282. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  283. }
  284. }
  285. //其他打分配置
  286. // decimal
  287. if scoreRule["type"] == "decimal" {
  288. min := qu.IntAll(scoreRule["min"])
  289. max := qu.IntAll(scoreRule["max"])
  290. val := qu.IntAll(tmpsvalue.Value)
  291. scores, _ := scoreRule["score"].([]interface{})
  292. if len(scores) < 3 {
  293. continue
  294. }
  295. if val > max {
  296. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  297. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  298. } else if val <= max && val > min {
  299. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  300. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  301. }
  302. }
  303. }
  304. //计算重复值,并加分=重复数量*乘系数
  305. valrepeat := map[string]int{}
  306. for _, v := range tmps {
  307. valrepeat[fmt.Sprint(v.Value)] += 1
  308. }
  309. for index, v := range tmps {
  310. v.ValRepeat = valrepeat[fmt.Sprint(v.Value)] - 1
  311. if v.ValRepeat > 0 {
  312. score := RepeatScore * float64(v.ValRepeat)
  313. v.Score += score
  314. tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score})
  315. }
  316. }
  317. }
  318. return result
  319. }