score.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. // score
  2. package extract
  3. import (
  4. "fmt"
  5. ju "jy/util"
  6. "log"
  7. qu "qfw/util"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. var SoreConfig map[string]map[string]interface{}
  14. var TagConfig map[string]map[string]float64
  15. func init() {
  16. qu.ReadConfig("./res/tagscore.json", &TagConfig)
  17. qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
  18. //实例化正则
  19. for _, tmp := range SoreConfig {
  20. //log.Println(tmp)
  21. if tmp["type"] == "string" {
  22. if positions, ok := tmp["positivewords"].([]interface{}); ok {
  23. for _, position := range positions {
  24. if p, ok := position.(map[string]interface{}); ok {
  25. qu.Try(func() {
  26. strReq, _ := p["regstr"].(string)
  27. if strings.Contains(strReq, "\\u") {
  28. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  29. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  30. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  31. p["regexp"] = regexp.MustCompile(strReq)
  32. } else {
  33. p["regexp"] = regexp.MustCompile(strReq)
  34. }
  35. }, func(err interface{}) {
  36. log.Println(err)
  37. })
  38. }
  39. }
  40. }
  41. if positions, ok := tmp["negativewords"].([]interface{}); ok {
  42. for _, position := range positions {
  43. if p, ok := position.(map[string]interface{}); ok {
  44. qu.Try(func() {
  45. strReq, _ := p["regstr"].(string)
  46. if strings.Contains(strReq, "\\u") {
  47. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  48. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  49. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  50. p["regexp"] = regexp.MustCompile(strReq)
  51. } else {
  52. p["regexp"] = regexp.MustCompile(strReq)
  53. }
  54. }, func(err interface{}) {
  55. log.Println(err)
  56. })
  57. }
  58. }
  59. }
  60. if winnerorders, ok := tmp["winnerorder"].([]interface{}); ok {
  61. for _, winnerorder := range winnerorders {
  62. if p, ok := winnerorder.(map[string]interface{}); ok {
  63. qu.Try(func() {
  64. strReq, _ := p["regstr"].(string)
  65. if strings.Contains(strReq, "\\u") {
  66. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  67. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  68. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  69. p["regexp"] = regexp.MustCompile(strReq)
  70. } else {
  71. p["regexp"] = regexp.MustCompile(strReq)
  72. }
  73. }, func(err interface{}) {
  74. log.Println(err)
  75. })
  76. }
  77. }
  78. }
  79. }
  80. }
  81. }
  82. //结果打分
  83. func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
  84. result := j.Result
  85. qu.Catch()
  86. for field, tmps := range result {
  87. for tmpsindex, tmpsvalue := range tmps {
  88. //是否有段标签
  89. if len(tmpsvalue.BlockTag) > 0 {
  90. //有标签段
  91. var qz float64 = 0.0 //取权重最高的
  92. var tgk string
  93. for key := range tmpsvalue.BlockTag {
  94. //key = "其他"//TODO 测试用
  95. if TagConfig[key][field] > qz {
  96. qz = TagConfig[key][field]
  97. tgk = key
  98. }
  99. }
  100. tmps[tmpsindex].Score += 2 * qz //乘以权重系数
  101. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", Type: tgk + field, ExtFrom: "tagscore.json", Value: tmpsvalue.Value, Score: 2 * qz})
  102. } else {
  103. //没有段标签,走其他
  104. //qz := TagConfig["其他"][field]
  105. //tmps[tmpsindex].Score += 2 * qz //乘以权重系数
  106. }
  107. if tmpsvalue.ExtFrom != "title" { //非标题抽取
  108. //是否有kv值
  109. if strings.Contains(tmpsvalue.Type, "colon") {
  110. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
  111. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
  112. } else if strings.Contains(tmpsvalue.Type, "space") {
  113. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
  114. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
  115. } else if strings.Contains(tmpsvalue.Type, "table") {
  116. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
  117. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
  118. }
  119. }
  120. if tmpsvalue.ExtFrom != "title" { //非标题抽取
  121. if strings.Contains(tmpsvalue.Type, "regexp") {
  122. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
  123. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
  124. }
  125. } else {
  126. if strings.Contains(tmpsvalue.Type, "regexp") {
  127. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1
  128. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1})
  129. }
  130. }
  131. scoreRule := SoreConfig[field]
  132. if scoreRule == nil {
  133. continue
  134. }
  135. if scoreRule["type"] == "string" {
  136. //1.长度打分
  137. valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
  138. if valueLen < 1 {
  139. continue
  140. }
  141. if valueLen > 100 && field != "projectscope" {
  142. tmps[tmpsindex].Score = -99
  143. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Type: "length", Value: tmpsvalue.Value, Score: -99})
  144. }
  145. if lengths, ok := scoreRule["length"].([]interface{}); ok {
  146. for _, tmp := range lengths {
  147. if length, ok := tmp.(map[string]interface{}); ok {
  148. min := qu.IntAll(length["min"])
  149. max := qu.IntAll(length["max"])
  150. scores, _ := length["score"].([]interface{})
  151. if len(scores) < 3 {
  152. continue
  153. }
  154. if valueLen < min {
  155. tmps[tmpsindex].Score += qu.Float64All(scores[0])
  156. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, "<", min), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
  157. } else if valueLen > max {
  158. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  159. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", max), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  160. } else {
  161. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  162. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", min, "&&", valueLen, "<", max), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  163. }
  164. }
  165. }
  166. }
  167. //2.负面词打分
  168. if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
  169. for _, position := range positions {
  170. if p, ok := position.(map[string]interface{}); ok {
  171. qu.Try(func() {
  172. if p["regexp"] != nil {
  173. reg := p["regexp"].(*regexp.Regexp)
  174. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  175. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  176. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: field + ".negativewords", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  177. }
  178. }
  179. }, func(err interface{}) {
  180. log.Println(err)
  181. })
  182. }
  183. }
  184. }
  185. //3.正面词打分
  186. if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
  187. for _, position := range positions {
  188. if p, ok := position.(map[string]interface{}); ok {
  189. qu.Try(func() {
  190. if p["regexp"] != nil {
  191. reg := p["regexp"].(*regexp.Regexp)
  192. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  193. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  194. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: field + ".positivewords", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  195. }
  196. }
  197. }, func(err interface{}) {
  198. log.Println(err)
  199. })
  200. }
  201. }
  202. }
  203. //4.位置打分
  204. if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
  205. for _, winnerorder := range winnerorders {
  206. if p, ok := winnerorder.(map[string]interface{}); ok {
  207. qu.Try(func() {
  208. if p["regexp"] != nil {
  209. reg := p["regexp"].(*regexp.Regexp)
  210. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  211. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  212. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: field + ".winnerorder", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  213. }
  214. }
  215. }, func(err interface{}) {
  216. log.Println(err)
  217. })
  218. }
  219. }
  220. }
  221. }
  222. //5.数据范围打分
  223. if scoreRule["type"] == "float" {
  224. min := qu.IntAll(scoreRule["min"])
  225. max := qu.IntAll(scoreRule["max"])
  226. val := qu.IntAll(tmpsvalue.Value)
  227. scores, _ := scoreRule["score"].([]interface{})
  228. if len(scores) < 3 {
  229. continue
  230. }
  231. if val < min && 0 < val {
  232. tmps[tmpsindex].Score += qu.Float64All(scores[0])
  233. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
  234. } else if val > max {
  235. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  236. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  237. } else if val <= max && val >= min {
  238. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  239. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  240. }
  241. }
  242. //其他打分配置
  243. // decimal
  244. if scoreRule["type"] == "decimal" {
  245. min := qu.IntAll(scoreRule["min"])
  246. max := qu.IntAll(scoreRule["max"])
  247. val := qu.IntAll(tmpsvalue.Value)
  248. scores, _ := scoreRule["score"].([]interface{})
  249. if len(scores) < 3 {
  250. continue
  251. }
  252. if val > max {
  253. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  254. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  255. } else if val <= max && val > min {
  256. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  257. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  258. }
  259. }
  260. }
  261. }
  262. return result
  263. }