score.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. // score
  2. package extract
  3. import (
  4. "fmt"
  5. ju "jy/util"
  6. "log"
  7. qu "qfw/util"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. var SoreConfig map[string]map[string]interface{}
  14. var TagConfig map[string]map[string]float64
  15. var TagConfigDesc map[string]string
  16. var TitleScore float64
  17. func init() {
  18. qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
  19. qu.ReadConfig("./res/tagscore.json", &TagConfig)
  20. qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
  21. TitleScore = qu.Float64All(SoreConfig["extractype"]["title"])
  22. //实例化正则
  23. for _, tmp := range SoreConfig {
  24. //log.Println(tmp)
  25. if tmp["type"] == "string" {
  26. if positions, ok := tmp["positivewords"].([]interface{}); ok {
  27. for _, position := range positions {
  28. if p, ok := position.(map[string]interface{}); ok {
  29. qu.Try(func() {
  30. strReq, _ := p["regstr"].(string)
  31. if strings.Contains(strReq, "\\u") {
  32. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  33. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  34. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  35. p["regexp"] = regexp.MustCompile(strReq)
  36. } else {
  37. p["regexp"] = regexp.MustCompile(strReq)
  38. }
  39. }, func(err interface{}) {
  40. log.Println(err)
  41. })
  42. }
  43. }
  44. }
  45. if positions, ok := tmp["negativewords"].([]interface{}); ok {
  46. for _, position := range positions {
  47. if p, ok := position.(map[string]interface{}); ok {
  48. qu.Try(func() {
  49. strReq, _ := p["regstr"].(string)
  50. if strings.Contains(strReq, "\\u") {
  51. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  52. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  53. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  54. p["regexp"] = regexp.MustCompile(strReq)
  55. } else {
  56. p["regexp"] = regexp.MustCompile(strReq)
  57. }
  58. }, func(err interface{}) {
  59. log.Println(err)
  60. })
  61. }
  62. }
  63. }
  64. if winnerorders, ok := tmp["winnerorder"].([]interface{}); ok {
  65. for _, winnerorder := range winnerorders {
  66. if p, ok := winnerorder.(map[string]interface{}); ok {
  67. qu.Try(func() {
  68. strReq, _ := p["regstr"].(string)
  69. if strings.Contains(strReq, "\\u") {
  70. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  71. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  72. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  73. p["regexp"] = regexp.MustCompile(strReq)
  74. } else {
  75. p["regexp"] = regexp.MustCompile(strReq)
  76. }
  77. }, func(err interface{}) {
  78. log.Println(err)
  79. })
  80. }
  81. }
  82. }
  83. }
  84. }
  85. }
  86. //结果打分
  87. func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
  88. result := j.Result
  89. qu.Catch()
  90. for field, tmps := range result {
  91. for tmpsindex, tmpsvalue := range tmps {
  92. if tmpsvalue.ExtFrom == "title" { //标题打分初始化
  93. tmps[tmpsindex].Score += TitleScore
  94. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: TitleScore})
  95. }
  96. //是否有段标签
  97. if len(tmpsvalue.BlockTag) > 0 {
  98. //有标签段
  99. var qz float64 = 0.0 //取权重最高的
  100. for key := range tmpsvalue.BlockTag {
  101. //key = "其他"//TODO 测试用
  102. if TagConfig[key][field] > qz {
  103. qz = TagConfig[key][field]
  104. }
  105. }
  106. tmps[tmpsindex].Score += 2 * qz //乘以权重系数
  107. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: 2 * qz})
  108. } else {
  109. //没有段标签,走其他
  110. //qz := TagConfig["其他"][field]
  111. //tmps[tmpsindex].Score += 2 * qz //乘以权重系数
  112. }
  113. //抽取类型打分
  114. if strings.Contains(tmpsvalue.Type, "colon") {
  115. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
  116. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
  117. } else if strings.Contains(tmpsvalue.Type, "space") {
  118. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
  119. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
  120. } else if strings.Contains(tmpsvalue.Type, "table") {
  121. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
  122. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
  123. } else if strings.Contains(tmpsvalue.Type, "regexp") {
  124. tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
  125. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
  126. }
  127. scoreRule := SoreConfig[field]
  128. if scoreRule == nil {
  129. continue
  130. }
  131. if scoreRule["type"] == "string" {
  132. //1.长度打分
  133. valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
  134. if valueLen < 1 {
  135. continue
  136. }
  137. if valueLen > 100 && field != "projectscope" {
  138. tmps[tmpsindex].Score = -99
  139. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Value: tmpsvalue.Value, Score: -99})
  140. }
  141. if lengths, ok := scoreRule["length"].([]interface{}); ok {
  142. for _, tmp := range lengths {
  143. if length, ok := tmp.(map[string]interface{}); ok {
  144. min := qu.IntAll(length["min"])
  145. max := qu.IntAll(length["max"])
  146. scores, _ := length["score"].([]interface{})
  147. if len(scores) < 3 {
  148. continue
  149. }
  150. if valueLen < min {
  151. tmps[tmpsindex].Score += qu.Float64All(scores[0])
  152. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, "<", min), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
  153. } else if valueLen > max {
  154. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  155. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", max), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  156. } else {
  157. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  158. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", min, "&&", valueLen, "<", max), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  159. }
  160. }
  161. }
  162. }
  163. //2.负面词打分
  164. if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
  165. for _, position := range positions {
  166. if p, ok := position.(map[string]interface{}); ok {
  167. qu.Try(func() {
  168. if p["regexp"] != nil {
  169. reg := p["regexp"].(*regexp.Regexp)
  170. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  171. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  172. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  173. }
  174. }
  175. }, func(err interface{}) {
  176. log.Println(err)
  177. })
  178. }
  179. }
  180. }
  181. //3.正面词打分
  182. if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
  183. for _, position := range positions {
  184. if p, ok := position.(map[string]interface{}); ok {
  185. qu.Try(func() {
  186. if p["regexp"] != nil {
  187. reg := p["regexp"].(*regexp.Regexp)
  188. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  189. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  190. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  191. }
  192. }
  193. }, func(err interface{}) {
  194. log.Println(err)
  195. })
  196. }
  197. }
  198. }
  199. //4.位置打分
  200. if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
  201. for _, winnerorder := range winnerorders {
  202. if p, ok := winnerorder.(map[string]interface{}); ok {
  203. qu.Try(func() {
  204. if p["regexp"] != nil {
  205. reg := p["regexp"].(*regexp.Regexp)
  206. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  207. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  208. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: "winnerorder", RuleText: reg.String(), ScoreFrom: "fieldscore.json.winnerorder", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  209. }
  210. }
  211. }, func(err interface{}) {
  212. log.Println(err)
  213. })
  214. }
  215. }
  216. }
  217. }
  218. //5.数据范围打分
  219. if scoreRule["type"] == "float" {
  220. min := qu.IntAll(scoreRule["min"])
  221. max := qu.IntAll(scoreRule["max"])
  222. val := qu.IntAll(tmpsvalue.Value)
  223. scores, _ := scoreRule["score"].([]interface{})
  224. if len(scores) < 3 {
  225. continue
  226. }
  227. if val < min && 0 < val {
  228. tmps[tmpsindex].Score += qu.Float64All(scores[0])
  229. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
  230. } else if val > max {
  231. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  232. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  233. } else if val <= max && val >= min {
  234. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  235. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  236. }
  237. }
  238. //其他打分配置
  239. // decimal
  240. if scoreRule["type"] == "decimal" {
  241. min := qu.IntAll(scoreRule["min"])
  242. max := qu.IntAll(scoreRule["max"])
  243. val := qu.IntAll(tmpsvalue.Value)
  244. scores, _ := scoreRule["score"].([]interface{})
  245. if len(scores) < 3 {
  246. continue
  247. }
  248. if val > max {
  249. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  250. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  251. } else if val <= max && val > min {
  252. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  253. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  254. }
  255. }
  256. }
  257. }
  258. return result
  259. }