score.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. // score
  2. package extract
  3. import (
  4. "fmt"
  5. ju "jy/util"
  6. "log"
  7. qu "qfw/util"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "unicode/utf8"
  13. )
  14. var (
  15. lockscore sync.RWMutex
  16. SoreConfig map[string]map[string]interface{}
  17. TagConfig map[string]map[string]float64
  18. TagConfigDesc map[string]string
  19. RepeatScore, BlockScore float64
  20. CommonScore map[string]float64
  21. FieldsScore map[string]map[string]float64
  22. )
  23. func init() {
  24. qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
  25. qu.ReadConfig("./res/tagscore.json", &TagConfig)
  26. qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
  27. if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
  28. RepeatScore = qu.Float64All(repeat["score"])
  29. }
  30. if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
  31. BlockScore = qu.Float64All(block["score"])
  32. }
  33. //通用抽取属性打分配置
  34. if tmp, ok := SoreConfig["extractype"]["common"].(map[string]interface{}); ok {
  35. CommonScore = map[string]float64{}
  36. for k, v := range tmp {
  37. CommonScore[k] = qu.Float64All(v)
  38. }
  39. }
  40. //指定抽取属性打分配置
  41. if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
  42. FieldsScore = map[string]map[string]float64{}
  43. for key, fieldmap := range tmp {
  44. fieldscore := map[string]float64{}
  45. if field, ok := fieldmap.(map[string]interface{}); ok {
  46. for k, score := range field {
  47. fieldscore[k] = qu.Float64All(score)
  48. }
  49. }
  50. FieldsScore[key] = fieldscore
  51. }
  52. }
  53. //实例化正则
  54. for _, tmp := range SoreConfig {
  55. //log.Println(tmp)
  56. if tmp["type"] == "string" {
  57. if positions, ok := tmp["positivewords"].([]interface{}); ok {
  58. for _, position := range positions {
  59. if p, ok := position.(map[string]interface{}); ok {
  60. qu.Try(func() {
  61. strReq, _ := p["regstr"].(string)
  62. if strings.Contains(strReq, "\\u") {
  63. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  64. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  65. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  66. p["regexp"] = regexp.MustCompile(strReq)
  67. } else {
  68. p["regexp"] = regexp.MustCompile(strReq)
  69. }
  70. }, func(err interface{}) {
  71. log.Println(err)
  72. })
  73. }
  74. }
  75. }
  76. if positions, ok := tmp["negativewords"].([]interface{}); ok {
  77. for _, position := range positions {
  78. if p, ok := position.(map[string]interface{}); ok {
  79. qu.Try(func() {
  80. strReq, _ := p["regstr"].(string)
  81. if strings.Contains(strReq, "\\u") {
  82. strReq = strings.Replace(strReq, "\\", "\\\\", -1)
  83. strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
  84. strReq, _ = strconv.Unquote(`"` + strReq + `"`)
  85. p["regexp"] = regexp.MustCompile(strReq)
  86. } else {
  87. p["regexp"] = regexp.MustCompile(strReq)
  88. }
  89. }, func(err interface{}) {
  90. log.Println(err)
  91. })
  92. }
  93. }
  94. }
  95. }
  96. }
  97. }
  98. //结果打分
  99. func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
  100. qu.Catch()
  101. result := j.Result
  102. for field, tmps := range result {
  103. if field == "projectcode" {
  104. tmps = projectWeightClear(tmps)
  105. }
  106. locktag.Lock()
  107. taglength := len(ftag[field])
  108. locktag.Unlock()
  109. for tmpsindex, tmpsvalue := range tmps {
  110. //没有抽取到值,不打分
  111. if string_value := fmt.Sprint(tmpsvalue.Value); string_value == "" || string_value == "0" || string_value == "<nil>" {
  112. if field == "budget" || field == "bidamount" {
  113. if tmpsvalue.IsTrue {
  114. //continue
  115. }else {
  116. tmps[tmpsindex].Score = -10
  117. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
  118. continue
  119. }
  120. }else {
  121. tmps[tmpsindex].Score = -10
  122. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
  123. continue
  124. }
  125. }
  126. lockscore.Lock()
  127. describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
  128. lockscore.Unlock()
  129. //是否有段标签
  130. if len(tmpsvalue.BlockTag) > 0 {
  131. //有标签段
  132. var qz float64 = 0.0 //取权重最高的
  133. for key := range tmpsvalue.BlockTag {
  134. //key = "其他"//TODO 测试用
  135. lockscore.Lock()
  136. if TagConfig[key][field] > qz {
  137. qz = TagConfig[key][field]
  138. }
  139. lockscore.Unlock()
  140. }
  141. tmps[tmpsindex].Score += ju.FloatFormat(BlockScore*qz, 4) //乘以权重系数
  142. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
  143. } else {
  144. //没有段标签,走其他
  145. //qz := TagConfig["其他"][field]
  146. //tmps[tmpsindex].Score += 2 * qz //乘以权重系数
  147. }
  148. //抽取类型打分
  149. lockscore.Lock()
  150. fieldscore := FieldsScore[field]
  151. typescore := float64(0)
  152. titlescore := float64(0)
  153. if fieldscore != nil { //指定抽取属性打分配置
  154. if tmpsvalue.ExtFrom == "title" { //标题打分初始化
  155. titlescore = fieldscore["title"]
  156. }
  157. typescore = fieldscore[tmpsvalue.Type]
  158. } else { //通用抽取属性打分配置
  159. if tmpsvalue.ExtFrom == "title" { //标题打分初始化
  160. titlescore = CommonScore["title"]
  161. }
  162. typescore = CommonScore[tmpsvalue.Type]
  163. }
  164. lockscore.Unlock()
  165. if titlescore > 0 {
  166. tmps[tmpsindex].Score += titlescore
  167. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: titlescore})
  168. }
  169. tmps[tmpsindex].Score += typescore
  170. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: typescore})
  171. //kv权重打分
  172. if fieldscore != nil { //指定抽取属性打分配置
  173. if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
  174. if taglength == 0 {
  175. continue
  176. }
  177. weightscore := ju.FloatFormat(float64(qu.Float64All(fieldscore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
  178. tmps[tmpsindex].Score += weightscore
  179. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
  180. } else {
  181. //正则权重,暂不考虑
  182. }
  183. } else {
  184. if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
  185. if taglength == 0 {
  186. continue
  187. }
  188. weightscore := ju.FloatFormat(float64(qu.Float64All(CommonScore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
  189. tmps[tmpsindex].Score += weightscore
  190. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
  191. } else {
  192. //正则权重,暂不考虑
  193. }
  194. }
  195. lockscore.Lock()
  196. scoreRule := SoreConfig[field]
  197. lockscore.Unlock()
  198. if scoreRule == nil {
  199. continue
  200. }
  201. //配置打分
  202. if scoreRule["type"] == "string" {
  203. //1.长度打分
  204. valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
  205. if valueLen < 1 {
  206. tmps[tmpsindex].Score = -10
  207. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen < 1 && field != "projectscope"直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
  208. continue
  209. }
  210. if valueLen > 100 && field != "projectscope" {
  211. tmps[tmpsindex].Score = -99
  212. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Value: tmpsvalue.Value, Score: -99})
  213. }
  214. if lengths, ok := scoreRule["length"].([]interface{}); ok {
  215. for _, tmp := range lengths {
  216. if length, ok := tmp.(map[string]interface{}); ok {
  217. if ranges, ok := length["range"].([]interface{}); ok {
  218. gt := qu.IntAll(ranges[0])
  219. lte := qu.IntAll(ranges[1])
  220. if lte < 0 { //∞
  221. lte = 999999
  222. }
  223. score := qu.Float64All(ranges[2])
  224. if valueLen > gt && valueLen <= lte {
  225. tmps[tmpsindex].Score += score
  226. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
  227. break
  228. }
  229. }
  230. }
  231. }
  232. }
  233. //2.负面词打分
  234. if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
  235. for _, position := range positions {
  236. if p, ok := position.(map[string]interface{}); ok {
  237. qu.Try(func() {
  238. if p["regexp"] != nil {
  239. reg := p["regexp"].(*regexp.Regexp)
  240. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  241. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  242. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  243. }
  244. }
  245. }, func(err interface{}) {
  246. log.Println(err)
  247. })
  248. }
  249. }
  250. }
  251. //3.正面词打分
  252. if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
  253. for _, position := range positions {
  254. if p, ok := position.(map[string]interface{}); ok {
  255. qu.Try(func() {
  256. if p["regexp"] != nil {
  257. reg := p["regexp"].(*regexp.Regexp)
  258. if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
  259. tmps[tmpsindex].Score += qu.Float64All(p["score"])
  260. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
  261. }
  262. }
  263. }, func(err interface{}) {
  264. log.Println(err)
  265. })
  266. }
  267. }
  268. }
  269. }
  270. //4.数据范围打分
  271. if scoreRule["type"] == "float" {
  272. min := qu.IntAll(scoreRule["min"])
  273. max := qu.IntAll(scoreRule["max"])
  274. val := qu.IntAll(tmpsvalue.Value)
  275. scores, _ := scoreRule["score"].([]interface{})
  276. if len(scores) < 3 || val < 0 {
  277. continue
  278. }
  279. if val < min && 0 < val {
  280. tmps[tmpsindex].Score += qu.Float64All(scores[0])
  281. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
  282. } else if val > max {
  283. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  284. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  285. } else if val <= max && val >= min {
  286. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  287. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  288. }
  289. }
  290. //其他打分配置
  291. // decimal
  292. if scoreRule["type"] == "decimal" {
  293. min := qu.IntAll(scoreRule["min"])
  294. max := qu.IntAll(scoreRule["max"])
  295. val := qu.IntAll(tmpsvalue.Value)
  296. scores, _ := scoreRule["score"].([]interface{})
  297. if len(scores) < 3 {
  298. continue
  299. }
  300. if val > max {
  301. tmps[tmpsindex].Score += qu.Float64All(scores[2])
  302. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
  303. } else if val <= max && val > min {
  304. tmps[tmpsindex].Score += qu.Float64All(scores[1])
  305. tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
  306. }
  307. }
  308. }
  309. //计算重复值,并加分=重复数量*乘系数
  310. valrepeat := map[string]int{}
  311. for _, v := range tmps {
  312. valrepeat[fmt.Sprint(v.Value)] += 1
  313. }
  314. for index, v := range tmps {
  315. v.ValRepeat = valrepeat[fmt.Sprint(v.Value)] - 1
  316. if v.ValRepeat > 0 {
  317. score := RepeatScore * float64(v.ValRepeat)
  318. v.Score += score
  319. tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score})
  320. }
  321. v.Score = ju.FloatFormat(v.Score, 4)
  322. }
  323. }
  324. return result
  325. }
  326. //项目编号权重清理
  327. func projectWeightClear(tmps []*ju.ExtField) []*ju.ExtField {
  328. newList := make([]*ju.ExtField, 0)
  329. if len(tmps) < 1 {
  330. return newList
  331. }
  332. ju.Sort(tmps)
  333. tmpWeight := -999 //记录最大权重
  334. tmpIndex := -999 //记录最大权重下标
  335. vmap := make(map[string]int, 0)
  336. for i, v := range tmps {
  337. if v.Weight == 0 {
  338. newList = append(newList, v)
  339. continue
  340. } else if v.Weight > tmpWeight {
  341. tmpWeight = v.Weight
  342. tmpIndex = i
  343. } else if v.Weight == tmpWeight {
  344. //if utf8.RuneCountInString(qu.ObjToString(v.Value)) >= 5 && utf8.RuneCountInString(qu.ObjToString(v.Value)) <= 38 && v.Value != tmps[tmpIndex].Value {
  345. // vmap[qu.ObjToString(v.Value)] = i
  346. //}
  347. newList = append(newList, v)
  348. }
  349. }
  350. if tmpIndex != -999 {
  351. newList = append(newList, tmps[tmpIndex])
  352. }
  353. if len(vmap) > 0 {
  354. for _, v := range vmap {
  355. newList = append(newList, tmps[v])
  356. }
  357. }
  358. return newList
  359. }