score.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. package main
  2. import (
  3. "fmt"
  4. "math"
  5. "strings"
  6. "time"
  7. "github.com/adrg/strutil"
  8. "github.com/adrg/strutil/metrics"
  9. )
  10. type ScoredResult struct {
  11. Document EsDocument
  12. Confidence float64
  13. }
  14. type InputData struct {
  15. ProjectCode string
  16. Area string
  17. City string
  18. ProjectName string
  19. ConstructionUnit string
  20. AreaCode string
  21. PlannedInvestment float64
  22. BaseDate time.Time
  23. }
  24. type EsDocument struct {
  25. Id string `json:"id"`
  26. Title string `json:"title"`
  27. Projectname string `json:"projectname"`
  28. ProjectCode string `json:"projectcode"`
  29. Toptype string `json:"toptype"`
  30. Subtype string `json:"subtype"`
  31. Area string `json:"area"`
  32. City string `json:"city"`
  33. Buyer string `json:"buyer"`
  34. Budget float64 `json:"budget"`
  35. Bidamount float64 `json:"bidamount"`
  36. Winner string `json:"winner"`
  37. Detail string `json:"detail"`
  38. Publishtime int64 `json:"publishtime"`
  39. Agency string `json:"agency"`
  40. SWinner string `json:"s_winner"`
  41. WinnerTel string `json:"winnertel"`
  42. BuyerTel string `json:"buyertel"`
  43. BuyerPerson string `json:"buyerperson"`
  44. Score float64 `json:"score"`
  45. }
  46. // calculateConfidenceScore calculates a score from 0-100 based on multiple factors.
  47. func calculateConfidenceScore(target InputData, candidate EsDocument) float64 {
  48. var totalScore float64 = 0.0
  49. nameSimilarity := EnhancedSimilarity(strings.ReplaceAll(target.ProjectName, target.ConstructionUnit, ""), strings.ReplaceAll(strings.ReplaceAll(candidate.Projectname, candidate.Buyer, ""), target.ConstructionUnit, ""))
  50. // 1. Project Name Similarity (Weight: 40 points)
  51. totalScore += nameSimilarity * 70.0
  52. //log.Println(target.ProjectName, candidate.Projectname, nameSimilarity)
  53. // 2. Construction Unit Similarity (Weight: 30 points)
  54. unitSimilarity := EnhancedSimilarity(target.ConstructionUnit, candidate.Buyer)
  55. totalScore += unitSimilarity * 10.0
  56. //log.Println(target.ConstructionUnit, candidate.Buyer, unitSimilarity)
  57. if target.Area != "" && target.Area == candidate.Area {
  58. totalScore += 5
  59. if target.City != "" && target.City == candidate.City {
  60. totalScore += 5
  61. }
  62. }
  63. // 3. Project Code Presence
  64. if target.ProjectCode != "" && strings.Contains(candidate.Detail, target.ProjectCode) {
  65. totalScore += 5.0
  66. }
  67. // 4. 投资额
  68. if target.PlannedInvestment > 0 && candidate.Budget > 0 {
  69. diff := math.Abs(target.PlannedInvestment - candidate.Budget)
  70. percentDiff := diff / math.Max(target.PlannedInvestment, 1) // Avoid division by zero
  71. // Simple linear decay: 100% score for 0% diff, 0% score for 100% diff or more
  72. investmentScore := math.Max(0, (1.0-percentDiff)*5.0)
  73. totalScore += investmentScore
  74. }
  75. //计算时间
  76. if target.BaseDate.Year() > 2000 && candidate.Publishtime > 1600000000 {
  77. totalScore += TimeProximity(target.BaseDate, time.Unix(candidate.Publishtime, 0)) * 5
  78. }
  79. return totalScore
  80. }
  81. // 计算时间
  82. func TimeProximity(t1, t2 time.Time) float64 {
  83. // 解析时间
  84. // 计算天数差值(取绝对值)
  85. days := int(t1.Sub(t2).Abs().Hours() / 24)
  86. // 根据天数差值返回不同的相似度
  87. switch {
  88. case days < 100:
  89. return 1.0
  90. case days < 200:
  91. return 0.7
  92. case days < 365:
  93. return 0.55
  94. case days < 700:
  95. return 0.3
  96. default:
  97. return 0.15
  98. }
  99. }
  100. func EnhancedSimilarity(str1, str2 string) float64 {
  101. // 标准化:去除空格和标点(根据中文需求调整)
  102. s1 := strings.TrimSpace(str1)
  103. s2 := strings.TrimSpace(str2)
  104. // 1. 检查完全相等
  105. if s1 == "" || s2 == "" {
  106. return 0
  107. } else if s1 == s2 {
  108. return 1.0
  109. }
  110. // 2. 检查包含关系(权重50%)
  111. if containmentScore := checkContainment(s1, s2); containmentScore > 0 {
  112. return containmentScore
  113. }
  114. // 3. 使用改进的Jaro-Winkler(考虑中文分词)
  115. jw := metrics.NewJaroWinkler()
  116. baseScore := strutil.Similarity(s1, s2, jw)
  117. // 4. 添加公共子串权重
  118. substringScore := checkLongestCommonSubstring(s1, s2)
  119. // 5. 组合得分
  120. finalScore := 0.5*baseScore + 0.5*substringScore
  121. return finalScore
  122. }
  123. // 检查包含关系,返回0-1的得分
  124. func checkContainment(s1, s2 string) float64 {
  125. // 确保s1是较短的字符串
  126. if len(s1) > len(s2) {
  127. s1, s2 = s2, s1
  128. }
  129. // 完全包含
  130. if strings.Contains(s2, s1) {
  131. // 被包含部分占长字符串的比例
  132. ratio := float64(len(s1)) / float64(len(s2))
  133. // 基础分0.7 + 比例调整(最多加到0.95)
  134. return 0.9 + ratio*0.1
  135. }
  136. return 0
  137. }
  138. // 检查最长公共子串
  139. func checkLongestCommonSubstring(s1, s2 string) float64 {
  140. m := len(s1)
  141. n := len(s2)
  142. // 动态规划表
  143. dp := make([][]int, m+1)
  144. for i := range dp {
  145. dp[i] = make([]int, n+1)
  146. }
  147. maxLen := 0
  148. for i := 1; i <= m; i++ {
  149. for j := 1; j <= n; j++ {
  150. if s1[i-1] == s2[j-1] {
  151. dp[i][j] = dp[i-1][j-1] + 1
  152. if dp[i][j] > maxLen {
  153. maxLen = dp[i][j]
  154. }
  155. }
  156. }
  157. }
  158. // 计算得分:最长公共子串占两个字符串平均长度的比例
  159. avgLen := (m + n) / 2
  160. if avgLen == 0 {
  161. return 0
  162. }
  163. return float64(maxLen) / float64(avgLen)
  164. }
  165. func main000() {
  166. str1 := "绿色低碳节能项目"
  167. str2 := "河北纵横集团丰南钢铁有限公司绿色低碳节能项目"
  168. similarity := EnhancedSimilarity(str1, str2)
  169. fmt.Printf("相似度: %.4f\n", similarity)
  170. // 更多测试用例
  171. testCases := []struct {
  172. s1, s2 string
  173. }{
  174. {"绿色低碳", "绿色低碳节能项目"},
  175. {"钢铁项目", "丰南钢铁有限公司项目"},
  176. {"节能环保", "新能源开发"},
  177. {"相同的字符串", "相同的字符串"},
  178. }
  179. for _, tc := range testCases {
  180. sim := EnhancedSimilarity(tc.s1, tc.s2)
  181. fmt.Printf("'%s' vs '%s': %.4f\n", tc.s1, tc.s2, sim)
  182. }
  183. }