123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- package main
- import (
- "fmt"
- "math"
- "strings"
- "time"
- "github.com/adrg/strutil"
- "github.com/adrg/strutil/metrics"
- )
- type ScoredResult struct {
- Document EsDocument
- Confidence float64
- }
- type InputData struct {
- ProjectCode string
- Area string
- City string
- ProjectName string
- ConstructionUnit string
- AreaCode string
- PlannedInvestment float64
- BaseDate time.Time
- }
- type EsDocument struct {
- Id string `json:"id"`
- Title string `json:"title"`
- Projectname string `json:"projectname"`
- ProjectCode string `json:"projectcode"`
- Toptype string `json:"toptype"`
- Subtype string `json:"subtype"`
- Area string `json:"area"`
- City string `json:"city"`
- Buyer string `json:"buyer"`
- Budget float64 `json:"budget"`
- Bidamount float64 `json:"bidamount"`
- Winner string `json:"winner"`
- Detail string `json:"detail"`
- Publishtime int64 `json:"publishtime"`
- Agency string `json:"agency"`
- SWinner string `json:"s_winner"`
- WinnerTel string `json:"winnertel"`
- BuyerTel string `json:"buyertel"`
- BuyerPerson string `json:"buyerperson"`
- Score float64 `json:"score"`
- }
- // calculateConfidenceScore calculates a score from 0-100 based on multiple factors.
- func calculateConfidenceScore(target InputData, candidate EsDocument) float64 {
- var totalScore float64 = 0.0
- nameSimilarity := EnhancedSimilarity(strings.ReplaceAll(target.ProjectName, target.ConstructionUnit, ""), strings.ReplaceAll(strings.ReplaceAll(candidate.Projectname, candidate.Buyer, ""), target.ConstructionUnit, ""))
- // 1. Project Name Similarity (Weight: 40 points)
- totalScore += nameSimilarity * 70.0
- //log.Println(target.ProjectName, candidate.Projectname, nameSimilarity)
- // 2. Construction Unit Similarity (Weight: 30 points)
- unitSimilarity := EnhancedSimilarity(target.ConstructionUnit, candidate.Buyer)
- totalScore += unitSimilarity * 10.0
- //log.Println(target.ConstructionUnit, candidate.Buyer, unitSimilarity)
- if target.Area != "" && target.Area == candidate.Area {
- totalScore += 5
- if target.City != "" && target.City == candidate.City {
- totalScore += 5
- }
- }
- // 3. Project Code Presence
- if target.ProjectCode != "" && strings.Contains(candidate.Detail, target.ProjectCode) {
- totalScore += 5.0
- }
- // 4. 投资额
- if target.PlannedInvestment > 0 && candidate.Budget > 0 {
- diff := math.Abs(target.PlannedInvestment - candidate.Budget)
- percentDiff := diff / math.Max(target.PlannedInvestment, 1) // Avoid division by zero
- // Simple linear decay: 100% score for 0% diff, 0% score for 100% diff or more
- investmentScore := math.Max(0, (1.0-percentDiff)*5.0)
- totalScore += investmentScore
- }
- //计算时间
- if target.BaseDate.Year() > 2000 && candidate.Publishtime > 1600000000 {
- totalScore += TimeProximity(target.BaseDate, time.Unix(candidate.Publishtime, 0)) * 5
- }
- return totalScore
- }
- // 计算时间
- func TimeProximity(t1, t2 time.Time) float64 {
- // 解析时间
- // 计算天数差值(取绝对值)
- days := int(t1.Sub(t2).Abs().Hours() / 24)
- // 根据天数差值返回不同的相似度
- switch {
- case days < 100:
- return 1.0
- case days < 200:
- return 0.7
- case days < 365:
- return 0.55
- case days < 700:
- return 0.3
- default:
- return 0.15
- }
- }
- func EnhancedSimilarity(str1, str2 string) float64 {
- // 标准化:去除空格和标点(根据中文需求调整)
- s1 := strings.TrimSpace(str1)
- s2 := strings.TrimSpace(str2)
- // 1. 检查完全相等
- if s1 == "" || s2 == "" {
- return 0
- } else if s1 == s2 {
- return 1.0
- }
- // 2. 检查包含关系(权重50%)
- if containmentScore := checkContainment(s1, s2); containmentScore > 0 {
- return containmentScore
- }
- // 3. 使用改进的Jaro-Winkler(考虑中文分词)
- jw := metrics.NewJaroWinkler()
- baseScore := strutil.Similarity(s1, s2, jw)
- // 4. 添加公共子串权重
- substringScore := checkLongestCommonSubstring(s1, s2)
- // 5. 组合得分
- finalScore := 0.5*baseScore + 0.5*substringScore
- return finalScore
- }
- // 检查包含关系,返回0-1的得分
- func checkContainment(s1, s2 string) float64 {
- // 确保s1是较短的字符串
- if len(s1) > len(s2) {
- s1, s2 = s2, s1
- }
- // 完全包含
- if strings.Contains(s2, s1) {
- // 被包含部分占长字符串的比例
- ratio := float64(len(s1)) / float64(len(s2))
- // 基础分0.7 + 比例调整(最多加到0.95)
- return 0.9 + ratio*0.1
- }
- return 0
- }
- // 检查最长公共子串
- func checkLongestCommonSubstring(s1, s2 string) float64 {
- m := len(s1)
- n := len(s2)
- // 动态规划表
- dp := make([][]int, m+1)
- for i := range dp {
- dp[i] = make([]int, n+1)
- }
- maxLen := 0
- for i := 1; i <= m; i++ {
- for j := 1; j <= n; j++ {
- if s1[i-1] == s2[j-1] {
- dp[i][j] = dp[i-1][j-1] + 1
- if dp[i][j] > maxLen {
- maxLen = dp[i][j]
- }
- }
- }
- }
- // 计算得分:最长公共子串占两个字符串平均长度的比例
- avgLen := (m + n) / 2
- if avgLen == 0 {
- return 0
- }
- return float64(maxLen) / float64(avgLen)
- }
- func main000() {
- str1 := "绿色低碳节能项目"
- str2 := "河北纵横集团丰南钢铁有限公司绿色低碳节能项目"
- similarity := EnhancedSimilarity(str1, str2)
- fmt.Printf("相似度: %.4f\n", similarity)
- // 更多测试用例
- testCases := []struct {
- s1, s2 string
- }{
- {"绿色低碳", "绿色低碳节能项目"},
- {"钢铁项目", "丰南钢铁有限公司项目"},
- {"节能环保", "新能源开发"},
- {"相同的字符串", "相同的字符串"},
- }
- for _, tc := range testCases {
- sim := EnhancedSimilarity(tc.s1, tc.s2)
- fmt.Printf("'%s' vs '%s': %.4f\n", tc.s1, tc.s2, sim)
- }
- }
|