package main import ( "fmt" "math" "strings" "time" "github.com/adrg/strutil" "github.com/adrg/strutil/metrics" ) type ScoredResult struct { Document EsDocument Confidence float64 } type InputData struct { ProjectCode string Area string City string ProjectName string ConstructionUnit string AreaCode string PlannedInvestment float64 BaseDate time.Time } type EsDocument struct { Id string `json:"id"` Title string `json:"title"` Projectname string `json:"projectname"` ProjectCode string `json:"projectcode"` Toptype string `json:"toptype"` Subtype string `json:"subtype"` Area string `json:"area"` City string `json:"city"` Buyer string `json:"buyer"` Budget float64 `json:"budget"` Bidamount float64 `json:"bidamount"` Winner string `json:"winner"` Detail string `json:"detail"` Publishtime int64 `json:"publishtime"` Agency string `json:"agency"` SWinner string `json:"s_winner"` WinnerTel string `json:"winnertel"` BuyerTel string `json:"buyertel"` BuyerPerson string `json:"buyerperson"` Score float64 `json:"score"` } // calculateConfidenceScore calculates a score from 0-100 based on multiple factors. func calculateConfidenceScore(target InputData, candidate EsDocument) float64 { var totalScore float64 = 0.0 nameSimilarity := EnhancedSimilarity(strings.ReplaceAll(target.ProjectName, target.ConstructionUnit, ""), strings.ReplaceAll(strings.ReplaceAll(candidate.Projectname, candidate.Buyer, ""), target.ConstructionUnit, "")) // 1. Project Name Similarity (Weight: 40 points) totalScore += nameSimilarity * 70.0 //log.Println(target.ProjectName, candidate.Projectname, nameSimilarity) // 2. Construction Unit Similarity (Weight: 30 points) unitSimilarity := EnhancedSimilarity(target.ConstructionUnit, candidate.Buyer) totalScore += unitSimilarity * 10.0 //log.Println(target.ConstructionUnit, candidate.Buyer, unitSimilarity) if target.Area != "" && target.Area == candidate.Area { totalScore += 5 if target.City != "" && target.City == candidate.City { totalScore += 5 } } // 3. Project Code Presence if target.ProjectCode != "" && strings.Contains(candidate.Detail, target.ProjectCode) { totalScore += 5.0 } // 4. 投资额 if target.PlannedInvestment > 0 && candidate.Budget > 0 { diff := math.Abs(target.PlannedInvestment - candidate.Budget) percentDiff := diff / math.Max(target.PlannedInvestment, 1) // Avoid division by zero // Simple linear decay: 100% score for 0% diff, 0% score for 100% diff or more investmentScore := math.Max(0, (1.0-percentDiff)*5.0) totalScore += investmentScore } //计算时间 if target.BaseDate.Year() > 2000 && candidate.Publishtime > 1600000000 { totalScore += TimeProximity(target.BaseDate, time.Unix(candidate.Publishtime, 0)) * 5 } return totalScore } // 计算时间 func TimeProximity(t1, t2 time.Time) float64 { // 解析时间 // 计算天数差值(取绝对值) days := int(t1.Sub(t2).Abs().Hours() / 24) // 根据天数差值返回不同的相似度 switch { case days < 100: return 1.0 case days < 200: return 0.7 case days < 365: return 0.55 case days < 700: return 0.3 default: return 0.15 } } func EnhancedSimilarity(str1, str2 string) float64 { // 标准化:去除空格和标点(根据中文需求调整) s1 := strings.TrimSpace(str1) s2 := strings.TrimSpace(str2) // 1. 检查完全相等 if s1 == "" || s2 == "" { return 0 } else if s1 == s2 { return 1.0 } // 2. 检查包含关系(权重50%) if containmentScore := checkContainment(s1, s2); containmentScore > 0 { return containmentScore } // 3. 使用改进的Jaro-Winkler(考虑中文分词) jw := metrics.NewJaroWinkler() baseScore := strutil.Similarity(s1, s2, jw) // 4. 添加公共子串权重 substringScore := checkLongestCommonSubstring(s1, s2) // 5. 组合得分 finalScore := 0.5*baseScore + 0.5*substringScore return finalScore } // 检查包含关系,返回0-1的得分 func checkContainment(s1, s2 string) float64 { // 确保s1是较短的字符串 if len(s1) > len(s2) { s1, s2 = s2, s1 } // 完全包含 if strings.Contains(s2, s1) { // 被包含部分占长字符串的比例 ratio := float64(len(s1)) / float64(len(s2)) // 基础分0.7 + 比例调整(最多加到0.95) return 0.9 + ratio*0.1 } return 0 } // 检查最长公共子串 func checkLongestCommonSubstring(s1, s2 string) float64 { m := len(s1) n := len(s2) // 动态规划表 dp := make([][]int, m+1) for i := range dp { dp[i] = make([]int, n+1) } maxLen := 0 for i := 1; i <= m; i++ { for j := 1; j <= n; j++ { if s1[i-1] == s2[j-1] { dp[i][j] = dp[i-1][j-1] + 1 if dp[i][j] > maxLen { maxLen = dp[i][j] } } } } // 计算得分:最长公共子串占两个字符串平均长度的比例 avgLen := (m + n) / 2 if avgLen == 0 { return 0 } return float64(maxLen) / float64(avgLen) } func main000() { str1 := "绿色低碳节能项目" str2 := "河北纵横集团丰南钢铁有限公司绿色低碳节能项目" similarity := EnhancedSimilarity(str1, str2) fmt.Printf("相似度: %.4f\n", similarity) // 更多测试用例 testCases := []struct { s1, s2 string }{ {"绿色低碳", "绿色低碳节能项目"}, {"钢铁项目", "丰南钢铁有限公司项目"}, {"节能环保", "新能源开发"}, {"相同的字符串", "相同的字符串"}, } for _, tc := range testCases { sim := EnhancedSimilarity(tc.s1, tc.s2) fmt.Printf("'%s' vs '%s': %.4f\n", tc.s1, tc.s2, sim) } }