package main import ( "github.com/axgle/mahonia" "github.com/yanyiwu/gojieba" "math" "strings" ) // preprocessText 对文本进行预处理,包括转换编码和去除非字母字符 func preprocessText(text string) string { // 转换编码为UTF-8 enc := mahonia.NewDecoder("gbk") text = enc.ConvertString(text) // 去除非字母字符 var processedText strings.Builder for _, char := range text { if (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || isChinese(char) { processedText.WriteRune(char) } } return processedText.String() } // calculateCosineSimilarity 计算两个向量的余弦相似度 func calculateCosineSimilarity(vector1, vector2 map[string]int) float64 { dotProduct := 0 magnitude1 := 0 magnitude2 := 0 // 计算点积和向量长度 for term, count1 := range vector1 { count2, exists := vector2[term] dotProduct += count1 * count2 magnitude1 += count1 * count1 if exists { magnitude2 += count2 * count2 } } // 处理向量2中独有的项 for term, count2 := range vector2 { if _, exists := vector1[term]; !exists { magnitude2 += count2 * count2 } } // 计算余弦相似度 if magnitude1 == 0 || magnitude2 == 0 { return 0.0 } return float64(dotProduct) / (math.Sqrt(float64(magnitude1)) * math.Sqrt(float64(magnitude2))) } // calculateTermFrequency 计算文本中每个词的词频 func calculateTermFrequency(text string, segmenter *gojieba.Jieba) map[string]int { // 使用 gojieba 包进行中文分词 words := segmenter.Cut(text, true) termFrequency := make(map[string]int) for _, word := range words { termFrequency[word]++ } return termFrequency } // cosineSimilarity 计算两个文本字符串的余弦相似度 func cosineSimilarity(text1, text2 string) float64 { // 初始化 gojieba 分词器 segmenter := gojieba.NewJieba() defer segmenter.Free() // 预处理文本 text1 = preprocessText(text1) text2 = preprocessText(text2) // 计算词频向量 vector1 := calculateTermFrequency(text1, segmenter) vector2 := calculateTermFrequency(text2, segmenter) // 计算并返回余弦相似度 return calculateCosineSimilarity(vector1, vector2) } // isChinese 判断字符是否为中文 func isChinese(r rune) bool { return r >= '\u4e00' && r <= '\u9fff' }