12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- package main
- import (
- "github.com/axgle/mahonia"
- "github.com/yanyiwu/gojieba"
- "math"
- "strings"
- )
- // preprocessText 对文本进行预处理,包括转换编码和去除非字母字符
- func preprocessText(text string) string {
- // 转换编码为UTF-8
- enc := mahonia.NewDecoder("gbk")
- text = enc.ConvertString(text)
- // 去除非字母字符
- var processedText strings.Builder
- for _, char := range text {
- if (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || isChinese(char) {
- processedText.WriteRune(char)
- }
- }
- return processedText.String()
- }
- // calculateCosineSimilarity 计算两个向量的余弦相似度
- func calculateCosineSimilarity(vector1, vector2 map[string]int) float64 {
- dotProduct := 0
- magnitude1 := 0
- magnitude2 := 0
- // 计算点积和向量长度
- for term, count1 := range vector1 {
- count2, exists := vector2[term]
- dotProduct += count1 * count2
- magnitude1 += count1 * count1
- if exists {
- magnitude2 += count2 * count2
- }
- }
- // 处理向量2中独有的项
- for term, count2 := range vector2 {
- if _, exists := vector1[term]; !exists {
- magnitude2 += count2 * count2
- }
- }
- // 计算余弦相似度
- if magnitude1 == 0 || magnitude2 == 0 {
- return 0.0
- }
- return float64(dotProduct) / (math.Sqrt(float64(magnitude1)) * math.Sqrt(float64(magnitude2)))
- }
- // calculateTermFrequency 计算文本中每个词的词频
- func calculateTermFrequency(text string, segmenter *gojieba.Jieba) map[string]int {
- // 使用 gojieba 包进行中文分词
- words := segmenter.Cut(text, true)
- termFrequency := make(map[string]int)
- for _, word := range words {
- termFrequency[word]++
- }
- return termFrequency
- }
- // cosineSimilarity 计算两个文本字符串的余弦相似度
- func cosineSimilarity(text1, text2 string) float64 {
- // 初始化 gojieba 分词器
- segmenter := gojieba.NewJieba()
- defer segmenter.Free()
- // 预处理文本
- text1 = preprocessText(text1)
- text2 = preprocessText(text2)
- // 计算词频向量
- vector1 := calculateTermFrequency(text1, segmenter)
- vector2 := calculateTermFrequency(text2, segmenter)
- // 计算并返回余弦相似度
- return calculateCosineSimilarity(vector1, vector2)
- }
- // isChinese 判断字符是否为中文
- func isChinese(r rune) bool {
- return r >= '\u4e00' && r <= '\u9fff'
- }
|