utils.go 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. package main
  2. import (
  3. "github.com/axgle/mahonia"
  4. "github.com/yanyiwu/gojieba"
  5. "math"
  6. "strings"
  7. )
  8. // preprocessText 对文本进行预处理,包括转换编码和去除非字母字符
  9. func preprocessText(text string) string {
  10. // 转换编码为UTF-8
  11. enc := mahonia.NewDecoder("gbk")
  12. text = enc.ConvertString(text)
  13. // 去除非字母字符
  14. var processedText strings.Builder
  15. for _, char := range text {
  16. if (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || isChinese(char) {
  17. processedText.WriteRune(char)
  18. }
  19. }
  20. return processedText.String()
  21. }
  22. // calculateCosineSimilarity 计算两个向量的余弦相似度
  23. func calculateCosineSimilarity(vector1, vector2 map[string]int) float64 {
  24. dotProduct := 0
  25. magnitude1 := 0
  26. magnitude2 := 0
  27. // 计算点积和向量长度
  28. for term, count1 := range vector1 {
  29. count2, exists := vector2[term]
  30. dotProduct += count1 * count2
  31. magnitude1 += count1 * count1
  32. if exists {
  33. magnitude2 += count2 * count2
  34. }
  35. }
  36. // 处理向量2中独有的项
  37. for term, count2 := range vector2 {
  38. if _, exists := vector1[term]; !exists {
  39. magnitude2 += count2 * count2
  40. }
  41. }
  42. // 计算余弦相似度
  43. if magnitude1 == 0 || magnitude2 == 0 {
  44. return 0.0
  45. }
  46. return float64(dotProduct) / (math.Sqrt(float64(magnitude1)) * math.Sqrt(float64(magnitude2)))
  47. }
  48. // calculateTermFrequency 计算文本中每个词的词频
  49. func calculateTermFrequency(text string, segmenter *gojieba.Jieba) map[string]int {
  50. // 使用 gojieba 包进行中文分词
  51. words := segmenter.Cut(text, true)
  52. termFrequency := make(map[string]int)
  53. for _, word := range words {
  54. termFrequency[word]++
  55. }
  56. return termFrequency
  57. }
  58. // cosineSimilarity 计算两个文本字符串的余弦相似度
  59. func cosineSimilarity(text1, text2 string) float64 {
  60. // 初始化 gojieba 分词器
  61. segmenter := gojieba.NewJieba()
  62. defer segmenter.Free()
  63. // 预处理文本
  64. text1 = preprocessText(text1)
  65. text2 = preprocessText(text2)
  66. // 计算词频向量
  67. vector1 := calculateTermFrequency(text1, segmenter)
  68. vector2 := calculateTermFrequency(text2, segmenter)
  69. // 计算并返回余弦相似度
  70. return calculateCosineSimilarity(vector1, vector2)
  71. }
  72. // isChinese 判断字符是否为中文
  73. func isChinese(r rune) bool {
  74. return r >= '\u4e00' && r <= '\u9fff'
  75. }