main.go 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. package main
  2. import (
  3. "context"
  4. "fmt"
  5. "github.com/axgle/mahonia"
  6. "github.com/yanyiwu/gojieba"
  7. "go.mongodb.org/mongo-driver/bson"
  8. "go.mongodb.org/mongo-driver/mongo"
  9. "go.mongodb.org/mongo-driver/mongo/options"
  10. "go.mongodb.org/mongo-driver/mongo/readpref"
  11. "log"
  12. "math"
  13. "strings"
  14. )
  15. func main() {
  16. // 连接 MongoDB
  17. clientOptions := options.Client().ApplyURI("mongodb://192.168.3.166:27082")
  18. // 连接到MongoDB
  19. client, err := mongo.Connect(context.Background(), clientOptions)
  20. if err != nil {
  21. fmt.Println(err)
  22. }
  23. defer client.Disconnect(context.TODO())
  24. // 检查 MongoDB 连接
  25. err = client.Ping(context.Background(), readpref.Primary())
  26. if err != nil {
  27. log.Fatal(err)
  28. }
  29. // 获取 MongoDB 中的标题数据
  30. newsData, err := getNewsDataFromMongoDB(client, "wcc", "hp_news")
  31. if err != nil {
  32. log.Fatal(err)
  33. }
  34. // 使用 gojieba 进行中文分词
  35. jieba := gojieba.NewJieba()
  36. defer jieba.Free()
  37. // 计算相似度并更新 MongoDB 数据
  38. updateSimilarityField(newsData, jieba, client, "wcc", "hp_news")
  39. }
  40. // getNewsDataFromMongoDB 从 MongoDB 中获取新闻数据
  41. func getNewsDataFromMongoDB(client *mongo.Client, database, collection string) ([]map[string]interface{}, error) {
  42. var newsData []map[string]interface{}
  43. coll := client.Database(database).Collection(collection)
  44. cur, err := coll.Find(context.Background(), nil)
  45. if err != nil {
  46. return nil, err
  47. }
  48. defer cur.Close(context.Background())
  49. for cur.Next(context.Background()) {
  50. var result map[string]interface{}
  51. err := cur.Decode(&result)
  52. if err != nil {
  53. return nil, err
  54. }
  55. newsData = append(newsData, result)
  56. }
  57. return newsData, nil
  58. }
  59. // preprocessText 对文本进行预处理,包括转换编码和去除非字母字符
  60. func preprocessText(text string) string {
  61. // 转换编码为UTF-8
  62. enc := mahonia.NewDecoder("gbk")
  63. text = enc.ConvertString(text)
  64. // 去除非字母字符
  65. var processedText strings.Builder
  66. for _, char := range text {
  67. if (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || isChinese(char) {
  68. processedText.WriteRune(char)
  69. }
  70. }
  71. return processedText.String()
  72. }
  73. // isChinese 判断字符是否为中文
  74. func isChinese(r rune) bool {
  75. return r >= '\u4e00' && r <= '\u9fff'
  76. }
  77. // updateSimilarityField 计算相似度并更新 MongoDB 数据
  78. func updateSimilarityField(newsData []map[string]interface{}, jieba *gojieba.Jieba, client *mongo.Client, database, collection string) {
  79. // 计算相似度并更新 MongoDB 数据
  80. for i := range newsData {
  81. title1 := newsData[i]["title"].(string)
  82. title1 = preprocessText(title1)
  83. vector1 := buildVector(jieba.Cut(title1, true))
  84. for j := range newsData {
  85. if i == j {
  86. continue // 不要计算相同文本的相似度
  87. }
  88. title2 := newsData[j]["title"].(string)
  89. title2 = preprocessText(title2)
  90. vector2 := buildVector(jieba.Cut(title2, true))
  91. similarity := cosineSimilarity(vector1, vector2)
  92. // 更新 MongoDB 中的数据,添加相似度字段
  93. filter := bson.M{"_id": newsData[i]["_id"]}
  94. update := bson.M{"$set": bson.M{"similarity": similarity}}
  95. _, err := client.Database(database).Collection(collection).UpdateOne(context.Background(), filter, update)
  96. if err != nil {
  97. log.Println("Error updating document:", err)
  98. }
  99. }
  100. }
  101. }
  102. // buildVector 构建词频向量
  103. func buildVector(words []string) map[string]int {
  104. vector := make(map[string]int)
  105. for _, word := range words {
  106. vector[word]++
  107. }
  108. return vector
  109. }
  110. // cosineSimilarity 计算余弦相似度
  111. func cosineSimilarity(vector1, vector2 map[string]int) float64 {
  112. dotProduct := 0
  113. magnitude1 := 0
  114. magnitude2 := 0
  115. for word, count1 := range vector1 {
  116. if count2, found := vector2[word]; found {
  117. dotProduct += count1 * count2
  118. }
  119. magnitude1 += count1 * count1
  120. }
  121. for _, count2 := range vector2 {
  122. magnitude2 += count2 * count2
  123. }
  124. if magnitude1 == 0 || magnitude2 == 0 {
  125. return 0.0 // 避免除以零的情况
  126. }
  127. return float64(dotProduct) / (math.Sqrt(float64(magnitude1)) * math.Sqrt(float64(magnitude2)))
  128. }