package main import ( "context" "fmt" "github.com/axgle/mahonia" "github.com/yanyiwu/gojieba" "go.mongodb.org/mongo-driver/bson" "go.mongodb.org/mongo-driver/mongo" "go.mongodb.org/mongo-driver/mongo/options" "go.mongodb.org/mongo-driver/mongo/readpref" "log" "math" "strings" ) func main() { // 连接 MongoDB clientOptions := options.Client().ApplyURI("mongodb://192.168.3.166:27082") // 连接到MongoDB client, err := mongo.Connect(context.Background(), clientOptions) if err != nil { fmt.Println(err) } defer client.Disconnect(context.TODO()) // 检查 MongoDB 连接 err = client.Ping(context.Background(), readpref.Primary()) if err != nil { log.Fatal(err) } // 获取 MongoDB 中的标题数据 newsData, err := getNewsDataFromMongoDB(client, "wcc", "hp_news") if err != nil { log.Fatal(err) } // 使用 gojieba 进行中文分词 jieba := gojieba.NewJieba() defer jieba.Free() // 计算相似度并更新 MongoDB 数据 updateSimilarityField(newsData, jieba, client, "wcc", "hp_news") } // getNewsDataFromMongoDB 从 MongoDB 中获取新闻数据 func getNewsDataFromMongoDB(client *mongo.Client, database, collection string) ([]map[string]interface{}, error) { var newsData []map[string]interface{} coll := client.Database(database).Collection(collection) cur, err := coll.Find(context.Background(), nil) if err != nil { return nil, err } defer cur.Close(context.Background()) for cur.Next(context.Background()) { var result map[string]interface{} err := cur.Decode(&result) if err != nil { return nil, err } newsData = append(newsData, result) } return newsData, nil } // preprocessText 对文本进行预处理,包括转换编码和去除非字母字符 func preprocessText(text string) string { // 转换编码为UTF-8 enc := mahonia.NewDecoder("gbk") text = enc.ConvertString(text) // 去除非字母字符 var processedText strings.Builder for _, char := range text { if (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || isChinese(char) { processedText.WriteRune(char) } } return processedText.String() } // isChinese 判断字符是否为中文 func isChinese(r rune) bool { return r >= '\u4e00' && r <= '\u9fff' } // updateSimilarityField 计算相似度并更新 MongoDB 数据 func updateSimilarityField(newsData []map[string]interface{}, jieba *gojieba.Jieba, client *mongo.Client, database, collection string) { // 计算相似度并更新 MongoDB 数据 for i := range newsData { title1 := newsData[i]["title"].(string) title1 = preprocessText(title1) vector1 := buildVector(jieba.Cut(title1, true)) for j := range newsData { if i == j { continue // 不要计算相同文本的相似度 } title2 := newsData[j]["title"].(string) title2 = preprocessText(title2) vector2 := buildVector(jieba.Cut(title2, true)) similarity := cosineSimilarity(vector1, vector2) // 更新 MongoDB 中的数据,添加相似度字段 filter := bson.M{"_id": newsData[i]["_id"]} update := bson.M{"$set": bson.M{"similarity": similarity}} _, err := client.Database(database).Collection(collection).UpdateOne(context.Background(), filter, update) if err != nil { log.Println("Error updating document:", err) } } } } // buildVector 构建词频向量 func buildVector(words []string) map[string]int { vector := make(map[string]int) for _, word := range words { vector[word]++ } return vector } // cosineSimilarity 计算余弦相似度 func cosineSimilarity(vector1, vector2 map[string]int) float64 { dotProduct := 0 magnitude1 := 0 magnitude2 := 0 for word, count1 := range vector1 { if count2, found := vector2[word]; found { dotProduct += count1 * count2 } magnitude1 += count1 * count1 } for _, count2 := range vector2 { magnitude2 += count2 * count2 } if magnitude1 == 0 || magnitude2 == 0 { return 0.0 // 避免除以零的情况 } return float64(dotProduct) / (math.Sqrt(float64(magnitude1)) * math.Sqrt(float64(magnitude2))) }