123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- package main
- import (
- "context"
- "fmt"
- "github.com/axgle/mahonia"
- "github.com/yanyiwu/gojieba"
- "go.mongodb.org/mongo-driver/bson"
- "go.mongodb.org/mongo-driver/mongo"
- "go.mongodb.org/mongo-driver/mongo/options"
- "go.mongodb.org/mongo-driver/mongo/readpref"
- "log"
- "math"
- "strings"
- )
- func main() {
- // 连接 MongoDB
- clientOptions := options.Client().ApplyURI("mongodb://192.168.3.166:27082")
- // 连接到MongoDB
- client, err := mongo.Connect(context.Background(), clientOptions)
- if err != nil {
- fmt.Println(err)
- }
- defer client.Disconnect(context.TODO())
- // 检查 MongoDB 连接
- err = client.Ping(context.Background(), readpref.Primary())
- if err != nil {
- log.Fatal(err)
- }
- // 获取 MongoDB 中的标题数据
- newsData, err := getNewsDataFromMongoDB(client, "wcc", "hp_news")
- if err != nil {
- log.Fatal(err)
- }
- // 使用 gojieba 进行中文分词
- jieba := gojieba.NewJieba()
- defer jieba.Free()
- // 计算相似度并更新 MongoDB 数据
- updateSimilarityField(newsData, jieba, client, "wcc", "hp_news")
- }
- // getNewsDataFromMongoDB 从 MongoDB 中获取新闻数据
- func getNewsDataFromMongoDB(client *mongo.Client, database, collection string) ([]map[string]interface{}, error) {
- var newsData []map[string]interface{}
- coll := client.Database(database).Collection(collection)
- cur, err := coll.Find(context.Background(), nil)
- if err != nil {
- return nil, err
- }
- defer cur.Close(context.Background())
- for cur.Next(context.Background()) {
- var result map[string]interface{}
- err := cur.Decode(&result)
- if err != nil {
- return nil, err
- }
- newsData = append(newsData, result)
- }
- return newsData, nil
- }
- // preprocessText 对文本进行预处理,包括转换编码和去除非字母字符
- func preprocessText(text string) string {
- // 转换编码为UTF-8
- enc := mahonia.NewDecoder("gbk")
- text = enc.ConvertString(text)
- // 去除非字母字符
- var processedText strings.Builder
- for _, char := range text {
- if (char >= 'a' && char <= 'z') || (char >= 'A' && char <= 'Z') || isChinese(char) {
- processedText.WriteRune(char)
- }
- }
- return processedText.String()
- }
- // isChinese 判断字符是否为中文
- func isChinese(r rune) bool {
- return r >= '\u4e00' && r <= '\u9fff'
- }
- // updateSimilarityField 计算相似度并更新 MongoDB 数据
- func updateSimilarityField(newsData []map[string]interface{}, jieba *gojieba.Jieba, client *mongo.Client, database, collection string) {
- // 计算相似度并更新 MongoDB 数据
- for i := range newsData {
- title1 := newsData[i]["title"].(string)
- title1 = preprocessText(title1)
- vector1 := buildVector(jieba.Cut(title1, true))
- for j := range newsData {
- if i == j {
- continue // 不要计算相同文本的相似度
- }
- title2 := newsData[j]["title"].(string)
- title2 = preprocessText(title2)
- vector2 := buildVector(jieba.Cut(title2, true))
- similarity := cosineSimilarity(vector1, vector2)
- // 更新 MongoDB 中的数据,添加相似度字段
- filter := bson.M{"_id": newsData[i]["_id"]}
- update := bson.M{"$set": bson.M{"similarity": similarity}}
- _, err := client.Database(database).Collection(collection).UpdateOne(context.Background(), filter, update)
- if err != nil {
- log.Println("Error updating document:", err)
- }
- }
- }
- }
- // buildVector 构建词频向量
- func buildVector(words []string) map[string]int {
- vector := make(map[string]int)
- for _, word := range words {
- vector[word]++
- }
- return vector
- }
- // cosineSimilarity 计算余弦相似度
- func cosineSimilarity(vector1, vector2 map[string]int) float64 {
- dotProduct := 0
- magnitude1 := 0
- magnitude2 := 0
- for word, count1 := range vector1 {
- if count2, found := vector2[word]; found {
- dotProduct += count1 * count2
- }
- magnitude1 += count1 * count1
- }
- for _, count2 := range vector2 {
- magnitude2 += count2 * count2
- }
- if magnitude1 == 0 || magnitude2 == 0 {
- return 0.0 // 避免除以零的情况
- }
- return float64(dotProduct) / (math.Sqrt(float64(magnitude1)) * math.Sqrt(float64(magnitude2)))
- }
|