package main import ( "fmt" "github.com/yanyiwu/gojieba" "gorm.io/gorm" util "jygit.jydev.jianyu360.cn/data_processing/common_utils" "regexp" "strings" ) // dealAttachment 处理债券附件 //func dealAttachment1() { // sess := Mgo.GetMgoConn() // defer Mgo.DestoryMongoConn(sess) // query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Select(nil).Iter() // count := 0 // // bonds := make([]BondInfo, 0) // MySQL 债券数据 // for tmp := make(map[string]interface{}); query.Next(tmp); count++ { // if count%100 == 0 { // log.Info("current:", zap.Int("count", count), zap.Any("title", tmp["title"])) // } // // // if atta, ok := tmp["attachments"]; ok { // if atm, ok := atta.(map[string]interface{}); ok { // for _, v := range atm { // if dd, ok := v.(map[string]interface{}); ok { // //attachments 里面的文件名称 // filename := util.ObjToString(dd["filename"]) // fmt.Println(filename) // } // } // } // } // } //} func dealAttachment(db *gorm.DB) { sess := Mgo.GetMgoConn() defer Mgo.DestoryMongoConn(sess) // 1. 获取 MySQL 所有债券数据 var bonds []BondInfo if err := db.Table("zxz_bond_info").Find(&bonds).Error; err != nil { fmt.Println("获取 MySQL 债券数据失败:", err) return } // 2. 初始化分词器 tokenizer := gojieba.NewJieba() defer tokenizer.Free() // 3. 遍历 MongoDB query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Iter() count := 0 tmp := make(map[string]interface{}) for query.Next(&tmp) { count++ if count%100 == 0 { fmt.Println("Progress:", count) } if attachments, ok := tmp["attachments"].(map[string]interface{}); ok { for _, item := range attachments { if attMap, ok := item.(map[string]interface{}); ok { filename := util.ObjToString(attMap["filename"]) cleanFilename := strings.TrimSuffix(filename, ".pdf") // 提取 MongoDB 附件关键词 k1 := ExtractKeywords(cleanFilename, tokenizer) // 逐个匹配 MySQL 中的 bond_name for _, bond := range bonds { k2 := ExtractKeywords(bond.BondName, tokenizer) if IsMatch(k1, k2) { fmt.Printf("匹配成功: [%s] <=> [%s]\n", cleanFilename, bond.BondName) // 可记录匹配对结果到文件/数据库 break } } } } } } } func ExtractKeywordsV4(text string) []map[string]string { text = strings.ReplaceAll(text, "(", "(") text = strings.ReplaceAll(text, ")", ")") // 匹配模式:完整提取年 + 省 + 债券类型 + 期数 + 括号说明 mainReg := regexp.MustCompile(`(?P\d{4}年)(?P[\p{Han}]{2,3}省)(?P[\p{Han}]{2,20}债券)(?P[一二三四五六七八九十百至]{1,6}期)?(?:\((?P[^)]+)\))?`) matches := mainReg.FindAllStringSubmatch(text, -1) results := make([]map[string]string, 0, len(matches)) for _, match := range matches { if len(match) < 6 { continue } year := match[1] province := match[2] bondType := match[3] phase := match[4] bracket := match[5] results = append(results, map[string]string{ "year": year, "province": province, "type": bondType, "phase": phase, "bracket": bracket, }) } return results } func ExtractKeywordsV3WithJieba(text string, tokenizer *gojieba.Jieba) []map[string]string { text = strings.ReplaceAll(text, "(", "(") text = strings.ReplaceAll(text, ")", ")") // 提取年份 yearReg := regexp.MustCompile(`\d{4}年`) year := yearReg.FindString(text) // 提取“年”后的省份 province := "" if year != "" { afterYear := text[strings.Index(text, year)+len(year):] provinceReg := regexp.MustCompile(`[\p{Han}]{2,3}省`) province = provinceReg.FindString(afterYear) } // 债券类型词典 bondTypes := []string{ "专项债券", "政府专项债券", "一般债券", "政府一般债券", "再融资专项债券", "再融资一般债券", "再融资债券", } // combo 匹配正则 comboReg := regexp.MustCompile(`(?P[\p{Han}]{2,12}债券)[,、,]?(?P[一二三四五六七八九十百至]{1,6}期)?(?:\((?P[^)]+)\))?`) matches := comboReg.FindAllStringSubmatch(text, -1) results := make([]map[string]string, 0, len(matches)) for _, match := range matches { if len(match) < 4 { continue } bondType := match[1] phase := match[2] bracket := match[3] // 精确类型匹配 bestMatch := "" for _, t := range bondTypes { if strings.Contains(bondType, t) { bestMatch = t break } } if bestMatch == "" { bestMatch = bondType // fallback } results = append(results, map[string]string{ "year": year, "province": province, "type": bestMatch, "phase": phase, "bracket": bracket, }) } return results } // ExtractKeywords 提取五类关键词 func ExtractKeywords(text string, tokenizer *gojieba.Jieba) map[string]string { text = removeFileExtension(text) // 分词 words := tokenizer.Cut(text, true) wordSet := make(map[string]bool) for _, w := range words { wordSet[w] = true } // 正则抽取 yearReg := regexp.MustCompile(`\d{4}(年|年度)`) provinceReg := regexp.MustCompile(`20\d{2}年([\p{Han}]{2,3}省)`) phaseReg := regexp.MustCompile(`(第[一二三四五六七八九十百]{1,3}期|[一二三四五六七八九十百]{1,3}至[一二三四五六七八九十百]{1,3}期)`) bracketReg := regexp.MustCompile(`([^)]+)`) // 提取关键字段 year := yearReg.FindString(text) province := "" if match := provinceReg.FindStringSubmatch(text); len(match) == 2 { province = match[1] } phase := phaseReg.FindString(text) bracket := bracketReg.FindString(text) // 通过词判断类型 bondType := "" for _, t := range []string{"专项债券", "专项债", "一般债券", "一般债", "再融资债", "再融资一般债"} { if wordSet[t] { bondType = t break } } return map[string]string{ "year": year, "province": province, "phase": phase, "bracket": bracket, "type": bondType, } } // removeFileExtension 去除常见扩展名 func removeFileExtension(text string) string { suffixes := []string{".pdf", ".doc", ".docx", ".xls", ".xlsx", ".txt", ".zip"} for _, ext := range suffixes { if strings.HasSuffix(text, ext) { return strings.TrimSuffix(text, ext) } } return text } // IsMatch 判断关键词是否完全一致 func IsMatch(k1, k2 map[string]string) bool { for k := range k1 { if k1[k] != "" && k1[k] == k2[k] { continue } return false } return true }