|
@@ -0,0 +1,243 @@
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "fmt"
|
|
|
+ "github.com/yanyiwu/gojieba"
|
|
|
+ "gorm.io/gorm"
|
|
|
+ util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+)
|
|
|
+
|
|
|
+// dealAttachment 处理债券附件
|
|
|
+//func dealAttachment1() {
|
|
|
+// sess := Mgo.GetMgoConn()
|
|
|
+// defer Mgo.DestoryMongoConn(sess)
|
|
|
+// query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Select(nil).Iter()
|
|
|
+// count := 0
|
|
|
+//
|
|
|
+// bonds := make([]BondInfo, 0) // MySQL 债券数据
|
|
|
+// for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
|
|
|
+// if count%100 == 0 {
|
|
|
+// log.Info("current:", zap.Int("count", count), zap.Any("title", tmp["title"]))
|
|
|
+// }
|
|
|
+// //
|
|
|
+// if atta, ok := tmp["attachments"]; ok {
|
|
|
+// if atm, ok := atta.(map[string]interface{}); ok {
|
|
|
+// for _, v := range atm {
|
|
|
+// if dd, ok := v.(map[string]interface{}); ok {
|
|
|
+// //attachments 里面的文件名称
|
|
|
+// filename := util.ObjToString(dd["filename"])
|
|
|
+// fmt.Println(filename)
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+//}
|
|
|
+
|
|
|
+func dealAttachment(db *gorm.DB) {
|
|
|
+ sess := Mgo.GetMgoConn()
|
|
|
+ defer Mgo.DestoryMongoConn(sess)
|
|
|
+
|
|
|
+ // 1. 获取 MySQL 所有债券数据
|
|
|
+ var bonds []BondInfo
|
|
|
+ if err := db.Table("zxz_bond_info").Find(&bonds).Error; err != nil {
|
|
|
+ fmt.Println("获取 MySQL 债券数据失败:", err)
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ // 2. 初始化分词器
|
|
|
+ tokenizer := gojieba.NewJieba()
|
|
|
+ defer tokenizer.Free()
|
|
|
+
|
|
|
+ // 3. 遍历 MongoDB
|
|
|
+ query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Iter()
|
|
|
+ count := 0
|
|
|
+ tmp := make(map[string]interface{})
|
|
|
+
|
|
|
+ for query.Next(&tmp) {
|
|
|
+ count++
|
|
|
+ if count%100 == 0 {
|
|
|
+ fmt.Println("Progress:", count)
|
|
|
+ }
|
|
|
+
|
|
|
+ if attachments, ok := tmp["attachments"].(map[string]interface{}); ok {
|
|
|
+ for _, item := range attachments {
|
|
|
+ if attMap, ok := item.(map[string]interface{}); ok {
|
|
|
+ filename := util.ObjToString(attMap["filename"])
|
|
|
+ cleanFilename := strings.TrimSuffix(filename, ".pdf")
|
|
|
+ // 提取 MongoDB 附件关键词
|
|
|
+ k1 := ExtractKeywords(cleanFilename, tokenizer)
|
|
|
+
|
|
|
+ // 逐个匹配 MySQL 中的 bond_name
|
|
|
+ for _, bond := range bonds {
|
|
|
+ k2 := ExtractKeywords(bond.BondName, tokenizer)
|
|
|
+ if IsMatch(k1, k2) {
|
|
|
+ fmt.Printf("匹配成功: [%s] <=> [%s]\n", cleanFilename, bond.BondName)
|
|
|
+ // 可记录匹配对结果到文件/数据库
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func ExtractKeywordsV4(text string) []map[string]string {
|
|
|
+ text = strings.ReplaceAll(text, "(", "(")
|
|
|
+ text = strings.ReplaceAll(text, ")", ")")
|
|
|
+
|
|
|
+ // 匹配模式:完整提取年 + 省 + 债券类型 + 期数 + 括号说明
|
|
|
+ mainReg := regexp.MustCompile(`(?P<year>\d{4}年)(?P<province>[\p{Han}]{2,3}省)(?P<type>[\p{Han}]{2,20}债券)(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
|
|
|
+ matches := mainReg.FindAllStringSubmatch(text, -1)
|
|
|
+
|
|
|
+ results := make([]map[string]string, 0, len(matches))
|
|
|
+
|
|
|
+ for _, match := range matches {
|
|
|
+ if len(match) < 6 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ year := match[1]
|
|
|
+ province := match[2]
|
|
|
+ bondType := match[3]
|
|
|
+ phase := match[4]
|
|
|
+ bracket := match[5]
|
|
|
+
|
|
|
+ results = append(results, map[string]string{
|
|
|
+ "year": year,
|
|
|
+ "province": province,
|
|
|
+ "type": bondType,
|
|
|
+ "phase": phase,
|
|
|
+ "bracket": bracket,
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ return results
|
|
|
+}
|
|
|
+
|
|
|
+func ExtractKeywordsV3WithJieba(text string, tokenizer *gojieba.Jieba) []map[string]string {
|
|
|
+ text = strings.ReplaceAll(text, "(", "(")
|
|
|
+ text = strings.ReplaceAll(text, ")", ")")
|
|
|
+
|
|
|
+ // 提取年份
|
|
|
+ yearReg := regexp.MustCompile(`\d{4}年`)
|
|
|
+ year := yearReg.FindString(text)
|
|
|
+
|
|
|
+ // 提取“年”后的省份
|
|
|
+ province := ""
|
|
|
+ if year != "" {
|
|
|
+ afterYear := text[strings.Index(text, year)+len(year):]
|
|
|
+ provinceReg := regexp.MustCompile(`[\p{Han}]{2,3}省`)
|
|
|
+ province = provinceReg.FindString(afterYear)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 债券类型词典
|
|
|
+ bondTypes := []string{
|
|
|
+ "专项债券", "政府专项债券", "一般债券", "政府一般债券",
|
|
|
+ "再融资专项债券", "再融资一般债券", "再融资债券",
|
|
|
+ }
|
|
|
+
|
|
|
+ // combo 匹配正则
|
|
|
+ comboReg := regexp.MustCompile(`(?P<type>[\p{Han}]{2,12}债券)[,、,]?(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
|
|
|
+ matches := comboReg.FindAllStringSubmatch(text, -1)
|
|
|
+
|
|
|
+ results := make([]map[string]string, 0, len(matches))
|
|
|
+
|
|
|
+ for _, match := range matches {
|
|
|
+ if len(match) < 4 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ bondType := match[1]
|
|
|
+ phase := match[2]
|
|
|
+ bracket := match[3]
|
|
|
+
|
|
|
+ // 精确类型匹配
|
|
|
+ bestMatch := ""
|
|
|
+ for _, t := range bondTypes {
|
|
|
+ if strings.Contains(bondType, t) {
|
|
|
+ bestMatch = t
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bestMatch == "" {
|
|
|
+ bestMatch = bondType // fallback
|
|
|
+ }
|
|
|
+
|
|
|
+ results = append(results, map[string]string{
|
|
|
+ "year": year,
|
|
|
+ "province": province,
|
|
|
+ "type": bestMatch,
|
|
|
+ "phase": phase,
|
|
|
+ "bracket": bracket,
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ return results
|
|
|
+}
|
|
|
+
|
|
|
+// ExtractKeywords 提取五类关键词
|
|
|
+func ExtractKeywords(text string, tokenizer *gojieba.Jieba) map[string]string {
|
|
|
+ text = removeFileExtension(text)
|
|
|
+ // 分词
|
|
|
+ words := tokenizer.Cut(text, true)
|
|
|
+ wordSet := make(map[string]bool)
|
|
|
+ for _, w := range words {
|
|
|
+ wordSet[w] = true
|
|
|
+ }
|
|
|
+
|
|
|
+ // 正则抽取
|
|
|
+ yearReg := regexp.MustCompile(`\d{4}(年|年度)`)
|
|
|
+ provinceReg := regexp.MustCompile(`20\d{2}年([\p{Han}]{2,3}省)`)
|
|
|
+ phaseReg := regexp.MustCompile(`(第[一二三四五六七八九十百]{1,3}期|[一二三四五六七八九十百]{1,3}至[一二三四五六七八九十百]{1,3}期)`)
|
|
|
+ bracketReg := regexp.MustCompile(`([^)]+)`)
|
|
|
+
|
|
|
+ // 提取关键字段
|
|
|
+ year := yearReg.FindString(text)
|
|
|
+ province := ""
|
|
|
+ if match := provinceReg.FindStringSubmatch(text); len(match) == 2 {
|
|
|
+ province = match[1]
|
|
|
+ }
|
|
|
+ phase := phaseReg.FindString(text)
|
|
|
+ bracket := bracketReg.FindString(text)
|
|
|
+
|
|
|
+ // 通过词判断类型
|
|
|
+ bondType := ""
|
|
|
+ for _, t := range []string{"专项债券", "专项债", "一般债券", "一般债", "再融资债", "再融资一般债"} {
|
|
|
+ if wordSet[t] {
|
|
|
+ bondType = t
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return map[string]string{
|
|
|
+ "year": year,
|
|
|
+ "province": province,
|
|
|
+ "phase": phase,
|
|
|
+ "bracket": bracket,
|
|
|
+ "type": bondType,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// removeFileExtension 去除常见扩展名
|
|
|
+func removeFileExtension(text string) string {
|
|
|
+ suffixes := []string{".pdf", ".doc", ".docx", ".xls", ".xlsx", ".txt", ".zip"}
|
|
|
+ for _, ext := range suffixes {
|
|
|
+ if strings.HasSuffix(text, ext) {
|
|
|
+ return strings.TrimSuffix(text, ext)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return text
|
|
|
+}
|
|
|
+
|
|
|
+// IsMatch 判断关键词是否完全一致
|
|
|
+func IsMatch(k1, k2 map[string]string) bool {
|
|
|
+ for k := range k1 {
|
|
|
+ if k1[k] != "" && k1[k] == k2[k] {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ return true
|
|
|
+}
|