123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243 |
- package main
- import (
- "fmt"
- "github.com/yanyiwu/gojieba"
- "gorm.io/gorm"
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "regexp"
- "strings"
- )
- // dealAttachment 处理债券附件
- //func dealAttachment1() {
- // sess := Mgo.GetMgoConn()
- // defer Mgo.DestoryMongoConn(sess)
- // query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Select(nil).Iter()
- // count := 0
- //
- // bonds := make([]BondInfo, 0) // MySQL 债券数据
- // for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
- // if count%100 == 0 {
- // log.Info("current:", zap.Int("count", count), zap.Any("title", tmp["title"]))
- // }
- // //
- // if atta, ok := tmp["attachments"]; ok {
- // if atm, ok := atta.(map[string]interface{}); ok {
- // for _, v := range atm {
- // if dd, ok := v.(map[string]interface{}); ok {
- // //attachments 里面的文件名称
- // filename := util.ObjToString(dd["filename"])
- // fmt.Println(filename)
- // }
- // }
- // }
- // }
- // }
- //}
- func dealAttachment(db *gorm.DB) {
- sess := Mgo.GetMgoConn()
- defer Mgo.DestoryMongoConn(sess)
- // 1. 获取 MySQL 所有债券数据
- var bonds []BondInfo
- if err := db.Table("zxz_bond_info").Find(&bonds).Error; err != nil {
- fmt.Println("获取 MySQL 债券数据失败:", err)
- return
- }
- // 2. 初始化分词器
- tokenizer := gojieba.NewJieba()
- defer tokenizer.Free()
- // 3. 遍历 MongoDB
- query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Iter()
- count := 0
- tmp := make(map[string]interface{})
- for query.Next(&tmp) {
- count++
- if count%100 == 0 {
- fmt.Println("Progress:", count)
- }
- if attachments, ok := tmp["attachments"].(map[string]interface{}); ok {
- for _, item := range attachments {
- if attMap, ok := item.(map[string]interface{}); ok {
- filename := util.ObjToString(attMap["filename"])
- cleanFilename := strings.TrimSuffix(filename, ".pdf")
- // 提取 MongoDB 附件关键词
- k1 := ExtractKeywords(cleanFilename, tokenizer)
- // 逐个匹配 MySQL 中的 bond_name
- for _, bond := range bonds {
- k2 := ExtractKeywords(bond.BondName, tokenizer)
- if IsMatch(k1, k2) {
- fmt.Printf("匹配成功: [%s] <=> [%s]\n", cleanFilename, bond.BondName)
- // 可记录匹配对结果到文件/数据库
- break
- }
- }
- }
- }
- }
- }
- }
- func ExtractKeywordsV4(text string) []map[string]string {
- text = strings.ReplaceAll(text, "(", "(")
- text = strings.ReplaceAll(text, ")", ")")
- // 匹配模式:完整提取年 + 省 + 债券类型 + 期数 + 括号说明
- mainReg := regexp.MustCompile(`(?P<year>\d{4}年)(?P<province>[\p{Han}]{2,3}省)(?P<type>[\p{Han}]{2,20}债券)(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
- matches := mainReg.FindAllStringSubmatch(text, -1)
- results := make([]map[string]string, 0, len(matches))
- for _, match := range matches {
- if len(match) < 6 {
- continue
- }
- year := match[1]
- province := match[2]
- bondType := match[3]
- phase := match[4]
- bracket := match[5]
- results = append(results, map[string]string{
- "year": year,
- "province": province,
- "type": bondType,
- "phase": phase,
- "bracket": bracket,
- })
- }
- return results
- }
- func ExtractKeywordsV3WithJieba(text string, tokenizer *gojieba.Jieba) []map[string]string {
- text = strings.ReplaceAll(text, "(", "(")
- text = strings.ReplaceAll(text, ")", ")")
- // 提取年份
- yearReg := regexp.MustCompile(`\d{4}年`)
- year := yearReg.FindString(text)
- // 提取“年”后的省份
- province := ""
- if year != "" {
- afterYear := text[strings.Index(text, year)+len(year):]
- provinceReg := regexp.MustCompile(`[\p{Han}]{2,3}省`)
- province = provinceReg.FindString(afterYear)
- }
- // 债券类型词典
- bondTypes := []string{
- "专项债券", "政府专项债券", "一般债券", "政府一般债券",
- "再融资专项债券", "再融资一般债券", "再融资债券",
- }
- // combo 匹配正则
- comboReg := regexp.MustCompile(`(?P<type>[\p{Han}]{2,12}债券)[,、,]?(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
- matches := comboReg.FindAllStringSubmatch(text, -1)
- results := make([]map[string]string, 0, len(matches))
- for _, match := range matches {
- if len(match) < 4 {
- continue
- }
- bondType := match[1]
- phase := match[2]
- bracket := match[3]
- // 精确类型匹配
- bestMatch := ""
- for _, t := range bondTypes {
- if strings.Contains(bondType, t) {
- bestMatch = t
- break
- }
- }
- if bestMatch == "" {
- bestMatch = bondType // fallback
- }
- results = append(results, map[string]string{
- "year": year,
- "province": province,
- "type": bestMatch,
- "phase": phase,
- "bracket": bracket,
- })
- }
- return results
- }
- // ExtractKeywords 提取五类关键词
- func ExtractKeywords(text string, tokenizer *gojieba.Jieba) map[string]string {
- text = removeFileExtension(text)
- // 分词
- words := tokenizer.Cut(text, true)
- wordSet := make(map[string]bool)
- for _, w := range words {
- wordSet[w] = true
- }
- // 正则抽取
- yearReg := regexp.MustCompile(`\d{4}(年|年度)`)
- provinceReg := regexp.MustCompile(`20\d{2}年([\p{Han}]{2,3}省)`)
- phaseReg := regexp.MustCompile(`(第[一二三四五六七八九十百]{1,3}期|[一二三四五六七八九十百]{1,3}至[一二三四五六七八九十百]{1,3}期)`)
- bracketReg := regexp.MustCompile(`([^)]+)`)
- // 提取关键字段
- year := yearReg.FindString(text)
- province := ""
- if match := provinceReg.FindStringSubmatch(text); len(match) == 2 {
- province = match[1]
- }
- phase := phaseReg.FindString(text)
- bracket := bracketReg.FindString(text)
- // 通过词判断类型
- bondType := ""
- for _, t := range []string{"专项债券", "专项债", "一般债券", "一般债", "再融资债", "再融资一般债"} {
- if wordSet[t] {
- bondType = t
- break
- }
- }
- return map[string]string{
- "year": year,
- "province": province,
- "phase": phase,
- "bracket": bracket,
- "type": bondType,
- }
- }
- // removeFileExtension 去除常见扩展名
- func removeFileExtension(text string) string {
- suffixes := []string{".pdf", ".doc", ".docx", ".xls", ".xlsx", ".txt", ".zip"}
- for _, ext := range suffixes {
- if strings.HasSuffix(text, ext) {
- return strings.TrimSuffix(text, ext)
- }
- }
- return text
- }
- // IsMatch 判断关键词是否完全一致
- func IsMatch(k1, k2 map[string]string) bool {
- for k := range k1 {
- if k1[k] != "" && k1[k] == k2[k] {
- continue
- }
- return false
- }
- return true
- }
|