|
@@ -1,243 +0,0 @@
|
|
|
-package main
|
|
|
-
|
|
|
-import (
|
|
|
- "fmt"
|
|
|
- "github.com/yanyiwu/gojieba"
|
|
|
- "gorm.io/gorm"
|
|
|
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
|
- "regexp"
|
|
|
- "strings"
|
|
|
-)
|
|
|
-
|
|
|
-// dealAttachment 处理债券附件
|
|
|
-//func dealAttachment1() {
|
|
|
-// sess := Mgo.GetMgoConn()
|
|
|
-// defer Mgo.DestoryMongoConn(sess)
|
|
|
-// query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Select(nil).Iter()
|
|
|
-// count := 0
|
|
|
-//
|
|
|
-// bonds := make([]BondInfo, 0) // MySQL 债券数据
|
|
|
-// for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
|
|
|
-// if count%100 == 0 {
|
|
|
-// log.Info("current:", zap.Int("count", count), zap.Any("title", tmp["title"]))
|
|
|
-// }
|
|
|
-// //
|
|
|
-// if atta, ok := tmp["attachments"]; ok {
|
|
|
-// if atm, ok := atta.(map[string]interface{}); ok {
|
|
|
-// for _, v := range atm {
|
|
|
-// if dd, ok := v.(map[string]interface{}); ok {
|
|
|
-// //attachments 里面的文件名称
|
|
|
-// filename := util.ObjToString(dd["filename"])
|
|
|
-// fmt.Println(filename)
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-//}
|
|
|
-
|
|
|
-func dealAttachment(db *gorm.DB) {
|
|
|
- sess := Mgo.GetMgoConn()
|
|
|
- defer Mgo.DestoryMongoConn(sess)
|
|
|
-
|
|
|
- // 1. 获取 MySQL 所有债券数据
|
|
|
- var bonds []BondInfo
|
|
|
- if err := db.Table("zxz_bond_info").Find(&bonds).Error; err != nil {
|
|
|
- fmt.Println("获取 MySQL 债券数据失败:", err)
|
|
|
- return
|
|
|
- }
|
|
|
-
|
|
|
- // 2. 初始化分词器
|
|
|
- tokenizer := gojieba.NewJieba()
|
|
|
- defer tokenizer.Free()
|
|
|
-
|
|
|
- // 3. 遍历 MongoDB
|
|
|
- query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Iter()
|
|
|
- count := 0
|
|
|
- tmp := make(map[string]interface{})
|
|
|
-
|
|
|
- for query.Next(&tmp) {
|
|
|
- count++
|
|
|
- if count%100 == 0 {
|
|
|
- fmt.Println("Progress:", count)
|
|
|
- }
|
|
|
-
|
|
|
- if attachments, ok := tmp["attachments"].(map[string]interface{}); ok {
|
|
|
- for _, item := range attachments {
|
|
|
- if attMap, ok := item.(map[string]interface{}); ok {
|
|
|
- filename := util.ObjToString(attMap["filename"])
|
|
|
- cleanFilename := strings.TrimSuffix(filename, ".pdf")
|
|
|
- // 提取 MongoDB 附件关键词
|
|
|
- k1 := ExtractKeywords(cleanFilename, tokenizer)
|
|
|
-
|
|
|
- // 逐个匹配 MySQL 中的 bond_name
|
|
|
- for _, bond := range bonds {
|
|
|
- k2 := ExtractKeywords(bond.BondName, tokenizer)
|
|
|
- if IsMatch(k1, k2) {
|
|
|
- fmt.Printf("匹配成功: [%s] <=> [%s]\n", cleanFilename, bond.BondName)
|
|
|
- // 可记录匹配对结果到文件/数据库
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-func ExtractKeywordsV4(text string) []map[string]string {
|
|
|
- text = strings.ReplaceAll(text, "(", "(")
|
|
|
- text = strings.ReplaceAll(text, ")", ")")
|
|
|
-
|
|
|
- // 匹配模式:完整提取年 + 省 + 债券类型 + 期数 + 括号说明
|
|
|
- mainReg := regexp.MustCompile(`(?P<year>\d{4}年)(?P<province>[\p{Han}]{2,3}省)(?P<type>[\p{Han}]{2,20}债券)(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
|
|
|
- matches := mainReg.FindAllStringSubmatch(text, -1)
|
|
|
-
|
|
|
- results := make([]map[string]string, 0, len(matches))
|
|
|
-
|
|
|
- for _, match := range matches {
|
|
|
- if len(match) < 6 {
|
|
|
- continue
|
|
|
- }
|
|
|
- year := match[1]
|
|
|
- province := match[2]
|
|
|
- bondType := match[3]
|
|
|
- phase := match[4]
|
|
|
- bracket := match[5]
|
|
|
-
|
|
|
- results = append(results, map[string]string{
|
|
|
- "year": year,
|
|
|
- "province": province,
|
|
|
- "type": bondType,
|
|
|
- "phase": phase,
|
|
|
- "bracket": bracket,
|
|
|
- })
|
|
|
- }
|
|
|
-
|
|
|
- return results
|
|
|
-}
|
|
|
-
|
|
|
-func ExtractKeywordsV3WithJieba(text string, tokenizer *gojieba.Jieba) []map[string]string {
|
|
|
- text = strings.ReplaceAll(text, "(", "(")
|
|
|
- text = strings.ReplaceAll(text, ")", ")")
|
|
|
-
|
|
|
- // 提取年份
|
|
|
- yearReg := regexp.MustCompile(`\d{4}年`)
|
|
|
- year := yearReg.FindString(text)
|
|
|
-
|
|
|
- // 提取“年”后的省份
|
|
|
- province := ""
|
|
|
- if year != "" {
|
|
|
- afterYear := text[strings.Index(text, year)+len(year):]
|
|
|
- provinceReg := regexp.MustCompile(`[\p{Han}]{2,3}省`)
|
|
|
- province = provinceReg.FindString(afterYear)
|
|
|
- }
|
|
|
-
|
|
|
- // 债券类型词典
|
|
|
- bondTypes := []string{
|
|
|
- "专项债券", "政府专项债券", "一般债券", "政府一般债券",
|
|
|
- "再融资专项债券", "再融资一般债券", "再融资债券",
|
|
|
- }
|
|
|
-
|
|
|
- // combo 匹配正则
|
|
|
- comboReg := regexp.MustCompile(`(?P<type>[\p{Han}]{2,12}债券)[,、,]?(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
|
|
|
- matches := comboReg.FindAllStringSubmatch(text, -1)
|
|
|
-
|
|
|
- results := make([]map[string]string, 0, len(matches))
|
|
|
-
|
|
|
- for _, match := range matches {
|
|
|
- if len(match) < 4 {
|
|
|
- continue
|
|
|
- }
|
|
|
- bondType := match[1]
|
|
|
- phase := match[2]
|
|
|
- bracket := match[3]
|
|
|
-
|
|
|
- // 精确类型匹配
|
|
|
- bestMatch := ""
|
|
|
- for _, t := range bondTypes {
|
|
|
- if strings.Contains(bondType, t) {
|
|
|
- bestMatch = t
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- if bestMatch == "" {
|
|
|
- bestMatch = bondType // fallback
|
|
|
- }
|
|
|
-
|
|
|
- results = append(results, map[string]string{
|
|
|
- "year": year,
|
|
|
- "province": province,
|
|
|
- "type": bestMatch,
|
|
|
- "phase": phase,
|
|
|
- "bracket": bracket,
|
|
|
- })
|
|
|
- }
|
|
|
-
|
|
|
- return results
|
|
|
-}
|
|
|
-
|
|
|
-// ExtractKeywords 提取五类关键词
|
|
|
-func ExtractKeywords(text string, tokenizer *gojieba.Jieba) map[string]string {
|
|
|
- text = removeFileExtension(text)
|
|
|
- // 分词
|
|
|
- words := tokenizer.Cut(text, true)
|
|
|
- wordSet := make(map[string]bool)
|
|
|
- for _, w := range words {
|
|
|
- wordSet[w] = true
|
|
|
- }
|
|
|
-
|
|
|
- // 正则抽取
|
|
|
- yearReg := regexp.MustCompile(`\d{4}(年|年度)`)
|
|
|
- provinceReg := regexp.MustCompile(`20\d{2}年([\p{Han}]{2,3}省)`)
|
|
|
- phaseReg := regexp.MustCompile(`(第[一二三四五六七八九十百]{1,3}期|[一二三四五六七八九十百]{1,3}至[一二三四五六七八九十百]{1,3}期)`)
|
|
|
- bracketReg := regexp.MustCompile(`([^)]+)`)
|
|
|
-
|
|
|
- // 提取关键字段
|
|
|
- year := yearReg.FindString(text)
|
|
|
- province := ""
|
|
|
- if match := provinceReg.FindStringSubmatch(text); len(match) == 2 {
|
|
|
- province = match[1]
|
|
|
- }
|
|
|
- phase := phaseReg.FindString(text)
|
|
|
- bracket := bracketReg.FindString(text)
|
|
|
-
|
|
|
- // 通过词判断类型
|
|
|
- bondType := ""
|
|
|
- for _, t := range []string{"专项债券", "专项债", "一般债券", "一般债", "再融资债", "再融资一般债"} {
|
|
|
- if wordSet[t] {
|
|
|
- bondType = t
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return map[string]string{
|
|
|
- "year": year,
|
|
|
- "province": province,
|
|
|
- "phase": phase,
|
|
|
- "bracket": bracket,
|
|
|
- "type": bondType,
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-// removeFileExtension 去除常见扩展名
|
|
|
-func removeFileExtension(text string) string {
|
|
|
- suffixes := []string{".pdf", ".doc", ".docx", ".xls", ".xlsx", ".txt", ".zip"}
|
|
|
- for _, ext := range suffixes {
|
|
|
- if strings.HasSuffix(text, ext) {
|
|
|
- return strings.TrimSuffix(text, ext)
|
|
|
- }
|
|
|
- }
|
|
|
- return text
|
|
|
-}
|
|
|
-
|
|
|
-// IsMatch 判断关键词是否完全一致
|
|
|
-func IsMatch(k1, k2 map[string]string) bool {
|
|
|
- for k := range k1 {
|
|
|
- if k1[k] != "" && k1[k] == k2[k] {
|
|
|
- continue
|
|
|
- }
|
|
|
- return false
|
|
|
- }
|
|
|
- return true
|
|
|
-}
|