attach.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. package main
  2. import (
  3. "fmt"
  4. "github.com/yanyiwu/gojieba"
  5. "gorm.io/gorm"
  6. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  7. "regexp"
  8. "strings"
  9. )
  10. // dealAttachment 处理债券附件
  11. //func dealAttachment1() {
  12. // sess := Mgo.GetMgoConn()
  13. // defer Mgo.DestoryMongoConn(sess)
  14. // query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Select(nil).Iter()
  15. // count := 0
  16. //
  17. // bonds := make([]BondInfo, 0) // MySQL 债券数据
  18. // for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  19. // if count%100 == 0 {
  20. // log.Info("current:", zap.Int("count", count), zap.Any("title", tmp["title"]))
  21. // }
  22. // //
  23. // if atta, ok := tmp["attachments"]; ok {
  24. // if atm, ok := atta.(map[string]interface{}); ok {
  25. // for _, v := range atm {
  26. // if dd, ok := v.(map[string]interface{}); ok {
  27. // //attachments 里面的文件名称
  28. // filename := util.ObjToString(dd["filename"])
  29. // fmt.Println(filename)
  30. // }
  31. // }
  32. // }
  33. // }
  34. // }
  35. //}
  36. func dealAttachment(db *gorm.DB) {
  37. sess := Mgo.GetMgoConn()
  38. defer Mgo.DestoryMongoConn(sess)
  39. // 1. 获取 MySQL 所有债券数据
  40. var bonds []BondInfo
  41. if err := db.Table("zxz_bond_info").Find(&bonds).Error; err != nil {
  42. fmt.Println("获取 MySQL 债券数据失败:", err)
  43. return
  44. }
  45. // 2. 初始化分词器
  46. tokenizer := gojieba.NewJieba()
  47. defer tokenizer.Free()
  48. // 3. 遍历 MongoDB
  49. query := sess.DB("py_theme").C("special_purpose_bond_files_detail").Find(nil).Iter()
  50. count := 0
  51. tmp := make(map[string]interface{})
  52. for query.Next(&tmp) {
  53. count++
  54. if count%100 == 0 {
  55. fmt.Println("Progress:", count)
  56. }
  57. if attachments, ok := tmp["attachments"].(map[string]interface{}); ok {
  58. for _, item := range attachments {
  59. if attMap, ok := item.(map[string]interface{}); ok {
  60. filename := util.ObjToString(attMap["filename"])
  61. cleanFilename := strings.TrimSuffix(filename, ".pdf")
  62. // 提取 MongoDB 附件关键词
  63. k1 := ExtractKeywords(cleanFilename, tokenizer)
  64. // 逐个匹配 MySQL 中的 bond_name
  65. for _, bond := range bonds {
  66. k2 := ExtractKeywords(bond.BondName, tokenizer)
  67. if IsMatch(k1, k2) {
  68. fmt.Printf("匹配成功: [%s] <=> [%s]\n", cleanFilename, bond.BondName)
  69. // 可记录匹配对结果到文件/数据库
  70. break
  71. }
  72. }
  73. }
  74. }
  75. }
  76. }
  77. }
  78. func ExtractKeywordsV4(text string) []map[string]string {
  79. text = strings.ReplaceAll(text, "(", "(")
  80. text = strings.ReplaceAll(text, ")", ")")
  81. // 匹配模式:完整提取年 + 省 + 债券类型 + 期数 + 括号说明
  82. mainReg := regexp.MustCompile(`(?P<year>\d{4}年)(?P<province>[\p{Han}]{2,3}省)(?P<type>[\p{Han}]{2,20}债券)(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
  83. matches := mainReg.FindAllStringSubmatch(text, -1)
  84. results := make([]map[string]string, 0, len(matches))
  85. for _, match := range matches {
  86. if len(match) < 6 {
  87. continue
  88. }
  89. year := match[1]
  90. province := match[2]
  91. bondType := match[3]
  92. phase := match[4]
  93. bracket := match[5]
  94. results = append(results, map[string]string{
  95. "year": year,
  96. "province": province,
  97. "type": bondType,
  98. "phase": phase,
  99. "bracket": bracket,
  100. })
  101. }
  102. return results
  103. }
  104. func ExtractKeywordsV3WithJieba(text string, tokenizer *gojieba.Jieba) []map[string]string {
  105. text = strings.ReplaceAll(text, "(", "(")
  106. text = strings.ReplaceAll(text, ")", ")")
  107. // 提取年份
  108. yearReg := regexp.MustCompile(`\d{4}年`)
  109. year := yearReg.FindString(text)
  110. // 提取“年”后的省份
  111. province := ""
  112. if year != "" {
  113. afterYear := text[strings.Index(text, year)+len(year):]
  114. provinceReg := regexp.MustCompile(`[\p{Han}]{2,3}省`)
  115. province = provinceReg.FindString(afterYear)
  116. }
  117. // 债券类型词典
  118. bondTypes := []string{
  119. "专项债券", "政府专项债券", "一般债券", "政府一般债券",
  120. "再融资专项债券", "再融资一般债券", "再融资债券",
  121. }
  122. // combo 匹配正则
  123. comboReg := regexp.MustCompile(`(?P<type>[\p{Han}]{2,12}债券)[,、,]?(?P<phase>[一二三四五六七八九十百至]{1,6}期)?(?:\((?P<bracket>[^)]+)\))?`)
  124. matches := comboReg.FindAllStringSubmatch(text, -1)
  125. results := make([]map[string]string, 0, len(matches))
  126. for _, match := range matches {
  127. if len(match) < 4 {
  128. continue
  129. }
  130. bondType := match[1]
  131. phase := match[2]
  132. bracket := match[3]
  133. // 精确类型匹配
  134. bestMatch := ""
  135. for _, t := range bondTypes {
  136. if strings.Contains(bondType, t) {
  137. bestMatch = t
  138. break
  139. }
  140. }
  141. if bestMatch == "" {
  142. bestMatch = bondType // fallback
  143. }
  144. results = append(results, map[string]string{
  145. "year": year,
  146. "province": province,
  147. "type": bestMatch,
  148. "phase": phase,
  149. "bracket": bracket,
  150. })
  151. }
  152. return results
  153. }
  154. // ExtractKeywords 提取五类关键词
  155. func ExtractKeywords(text string, tokenizer *gojieba.Jieba) map[string]string {
  156. text = removeFileExtension(text)
  157. // 分词
  158. words := tokenizer.Cut(text, true)
  159. wordSet := make(map[string]bool)
  160. for _, w := range words {
  161. wordSet[w] = true
  162. }
  163. // 正则抽取
  164. yearReg := regexp.MustCompile(`\d{4}(年|年度)`)
  165. provinceReg := regexp.MustCompile(`20\d{2}年([\p{Han}]{2,3}省)`)
  166. phaseReg := regexp.MustCompile(`(第[一二三四五六七八九十百]{1,3}期|[一二三四五六七八九十百]{1,3}至[一二三四五六七八九十百]{1,3}期)`)
  167. bracketReg := regexp.MustCompile(`([^)]+)`)
  168. // 提取关键字段
  169. year := yearReg.FindString(text)
  170. province := ""
  171. if match := provinceReg.FindStringSubmatch(text); len(match) == 2 {
  172. province = match[1]
  173. }
  174. phase := phaseReg.FindString(text)
  175. bracket := bracketReg.FindString(text)
  176. // 通过词判断类型
  177. bondType := ""
  178. for _, t := range []string{"专项债券", "专项债", "一般债券", "一般债", "再融资债", "再融资一般债"} {
  179. if wordSet[t] {
  180. bondType = t
  181. break
  182. }
  183. }
  184. return map[string]string{
  185. "year": year,
  186. "province": province,
  187. "phase": phase,
  188. "bracket": bracket,
  189. "type": bondType,
  190. }
  191. }
  192. // removeFileExtension 去除常见扩展名
  193. func removeFileExtension(text string) string {
  194. suffixes := []string{".pdf", ".doc", ".docx", ".xls", ".xlsx", ".txt", ".zip"}
  195. for _, ext := range suffixes {
  196. if strings.HasSuffix(text, ext) {
  197. return strings.TrimSuffix(text, ext)
  198. }
  199. }
  200. return text
  201. }
  202. // IsMatch 判断关键词是否完全一致
  203. func IsMatch(k1, k2 map[string]string) bool {
  204. for k := range k1 {
  205. if k1[k] != "" && k1[k] == k2[k] {
  206. continue
  207. }
  208. return false
  209. }
  210. return true
  211. }