spacekv.go 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. //识别空格kv
  2. package pretreated
  3. import (
  4. "jy/util"
  5. "regexp"
  6. "strings"
  7. )
  8. type SpacekvEntity struct{}
  9. var (
  10. SspacekvEntity = &SpacekvEntity{}
  11. filterLine = regexp.MustCompile("[::,,。??'\"“”‘’·~!…+=|&*#$【】]")
  12. filterSpaceKey = regexp.MustCompile("[((][^((]+[))]")
  13. excludeSpaceKey = regexp.MustCompile("[.、�\\[【{{〔<《\\]】}}〕>》]")
  14. )
  15. func (se *SpacekvEntity) Entrance(text, title string, contactFormat *util.ContactFormat) *util.JobKv {
  16. lines := se.getLines(text)
  17. kvMaps := []*util.Kv{}
  18. for _, line := range lines {
  19. kvMap := se.divideKV(line)
  20. if kvMap == nil {
  21. continue
  22. }
  23. kvMaps = append(kvMaps, kvMap...)
  24. }
  25. FormatContactKv(&kvMaps, title, nil, contactFormat)
  26. kvTags := GetKvTags(kvMaps, title, nil)
  27. return &util.JobKv{
  28. Kvs: kvMaps,
  29. KvTags: kvTags,
  30. }
  31. }
  32. //空格分kv
  33. func (se *SpacekvEntity) divideKV(line string) []*util.Kv {
  34. line = strings.TrimSpace(line)
  35. line = regReplAllSpace.ReplaceAllString(line, " ")
  36. line = TimeHM.ReplaceAllString(line, "D$1H$2M")
  37. if line == "" || strings.Count(line, " ") == 0 || filterLine.MatchString(line) {
  38. return nil
  39. }
  40. kv := strings.Split(line, " ")
  41. kvs := []*util.Kv{}
  42. for i := 0; i+1 <= len(kv)-1; i = i + 2 {
  43. k, v := kv[i], kv[i+1]
  44. k = filterSpaceKey.ReplaceAllString(k, "")
  45. //key字数限制
  46. if len([]rune(k)) <= 1 || len([]rune(k)) > 15 {
  47. continue
  48. }
  49. //过滤key
  50. if excludeSpaceKey.MatchString(k) {
  51. continue
  52. }
  53. kvs = append(kvs, &util.Kv{Key: k, Value: v})
  54. }
  55. return kvs
  56. }
  57. //分段
  58. func (se *SpacekvEntity) getLines(text string) []string {
  59. lines := strings.FieldsFunc(text, func(r rune) bool {
  60. return r == 10 || r == 13
  61. })
  62. arrays := []string{}
  63. for _, line := range lines {
  64. line = regTrimSpace.ReplaceAllString(line, "")
  65. if line == "" {
  66. continue
  67. }
  68. arrays = append(arrays, line)
  69. }
  70. return arrays
  71. }