htmlclear.go 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. package util
  2. import (
  3. mc "app.yhyue.com/moapp/jybase/common"
  4. "regexp"
  5. "strings"
  6. )
  7. //
  8. type Cut struct {
  9. scriptTag *regexp.Regexp
  10. sa *regexp.Regexp
  11. replBlankLine *regexp.Regexp
  12. replStartWrap *regexp.Regexp
  13. multiCR *regexp.Regexp
  14. tag *regexp.Regexp
  15. //annotate *regexp.Regexp
  16. //tag *regexp.Regexp
  17. //inputag *regexp.Regexp
  18. //isborder *regexp.Regexp
  19. //hiddentag *regexp.Regexp
  20. //styletag *regexp.Regexp
  21. //colstag *regexp.Regexp
  22. //rowstag *regexp.Regexp
  23. //display *regexp.Regexp
  24. //replTags2CR []string
  25. //retainTags2CR []string
  26. }
  27. //
  28. func NewCut() *Cut {
  29. scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?</(script|style)>")
  30. sa := regexp.MustCompile("<[a|A]\\s*[^>]*>(.*?)</[a|A]>")
  31. bl := regexp.MustCompile("\\s+[\r\n]")
  32. sw := regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
  33. m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
  34. t, _ := regexp.Compile("<[^>]+>")
  35. //t, _ := regexp.Compile("<[^>]+>")
  36. ////sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
  37. ////ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
  38. //at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css
  39. //hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
  40. //input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
  41. //cols, _ := regexp.Compile(`colspan="\d+"`)
  42. //rows, _ := regexp.Compile(`rowspan="\d+"`)
  43. //border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
  44. //dis, _ := regexp.Compile(`display:none`)
  45. return &Cut{
  46. scriptTag: scs,
  47. sa: sa,
  48. replBlankLine: bl,
  49. replStartWrap: sw,
  50. multiCR: m,
  51. tag: t,
  52. //annotate: at,
  53. //tag: t,
  54. //hiddentag: hiddentag,
  55. //inputag: input,
  56. //colstag: cols,
  57. //isborder: border,
  58. //rowstag: rows,
  59. //display: dis,
  60. //replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
  61. //retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
  62. }
  63. }
  64. //清理HTML标签
  65. func (c *Cut) ClearHtml(src string) string {
  66. src = strings.Replace(src, ">\n", ">", -1)
  67. src = strings.Replace(src, " ", "", -1)
  68. //标签全转小写
  69. src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
  70. //清script,style
  71. src = c.scriptTag.ReplaceAllString(src, "")
  72. src = c.sa.ReplaceAllString(src, "$1")
  73. src = c.replStartWrap.ReplaceAllString(src, "")
  74. src = c.replBlankLine.ReplaceAllString(src, "\n")
  75. //清除多余换行
  76. c.multiCR.ReplaceAllString(src, "\n")
  77. return strings.Replace(src, "\n", "<br/>", -1)
  78. }
  79. //判断table是否加表格线
  80. func isHasBoder(con string, reg *regexp.Regexp) bool {
  81. res := reg.FindAllStringSubmatch(con, -1)
  82. hasBorder := false
  83. for _, v := range res {
  84. for k, val := range v {
  85. if k > 0 && k%2 == 0 && mc.IntAll(val) > 0 {
  86. hasBorder = true
  87. break
  88. }
  89. }
  90. }
  91. return hasBorder
  92. }