clearHtml.go 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. package util
  2. import (
  3. "app.yhyue.com/moapp/jybase/common"
  4. "github.com/PuerkitoBio/goquery"
  5. "regexp"
  6. "strings"
  7. "unicode/utf8"
  8. )
  9. type Cut struct {
  10. tag *regexp.Regexp
  11. scripttag *regexp.Regexp
  12. inputag *regexp.Regexp
  13. isborder *regexp.Regexp
  14. hiddentag *regexp.Regexp
  15. styletag *regexp.Regexp
  16. colstag *regexp.Regexp
  17. rowstag *regexp.Regexp
  18. display *regexp.Regexp
  19. multiCR *regexp.Regexp
  20. replBlankLine *regexp.Regexp
  21. replStartWrap *regexp.Regexp
  22. replTags2CR []string
  23. retainTags2CR []string
  24. }
  25. func NewCut() *Cut {
  26. t, _ := regexp.Compile("<[^>]+>")
  27. m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
  28. //sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
  29. //ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
  30. scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
  31. hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
  32. input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
  33. cols, _ := regexp.Compile(`colspan="\d+"`)
  34. rows, _ := regexp.Compile(`rowspan="\d+"`)
  35. dis, _ := regexp.Compile(`display:none`)
  36. border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
  37. return &Cut{
  38. tag: t,
  39. scripttag: scs,
  40. hiddentag: hiddentag,
  41. inputag: input,
  42. colstag: cols,
  43. rowstag: rows,
  44. isborder: border,
  45. display: dis,
  46. multiCR: m,
  47. replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
  48. replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
  49. replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
  50. retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
  51. }
  52. }
  53. // 清理HTML标签
  54. func (c *Cut) ClearHtml(src string) string {
  55. src = strings.Replace(src, ">\n", ">", -1)
  56. src = strings.Replace(src, " ", "", -1)
  57. //标签全转小写
  58. src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
  59. //清script,style
  60. src = c.scripttag.ReplaceAllString(src, "")
  61. //清理input
  62. src = c.hiddentag.ReplaceAllString(src, "")
  63. src = c.inputag.ReplaceAllString(src, "$2")
  64. document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
  65. if err == nil {
  66. if tmpstr, err := document.Each(func(i int, sel *goquery.Selection) {
  67. sel.Find("td").Each(func(i int, selection *goquery.Selection) {
  68. val, b := selection.Attr("title")
  69. if b && strings.Trim(val, " ") != "" {
  70. tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
  71. return r == 9 || r == 32
  72. })
  73. if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
  74. selection.SetText(strings.Trim(val, " "))
  75. }
  76. }
  77. })
  78. }).Html(); err == nil {
  79. src = tmpstr
  80. }
  81. }
  82. //换结束标签
  83. src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
  84. tmp = strings.Replace(tmp, " ", "", -1)
  85. //保留这些标签
  86. for _, v := range c.retainTags2CR {
  87. if "<"+v+">" == tmp || "</"+v+">" == tmp {
  88. if tmp == "</table>" {
  89. return tmp + "\n"
  90. }
  91. return tmp
  92. }
  93. if strings.HasPrefix(tmp, "<"+v) {
  94. dispstrs := c.display.FindAllString(tmp, -1)
  95. rowstrs := c.rowstag.FindAllString(tmp, -1)
  96. colstrs := c.colstag.FindAllString(tmp, -1)
  97. con := "<" + v
  98. if con == "<table" {
  99. if isHasBoder(tmp, c.isborder) {
  100. con = con + ` border="1"`
  101. }
  102. }
  103. if len(colstrs) > 0 { //处理多列合并
  104. con += " " + colstrs[0]
  105. }
  106. if len(rowstrs) > 0 { //处理多行合并
  107. con += " " + rowstrs[0]
  108. }
  109. if len(dispstrs) > 0 {
  110. con += " style=\"" + dispstrs[0] + "\""
  111. }
  112. return con + ">"
  113. }
  114. }
  115. if tmp == "<br>" || tmp == "<br/>" || tmp == "<center>" || tmp == "</center>" {
  116. return "\n"
  117. }
  118. if tmp[1] != 47 { //开始标签
  119. for _, v := range c.replTags2CR {
  120. if v == tmp[1:len(tmp)-1] {
  121. return "\n"
  122. }
  123. }
  124. return ""
  125. }
  126. for _, v := range c.replTags2CR {
  127. if v == tmp[2:len(tmp)-1] {
  128. return "\n"
  129. }
  130. }
  131. return ""
  132. })
  133. src = c.replStartWrap.ReplaceAllString(src, "")
  134. src = c.replBlankLine.ReplaceAllString(src, "\n")
  135. //清除多余换行
  136. //return c.multiCR.ReplaceAllString(src, "\n")
  137. return strings.Replace(src, "\n", "<br/>", -1)
  138. }
  139. // 判断table是否加表格线
  140. func isHasBoder(con string, reg *regexp.Regexp) bool {
  141. res := reg.FindAllStringSubmatch(con, -1)
  142. hasBorder := false
  143. for _, v := range res {
  144. for k, val := range v {
  145. if k > 0 && k%2 == 0 && common.IntAll(val) > 0 {
  146. hasBorder = true
  147. break
  148. }
  149. }
  150. }
  151. return hasBorder
  152. }