clearHtml.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. package spiderutil
  2. import (
  3. qu "qfw/util"
  4. "regexp"
  5. "strings"
  6. "unicode/utf8"
  7. "github.com/PuerkitoBio/goquery"
  8. )
  9. //
  10. type Cut struct {
  11. annotate *regexp.Regexp
  12. tag *regexp.Regexp
  13. scripttag *regexp.Regexp
  14. inputag *regexp.Regexp
  15. isborder *regexp.Regexp
  16. hiddentag *regexp.Regexp
  17. styletag *regexp.Regexp
  18. colstag *regexp.Regexp
  19. rowstag *regexp.Regexp
  20. gttag *regexp.Regexp
  21. lttag *regexp.Regexp
  22. quotag *regexp.Regexp
  23. display *regexp.Regexp
  24. multiCR *regexp.Regexp
  25. replBlankLine *regexp.Regexp
  26. replStartWrap *regexp.Regexp
  27. replTags2CR []string
  28. retainTags2CR []string
  29. }
  30. //
  31. func NewCut() *Cut {
  32. t, _ := regexp.Compile("<[^>]+>")
  33. m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
  34. //sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
  35. //ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
  36. scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?</(script|style)>")
  37. at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css
  38. hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
  39. input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
  40. cols, _ := regexp.Compile(`colspan="\d+"`)
  41. rows, _ := regexp.Compile(`rowspan="\d+"`)
  42. border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
  43. dis, _ := regexp.Compile(`display:none`)
  44. gt := regexp.MustCompile("&gt;")
  45. lt := regexp.MustCompile("&lt;")
  46. quo := regexp.MustCompile("&#34;")
  47. return &Cut{
  48. annotate: at,
  49. tag: t,
  50. scripttag: scs,
  51. hiddentag: hiddentag,
  52. inputag: input,
  53. colstag: cols,
  54. isborder: border,
  55. rowstag: rows,
  56. gttag: gt,
  57. lttag: lt,
  58. quotag: quo,
  59. display: dis,
  60. multiCR: m,
  61. replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
  62. replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
  63. replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
  64. retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
  65. }
  66. }
  67. //清理HTML标签
  68. func (c *Cut) ClearHtml(src string) string {
  69. src = c.replBlankLine.ReplaceAllString(src, "")
  70. src = strings.Replace(src, ">\n", ">", -1)
  71. src = strings.Replace(src, " ", "", -1)
  72. //标签全转小写
  73. src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
  74. //清script,style
  75. src = c.scripttag.ReplaceAllString(src, "")
  76. //清理注释文本
  77. src = c.annotate.ReplaceAllString(src, "")
  78. //清理input
  79. src = c.hiddentag.ReplaceAllString(src, "")
  80. src = c.inputag.ReplaceAllString(src, "$2")
  81. document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
  82. if err == nil {
  83. if tmpstr, err := document.Each(func(i int, sel *goquery.Selection) {
  84. sel.Find("td").Each(func(i int, selection *goquery.Selection) {
  85. val, b := selection.Attr("title")
  86. if b && strings.Trim(val, " ") != "" {
  87. tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
  88. return r == 9 || r == 32
  89. })
  90. if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
  91. selection.SetText(strings.Trim(val, " "))
  92. }
  93. }
  94. })
  95. }).Html(); err == nil {
  96. src = tmpstr
  97. }
  98. }
  99. //换结束标签
  100. src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
  101. tmp = strings.Replace(tmp, " ", "", -1)
  102. //保留这些标签
  103. for _, v := range c.retainTags2CR {
  104. if "<"+v+">" == tmp || "</"+v+">" == tmp {
  105. if tmp == "</table>" {
  106. return tmp + "\n"
  107. }
  108. return tmp
  109. }
  110. if strings.HasPrefix(tmp, "<"+v) {
  111. dispstrs := c.display.FindAllString(tmp, -1)
  112. rowstrs := c.rowstag.FindAllString(tmp, -1)
  113. colstrs := c.colstag.FindAllString(tmp, -1)
  114. con := "<" + v
  115. if con == "<table" {
  116. if isHasBoder(tmp, c.isborder) {
  117. con = con + ` border="1"`
  118. }
  119. }
  120. if len(colstrs) > 0 { //处理多列合并
  121. con += " " + colstrs[0]
  122. }
  123. if len(rowstrs) > 0 { //处理多行合并
  124. con += " " + rowstrs[0]
  125. }
  126. if len(dispstrs) > 0 {
  127. con += " style=\"" + dispstrs[0] + "\""
  128. }
  129. return con + ">"
  130. }
  131. }
  132. if tmp == "<br>" || tmp == "<br/>" || tmp == "<center>" || tmp == "</center>" || tmp == "<ul>" || tmp == "</ul>" {
  133. return "\n"
  134. }
  135. if tmp[1] != 47 { //开始标签
  136. for _, v := range c.replTags2CR {
  137. if v == tmp[1:len(tmp)-1] {
  138. return "\n"
  139. }
  140. }
  141. return ""
  142. }
  143. for _, v := range c.replTags2CR {
  144. if v == tmp[2:len(tmp)-1] {
  145. return "\n"
  146. }
  147. }
  148. return ""
  149. })
  150. src = c.replStartWrap.ReplaceAllString(src, "")
  151. src = c.replBlankLine.ReplaceAllString(src, "\n")
  152. //清除多余换行
  153. c.multiCR.ReplaceAllString(src, "\n")
  154. src = c.gttag.ReplaceAllString(src, ">")
  155. src = c.lttag.ReplaceAllString(src, "<")
  156. src = c.quotag.ReplaceAllString(src, `"`)
  157. return strings.Replace(src, "\n", "<br/>", -1)
  158. }
  159. //判断table是否加表格线
  160. func isHasBoder(con string, reg *regexp.Regexp) bool {
  161. res := reg.FindAllStringSubmatch(con, -1)
  162. hasBorder := false
  163. for _, v := range res {
  164. for k, val := range v {
  165. if k > 0 && k%2 == 0 && qu.IntAll(val) > 0 {
  166. hasBorder = true
  167. break
  168. }
  169. }
  170. }
  171. return hasBorder
  172. }