|
@@ -0,0 +1,166 @@
|
|
|
+package util
|
|
|
+
|
|
|
+import (
|
|
|
+ mc "app.yhyue.com/moapp/jybase/common"
|
|
|
+ "github.com/PuerkitoBio/goquery"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+ "unicode/utf8"
|
|
|
+)
|
|
|
+
|
|
|
+//
|
|
|
+type Cut struct {
|
|
|
+ annotate *regexp.Regexp
|
|
|
+ tag *regexp.Regexp
|
|
|
+ scripttag *regexp.Regexp
|
|
|
+ inputag *regexp.Regexp
|
|
|
+ isborder *regexp.Regexp
|
|
|
+ hiddentag *regexp.Regexp
|
|
|
+ styletag *regexp.Regexp
|
|
|
+ colstag *regexp.Regexp
|
|
|
+ rowstag *regexp.Regexp
|
|
|
+ display *regexp.Regexp
|
|
|
+ multiCR *regexp.Regexp
|
|
|
+ replBlankLine *regexp.Regexp
|
|
|
+ replStartWrap *regexp.Regexp
|
|
|
+ replTags2CR []string
|
|
|
+ retainTags2CR []string
|
|
|
+}
|
|
|
+
|
|
|
+//
|
|
|
+func NewCut() *Cut {
|
|
|
+ t, _ := regexp.Compile("<[^>]+>")
|
|
|
+ m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
|
|
|
+ //sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
|
|
|
+ //ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
|
|
|
+ scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?</(script|style)>")
|
|
|
+ at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css
|
|
|
+ hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
|
|
|
+ input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
|
|
|
+ cols, _ := regexp.Compile(`colspan="\d+"`)
|
|
|
+ rows, _ := regexp.Compile(`rowspan="\d+"`)
|
|
|
+ border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
|
|
|
+ dis, _ := regexp.Compile(`display:none`)
|
|
|
+ return &Cut{
|
|
|
+ annotate: at,
|
|
|
+ tag: t,
|
|
|
+ scripttag: scs,
|
|
|
+ hiddentag: hiddentag,
|
|
|
+ inputag: input,
|
|
|
+ colstag: cols,
|
|
|
+ isborder: border,
|
|
|
+ rowstag: rows,
|
|
|
+ display: dis,
|
|
|
+ multiCR: m,
|
|
|
+ replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
|
|
|
+ replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
|
|
|
+ replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
|
|
|
+ retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//清理HTML标签
|
|
|
+func (c *Cut) ClearHtml(src string) string {
|
|
|
+ src = c.replBlankLine.ReplaceAllString(src, "")
|
|
|
+ src = strings.Replace(src, ">\n", ">", -1)
|
|
|
+ src = strings.Replace(src, " ", "", -1)
|
|
|
+ //标签全转小写
|
|
|
+ src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
|
|
|
+ //清script,style
|
|
|
+ src = c.scripttag.ReplaceAllString(src, "")
|
|
|
+ //清理注释文本
|
|
|
+ src = c.annotate.ReplaceAllString(src, "")
|
|
|
+ //清理input
|
|
|
+ src = c.hiddentag.ReplaceAllString(src, "")
|
|
|
+ src = c.inputag.ReplaceAllString(src, "$2")
|
|
|
+ document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
|
|
|
+ if err == nil {
|
|
|
+ if tmpstr, err := document.Each(func(i int, sel *goquery.Selection) {
|
|
|
+ sel.Find("td").Each(func(i int, selection *goquery.Selection) {
|
|
|
+ val, b := selection.Attr("title")
|
|
|
+ if b && strings.Trim(val, " ") != "" {
|
|
|
+ tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
|
|
|
+ return r == 9 || r == 32
|
|
|
+ })
|
|
|
+ if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
|
|
|
+ selection.SetText(strings.Trim(val, " "))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }).Html(); err == nil {
|
|
|
+ src = tmpstr
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //换结束标签
|
|
|
+ src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
|
|
|
+ tmp = strings.Replace(tmp, " ", "", -1)
|
|
|
+ //保留这些标签
|
|
|
+ for _, v := range c.retainTags2CR {
|
|
|
+ if "<"+v+">" == tmp || "</"+v+">" == tmp {
|
|
|
+ if tmp == "</table>" {
|
|
|
+ return tmp + "\n"
|
|
|
+ }
|
|
|
+ return tmp
|
|
|
+ }
|
|
|
+ if strings.HasPrefix(tmp, "<"+v) {
|
|
|
+ dispstrs := c.display.FindAllString(tmp, -1)
|
|
|
+ rowstrs := c.rowstag.FindAllString(tmp, -1)
|
|
|
+ colstrs := c.colstag.FindAllString(tmp, -1)
|
|
|
+ con := "<" + v
|
|
|
+ if con == "<table" {
|
|
|
+ if isHasBoder(tmp, c.isborder) {
|
|
|
+ con = con + ` border="1"`
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if len(colstrs) > 0 { //处理多列合并
|
|
|
+ con += " " + colstrs[0]
|
|
|
+ }
|
|
|
+ if len(rowstrs) > 0 { //处理多行合并
|
|
|
+ con += " " + rowstrs[0]
|
|
|
+ }
|
|
|
+ if len(dispstrs) > 0 {
|
|
|
+ con += " style=\"" + dispstrs[0] + "\""
|
|
|
+ }
|
|
|
+ return con + ">"
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if tmp == "<br>" || tmp == "<br/>" || tmp == "<center>" || tmp == "</center>" || tmp == "<ul>" || tmp == "</ul>" {
|
|
|
+ return "\n"
|
|
|
+ }
|
|
|
+ if tmp[1] != 47 { //开始标签
|
|
|
+ for _, v := range c.replTags2CR {
|
|
|
+ if v == tmp[1:len(tmp)-1] {
|
|
|
+ return "\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ""
|
|
|
+ }
|
|
|
+ for _, v := range c.replTags2CR {
|
|
|
+ if v == tmp[2:len(tmp)-1] {
|
|
|
+ return "\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ""
|
|
|
+ })
|
|
|
+ src = c.replStartWrap.ReplaceAllString(src, "")
|
|
|
+ src = c.replBlankLine.ReplaceAllString(src, "\n")
|
|
|
+ //清除多余换行
|
|
|
+ c.multiCR.ReplaceAllString(src, "\n")
|
|
|
+ return strings.Replace(src, "\n", "<br/>", -1)
|
|
|
+}
|
|
|
+
|
|
|
+//判断table是否加表格线
|
|
|
+func isHasBoder(con string, reg *regexp.Regexp) bool {
|
|
|
+ res := reg.FindAllStringSubmatch(con, -1)
|
|
|
+ hasBorder := false
|
|
|
+ for _, v := range res {
|
|
|
+ for k, val := range v {
|
|
|
+ if k > 0 && k%2 == 0 && mc.IntAll(val) > 0 {
|
|
|
+ hasBorder = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return hasBorder
|
|
|
+}
|