package util import ( qu "app.yhyue.com/moapp/jybase/common" "github.com/PuerkitoBio/goquery" "regexp" "strings" "unicode/utf8" ) type Cut struct { tag *regexp.Regexp scripttag *regexp.Regexp inputag *regexp.Regexp isborder *regexp.Regexp hiddentag *regexp.Regexp styletag *regexp.Regexp colstag *regexp.Regexp rowstag *regexp.Regexp display *regexp.Regexp multiCR *regexp.Regexp replBlankLine *regexp.Regexp replStartWrap *regexp.Regexp replTags2CR []string retainTags2CR []string } func NewCut() *Cut { t, _ := regexp.Compile("<[^>]+>") m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+") //sc, _ := regexp.Compile("\\") //ss, _ := regexp.Compile("\\") scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+(script|style)>") hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`) input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`) cols, _ := regexp.Compile(`colspan="\d+"`) rows, _ := regexp.Compile(`rowspan="\d+"`) dis, _ := regexp.Compile(`display:none`) border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`) return &Cut{ tag: t, scripttag: scs, hiddentag: hiddentag, inputag: input, colstag: cols, rowstag: rows, isborder: border, display: dis, multiCR: m, replBlankLine: regexp.MustCompile("\\s+[\r\n]"), replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"), replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"}, retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"}, } } // 清理HTML标签 func (c *Cut) ClearHtml(src string) string { src = strings.Replace(src, ">\n", ">", -1) src = strings.Replace(src, " ", "", -1) //标签全转小写 src = c.tag.ReplaceAllStringFunc(src, strings.ToLower) //清script,style src = c.scripttag.ReplaceAllString(src, "") //清理input src = c.hiddentag.ReplaceAllString(src, "") src = c.inputag.ReplaceAllString(src, "$2") document, err := goquery.NewDocumentFromReader(strings.NewReader(src)) if err == nil { if tmpstr, err := document.Each(func(i int, sel *goquery.Selection) { sel.Find("td").Each(func(i int, selection *goquery.Selection) { val, b := selection.Attr("title") if b && strings.Trim(val, " ") != "" { tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool { return r == 9 || r == 32 }) if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) { selection.SetText(strings.Trim(val, " ")) } } }) }).Html(); err == nil { src = tmpstr } } //换结束标签 src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string { tmp = strings.Replace(tmp, " ", "", -1) //保留这些标签 for _, v := range c.retainTags2CR { if "<"+v+">" == tmp || ""+v+">" == tmp { if tmp == "" { return tmp + "\n" } return tmp } if strings.HasPrefix(tmp, "<"+v) { dispstrs := c.display.FindAllString(tmp, -1) rowstrs := c.rowstag.FindAllString(tmp, -1) colstrs := c.colstag.FindAllString(tmp, -1) con := "<" + v if con == "