123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- package util
- import (
- "github.com/PuerkitoBio/goquery"
- "regexp"
- "strings"
- "unicode/utf8"
- )
- //
- type Cut struct {
- annotate *regexp.Regexp
- tag *regexp.Regexp
- scripttag *regexp.Regexp
- inputag *regexp.Regexp
- isborder *regexp.Regexp
- hiddentag *regexp.Regexp
- styletag *regexp.Regexp
- colstag *regexp.Regexp
- rowstag *regexp.Regexp
- display *regexp.Regexp
- multiCR *regexp.Regexp
- replBlankLine *regexp.Regexp
- replStartWrap *regexp.Regexp
- replTags2CR []string
- retainTags2CR []string
- }
- //
- func NewCut() *Cut {
- t, _ := regexp.Compile("<[^>]+>")
- m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
- //sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
- //ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
- scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?</(script|style)>")
- at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css
- hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
- input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
- cols, _ := regexp.Compile(`colspan="\d+"`)
- rows, _ := regexp.Compile(`rowspan="\d+"`)
- border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
- dis, _ := regexp.Compile(`display:none`)
- return &Cut{
- annotate: at,
- tag: t,
- scripttag: scs,
- hiddentag: hiddentag,
- inputag: input,
- colstag: cols,
- isborder: border,
- rowstag: rows,
- display: dis,
- multiCR: m,
- replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
- replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
- replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
- retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
- }
- }
- //清理HTML标签
- func (c *Cut) ClearHtml(src string) string {
- src = c.replBlankLine.ReplaceAllString(src, "")
- src = strings.Replace(src, ">\n", ">", -1)
- src = strings.Replace(src, " ", "", -1)
- //标签全转小写
- src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
- //清script,style
- src = c.scripttag.ReplaceAllString(src, "")
- //清理注释文本
- src = c.annotate.ReplaceAllString(src, "")
- //清理input
- src = c.hiddentag.ReplaceAllString(src, "")
- src = c.inputag.ReplaceAllString(src, "$2")
- document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
- if err == nil {
- if tmpstr, err := document.Each(func(i int, sel *goquery.Selection) {
- sel.Find("td").Each(func(i int, selection *goquery.Selection) {
- val, b := selection.Attr("title")
- if b && strings.Trim(val, " ") != "" {
- tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
- return r == 9 || r == 32
- })
- if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
- selection.SetText(strings.Trim(val, " "))
- }
- }
- })
- }).Html(); err == nil {
- src = tmpstr
- }
- }
- //换结束标签
- src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
- tmp = strings.Replace(tmp, " ", "", -1)
- //保留这些标签
- for _, v := range c.retainTags2CR {
- if "<"+v+">" == tmp || "</"+v+">" == tmp {
- if tmp == "</table>" {
- return tmp + "\n"
- }
- return tmp
- }
- if strings.HasPrefix(tmp, "<"+v) {
- dispstrs := c.display.FindAllString(tmp, -1)
- rowstrs := c.rowstag.FindAllString(tmp, -1)
- colstrs := c.colstag.FindAllString(tmp, -1)
- con := "<" + v
- if con == "<table" {
- if isHasBoder(tmp, c.isborder) {
- con = con + ` border="1"`
- }
- }
- if len(colstrs) > 0 { //处理多列合并
- con += " " + colstrs[0]
- }
- if len(rowstrs) > 0 { //处理多行合并
- con += " " + rowstrs[0]
- }
- if len(dispstrs) > 0 {
- con += " style=\"" + dispstrs[0] + "\""
- }
- return con + ">"
- }
- }
- if tmp == "<br>" || tmp == "<br/>" || tmp == "<center>" || tmp == "</center>" || tmp == "<ul>" || tmp == "</ul>" {
- return "\n"
- }
- if tmp[1] != 47 { //开始标签
- for _, v := range c.replTags2CR {
- if v == tmp[1:len(tmp)-1] {
- return "\n"
- }
- }
- return ""
- }
- for _, v := range c.replTags2CR {
- if v == tmp[2:len(tmp)-1] {
- return "\n"
- }
- }
- return ""
- })
- src = c.replStartWrap.ReplaceAllString(src, "")
- src = c.replBlankLine.ReplaceAllString(src, "\n")
- //清除多余换行
- c.multiCR.ReplaceAllString(src, "\n")
- return strings.Replace(src, "\n", "<br/>", -1)
- }
- //判断table是否加表格线
- func isHasBoder(con string, reg *regexp.Regexp) bool {
- res := reg.FindAllStringSubmatch(con, -1)
- hasBorder := false
- for _, v := range res {
- for k, val := range v {
- if k > 0 && k%2 == 0 && IntAll(val) > 0 {
- hasBorder = true
- break
- }
- }
- }
- return hasBorder
- }
|