package util import ( mc "app.yhyue.com/moapp/jybase/common" "regexp" "strings" ) // type Cut struct { scriptTag *regexp.Regexp sa *regexp.Regexp replBlankLine *regexp.Regexp replStartWrap *regexp.Regexp multiCR *regexp.Regexp tag *regexp.Regexp //annotate *regexp.Regexp //tag *regexp.Regexp //inputag *regexp.Regexp //isborder *regexp.Regexp //hiddentag *regexp.Regexp //styletag *regexp.Regexp //colstag *regexp.Regexp //rowstag *regexp.Regexp //display *regexp.Regexp //replTags2CR []string //retainTags2CR []string } // func NewCut() *Cut { scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?") sa := regexp.MustCompile("<[a|A]\\s*[^>]*>(.*?)") bl := regexp.MustCompile("\\s+[\r\n]") sw := regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$") m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+") t, _ := regexp.Compile("<[^>]+>") //t, _ := regexp.Compile("<[^>]+>") ////sc, _ := regexp.Compile("\\]*\\>*[^\\>]+\\") ////ss, _ := regexp.Compile("\\]*\\>*[^\\>]+\\") //at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css //hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`) //input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`) //cols, _ := regexp.Compile(`colspan="\d+"`) //rows, _ := regexp.Compile(`rowspan="\d+"`) //border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`) //dis, _ := regexp.Compile(`display:none`) return &Cut{ scriptTag: scs, sa: sa, replBlankLine: bl, replStartWrap: sw, multiCR: m, tag: t, //annotate: at, //tag: t, //hiddentag: hiddentag, //inputag: input, //colstag: cols, //isborder: border, //rowstag: rows, //display: dis, //replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"}, //retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"}, } } //清理HTML标签 func (c *Cut) ClearHtml(src string) string { src = strings.Replace(src, ">\n", ">", -1) src = strings.Replace(src, " ", "", -1) //标签全转小写 src = c.tag.ReplaceAllStringFunc(src, strings.ToLower) //清script,style src = c.scriptTag.ReplaceAllString(src, "") src = c.sa.ReplaceAllString(src, "$1") src = c.replStartWrap.ReplaceAllString(src, "") src = c.replBlankLine.ReplaceAllString(src, "\n") //清除多余换行 c.multiCR.ReplaceAllString(src, "\n") return strings.Replace(src, "\n", "
", -1) } //判断table是否加表格线 func isHasBoder(con string, reg *regexp.Regexp) bool { res := reg.FindAllStringSubmatch(con, -1) hasBorder := false for _, v := range res { for k, val := range v { if k > 0 && k%2 == 0 && mc.IntAll(val) > 0 { hasBorder = true break } } } return hasBorder }