package util
import (
mc "app.yhyue.com/moapp/jybase/common"
"regexp"
"strings"
)
//
type Cut struct {
scriptTag *regexp.Regexp
sa *regexp.Regexp
replBlankLine *regexp.Regexp
replStartWrap *regexp.Regexp
multiCR *regexp.Regexp
tag *regexp.Regexp
//annotate *regexp.Regexp
//tag *regexp.Regexp
//inputag *regexp.Regexp
//isborder *regexp.Regexp
//hiddentag *regexp.Regexp
//styletag *regexp.Regexp
//colstag *regexp.Regexp
//rowstag *regexp.Regexp
//display *regexp.Regexp
//replTags2CR []string
//retainTags2CR []string
}
//
func NewCut() *Cut {
scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?(script|style)>")
sa := regexp.MustCompile("<[a|A]\\s*[^>]*>(.*?)[a|A]>")
bl := regexp.MustCompile("\\s+[\r\n]")
sw := regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
t, _ := regexp.Compile("<[^>]+>")
//t, _ := regexp.Compile("<[^>]+>")
////sc, _ := regexp.Compile("\\")
////ss, _ := regexp.Compile("\\")
//at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css
//hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
//input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
//cols, _ := regexp.Compile(`colspan="\d+"`)
//rows, _ := regexp.Compile(`rowspan="\d+"`)
//border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
//dis, _ := regexp.Compile(`display:none`)
return &Cut{
scriptTag: scs,
sa: sa,
replBlankLine: bl,
replStartWrap: sw,
multiCR: m,
tag: t,
//annotate: at,
//tag: t,
//hiddentag: hiddentag,
//inputag: input,
//colstag: cols,
//isborder: border,
//rowstag: rows,
//display: dis,
//replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
//retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
}
}
//清理HTML标签
func (c *Cut) ClearHtml(src string) string {
src = strings.Replace(src, ">\n", ">", -1)
src = strings.Replace(src, " ", "", -1)
//标签全转小写
src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
//清script,style
src = c.scriptTag.ReplaceAllString(src, "")
src = c.sa.ReplaceAllString(src, "$1")
src = c.replStartWrap.ReplaceAllString(src, "")
src = c.replBlankLine.ReplaceAllString(src, "\n")
//清除多余换行
c.multiCR.ReplaceAllString(src, "\n")
return strings.Replace(src, "\n", "
", -1)
}
//判断table是否加表格线
func isHasBoder(con string, reg *regexp.Regexp) bool {
res := reg.FindAllStringSubmatch(con, -1)
hasBorder := false
for _, v := range res {
for k, val := range v {
if k > 0 && k%2 == 0 && mc.IntAll(val) > 0 {
hasBorder = true
break
}
}
}
return hasBorder
}