|
@@ -11,6 +11,7 @@ import (
|
|
|
|
|
|
//
|
|
|
type Cut struct {
|
|
|
+ annotate *regexp.Regexp
|
|
|
tag *regexp.Regexp
|
|
|
scripttag *regexp.Regexp
|
|
|
inputag *regexp.Regexp
|
|
@@ -33,7 +34,8 @@ func NewCut() *Cut {
|
|
|
m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
|
|
|
//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
|
|
|
//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
|
|
|
- scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
|
|
|
+ scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?</(script|style)>")
|
|
|
+ at := regexp.MustCompile("(?s)<!%-%-.*?%-%->")
|
|
|
hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
|
|
|
input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
|
|
|
cols, _ := regexp.Compile(`colspan="\d+"`)
|
|
@@ -41,6 +43,7 @@ func NewCut() *Cut {
|
|
|
border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
|
|
|
dis, _ := regexp.Compile(`display:none`)
|
|
|
return &Cut{
|
|
|
+ annotate: at,
|
|
|
tag: t,
|
|
|
scripttag: scs,
|
|
|
hiddentag: hiddentag,
|
|
@@ -65,6 +68,8 @@ func (c *Cut) ClearHtml(src string) string {
|
|
|
src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
|
|
|
//清script,style
|
|
|
src = c.scripttag.ReplaceAllString(src, "")
|
|
|
+ //清理注释文本
|
|
|
+ src = c.annotate.ReplaceAllString(src, "")
|
|
|
//清理input
|
|
|
src = c.hiddentag.ReplaceAllString(src, "")
|
|
|
src = c.inputag.ReplaceAllString(src, "$2")
|