Ver Fonte

优化过滤style、script标签内容

maxiaoshan há 3 anos atrás
pai
commit
0029bbba30
1 ficheiros alterados com 6 adições e 1 exclusões
  1. 6 1
      src/spiderutil/clearHtml.go

+ 6 - 1
src/spiderutil/clearHtml.go

@@ -11,6 +11,7 @@ import (
 
 //
 type Cut struct {
+	annotate      *regexp.Regexp
 	tag           *regexp.Regexp
 	scripttag     *regexp.Regexp
 	inputag       *regexp.Regexp
@@ -33,7 +34,8 @@ func NewCut() *Cut {
 	m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
 	//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
 	//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
-	scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
+	scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?</(script|style)>")
+	at := regexp.MustCompile("(?s)<!%-%-.*?%-%->")
 	hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
 	input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
 	cols, _ := regexp.Compile(`colspan="\d+"`)
@@ -41,6 +43,7 @@ func NewCut() *Cut {
 	border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
 	dis, _ := regexp.Compile(`display:none`)
 	return &Cut{
+		annotate:      at,
 		tag:           t,
 		scripttag:     scs,
 		hiddentag:     hiddentag,
@@ -65,6 +68,8 @@ func (c *Cut) ClearHtml(src string) string {
 	src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
 	//清script,style
 	src = c.scripttag.ReplaceAllString(src, "")
+	//清理注释文本
+	src = c.annotate.ReplaceAllString(src, "")
 	//清理input
 	src = c.hiddentag.ReplaceAllString(src, "")
 	src = c.inputag.ReplaceAllString(src, "$2")