|
@@ -8,11 +8,12 @@ import (
|
|
|
|
|
|
//
|
|
|
type Cut struct {
|
|
|
- scripttag *regexp.Regexp
|
|
|
+ scriptTag *regexp.Regexp
|
|
|
sa *regexp.Regexp
|
|
|
replBlankLine *regexp.Regexp
|
|
|
replStartWrap *regexp.Regexp
|
|
|
multiCR *regexp.Regexp
|
|
|
+ tag *regexp.Regexp
|
|
|
//annotate *regexp.Regexp
|
|
|
//tag *regexp.Regexp
|
|
|
//inputag *regexp.Regexp
|
|
@@ -33,6 +34,7 @@ func NewCut() *Cut {
|
|
|
bl := regexp.MustCompile("\\s+[\r\n]")
|
|
|
sw := regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
|
|
|
m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
|
|
|
+ t, _ := regexp.Compile("<[^>]+>")
|
|
|
|
|
|
//t, _ := regexp.Compile("<[^>]+>")
|
|
|
////sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
|
|
@@ -45,11 +47,12 @@ func NewCut() *Cut {
|
|
|
//border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
|
|
|
//dis, _ := regexp.Compile(`display:none`)
|
|
|
return &Cut{
|
|
|
- scripttag: scs,
|
|
|
+ scriptTag: scs,
|
|
|
sa: sa,
|
|
|
replBlankLine: bl,
|
|
|
replStartWrap: sw,
|
|
|
multiCR: m,
|
|
|
+ tag: t,
|
|
|
//annotate: at,
|
|
|
//tag: t,
|
|
|
//hiddentag: hiddentag,
|
|
@@ -65,8 +68,12 @@ func NewCut() *Cut {
|
|
|
|
|
|
//清理HTML标签
|
|
|
func (c *Cut) ClearHtml(src string) string {
|
|
|
+ src = strings.Replace(src, ">\n", ">", -1)
|
|
|
+ src = strings.Replace(src, " ", "", -1)
|
|
|
+ //标签全转小写
|
|
|
+ src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
|
|
|
//清script,style
|
|
|
- src = c.scripttag.ReplaceAllString(src, "")
|
|
|
+ src = c.scriptTag.ReplaceAllString(src, "")
|
|
|
src = c.sa.ReplaceAllString(src, "$1")
|
|
|
src = c.replStartWrap.ReplaceAllString(src, "")
|
|
|
src = c.replBlankLine.ReplaceAllString(src, "\n")
|