|
@@ -0,0 +1,319 @@
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "github.com/PuerkitoBio/goquery"
|
|
|
+ qu "qfw/util"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+ "unicode/utf8"
|
|
|
+)
|
|
|
+const (
|
|
|
+ conStr = "([\\s\u3000\u2003\u00a0]+|\\\\t)" //所有空格
|
|
|
+)
|
|
|
+var clearpkg = regexp.MustCompile("(标示|标识)")
|
|
|
+var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
|
|
|
+var (
|
|
|
+ LReg = regexp.MustCompile("^" + conStr)
|
|
|
+ cut = newCut()
|
|
|
+)
|
|
|
+var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
|
|
|
+
|
|
|
+var at = rune('&')
|
|
|
+var ed = rune(';')
|
|
|
+var lableMap = map[string]rune{
|
|
|
+ "&": rune('&'),
|
|
|
+ " ": rune(' '),
|
|
|
+ ">": rune('>'),
|
|
|
+ "<": rune('<'),
|
|
|
+}
|
|
|
+
|
|
|
+type Cut struct {
|
|
|
+ tag *regexp.Regexp
|
|
|
+ scripttag *regexp.Regexp
|
|
|
+ inputag *regexp.Regexp
|
|
|
+ hiddentag *regexp.Regexp
|
|
|
+ styletag *regexp.Regexp
|
|
|
+ colstag *regexp.Regexp
|
|
|
+ rowstag *regexp.Regexp
|
|
|
+ display *regexp.Regexp
|
|
|
+ multiCR *regexp.Regexp
|
|
|
+ replBlankLine *regexp.Regexp
|
|
|
+ replStartWrap *regexp.Regexp
|
|
|
+ replTags2CR []string
|
|
|
+ retainTags2CR []string
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+func cleandetail(detail string) string {
|
|
|
+ detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
|
|
|
+ //全文的需要修复表格
|
|
|
+ detail = repairCon(detail)
|
|
|
+ detail = cutLableStr(detail)
|
|
|
+ detail = cut.ClearHtml(detail)
|
|
|
+
|
|
|
+ return detail
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func newCut() *Cut {
|
|
|
+ t, _ := regexp.Compile("<[^>]+>")
|
|
|
+ m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
|
|
|
+ //sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
|
|
|
+ //ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
|
|
|
+ scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
|
|
|
+ hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
|
|
|
+ input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
|
|
|
+ cols, _ := regexp.Compile(`colspan="\d+"`)
|
|
|
+ rows, _ := regexp.Compile(`rowspan="\d+"`)
|
|
|
+ dis, _ := regexp.Compile(`display:none`)
|
|
|
+ return &Cut{
|
|
|
+ tag: t,
|
|
|
+ scripttag: scs,
|
|
|
+ hiddentag: hiddentag,
|
|
|
+ inputag: input,
|
|
|
+ colstag: cols,
|
|
|
+ rowstag: rows,
|
|
|
+ display: dis,
|
|
|
+ multiCR: m,
|
|
|
+ replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
|
|
|
+ replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
|
|
|
+ replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
|
|
|
+ retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
|
|
|
+ }
|
|
|
+}
|
|
|
+func (c *Cut) ClearHtml(src string) string {
|
|
|
+ src = strings.Replace(src, ">\n", ">", -1)
|
|
|
+ src = strings.Replace(src, " ", "", -1)
|
|
|
+ //标签全转小写
|
|
|
+ src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
|
|
|
+ //清script,style
|
|
|
+ src = c.scripttag.ReplaceAllString(src, "")
|
|
|
+ //清理input
|
|
|
+ src = c.hiddentag.ReplaceAllString(src, "")
|
|
|
+ src = c.inputag.ReplaceAllString(src, "$2")
|
|
|
+ document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
|
|
|
+ if err == nil {
|
|
|
+ if tmpstr,err := document.Each(func(i int, sel *goquery.Selection) {
|
|
|
+ sel.Find("td").Each(func(i int, selection *goquery.Selection) {
|
|
|
+ val, b := selection.Attr("title")
|
|
|
+ if b && strings.Trim(val, " ") != "" {
|
|
|
+ tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
|
|
|
+ return r == 9|| r == 32
|
|
|
+ })
|
|
|
+ if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
|
|
|
+ selection.SetText(strings.Trim(val, " "))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }).Html();err == nil{
|
|
|
+ src = tmpstr
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //换结束标签
|
|
|
+ src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
|
|
|
+ tmp = strings.Replace(tmp, " ", "", -1)
|
|
|
+ //保留这些标签
|
|
|
+ for _, v := range c.retainTags2CR {
|
|
|
+ if "<"+v+">" == tmp || "</"+v+">" == tmp {
|
|
|
+ if tmp == "</table>" {
|
|
|
+ return tmp + "\n"
|
|
|
+ }
|
|
|
+ return tmp
|
|
|
+ }
|
|
|
+ if strings.HasPrefix(tmp, "<"+v) {
|
|
|
+ dispstrs := c.display.FindAllString(tmp, -1)
|
|
|
+ rowstrs := c.rowstag.FindAllString(tmp, -1)
|
|
|
+ colstrs := c.colstag.FindAllString(tmp, -1)
|
|
|
+ c := "<" + v
|
|
|
+ if len(colstrs) > 0 { //处理多列合并
|
|
|
+ c += " " + colstrs[0]
|
|
|
+ }
|
|
|
+ if len(rowstrs) > 0 { //处理多行合并
|
|
|
+ c += " " + rowstrs[0]
|
|
|
+ }
|
|
|
+ if len(dispstrs) > 0 {
|
|
|
+ c += " style=\"" + dispstrs[0] + "\""
|
|
|
+ }
|
|
|
+ return c + ">"
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if tmp == "<br>" || tmp == "</ul>" ||tmp == "<ul>" || tmp == "<br/>" || tmp == "<center>" || tmp == "</center>"{
|
|
|
+ return "\n"
|
|
|
+ }
|
|
|
+ if tmp[1] != 47 { //开始标签
|
|
|
+ for _, v := range c.replTags2CR {
|
|
|
+ if v == tmp[1:len(tmp)-1] {
|
|
|
+ return "\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ""
|
|
|
+ }
|
|
|
+ for _, v := range c.replTags2CR {
|
|
|
+ if v == tmp[2:len(tmp)-1] {
|
|
|
+ return "\n"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ""
|
|
|
+ })
|
|
|
+ src = c.replStartWrap.ReplaceAllString(src, "")
|
|
|
+ src = c.replBlankLine.ReplaceAllString(src, "\n")
|
|
|
+ //清除多余换行
|
|
|
+ return c.multiCR.ReplaceAllString(src, "\n")
|
|
|
+ //return strings.Replace(src, "\n", "<br/>", -1)
|
|
|
+}
|
|
|
+
|
|
|
+//处理转义标签
|
|
|
+func cutLableStr(con string) string {
|
|
|
+ for i := 0; i < 3; i++ {
|
|
|
+ runes := []rune{}
|
|
|
+ pools := []rune{}
|
|
|
+ bpool := false
|
|
|
+ strings.IndexFunc(con, func(s rune) bool {
|
|
|
+ if !bpool && s == at {
|
|
|
+ bpool = true
|
|
|
+ pools = []rune{}
|
|
|
+ }
|
|
|
+ if bpool {
|
|
|
+ pools = append(pools, s)
|
|
|
+ if s == ed { //结束
|
|
|
+ lb := lableMap[string(pools)]
|
|
|
+ if lb != 0 {
|
|
|
+ runes = append(runes, lb)
|
|
|
+ } else {
|
|
|
+ runes = append(runes, pools...)
|
|
|
+ }
|
|
|
+ bpool = false
|
|
|
+ } else if len(pools) > 6 {
|
|
|
+ bpool = false
|
|
|
+ runes = append(runes, pools...)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ runes = append(runes, s)
|
|
|
+ }
|
|
|
+ return false
|
|
|
+ })
|
|
|
+ str1 := string(runes)
|
|
|
+ if i > 0 && con == str1 {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ con = str1
|
|
|
+ }
|
|
|
+ return con
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+func repairCon(con string) string {
|
|
|
+ con = clearpkg.ReplaceAllString(con, "")
|
|
|
+ res := saveThead.FindAllStringSubmatch(con, 1)
|
|
|
+ th := ""
|
|
|
+ if len(res) == 1 && len(res[0]) == 2 {
|
|
|
+ th = trimLeftSpace(res[0][1], "")
|
|
|
+ }
|
|
|
+ con = thbf.ReplaceAllString(con, "")
|
|
|
+ con = trimLeftSpace(con, "")
|
|
|
+ itbody := strings.Index(con, "<tr")
|
|
|
+ iLen := 3
|
|
|
+ if itbody == 0 {
|
|
|
+ con = findpos(con, iLen, itbody)
|
|
|
+ } else {
|
|
|
+ itable := strings.Index(con, "<table")
|
|
|
+ if itable == -1 || itable > itbody {
|
|
|
+ con = findpos(con, iLen, itbody)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //保留第一个thead
|
|
|
+ if th != "" {
|
|
|
+ con = strings.Replace(con, th, "<thead>"+th+"</thead>", 1)
|
|
|
+ }
|
|
|
+ //u.Debug(con)
|
|
|
+ return con
|
|
|
+}
|
|
|
+func findpos(con string, iLen, start int) (newcon string) {
|
|
|
+ defer qu.Catch()
|
|
|
+ n := len(con)
|
|
|
+ layer := 0
|
|
|
+ pos := 0
|
|
|
+ if start >= 0 {
|
|
|
+ if iLen == 6 {
|
|
|
+ for i := iLen + start; i < len(con); i++ {
|
|
|
+ if con[i] == '<' && i+6 < n {
|
|
|
+ str := con[i : i+6]
|
|
|
+ if str == "</tbod" {
|
|
|
+ if layer == 0 {
|
|
|
+ pos = i
|
|
|
+ break
|
|
|
+ } else {
|
|
|
+ layer--
|
|
|
+ }
|
|
|
+ i += 6
|
|
|
+ } else if str == "<tbody" {
|
|
|
+ layer++
|
|
|
+ i += 6
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if pos+7 <= n && start+6 < pos {
|
|
|
+ newcon = con[:start] + "<table" + con[start+6:pos] + "</table" + con[pos+7:]
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ layer++
|
|
|
+ nq := 0
|
|
|
+ lasttr := 0
|
|
|
+ for i := iLen + start; i < len(con); i++ {
|
|
|
+ if con[i] == '<' && i+4 < n {
|
|
|
+ if nq == 0 {
|
|
|
+ str := con[i : i+4]
|
|
|
+ if str == "</tr" {
|
|
|
+ if layer <= 0 {
|
|
|
+ pos = i //正常情况不会存在此类情况
|
|
|
+ break
|
|
|
+ } else {
|
|
|
+ layer--
|
|
|
+ lasttr = i
|
|
|
+ }
|
|
|
+ i += 4
|
|
|
+ } else if str[:3] == "<tr" {
|
|
|
+ layer++
|
|
|
+ i += 4
|
|
|
+ } else if str == "<tab" && i+6 < n && con[i+4:i+6] == "le" {
|
|
|
+ if layer == 0 {
|
|
|
+ break
|
|
|
+ } else {
|
|
|
+ //内嵌的表格
|
|
|
+ nq++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if i+6 < n {
|
|
|
+ str := con[i : i+6]
|
|
|
+ if str == "</tabl" {
|
|
|
+ nq--
|
|
|
+ } else if str == "<table" {
|
|
|
+ nq++
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if pos == 0 && lasttr > 3 {
|
|
|
+ pos = lasttr + 5
|
|
|
+ } else if pos > 0 {
|
|
|
+ pos += 5
|
|
|
+ }
|
|
|
+ if pos <= n && pos < len(con) && start < pos {
|
|
|
+ newcon = con[:start] + "<table>" + con[start:pos] + "</table>" + con[pos:]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if newcon == "" {
|
|
|
+ newcon = con
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+func trimLeftSpace(con, repl string) string {
|
|
|
+ return LReg.ReplaceAllString(con, repl)
|
|
|
+}
|