package spider_com import ( "regexp" "strings" "unicode/utf8" "github.com/PuerkitoBio/goquery" ) type Cut struct { annotate *regexp.Regexp tag *regexp.Regexp scripttag *regexp.Regexp inputag *regexp.Regexp isborder *regexp.Regexp hiddentag *regexp.Regexp styletag *regexp.Regexp colstag *regexp.Regexp rowstag *regexp.Regexp display *regexp.Regexp multiCR *regexp.Regexp replBlankLine *regexp.Regexp replStartWrap *regexp.Regexp replTags2CR []string retainTags2CR []string } func NewCut() *Cut { t, _ := regexp.Compile("<[^>]+>") m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+") //sc, _ := regexp.Compile("\\]*\\>*[^\\>]+\\") //ss, _ := regexp.Compile("\\]*\\>*[^\\>]+\\") scs := regexp.MustCompile("(?s)<(script|style)[^>]*>.+?") at := regexp.MustCompile("(?s)<(!%-%-|!--).*?(%-%-|--)>") //注释 css hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`) input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`) cols, _ := regexp.Compile(`colspan="\d+"`) rows, _ := regexp.Compile(`rowspan="\d+"`) border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`) dis, _ := regexp.Compile(`display:none`) return &Cut{ annotate: at, tag: t, scripttag: scs, hiddentag: hiddentag, inputag: input, colstag: cols, isborder: border, rowstag: rows, display: dis, multiCR: m, replBlankLine: regexp.MustCompile("\\s+[\r\n]"), replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"), replTags2CR: []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"}, retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"}, } } // 清理HTML标签 func (c *Cut) ClearHtml(src string) string { src = c.replBlankLine.ReplaceAllString(src, "") src = strings.Replace(src, ">\n", ">", -1) src = strings.Replace(src, " ", "", -1) //标签全转小写 src = c.tag.ReplaceAllStringFunc(src, strings.ToLower) //清script,style src = c.scripttag.ReplaceAllString(src, "") //清理注释文本 src = c.annotate.ReplaceAllString(src, "") //清理input src = c.hiddentag.ReplaceAllString(src, "") src = c.inputag.ReplaceAllString(src, "$2") document, err := goquery.NewDocumentFromReader(strings.NewReader(src)) if err == nil { if tmpstr, err := document.Each(func(i int, sel *goquery.Selection) { sel.Find("td").Each(func(i int, selection *goquery.Selection) { val, b := selection.Attr("title") if b && strings.Trim(val, " ") != "" { tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool { return r == 9 || r == 32 }) if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) { selection.SetText(strings.Trim(val, " ")) } } }) }).Html(); err == nil { src = tmpstr } } //换结束标签 src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string { tmp = strings.Replace(tmp, " ", "", -1) //保留这些标签 for _, v := range c.retainTags2CR { if "<"+v+">" == tmp || "" == tmp { if tmp == "" { return tmp + "\n" } return tmp } if strings.HasPrefix(tmp, "<"+v) { dispstrs := c.display.FindAllString(tmp, -1) rowstrs := c.rowstag.FindAllString(tmp, -1) colstrs := c.colstag.FindAllString(tmp, -1) con := "<" + v if con == " 0 { //处理多列合并 con += " " + colstrs[0] } if len(rowstrs) > 0 { //处理多行合并 con += " " + rowstrs[0] } if len(dispstrs) > 0 { con += " style=\"" + dispstrs[0] + "\"" } return con + ">" } } if tmp == "
" || tmp == "
" || tmp == "
" || tmp == "
" || tmp == "
    " || tmp == "
" { return "\n" } if tmp[1] != 47 { //开始标签 for _, v := range c.replTags2CR { if v == tmp[1:len(tmp)-1] { return "\n" } } return "" } for _, v := range c.replTags2CR { if v == tmp[2:len(tmp)-1] { return "\n" } } return "" }) src = c.replStartWrap.ReplaceAllString(src, "") src = c.replBlankLine.ReplaceAllString(src, "\n") //清除多余换行 c.multiCR.ReplaceAllString(src, "\n") return strings.Replace(src, "\n", "
", -1) } // 判断table是否加表格线 func isHasBoder(con string, reg *regexp.Regexp) bool { res := reg.FindAllStringSubmatch(con, -1) hasBorder := false for _, v := range res { for k, val := range v { if k > 0 && k%2 == 0 && IntAll(val) > 0 { hasBorder = true break } } } return hasBorder }