zhangjinkun 6 år sedan
förälder
incheckning
a84524a200
2 ändrade filer med 20 tillägg och 5 borttagningar
  1. 13 1
      src/jy/extract/extract.go
  2. 7 4
      src/jy/util/clearHtml.go

+ 13 - 1
src/jy/extract/extract.go

@@ -1,6 +1,7 @@
 package extract
 
 import (
+	//"encoding/json"
 	"fmt"
 	"jy/clear"
 	db "jy/mongodbutil"
@@ -8,6 +9,7 @@ import (
 	ju "jy/util"
 	"log"
 	qu "qfw/util"
+	"regexp"
 	"strconv"
 	"strings"
 	"sync"
@@ -207,6 +209,7 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
 			//log.Println("抽取-后置规则", tmp)
+
 		}
 		//全局后置规则
 		for _, v := range e.RuleBacks {
@@ -498,7 +501,16 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 			}
 		}
 	} else {
-		val := v.RegCore.Reg.ReplaceAllString(text, "")
+		pos := v.RegCore.Reg.FindStringIndex(text)
+		val := ""
+		if len(pos) == 2 {
+			text = text[pos[1]:]
+			rs := regexp.MustCompile("[^\r\n\t]+")
+			tmp := rs.FindAllString(text, -1)
+			if len(tmp) > 0 {
+				val = tmp[0]
+			}
+		}
 		if val != "" {
 			tmps := []map[string]interface{}{}
 			tmp := map[string]interface{}{

+ 7 - 4
src/jy/util/clearHtml.go

@@ -9,6 +9,7 @@ import (
 type Cut struct {
 	tag           *regexp.Regexp
 	scripttag     *regexp.Regexp
+	inputag       *regexp.Regexp
 	styletag      *regexp.Regexp
 	colstag       *regexp.Regexp
 	rowstag       *regexp.Regexp
@@ -27,13 +28,14 @@ func NewCut() *Cut {
 	//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
 	//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
 	scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
+	input := regexp.MustCompile(`<\s*input.*value=("|')(.*)("|')/?>(</>)?`)
 	cols, _ := regexp.Compile(`colspan="\d+"`)
 	rows, _ := regexp.Compile(`rowspan="\d+"`)
 	dis, _ := regexp.Compile(`display:none`)
 	return &Cut{
-		tag:       t,
-		scripttag: scs,
-		//styletag:      ss,
+		tag:           t,
+		scripttag:     scs,
+		inputag:       input,
 		colstag:       cols,
 		rowstag:       rows,
 		display:       dis,
@@ -52,7 +54,8 @@ func (c *Cut) ClearHtml(src string) string {
 	src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
 	//清script,style
 	src = c.scripttag.ReplaceAllString(src, "")
-	//
+	//清理input
+	src = c.inputag.ReplaceAllString(src, "$2")
 	//换结束标签
 	src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
 		tmp = strings.Replace(tmp, " ", "", -1)