浏览代码

抽取逻辑和日志

zhangjinkun 6 年之前
父节点
当前提交
df9b731176

+ 22 - 22
src/jy/admin/rule.go

@@ -16,16 +16,16 @@ import (
 var Date_Short_Layout = "2006-01-02"
 
 const (
-	FE  = "FE"  //前置正则
-	FL  = "FL"  //前置脚本
-	BE  = "BE"  //后置正则
-	BL  = "BL"  //后置脚本
-	EFE = "EFE" //抽取前置正则
-	EFL = "EFL" //抽取前置脚本
-	EBE = "EBE" //抽取后置正则
-	EBL = "EBL" //抽取后置脚本
-	ECE = "ECE" //抽取逻辑正则
-	ECL = "ECL" //抽取逻辑脚本
+	F_E  = "F_E"  //前置正则
+	F_L  = "F_L"  //前置脚本
+	B_E  = "B_E"  //后置正则
+	B_L  = "B_L"  //后置脚本
+	E_FE = "E_FE" //抽取前置正则
+	E_FL = "E_FL" //抽取前置脚本
+	E_BE = "E_BE" //抽取后置正则
+	E_BL = "E_BL" //抽取后置脚本
+	E_CE = "E_CE" //抽取逻辑正则
+	E_CL = "E_CL" //抽取逻辑脚本
 )
 
 func init() {
@@ -111,9 +111,9 @@ func RulePreSave(c *gin.Context) {
 		s_type := data["s_type"]
 		code := ""
 		if s_type == "0" { //前置正则
-			code = util.GetSyncIndex(FE)
+			code = util.GetSyncIndex(F_E)
 		} else { //前置脚本
-			code = util.GetSyncIndex(FL)
+			code = util.GetSyncIndex(F_L)
 		}
 		data["s_code"] = code
 		b = Mgo.Save("rule_pre", data) != ""
@@ -165,9 +165,9 @@ func RuleBackSave(c *gin.Context) {
 		s_type := data["s_type"]
 		code := ""
 		if s_type == "0" { //前置正则
-			code = util.GetSyncIndex(BE)
+			code = util.GetSyncIndex(B_E)
 		} else { //前置lua脚本
-			code = util.GetSyncIndex(BL)
+			code = util.GetSyncIndex(B_L)
 		}
 		data["s_code"] = code
 		b = Mgo.Save("rule_back", data) != ""
@@ -245,10 +245,10 @@ func RuleLogicPreSave(c *gin.Context) {
 		data["s_username"] = sessions.Default(c).Get("username")
 		s_type := data["s_type"]
 		code := ""
-		if s_type == "0" { //抽取前置正则EFE
-			code = util.GetSyncIndex(EFE)
+		if s_type == "0" { //抽取前置正则E_FE
+			code = util.GetSyncIndex(E_FE)
 		} else { //抽取前置脚本
-			code = util.GetSyncIndex(EFL)
+			code = util.GetSyncIndex(E_FL)
 		}
 		data["s_code"] = code
 		b = Mgo.Save("rule_logicpre", data) != ""
@@ -279,10 +279,10 @@ func RuleLogicBackSave(c *gin.Context) {
 		data["s_username"] = sessions.Default(c).Get("username")
 		s_type := data["s_type"]
 		code := ""
-		if s_type == "0" { //抽取后置正则EFE
-			code = util.GetSyncIndex(EBE)
+		if s_type == "0" { //抽取后置正则E_FE
+			code = util.GetSyncIndex(E_BE)
 		} else { //抽取后置脚本
-			code = util.GetSyncIndex(EBL)
+			code = util.GetSyncIndex(E_BL)
 		}
 		data["s_code"] = code
 		b = Mgo.Save("rule_logicback", data) != ""
@@ -314,9 +314,9 @@ func RuleLogicCoreSave(c *gin.Context) {
 		s_type := data["s_type"]
 		code := ""
 		if s_type == "0" { //抽取逻辑正则
-			code = util.GetSyncIndex(ECE)
+			code = util.GetSyncIndex(E_CE)
 		} else { //抽取逻辑脚本
-			code = util.GetSyncIndex(ECL)
+			code = util.GetSyncIndex(E_CL)
 		}
 		data["s_code"] = code
 		b = Mgo.Save("rule_logicore", data) != ""

+ 76 - 44
src/jy/extract/extract.go

@@ -55,10 +55,15 @@ type ExtractTask struct {
 	RuleCores []*RuleCore   //抽取规则
 }
 
-var lock sync.RWMutex
-var ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-var saveLimit = 200                                //抽取日志批量保存
-var TaskList map[string]*ExtractTask               //任务列表
+var (
+	lock     sync.RWMutex
+	cut      = ju.NewCut()                          //获取正文并清理
+	ExtLogs  map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList map[string]*ExtractTask                //任务列表
+
+	saveLimit = 200                     //抽取日志批量保存
+	nfields   = []string{"contenthtml"} //日志保存排除字段
+)
 
 func init() {
 	TaskList = make(map[string]*ExtractTask)
@@ -100,18 +105,37 @@ func RunExtractTask(ext *ExtractTask) {
 	if !ext.IsRun {
 		return
 	}
+	var fields = `{"title":1,"detail":1,"contenthtml":1}`
 	query := bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
-	list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, `{"title":1,"detail":1,"contenthtml":1}`, false, -1, -1)
+	list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, fields, false, -1, -1)
 	for _, v := range *list {
 		if !ext.IsRun {
 			break
 		}
+		v = PreInfo(v)
 		ext.TaskInfo.ProcessPool <- true
 		go ext.ExtractProcess(v)
 	}
 	time.AfterFunc(30*time.Minute, func() { RunExtractTask(ext) })
 }
 
+//信息预处理
+func PreInfo(doc map[string]interface{}) map[string]interface{} {
+	detail := ""
+	d1 := doc["detail"].(string)
+	d2 := doc["contenthtml"].(string)
+	if len(d1) >= len(d2) || d2 == "" {
+		detail = d1
+	} else {
+		detail = d2
+	}
+	detail = ju.CutLableStr(detail)
+	detail = cut.ClearHtml(detail)
+	doc["detail"] = detail
+	delete(doc, "contenthtml")
+	return doc
+}
+
 //加载任务信息
 func (e *ExtractTask) InitTaskInfo() {
 	task, _ := db.Mgo.FindById("task", e.Id, nil)
@@ -296,40 +320,40 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
 		for _, v := range e.RulePres {
 			doc = ExtRegPre(doc, v, e.TaskInfo)
 		}
-		log.Println("全局前置规则", doc)
+		//log.Println("全局前置规则", doc)
 		//抽取规则
 		for _, vc := range e.RuleCores {
+			tmp := ju.DeepCopy(doc, []string{}).(map[string]interface{})
 			//是否进入逻辑
-			if !ju.Logic(vc.LuaLogic, doc) {
+			if !ju.Logic(vc.LuaLogic, tmp) {
 				continue
 			}
-			data := map[string]interface{}{}
 			//抽取-前置规则
-			tmpdoc := map[string]interface{}{}
 			for _, v := range vc.RulePres {
-				tmpdoc = ExtRegPre(doc, v, e.TaskInfo)
+				tmp = ExtRegPre(tmp, v, e.TaskInfo)
 			}
-			log.Println("抽取-前置规则", tmpdoc)
+			//log.Println("抽取-前置规则", tmp)
+
 			//抽取-规则
 			for _, v := range vc.RuleCores {
-				data = ExtRegCore(tmpdoc, v, e.TaskInfo)
+				tmp = ExtRegCore(tmp, v, e.TaskInfo)
 			}
-			log.Println("抽取-规则", data)
+			//log.Println("抽取-规则", tmp)
 
 			//抽取-后置规则
 			for _, v := range vc.RuleBacks {
-				data = ExtRegBack(data, v, e.TaskInfo)
+				tmp = ExtRegBack(tmp, v, e.TaskInfo)
 			}
-			log.Println("抽取-后置规则", data)
+			//log.Println("抽取-后置规则", tmp)
 			//全局后置规则
 			for _, v := range e.RuleBacks {
-				data = ExtRegBack(data, v, e.TaskInfo)
+				tmp = ExtRegBack(tmp, v, e.TaskInfo)
 			}
-			log.Println("全局后置规则", data)
+			//log.Println("全局后置规则", tmp)
 
 			//抽取结果赋值
-			for k, v := range data {
-				if k == "_id" {
+			for k, v := range tmp {
+				if k == "_id" || k == "detail" || k == "contenthtml" {
 					continue
 				}
 				if result[k] == nil {
@@ -352,37 +376,40 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
 
 //前置过滤
 func ExtRegPre(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[string]interface{} {
+	before := ju.DeepCopy(doc, []string{}).(map[string]interface{})
+	extinfo := map[string]interface{}{}
 	if v.IsLua {
 		lua := ju.LuaScript{Code: v.Code, Name: v.Name, Doc: doc, Script: v.RuleText}
-		data := lua.RunScript()
-		AddExtLog(doc, data, v, t) //抽取日志
-		for k, v := range data {
+		extinfo = lua.RunScript()
+		for k, v := range extinfo { //结果覆盖原doc
 			doc[k] = v
 		}
+		AddExtLog(before, extinfo, v, t) //抽取日志
 	} else {
-		tmp := doc
 		key := qu.If(v.Field == "", "detail", v.Field).(string)
 		text := qu.ObjToString(doc[key])
-		doc[key] = v.RegPreBac.Reg.ReplaceAllString(text, "")
-		AddExtLog(tmp, doc, v, t) //抽取日志
+		extinfo[key] = v.RegPreBac.Reg.ReplaceAllString(text, "")
+		doc[key] = extinfo[key]          //结果覆盖原doc
+		AddExtLog(before, extinfo, v, t) //抽取日志
 	}
 	return doc
 }
 
 //抽取-规则
 func ExtRegCore(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[string]interface{} {
+	before := ju.DeepCopy(doc, nfields).(map[string]interface{})
+	extinfo := map[string]interface{}{}
 	if v.IsLua {
 		lua := ju.LuaScript{Code: v.Code, Name: v.Name, Doc: doc, Script: v.RuleText}
-		data := lua.RunScript()
-		AddExtLog(doc, data, v, t) //抽取日志
-		for k, v := range data {
-			doc[k] = v
+		extinfo = lua.RunScript()
+		for k, v := range extinfo {
+			doc[k] = v //结果覆盖原doc
 		}
+		AddExtLog(before, extinfo, v, t) //抽取日志
 	} else {
 		if v.Field == "" {
 			return doc
 		}
-		tmp := doc
 		text := qu.ObjToString(doc["detail"])
 		if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
 			apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
@@ -390,47 +417,51 @@ func ExtRegCore(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[stri
 				pos := apos[0]
 				for k, p := range v.RegCore.ExtractPos {
 					if len(pos) > p {
-						doc[k] = text[pos[p]:pos[p+1]]
-						//log.Println(k, doc[k])
+						extinfo[k] = text[pos[p]:pos[p+1]]
+						doc[k] = extinfo[k] //结果覆盖原doc
 					}
 				}
 			}
 		} else {
-			doc[v.Field] = v.RegCore.Reg.ReplaceAllString(text, "")
+			extinfo[v.Field] = v.RegCore.Reg.ReplaceAllString(text, "")
+			doc[v.Field] = extinfo[v.Field] //结果覆盖原doc
 		}
-		AddExtLog(tmp, doc, v, t) //抽取日志
+		AddExtLog(before, extinfo, v, t) //抽取日志
 	}
 	return doc
 }
 
 //后置过滤
 func ExtRegBack(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[string]interface{} {
+	before := ju.DeepCopy(doc, nfields).(map[string]interface{})
+	extinfo := map[string]interface{}{}
 	if v.IsLua {
 		lua := ju.LuaScript{Code: v.Code, Name: v.Name, Doc: doc, Script: v.RuleText}
-		data := lua.RunScript()
-		AddExtLog(doc, data, v, t) //抽取日志
-		for k, v := range data {
+		extinfo = lua.RunScript()
+		for k, v := range extinfo { //结果覆盖原doc
 			doc[k] = v
 		}
+		AddExtLog(before, extinfo, v, t) //抽取日志
 	} else {
-		tmp := doc
 		if v.Field != "" && qu.ObjToString(doc[v.Field]) != "" {
-			doc[v.Field] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(doc[v.Field]), v.RegPreBac.Replace)
+			extinfo[v.Field] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(doc[v.Field]), v.RegPreBac.Replace)
+			doc[v.Field] = extinfo[v.Field]
 		} else {
 			for k, val := range doc {
-				if k == "_id" || k == "detail" || qu.ObjToString(val) == "" {
+				if k == "_id" || qu.ObjToString(val) == "" {
 					continue
 				}
-				doc[k] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(val), v.RegPreBac.Replace)
+				extinfo[k] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(val), v.RegPreBac.Replace)
+				doc[k] = extinfo[k] //结果覆盖原doc
 			}
 		}
-		AddExtLog(tmp, doc, v, t) //抽取日志
+		AddExtLog(before, extinfo, v, t) //抽取日志
 	}
 	return doc
 }
 
 //抽取日志
-func AddExtLog(before, extinfo map[string]interface{}, v *RegLuaInfo, t *TaskInfo) {
+func AddExtLog(before map[string]interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
 	if !t.IsEtxLog {
 		return
 	}
@@ -439,6 +470,7 @@ func AddExtLog(before, extinfo map[string]interface{}, v *RegLuaInfo, t *TaskInf
 		"name":       v.Name,
 		"ruletext":   v.RuleText,
 		"islua":      v.IsLua,
+		"field":      v.Field,
 		"version":    t.Version,
 		"taskname":   t.Name,
 		"before":     before,
@@ -475,5 +507,5 @@ func SaveExtLog() {
 			}
 		}
 	}
-	time.AfterFunc(1*time.Minute, SaveExtLog)
+	time.AfterFunc(10*time.Second, SaveExtLog)
 }

+ 1 - 0
src/jy/extract/extract_test.go

@@ -3,6 +3,7 @@ package extract
 
 import (
 	. "jy/mongodbutil"
+
 	"testing"
 	"time"
 )

+ 54 - 0
src/jy/util/clear.go

@@ -0,0 +1,54 @@
+// clear
+package util
+
+import (
+	"strings"
+)
+
+var at = rune('&')
+var ed = rune(';')
+var lableMap = map[string]rune{
+	"&amp;":  rune('&'),
+	"&nbsp;": rune(' '),
+	"&gt;":   rune('>'),
+	"&lt;":   rune('<'),
+}
+
+//处理转义标签
+func CutLableStr(con string) string {
+	for i := 0; i < 3; i++ {
+		runes := []rune{}
+		pools := []rune{}
+		bpool := false
+		strings.IndexFunc(con, func(s rune) bool {
+			if !bpool && s == at {
+				bpool = true
+				pools = []rune{}
+			}
+			if bpool {
+				pools = append(pools, s)
+				if s == ed { //结束
+					lb := lableMap[string(pools)]
+					if lb != 0 {
+						runes = append(runes, lb)
+					} else {
+						runes = append(runes, pools...)
+					}
+					bpool = false
+				} else if len(pools) > 6 {
+					bpool = false
+					runes = append(runes, pools...)
+				}
+			} else {
+				runes = append(runes, s)
+			}
+			return false
+		})
+		str1 := string(runes)
+		if i > 0 && con == str1 {
+			break
+		}
+		con = str1
+	}
+	return con
+}

+ 108 - 0
src/jy/util/clearHtml.go

@@ -0,0 +1,108 @@
+package util
+
+import (
+	"regexp"
+	"strings"
+)
+
+//
+type Cut struct {
+	tag           *regexp.Regexp
+	scripttag     *regexp.Regexp
+	styletag      *regexp.Regexp
+	colstag       *regexp.Regexp
+	rowstag       *regexp.Regexp
+	display       *regexp.Regexp
+	multiCR       *regexp.Regexp
+	replBlankLine *regexp.Regexp
+	replStartWrap *regexp.Regexp
+	replTags2CR   []string
+	retainTags2CR []string
+}
+
+//
+func NewCut() *Cut {
+	t, _ := regexp.Compile("<[^>]+>")
+	m, _ := regexp.Compile("([\r\n][\u3000\u2003\u00a0\\s]*)+|[\r\n]+")
+	//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
+	//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
+	scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
+	cols, _ := regexp.Compile(`colspan="\d+"`)
+	rows, _ := regexp.Compile(`rowspan="\d+"`)
+	dis, _ := regexp.Compile(`display:none`)
+	return &Cut{
+		tag:       t,
+		scripttag: scs,
+		//styletag:      ss,
+		colstag:       cols,
+		rowstag:       rows,
+		display:       dis,
+		multiCR:       m,
+		replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
+		replStartWrap: regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$"),
+		replTags2CR:   []string{"div", "p", "br", "h1", "h2", "h3", "h4", "h5"},
+		retainTags2CR: []string{"table", "thead", "tfoot", "tbody", "th", "td", "tr"},
+	}
+}
+
+//清理HTML标签
+func (c *Cut) ClearHtml(src string) string {
+	src = strings.Replace(src, ">\n", ">", -1)
+	//标签全转小写
+	src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
+	//清script,style
+	src = c.scripttag.ReplaceAllString(src, "")
+	//
+	//换结束标签
+	src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
+		tmp = strings.Replace(tmp, " ", "", -1)
+		//保留这些标签
+		for _, v := range c.retainTags2CR {
+			if "<"+v+">" == tmp || "</"+v+">" == tmp {
+				if tmp == "</table>" {
+					return tmp + "\n"
+				}
+				return tmp
+			}
+			if strings.HasPrefix(tmp, "<"+v) {
+				dispstrs := c.display.FindAllString(tmp, -1)
+				rowstrs := c.rowstag.FindAllString(tmp, -1)
+				colstrs := c.colstag.FindAllString(tmp, -1)
+				c := "<" + v
+				if len(colstrs) > 0 { //处理多列合并
+					c += " " + colstrs[0]
+				}
+				if len(rowstrs) > 0 { //处理多行合并
+					c += " " + rowstrs[0]
+				}
+				if len(dispstrs) > 0 {
+					c += " style=\"" + dispstrs[0] + "\""
+				}
+				return c + ">"
+
+			}
+		}
+		if tmp == "<br>" || tmp == "<br/>" {
+			return "\n"
+		}
+		if tmp[1] != 47 { //开始标签
+			for _, v := range c.replTags2CR {
+				if v == tmp[1:len(tmp)-1] {
+					return "\n"
+				}
+			}
+			return ""
+		}
+		for _, v := range c.replTags2CR {
+			if v == tmp[2:len(tmp)-1] {
+				return "\n"
+			}
+		}
+		return ""
+	})
+	src = c.replStartWrap.ReplaceAllString(src, "")
+	src = c.replBlankLine.ReplaceAllString(src, "\n")
+	//清除多余换行
+	return c.multiCR.ReplaceAllString(src, "\n")
+	//return strings.Replace(src, "\n", "<br/>", -1)
+}

+ 27 - 0
src/jy/util/util.go

@@ -53,3 +53,30 @@ func GetSyncIndex(code string) string {
 	}
 	return tmp
 }
+
+//nfields非复制字段集
+func DeepCopy(value interface{}, nfields []string) interface{} {
+	if valueMap, ok := value.(map[string]interface{}); ok {
+		newMap := make(map[string]interface{})
+		for k, v := range valueMap {
+			iscop := true
+			for _, fn := range nfields {
+				if k == fn {
+					iscop = false
+					break
+				}
+			}
+			if iscop {
+				newMap[k] = DeepCopy(v, []string{})
+			}
+		}
+		return newMap
+	} else if valueSlice, ok := value.([]interface{}); ok {
+		newSlice := make([]interface{}, len(valueSlice))
+		for k, v := range valueSlice {
+			newSlice[k] = DeepCopy(v, []string{})
+		}
+		return newSlice
+	}
+	return value
+}

+ 1 - 1
src/web/templates/admin/rule_backlist.html

@@ -187,7 +187,7 @@ function del(_id){
 			data:{"_id":_id},
 			success:function(r){
 				if(r.rep){				
-					ttable.ajax.reload();
+					window.location.href="/admin/ruleback?version={{.version}}"	
 				}else{
 					showTip("删除失败", 1000, function() {});
 				}

+ 1 - 1
src/web/templates/admin/rule_logicbacklist.html

@@ -187,7 +187,7 @@ function del(_id){
 			data:{"_id":_id},
 			success:function(r){
 				if(r.rep){				
-					ttable.ajax.reload();
+					window.location.href="/admin/logicback?version={{.version}}&sid={{.sid}}"
 				}else{
 					showTip("删除失败", 1000);
 				}

+ 2 - 2
src/web/templates/admin/rule_logiclist.html

@@ -139,7 +139,7 @@ $(function () {
 								if (bcon){								
 									$.post("/admin/rulelogic/save",obj,function(data){
 										if(data&&data.rep){
-											window.location.reload()
+											window.location.href="/admin/rulelogic?version={{.version}}"	
 										}else{
 											showTip(data.msg,1000)
 										}
@@ -172,7 +172,7 @@ function use(_id,utype){
 			data:{"_id":_id,"isuse":utype},
 			success:function(r){
 				if(r.rep){				
-					window.location.reload()
+					window.location.href="/admin/rulelogic?version={{.version}}"
 				}else{
 					showTip("启用失败", 1000, function() {});
 				}

+ 1 - 1
src/web/templates/admin/rule_prelist.html

@@ -187,7 +187,7 @@ function del(_id){
 			data:{"_id":_id},
 			success:function(r){
 				if(r.rep){				
-					ttable.ajax.reload();
+					window.location.href="/admin/rulepre?version={{.version}}"
 				}else{
 					showTip("删除失败", 1000);
 				}