Răsfoiți Sursa

Merge branch 'dev3.1.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.1.2

unknown 6 ani în urmă
părinte
comite
04852f3a79

+ 26 - 5
src/jy/admin/rulecheck.go

@@ -6,9 +6,11 @@ import (
 	"jy/extract"
 	. "jy/mongodbutil"
 	ju "jy/util"
+	"log"
 	qu "qfw/util"
 	"qfw/util/elastic"
 	"regexp"
+	"strconv"
 	"strings"
 
 	"github.com/gin-gonic/gin"
@@ -194,11 +196,17 @@ func checkPreReg(content, ruleText string) string {
 	tmpstr := ""
 	qu.Try(func() {
 		tmp := strings.Split(ruleText, "__")
+		var pattern string
+		if strings.Contains(tmp[0], "\\u") {
+			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+		} else {
+			pattern = tmp[0]
+		}
 		if len(tmp) == 2 {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, tmp[1])
 		} else {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, "")
 		}
 	}, func(err interface{}) {
@@ -212,13 +220,20 @@ func checkBackReg(content, ruleText string) string {
 	tmpstr := ""
 	qu.Try(func() {
 		tmp := strings.Split(ruleText, "__")
+		var pattern string
+		if strings.Contains(tmp[0], "\\u") {
+			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+		} else {
+			pattern = tmp[0]
+		}
 		if len(tmp) == 2 {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, tmp[1])
 		} else {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, "")
 		}
+		log.Println(tmp[0])
 	}, func(err interface{}) {
 		tmpstr = fmt.Sprint(err)
 	})
@@ -241,7 +256,13 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 					posm[field] = qu.IntAll(ks[0])
 				}
 			}
-			reg := regexp.MustCompile(tmp[0])
+			var pattern string
+			if strings.Contains(tmp[0], "\\u") {
+				pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+			} else {
+				pattern = tmp[0]
+			}
+			reg := regexp.MustCompile(pattern)
 			apos := reg.FindAllStringSubmatchIndex(content, -1)
 			if len(apos) > 0 {
 				pos := apos[0]

+ 1 - 1
src/jy/extract/extpackage.go

@@ -23,7 +23,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 				sonJobResult["origin"] = pkg.Origin
 				sonJobResult["type"] = pkg.Type
 				sonJobResult["winnerorder"] = pkg.WinnerOrder
-				//分包结果暂时不用
+				//分包暂不参与选举
 				/*
 					for k, tags := range e.Tag {
 					L:

+ 13 - 10
src/jy/extract/extract.go

@@ -101,8 +101,8 @@ func StartExtractTaskId(taskId string) bool {
 		ext.Id = taskId
 		ext.InitTaskInfo()
 	}
-	ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
-	ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
+	ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
 	ext.InitRulePres()
 	ext.InitRuleBacks()
 	ext.InitRuleCore()
@@ -165,14 +165,15 @@ func RunExtractTask(taskId string) {
 			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 				continue
 			}
-			//log.Println(v["_id"])
+			_id := qu.BsonIdToSId(v["_id"])
+			log.Println(_id)
 			if !ext.IsRun {
 				break
 			}
 			j := PreInfo(v)
 			ext.TaskInfo.ProcessPool <- true
 			go ext.ExtractProcess(j)
-			ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
+			ext.TaskInfo.LastExtId = _id
 		}
 		db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
 		if !ext.IsRun {
@@ -303,12 +304,14 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
 				v.Value = data[0]
 				//清理特殊符号
+				lock.Lock()
 				if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
 					clear.MesField[key] != nil {
 					text := qu.ObjToString(v.Value)
 					text = clear.OtherClean(key, text)
 					v.Value = text
 				}
+				lock.Unlock()
 			}
 		}
 		PackageDetail(j, e) //处理分包信息
@@ -316,11 +319,10 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 		//		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j, e)
-		<-e.TaskInfo.ProcessPool
 	}, func(err interface{}) {
-		log.Println("ExtractProcess err", err, (*j.Data)["_id"])
-		<-e.TaskInfo.ProcessPool
+		log.Println("ExtractProcess err", err)
 	})
+	<-e.TaskInfo.ProcessPool
 }
 
 //前置过滤
@@ -648,9 +650,10 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 				tmp := j.Result[in.Field]
 				exts := []interface{}{}
 				for k, v := range tmp {
-					if v.Type == "table" && v.Field != "projectname" { //table抽取到的数据不清理
-						continue
-					}
+					//table抽取到的数据不清理
+					//					if v.Type == "table" && v.Field != "projectname" {
+					//						continue
+					//					}
 					text := qu.ObjToString(v.Value)
 					if text != "" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)

+ 49 - 12
src/jy/extract/extractInit.go

@@ -8,6 +8,7 @@ import (
 	qu "qfw/util"
 	"regexp"
 	"sort"
+	"strconv"
 	"strings"
 	"time"
 )
@@ -201,10 +202,16 @@ func (e *ExtractTask) InitRulePres() {
 			qu.Try(func() {
 				rinfo.RuleText = v["s_rule"].(string)
 				tmp := strings.Split(rinfo.RuleText, "__")
+				var pattern string
+				if strings.Contains(tmp[0], "\\u") {
+					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+				} else {
+					pattern = tmp[0]
+				}
 				if len(tmp) == 2 {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 				} else {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 				}
 				e.RulePres = append(e.RulePres, rinfo)
 			}, func(err interface{}) {
@@ -232,10 +239,16 @@ func (e *ExtractTask) InitRuleBacks() {
 			qu.Try(func() {
 				rinfo.RuleText = v["s_rule"].(string)
 				tmp := strings.Split(rinfo.RuleText, "__")
+				var pattern string
+				if strings.Contains(tmp[0], "\\u") {
+					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+				} else {
+					pattern = tmp[0]
+				}
 				if len(tmp) == 2 {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 				} else {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 				}
 				e.RuleBacks = append(e.RuleBacks, rinfo)
 			}, func(err interface{}) {
@@ -282,10 +295,16 @@ func (e *ExtractTask) InitRuleCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
 						if len(tmp) == 2 {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 						}
 						rulePres = append(rulePres, rinfo)
 					}, func(err interface{}) {
@@ -312,10 +331,16 @@ func (e *ExtractTask) InitRuleCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
 						if len(tmp) == 2 {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 						}
 						ruleBacks = append(ruleBacks, rinfo)
 					}, func(err interface{}) {
@@ -347,6 +372,12 @@ func (e *ExtractTask) InitRuleCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")
 							posm := map[string]int{}
@@ -358,9 +389,9 @@ func (e *ExtractTask) InitRuleCore() {
 									posm[rinfo.Field] = qu.IntAll(ks[0])
 								}
 							}
-							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
 						} else {
-							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
 						}
 						ruleCores = append(ruleCores, rinfo)
 					}, func(err interface{}) {
@@ -412,10 +443,16 @@ func (e *ExtractTask) InitPkgCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
 						if len(tmp) == 2 {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 						}
 						ruleBacks = append(ruleBacks, rinfo)
 					}, func(err interface{}) {

+ 10 - 7
src/jy/extract/extractudp.go

@@ -88,7 +88,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 	ext := &ExtractTask{}
 	ext.Id = qu.ObjToString(ju.Config["udptaskid"])
 	ext.InitTaskInfo()
-	ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
 	ext.InitRulePres()
 	ext.InitRuleBacks()
 	ext.InitRuleCore()
@@ -140,11 +140,12 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 						continue
 					}
-					//log.Println(v["_id"])
+					_id := qu.BsonIdToSId(v["_id"])
+					log.Println(_id)
 					j := PreInfo(v)
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j)
-					sid = qu.BsonIdToSId(v["_id"])
+					sid = _id
 				}
 				db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
 					map[string]interface{}{"$set": map[string]interface{}{
@@ -159,11 +160,12 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 						continue
 					}
-					//log.Println(v["_id"])
+					_id := qu.BsonIdToSId(v["_id"])
+					log.Println(_id)
 					j := PreInfo(v)
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j)
-					sidback = qu.BsonIdToSId(v["_id"])
+					sidback = _id
 				}
 				db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
 					map[string]interface{}{"$set": map[string]interface{}{
@@ -192,11 +194,12 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 				if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 					continue
 				}
-				//log.Println(v["_id"])
+				_id := qu.BsonIdToSId(v["_id"])
+				log.Println(_id)
 				j := PreInfo(v)
 				ext.TaskInfo.ProcessPool <- true
 				go ext.ExtractProcess(j)
-				sid = qu.BsonIdToSId(v["_id"])
+				sid = _id
 			}
 
 		}

+ 9 - 1
src/jy/extract/score.go

@@ -7,6 +7,7 @@ import (
 	"log"
 	qu "qfw/util"
 	"regexp"
+	"strconv"
 	"strings"
 )
 
@@ -16,12 +17,19 @@ func init() {
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
 	//实例化正则
 	for _, tmp := range SoreConfig {
+		log.Println(tmp)
 		if tmp["type"] == "string" {
 			if positions, ok := tmp["position"].([]interface{}); ok {
 				for _, position := range positions {
 					if p, ok := position.(map[string]interface{}); ok {
 						qu.Try(func() {
-							p["regexp"] = regexp.MustCompile(qu.ObjToString(p["regstr"]))
+							strReq, _ := p["regstr"].(string)
+							if strings.Contains(strReq, "\\u") {
+								pattern, _ := strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(pattern)
+							} else {
+								p["regexp"] = regexp.MustCompile(strReq)
+							}
 						}, func(err interface{}) {
 							log.Println(err)
 						})

+ 3 - 2
src/jy/pretreated/tablev2.go

@@ -185,7 +185,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 						tb1.AddKey(k, v)
 					} else {
 						bp := tb1.Map[k].(*u.BlockPackage)
-						if v1.TableKV != nil && v1.TableKV.Kv != nil {
+						if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
 							for k2, v2 := range v1.TableKV.Kv {
 								if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
 									bp.TableKV.Kv[k2] = v2
@@ -753,7 +753,9 @@ strtype 1全文 2块文本
 func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
 	defer qutil.Catch()
 	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	cons := doc.Text()
 	tables := doc.Find("table")
+	doc = nil
 	if tables.Size() > 0 {
 		tabs = []*goquery.Selection{}
 		for i := 0; i < tables.Size(); i++ {
@@ -769,7 +771,6 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 			}
 		}
 		tlen := 0
-		cons := doc.Text()
 		for _, t := range tabs {
 			tlen += len(t.Text())
 		}

+ 1 - 1
src/jy/util/script.go

@@ -103,7 +103,7 @@ func (s *LuaScript) RunScript(stype string) map[string]interface{} {
 			}
 		}
 	}, func(err interface{}) {
-		log.Println("lua err:", data["err"])
+		log.Println("lua err:", err)
 	})
 	return data
 }

+ 12 - 0
src/main_test.go

@@ -1,16 +1,28 @@
 package main
 
 import (
+	"fmt"
 	"jy/admin/track"
 	"jy/clear"
 	"jy/extract"
 	. "jy/mongodbutil"
 	"log"
 	"regexp"
+	"strconv"
 	"testing"
 	"time"
 )
 
+func Test_han(t *testing.T) {
+	str := `[\u4e00-\u9fa5]` //"[\u4e00-\u9fa5]"
+	//var rg = regexp.MustCompile(`[\u4e00-\u9fa5]`)会出错
+	var pattern string
+	if strings.Contains(str, "\\u") {
+		pattern, _ = strconv.Unquote(`"` + str + `"`)
+	}
+	var rg = regexp.MustCompile(str)
+	fmt.Println(rg.MatchString(str))
+}
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")

+ 8 - 3
src/res/fieldscore.json

@@ -143,7 +143,7 @@
         "position": [
             {
                 "describe": "出现中文汉字",
-                "regstr": "[\\p{Han}]",
+                "regstr": "[\\u4e00-\\u9fa5]",
                 "score": -1
             }
         ],
@@ -165,8 +165,13 @@
         "position": [
             {
                 "describe": "全为中文汉字",
-                "regstr": "^[\\p{Han}]+$",
-                "score": -1
+                "regstr": "^[\\u4e00-\\u9fa5]+$",
+                "score": -20
+            },
+            {
+                "describe": "包含负分",
+                "regstr": "(月|日|天|[,,\\.。、::“”‘’\"])",
+                "score": -20
             }
         ],
         "length": [