Browse Source

正则中文支持

zhangjinkun 6 years ago
parent
commit
74e837b548
5 changed files with 80 additions and 21 deletions
  1. 20 5
      src/jy/admin/rulecheck.go
  2. 37 12
      src/jy/extract/extractInit.go
  3. 9 1
      src/jy/extract/score.go
  4. 6 0
      src/main_test.go
  5. 8 3
      src/res/fieldscore.json

+ 20 - 5
src/jy/admin/rulecheck.go

@@ -6,9 +6,11 @@ import (
 	"jy/extract"
 	. "jy/mongodbutil"
 	ju "jy/util"
+	"log"
 	qu "qfw/util"
 	"qfw/util/elastic"
 	"regexp"
+	"strconv"
 	"strings"
 
 	"github.com/gin-gonic/gin"
@@ -194,11 +196,15 @@ func checkPreReg(content, ruleText string) string {
 	tmpstr := ""
 	qu.Try(func() {
 		tmp := strings.Split(ruleText, "__")
+		var pattern string
+		if strings.Contains(tmp[0], "\\u") {
+			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+		}
 		if len(tmp) == 2 {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, tmp[1])
 		} else {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, "")
 		}
 	}, func(err interface{}) {
@@ -212,13 +218,18 @@ func checkBackReg(content, ruleText string) string {
 	tmpstr := ""
 	qu.Try(func() {
 		tmp := strings.Split(ruleText, "__")
+		var pattern string
+		if strings.Contains(tmp[0], "\\u") {
+			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+		}
 		if len(tmp) == 2 {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, tmp[1])
 		} else {
-			reg := regexp.MustCompile(tmp[0])
+			reg := regexp.MustCompile(pattern)
 			tmpstr = reg.ReplaceAllString(content, "")
 		}
+		log.Println(tmp[0])
 	}, func(err interface{}) {
 		tmpstr = fmt.Sprint(err)
 	})
@@ -241,7 +252,11 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 					posm[field] = qu.IntAll(ks[0])
 				}
 			}
-			reg := regexp.MustCompile(tmp[0])
+			var pattern string
+			if strings.Contains(tmp[0], "\\u") {
+				pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+			}
+			reg := regexp.MustCompile(pattern)
 			apos := reg.FindAllStringSubmatchIndex(content, -1)
 			if len(apos) > 0 {
 				pos := apos[0]

+ 37 - 12
src/jy/extract/extractInit.go

@@ -8,6 +8,7 @@ import (
 	qu "qfw/util"
 	"regexp"
 	"sort"
+	"strconv"
 	"strings"
 	"time"
 )
@@ -201,10 +202,14 @@ func (e *ExtractTask) InitRulePres() {
 			qu.Try(func() {
 				rinfo.RuleText = v["s_rule"].(string)
 				tmp := strings.Split(rinfo.RuleText, "__")
+				var pattern string
+				if strings.Contains(tmp[0], "\\u") {
+					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+				}
 				if len(tmp) == 2 {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 				} else {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 				}
 				e.RulePres = append(e.RulePres, rinfo)
 			}, func(err interface{}) {
@@ -232,10 +237,14 @@ func (e *ExtractTask) InitRuleBacks() {
 			qu.Try(func() {
 				rinfo.RuleText = v["s_rule"].(string)
 				tmp := strings.Split(rinfo.RuleText, "__")
+				var pattern string
+				if strings.Contains(tmp[0], "\\u") {
+					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+				}
 				if len(tmp) == 2 {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 				} else {
-					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 				}
 				e.RuleBacks = append(e.RuleBacks, rinfo)
 			}, func(err interface{}) {
@@ -282,10 +291,14 @@ func (e *ExtractTask) InitRuleCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						}
 						if len(tmp) == 2 {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 						}
 						rulePres = append(rulePres, rinfo)
 					}, func(err interface{}) {
@@ -312,10 +325,14 @@ func (e *ExtractTask) InitRuleCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						}
 						if len(tmp) == 2 {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 						}
 						ruleBacks = append(ruleBacks, rinfo)
 					}, func(err interface{}) {
@@ -347,6 +364,10 @@ func (e *ExtractTask) InitRuleCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						}
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")
 							posm := map[string]int{}
@@ -358,9 +379,9 @@ func (e *ExtractTask) InitRuleCore() {
 									posm[rinfo.Field] = qu.IntAll(ks[0])
 								}
 							}
-							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
 						} else {
-							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
 						}
 						ruleCores = append(ruleCores, rinfo)
 					}, func(err interface{}) {
@@ -412,10 +433,14 @@ func (e *ExtractTask) InitPkgCore() {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						}
 						if len(tmp) == 2 {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
-							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
 						}
 						ruleBacks = append(ruleBacks, rinfo)
 					}, func(err interface{}) {

+ 9 - 1
src/jy/extract/score.go

@@ -7,6 +7,7 @@ import (
 	"log"
 	qu "qfw/util"
 	"regexp"
+	"strconv"
 	"strings"
 )
 
@@ -16,12 +17,19 @@ func init() {
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
 	//实例化正则
 	for _, tmp := range SoreConfig {
+		log.Println(tmp)
 		if tmp["type"] == "string" {
 			if positions, ok := tmp["position"].([]interface{}); ok {
 				for _, position := range positions {
 					if p, ok := position.(map[string]interface{}); ok {
 						qu.Try(func() {
-							p["regexp"] = regexp.MustCompile(qu.ObjToString(p["regstr"]))
+							strReq, _ := p["regstr"].(string)
+							if strings.Contains(strReq, "\\u") {
+								pattern, _ := strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(pattern)
+							} else {
+								p["regexp"] = regexp.MustCompile(strReq)
+							}
 						}, func(err interface{}) {
 							log.Println(err)
 						})

+ 6 - 0
src/main_test.go

@@ -1,6 +1,7 @@
 package main
 
 import (
+	"fmt"
 	"jy/admin/track"
 	"jy/clear"
 	"jy/extract"
@@ -11,6 +12,11 @@ import (
 	"time"
 )
 
+func Test_han(t *testing.T) {
+	str := "撒地方12334sf"
+	var rg = regexp.MustCompile("[\u4e00-\u9fa5]")
+	fmt.Println(rg.MatchString(str))
+}
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")

+ 8 - 3
src/res/fieldscore.json

@@ -143,7 +143,7 @@
         "position": [
             {
                 "describe": "出现中文汉字",
-                "regstr": "[\\p{Han}]",
+                "regstr": "[\\u4e00-\\u9fa5]",
                 "score": -1
             }
         ],
@@ -165,8 +165,13 @@
         "position": [
             {
                 "describe": "全为中文汉字",
-                "regstr": "^[\\p{Han}]+$",
-                "score": -1
+                "regstr": "^[\\u4e00-\\u9fa5]+$",
+                "score": -20
+            },
+            {
+                "describe": "包含负分",
+                "regstr": "(月|日|天|[,,\\.。、::“”‘’\"])",
+                "score": -20
             }
         ],
         "length": [