Selaa lähdekoodia

时间清理、打分正则

zhangjinkun 6 vuotta sitten
vanhempi
commit
8f178f67be

+ 9 - 2
src/jy/admin/rulecheck.go

@@ -198,6 +198,8 @@ func checkPreReg(content, ruleText string) string {
 		tmp := strings.Split(ruleText, "__")
 		var pattern string
 		if strings.Contains(tmp[0], "\\u") {
+			tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+			tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 		} else {
 			pattern = tmp[0]
@@ -222,6 +224,8 @@ func checkBackReg(content, ruleText string) string {
 		tmp := strings.Split(ruleText, "__")
 		var pattern string
 		if strings.Contains(tmp[0], "\\u") {
+			tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+			tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 		} else {
 			pattern = tmp[0]
@@ -258,10 +262,13 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 			}
 			var pattern string
 			if strings.Contains(tmp[0], "\\u") {
+				tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+				tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 				pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 			} else {
 				pattern = tmp[0]
 			}
+			log.Println("pattern", pattern)
 			reg := regexp.MustCompile(pattern)
 			apos := reg.FindAllStringSubmatchIndex(content, -1)
 			if len(apos) > 0 {
@@ -313,7 +320,7 @@ func checkBackScript(table, code, name, version, infoid, script string, alone bo
 		doc = extract.ExtRegPre(doc, j, v, e.TaskInfo)
 	}
 	//抽取规则
-	if j.CategorySecond==""{
+	if j.CategorySecond == "" {
 		for _, vc1 := range e.RuleCores[j.Category] {
 			for _, vc := range vc1 {
 				tmp := ju.DeepCopy(doc).(map[string]interface{})
@@ -331,7 +338,7 @@ func checkBackScript(table, code, name, version, infoid, script string, alone bo
 				}
 			}
 		}
-	}else{
+	} else {
 		for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
 			for _, vc := range vc1 {
 				tmp := ju.DeepCopy(doc).(map[string]interface{})

+ 27 - 3
src/jy/clear/totimestamp.go

@@ -4,11 +4,12 @@ package clear
 import (
 	"fmt"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
 )
 
-var reg, regA, regB, regC, regAfter *regexp.Regexp
+var reg, regA, regB, regC, regD, regAfter *regexp.Regexp
 
 const (
 	T = 365 * 86400
@@ -22,10 +23,12 @@ var item = map[string]string{
 }
 
 func init() {
+	//二〇一五年十一月四日十五时
 	reg, _ = regexp.Compile(`\d+`)
 	regA, _ = regexp.Compile(`[一|二|三|四|五|六|七|八|九|十|零|〇|1|2|3|4|5|6|7|8|9|0]`)
 	regB, _ = regexp.Compile(`\d+年\d+月\d+日((上|下)午)?\s*\d+[::时]\d+分?[-—]\d+[::时]\d+时?分?`)
 	regC, _ = regexp.Compile(`\s*\d+[::时]\d+分?[-—]`)
+	regD, _ = regexp.Compile(`([一|二|三|四|五|六|七|八|九|十|零|〇]{4})年([一|二|三|四|五|六|七|八|九|十]{1,2})月([一|二|三|四|五|六|七|八|九|十]{1,3})日([一|二|三|四|五|六|七|八|九|十]{1,3})时`)
 	regAfter, _ = regexp.Compile(`(下午D?\d{1,2}[时|:|:|h|H])`)
 }
 
@@ -41,6 +44,27 @@ func init() {
 */
 func ObjToTimestamp(data []interface{}) []interface{} {
 	tmp := fmt.Sprint(data[0])
+	//处理类似:二〇一五年十一月四日十五时
+	cht := regD.FindStringSubmatch(tmp)
+	if len(cht) == 5 {
+		y := chineseToNumber(cht[1])
+		m := 0
+		for _, v := range []rune(cht[2]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			m += it
+		}
+		d := 0
+		for _, v := range []rune(cht[3]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			d += it
+		}
+		M := 0
+		for _, v := range []rune(cht[4]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			M += it
+		}
+		tmp = fmt.Sprintf("%s年%d月%d日%d时", y, m, d, M)
+	}
 	//2016年12月7日上午9:00-11:30时 时间范围处理 取后面的时间
 	if regB.MatchString(tmp) {
 		tmp = regC.ReplaceAllString(tmp, "")
@@ -97,8 +121,8 @@ func ObjToTimestamp(data []interface{}) []interface{} {
 		t, _ := time.ParseInLocation("2006-01-02 15:04", timestr, time.Local)
 		timestamp = t.Unix()
 	}
-	if timestamp < 0 || timestamp > (time.Now().Unix()+T) {
-		data[0] = 0
+	if timestamp <= 0 || timestamp > (time.Now().Unix()+T) {
+		data[0] = ""
 	} else {
 		if addreptime > 0 {
 			timestamp += addreptime

+ 12 - 0
src/jy/extract/extractInit.go

@@ -214,6 +214,8 @@ func (e *ExtractTask) InitRulePres() {
 				tmp := strings.Split(rinfo.RuleText, "__")
 				var pattern string
 				if strings.Contains(tmp[0], "\\u") {
+					tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+					tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 				} else {
 					pattern = tmp[0]
@@ -251,6 +253,8 @@ func (e *ExtractTask) InitRuleBacks() {
 				tmp := strings.Split(rinfo.RuleText, "__")
 				var pattern string
 				if strings.Contains(tmp[0], "\\u") {
+					tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+					tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 				} else {
 					pattern = tmp[0]
@@ -335,6 +339,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -371,6 +377,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -414,6 +422,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -485,6 +495,8 @@ func (e *ExtractTask) InitPkgCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]

+ 9 - 5
src/jy/extract/score.go

@@ -17,7 +17,7 @@ func init() {
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
 	//实例化正则
 	for _, tmp := range SoreConfig {
-		log.Println(tmp)
+		//log.Println(tmp)
 		if tmp["type"] == "string" {
 			if positions, ok := tmp["position"].([]interface{}); ok {
 				for _, position := range positions {
@@ -25,8 +25,10 @@ func init() {
 						qu.Try(func() {
 							strReq, _ := p["regstr"].(string)
 							if strings.Contains(strReq, "\\u") {
-								pattern, _ := strconv.Unquote(`"` + strReq + `"`)
-								p["regexp"] = regexp.MustCompile(pattern)
+								strReq = strings.Replace(strReq, "\\", "\\\\", -1)
+								strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
+								strReq, _ = strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(strReq)
 							} else {
 								p["regexp"] = regexp.MustCompile(strReq)
 							}
@@ -42,8 +44,10 @@ func init() {
 						qu.Try(func() {
 							strReq, _ := p["regstr"].(string)
 							if strings.Contains(strReq, "\\u") {
-								pattern, _ := strconv.Unquote(`"` + strReq + `"`)
-								p["regexp"] = regexp.MustCompile(pattern)
+								strReq = strings.Replace(strReq, "\\", "\\\\", -1)
+								strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
+								strReq, _ = strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(strReq)
 							} else {
 								p["regexp"] = regexp.MustCompile(strReq)
 							}

+ 3 - 3
src/jy/util/config.go

@@ -12,9 +12,9 @@ import (
 var FormatTextMap map[string][]map[string]interface{}
 
 func init() {
-	//loadFormatText()
-	//LoadTagDb("./res/tagdb")
-	//LoadTagDb("./res/blocktagdb")
+	loadFormatText()
+	LoadTagDb("./res/tagdb")
+	LoadTagDb("./res/blocktagdb")
 }
 
 //加载格式化正文配置

+ 1 - 1
src/res/fieldscore.json

@@ -172,7 +172,7 @@
         "position": [
             {
                 "describe": "全为中文汉字或符号",
-                "regstr": "^[\\u4e00-\\u9fa5()()【】\\\\[\\\\],,。、::《》]+$",
+                "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
                 "score": -20
             },
             {