瀏覽代碼

Merge branch 'master' of http://192.168.3.207:10080/qmx/jy-data-extract

unknown 6 年之前
父節點
當前提交
e262b5c74e
共有 5 個文件被更改,包括 53 次插入16 次删除
  1. 1 1
      src/jy/clear/cutspace.go
  2. 36 9
      src/jy/extract/extpackage.go
  3. 6 0
      src/jy/extract/extract.go
  4. 4 0
      src/jy/extract/score.go
  5. 6 6
      src/res/fieldscore.json

+ 1 - 1
src/jy/clear/cutspace.go

@@ -9,7 +9,7 @@ import (
 var cutSpace *regexp.Regexp
 var cutAllSpace *regexp.Regexp
 var catSymbol *regexp.Regexp
-var spaces = []string{"\u3000", "\u2003", "\u00a0"}
+var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
 
 func init() {
 	cutSpace, _ = regexp.Compile(`^\s*|\s*$`)

+ 36 - 9
src/jy/extract/extpackage.go

@@ -5,6 +5,7 @@ import (
 	"jy/clear"
 	ju "jy/util"
 	qu "qfw/util"
+	"reflect"
 )
 
 //处理分包信息
@@ -27,7 +28,14 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 						for key, val := range pkg.TableKV.Kv {
 							if tag.Key == key {
 								clearmap[k] = false
-								sonJobResult[k] = val
+								var tmpval interface{}
+								if len(e.ClearFn[k]) > 0 {
+									data := clear.DoClearFn(e.ClearFn[k], []interface{}{val, j.Content})
+									tmpval = data[0]
+								} else {
+									tmpval = val
+								}
+								sonJobResult[k] = tmpval
 								if packagenum == 1 {
 									field := &ju.ExtField{
 										Field:     k,
@@ -36,7 +44,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 										Type:      "table",
 										MatchType: "tag_string",
 										ExtFrom:   "package",
-										Value:     val,
+										Value:     tmpval,
 										Score:     0,
 									}
 									j.Result[k] = append(j.Result[k], field)
@@ -49,7 +57,14 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 						for key, val := range pkg.ColonKV.Kv {
 							if tag.Key == key {
 								clearmap[k] = true
-								sonJobResult[k] = val
+								var tmpval interface{}
+								if len(e.ClearFn[k]) > 0 {
+									data := clear.DoClearFn(e.ClearFn[k], []interface{}{val, j.Content})
+									tmpval = data[0]
+								} else {
+									tmpval = val
+								}
+								sonJobResult[k] = tmpval
 								if packagenum == 1 {
 									field := &ju.ExtField{
 										Field:     k,
@@ -58,7 +73,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 										Type:      "colon",
 										MatchType: "tag_string",
 										ExtFrom:   "package",
-										Value:     val,
+										Value:     tmpval,
 										Score:     0,
 									}
 									j.Result[k] = append(j.Result[k], field)
@@ -71,7 +86,14 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 						for key, val := range pkg.SpaceKV.Kv {
 							if tag.Key == key {
 								clearmap[k] = true
-								sonJobResult[k] = val
+								var tmpval interface{}
+								if len(e.ClearFn[k]) > 0 {
+									data := clear.DoClearFn(e.ClearFn[k], []interface{}{val, j.Content})
+									tmpval = data[0]
+								} else {
+									tmpval = val
+								}
+								sonJobResult[k] = tmpval
 								if packagenum == 1 {
 									field := &ju.ExtField{
 										Field:     k,
@@ -80,7 +102,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 										Type:      "space",
 										MatchType: "tag_string",
 										ExtFrom:   "package",
-										Value:     val,
+										Value:     tmpval,
 										Score:     0,
 									}
 									j.Result[k] = append(j.Result[k], field)
@@ -116,7 +138,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 func extRegBackPack(j *ju.Job, e *ExtractTask) {
 	//正则清理
 	for _, rc := range e.RuleCores {
-		for _, pack := range j.PackageInfo {
+		for pk, pack := range j.PackageInfo {
 			clear, _ := pack["clear"].(map[string]interface{})
 			for k, val := range pack {
 				if b, ok := clear[k].(bool); ok && b {
@@ -131,6 +153,7 @@ func extRegBackPack(j *ju.Job, e *ExtractTask) {
 					}
 				}
 			}
+			j.PackageInfo[pk] = pack
 		}
 	}
 	//通用正则清理
@@ -150,8 +173,12 @@ func extRegBackPack(j *ju.Job, e *ExtractTask) {
 	//函数清理
 	for _, pack := range j.PackageInfo {
 		for key, val := range pack {
-			data := clear.DoClearFn(e.ClearFn[key], []interface{}{val, j.Content})
-			pack[key] = data[0]
+			if reflect.TypeOf(val).String() == "float64" || reflect.TypeOf(val).String() == "int64" {
+				continue
+			} else {
+				data := clear.DoClearFn(e.ClearFn[key], []interface{}{val, j.Content})
+				pack[key] = data[0]
+			}
 		}
 	}
 }

+ 6 - 0
src/jy/extract/extract.go

@@ -724,10 +724,16 @@ func AnalysisSaveResult(j *ju.Job, task *TaskInfo) {
 		fieldValue := map[string][]interface{}{}
 		if iscore { //走打分
 			for _, v := range val {
+				if len(fmt.Sprint(v.Value)) < 1 {
+					continue //去除空串
+				}
 				fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
 			}
 		} else { //不走打分,按出现频次
 			for _, v := range val {
+				if len(fmt.Sprint(v.Value)) < 1 {
+					continue //去除空串
+				}
 				if fieldValue[fmt.Sprint(v.Value)] == nil {
 					fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
 				} else {

+ 4 - 0
src/jy/extract/score.go

@@ -2,6 +2,7 @@
 package extract
 
 import (
+	"fmt"
 	ju "jy/util"
 	"log"
 	qu "qfw/util"
@@ -41,6 +42,9 @@ func ScoreFields(result map[string][]*ju.ExtField) map[string][]*ju.ExtField {
 		extractype := SoreConfig["extractype"]
 		fieldtype := scoreRule["type"]
 		for _, v := range tmps {
+			if len(fmt.Sprint(v.Value)) < 1 {
+				continue //空串跳过
+			}
 			//类型打分
 			if v.ExtFrom == "title" {
 				v.Score += qu.IntAll(extractype["title"])

+ 6 - 6
src/res/fieldscore.json

@@ -1,6 +1,6 @@
 {
     "extractype": {
-         "describe": "抽取类型打分",
+        "describe": "抽取类型打分",
         "title": 3,
         "table": 5,
         "colon": 3,
@@ -17,7 +17,7 @@
             },
             {
                 "describe": "以*结尾",
-                "regstr": "(项目|工程|采购)$",
+                "regstr": ".{2,100}(项目|工程|采购)$",
                 "score": 3
             }
         ],
@@ -39,7 +39,7 @@
         "position": [
             {
                 "describe": "以*结尾",
-                "regstr": "(委员会|办公室|局|中心|协会|公司|政府|大学|学校|医院|集团|银行)$",
+                "regstr": ".{2,100}(委员会|办公室|局|中心|协会|公司|政府|大学|学校|医院|集团|银行)$",
                 "score": 3
             },
             {
@@ -66,7 +66,7 @@
         "position": [
             {
                 "describe": "以*结尾",
-                "regstr": "(公司|合作社)$",
+                "regstr": ".{2,100}(公司|合作社)$",
                 "score": 3
             }
         ],
@@ -93,7 +93,7 @@
             },
             {
                 "describe": "以*结尾",
-                "regstr": "(公司|事务所)$",
+                "regstr": ".{2,100}(公司|事务所)$",
                 "score": 1
             }
         ],
@@ -115,7 +115,7 @@
         "position": [
             {
                 "describe": "以*结尾",
-                "regstr": "(工|老师|经理|女士|先生|主任|科长)$",
+                "regstr": ".{2,100}(工|老师|经理|女士|先生|主任|科长)$",
                 "score": 1
             }
         ],