Browse Source

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 years ago
parent
commit
58ca10adf6

+ 9 - 3
src/jy/clear/specialsymbols.go

@@ -354,8 +354,9 @@ func AnotherRemoveStart(text []rune) []rune {
 			removeLength = firstOpposite + 1
 			nb = nb + removeLength
 		}
-		lastOpposite := pairedIndex[length-1]                            //最后一个符号
-		if lastOpposite > 0 && SymmetricDelete[string(text[length-1])] { //删除结尾由对称符号包括的内容:新城地(区公共厕)所及附属(改造发斯蒂芬)-->新城地(区公共厕)所及附属
+		lastOpposite := pairedIndex[length-1] //最后一个符号
+		lenew := len(text)
+		if lastOpposite > 0 && lenew > 0 && SymmetricDelete[string(text[lenew-1])] { //删除结尾由对称符号包括的内容:新城地(区公共厕)所及附属(改造发斯蒂芬)-->新城地(区公共厕)所及附属
 			//na = length - lastOpposite
 			text = text[:lastOpposite-removeLength]
 		} else if surplusMax == length-1 { //没有对称,只删除最后一个反符号
@@ -385,7 +386,12 @@ func AnotherRemoveStart(text []rune) []rune {
 		}
 		if len(delstrarr) > 0 {
 			for _, rep := range delstrarr {
-				if strings.HasPrefix(rep, string(text[0])) || strings.HasSuffix(rep, string(text[len(text)-1])) { //要清理的内容是开头和结尾部分,清理
+				lenew := len(text)
+				if lenew > 0 && strings.HasPrefix(rep, string(text[0])) { //要清理的内容是开头和结尾部分,清理
+					text = []rune(strings.Replace(string(text), rep, "", -1))
+				}
+				lenew = len(text)
+				if lenew > 0 && strings.HasSuffix(rep, string(text[lenew-1])) {
 					text = []rune(strings.Replace(string(text), rep, "", -1))
 				}
 			}

+ 39 - 41
src/jy/extract/extract.go

@@ -443,9 +443,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 		}
 		//函数清理
 		for key, val := range j.Result {
-			tmpExtFields := make([]*ju.ExtField, 0)
-			tmpWeight := -999 //记录最大权重
-			tmpIndex := -999  //记录最大权重下标
 			for _, v := range val {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
@@ -470,21 +467,24 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				lockclear.Unlock()
 			}
 			//项目编号,采购单位权重清理
-			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
-				for i, v := range val {
-					if v.Weight == 0 {
-						tmpExtFields = append(tmpExtFields, v)
-						continue
-					} else if v.Weight > tmpWeight {
-						tmpWeight = v.Weight
-						tmpIndex = i
-					}
-				}
-				if tmpIndex != -999 {
-					tmpExtFields = append(tmpExtFields, val[tmpIndex])
-					j.Result[key] = tmpExtFields
-				}
-			}
+			//          tmpExtFields := make([]*ju.ExtField, 0)
+			//			tmpWeight := -999 //记录最大权重
+			//			tmpIndex := -999  //记录最大权重下标
+			//			if (key == "projectcode" || key == "buyer") && len(val) > 1 {
+			//				for i, v := range val {
+			//					if v.Weight == 0 {
+			//						tmpExtFields = append(tmpExtFields, v)
+			//						continue
+			//					} else if v.Weight > tmpWeight {
+			//						tmpWeight = v.Weight
+			//						tmpIndex = i
+			//					}
+			//				}
+			//				if tmpIndex != -999 {
+			//					tmpExtFields = append(tmpExtFields, val[tmpIndex])
+			//					j.Result[key] = tmpExtFields
+			//				}
+			//			}
 		}
 		PackageDetail(j, e) //处理分包信息
 		//		bs, _ := json.Marshal(j.Result)
@@ -644,37 +644,35 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 	if !b {
 		return
 	}
+	kvMap := getKvByLuaFields(extfrom, j, in, et.Tag)
 	if in.IsLua {
 		lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
-		lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
+		lua.KvMap = kvMap
 		lua.Block = j.Block
 		extinfo := lua.RunScript("core")
-		for k, v := range extinfo {
-			if k == in.Field {
-				if j.Result[k] == nil {
-					j.Result[k] = [](*ju.ExtField){}
-				}
-				if tmps, ok := v.([]map[string]interface{}); ok {
-					for _, tmp := range tmps {
-						field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
-						if tmp["blocktag"] != nil {
-							btag := make(map[string]string)
-							for k := range tmp["blocktag"].(map[string]bool) {
-								if TagConfigDesc[k] != "" {
-									btag[k] = TagConfigDesc[k]
-								}
-							}
-							field.BlockTag = btag
-						}
-						j.Result[k] = append(j.Result[k], field)
+		if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
+			kvMap[in.Field] = tmps
+		}
+	}
+	if len(kvMap) > 0 {
+		if j.Result[in.Field] == nil {
+			j.Result[in.Field] = [](*ju.ExtField){}
+		}
+		for _, tmp := range kvMap[in.Field] {
+			field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: in.Field, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+			if tmp["blocktag"] != nil {
+				btag := make(map[string]string)
+				for k := range tmp["blocktag"].(map[string]bool) {
+					if TagConfigDesc[k] != "" {
+						btag[k] = TagConfigDesc[k]
 					}
 				}
+				field.BlockTag = btag
 			}
+			j.Result[in.Field] = append(j.Result[in.Field], field)
 		}
-		if len(extinfo) > 0 {
-			AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
-		}
-	} else {
+		AddExtLog("extract", j.SourceMid, nil, kvMap, in, et.TaskInfo) //抽取日志
+	} else if !in.IsLua {
 		//全文正则
 		//text := qu.ObjToString(doc[extfrom])
 		//if in.Field != "" {

+ 14 - 28
src/jy/extract/score.go

@@ -136,7 +136,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 						qz = TagConfig[key][field]
 					}
 				}
-				tmps[tmpsindex].Score += BlockScore * qz //乘以权重系数
+				tmps[tmpsindex].Score += ju.FloatFormat(BlockScore*qz, 4) //乘以权重系数
 				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
 			} else {
 				//没有段标签,走其他
@@ -151,39 +151,24 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 					tmps[tmpsindex].Score += fieldscore["title"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["title"]})
 				}
-				if strings.Contains(tmpsvalue.Type, "colon") {
-					tmps[tmpsindex].Score += fieldscore["colon"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["colon"]})
-				} else if strings.Contains(tmpsvalue.Type, "space") {
-					tmps[tmpsindex].Score += fieldscore["space"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["space"]})
-				} else if strings.Contains(tmpsvalue.Type, "table") {
-					tmps[tmpsindex].Score += fieldscore["table"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["table"]})
-				} else if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += fieldscore["regexp"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore["regexp"]})
-				}
+				tmps[tmpsindex].Score += fieldscore[tmpsvalue.Type]
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: fieldscore[tmpsvalue.Type]})
 			} else { //通用抽取属性打分配置
 				if tmpsvalue.ExtFrom == "title" { //标题打分初始化
 					tmps[tmpsindex].Score += CommonScore["title"]
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["title"]})
 				}
-				if strings.Contains(tmpsvalue.Type, "colon") {
-					tmps[tmpsindex].Score += CommonScore["colon"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["colon"]})
-				} else if strings.Contains(tmpsvalue.Type, "space") {
-					tmps[tmpsindex].Score += CommonScore["space"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["space"]})
-				} else if strings.Contains(tmpsvalue.Type, "table") {
-					tmps[tmpsindex].Score += CommonScore["table"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["table"]})
-				} else if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += CommonScore["regexp"]
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore["regexp"]})
-				}
+				tmps[tmpsindex].Score += CommonScore[tmpsvalue.Type]
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: tmpsvalue.Type, Code: "fieldscore." + tmpsvalue.Type, RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: CommonScore[tmpsvalue.Type]})
+			}
+			//kv权重打分
+			if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+				weightscore := ju.FloatFormat(1+float64(tmps[tmpsindex].Weight)/1000, 4)
+				tmps[tmpsindex].Score += weightscore
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
+			} else {
+				//正则权重,暂不考虑
 			}
-
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {
 				continue
@@ -324,6 +309,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				v.Score += score
 				tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score})
 			}
+			v.Score = ju.FloatFormat(v.Score, 4)
 		}
 	}
 	return result

+ 3 - 0
src/jy/pretreated/colonkv.go

@@ -769,6 +769,9 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 							if strings.TrimSpace(nextval) == "" {
 								continue
 							}
+							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0{
+								continue
+							}
 						}
 					}
 					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight})

+ 10 - 0
src/jy/util/util.go

@@ -4,6 +4,7 @@ import (
 	"fmt"
 	. "jy/mongodbutil"
 	qu "qfw/util"
+	"strconv"
 
 	. "gopkg.in/mgo.v2/bson"
 )
@@ -137,3 +138,12 @@ func InitBrand() {
 	BrandGet = &DFA{}
 	BrandGet.AddWord(BrandConfig...)
 }
+
+func FloatFormat(val float64, length int) float64 {
+	tmp, err := strconv.ParseFloat(strconv.FormatFloat(val, 'f', length, 64), 64)
+	if err != nil {
+		return 0
+	} else {
+		return tmp
+	}
+}

+ 4 - 4
src/res/fieldscore.json

@@ -346,22 +346,22 @@
                 "describe": "[gt,lte,score]",
                 "range": [
                     0,
-                    3,
+                    4,
                     -5
                 ]
             },
             {
                 "describe": "[gt,lte,score]",
                 "range": [
-                    3,
-                    30,
+                    4,
+                    35,
                     3
                 ]
             },
             {
                 "describe": "[gt,∞,score]",
                 "range": [
-                    30,
+                    35,
                     -1,
                     -1
                 ]