瀏覽代碼

段标签、打分

fengweiqiang 6 年之前
父節點
當前提交
00b4ba5cfa
共有 7 個文件被更改,包括 613 次插入205 次删除
  1. 2 1
      src/config.json
  2. 105 29
      src/jy/extract/extract.go
  3. 285 113
      src/jy/extract/score.go
  4. 34 8
      src/jy/util/article.go
  5. 16 0
      src/jy/util/script.go
  6. 66 54
      src/res/fieldscore.json
  7. 105 0
      src/res/tagscore.json

+ 2 - 1
src/config.json

@@ -39,5 +39,6 @@
         "LaunchTemplateId8": "lt-2zeidqiydzusn7hw7lt8",
         "VSwitchId": "vsw-2ze23am2bl9e3v6rnyhfb"
     },
-    "filelength": 100000
+    "filelength": 100000,
+    "saveblock": true
 } 

+ 105 - 29
src/jy/extract/extract.go

@@ -22,13 +22,13 @@ import (
 )
 
 var (
-	lock          sync.RWMutex
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	lock    sync.RWMutex
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -274,7 +274,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 		}
 	}
 	qu.Try(func() {
-		pretreated.AnalyStart(j)
+		pretreated.AnalyStart(j) //job.Block分块
 		if isextFile {
 			pretreated.AnalyStart(jf)
 		}
@@ -331,9 +331,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 	qu.Try(func() {
 		doc := *j.Data
 		//全局前置规则,结果覆盖doc属性
-		for _, v := range e.RulePres {
-			doc = ExtRegPre(doc, j, v, e.TaskInfo)
-		}
+		//for _, v := range e.RulePres {
+		//	doc = ExtRegPre(doc, j, v, e.TaskInfo)
+		//}
 		if j.CategorySecond == "" {
 			//抽取规则
 			tmprules := map[string][]*RuleCore{}
@@ -349,10 +349,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if !ju.Logic(vc.LuaLogic, tmp) {
 						continue
 					}
-					//抽取-前置规则
-					for _, v := range vc.RulePres {
-						tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
-					}
+					////抽取-前置规则
+					//for _, v := range vc.RulePres {
+					//	tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
+					//}
 					// log.Debug("抽取-前置规则", tmp)
 
 					//抽取-规则
@@ -364,7 +364,14 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					//项目名称未能抽取到,标题来凑
 					if vc.Field == "projectname" {
 						if len(j.Result[vc.Field]) < 1 {
-							j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
+							if tmp["blocktag"] != nil {
+								j.Result[vc.Field] = append(j.Result[vc.Field],
+									&ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
+							} else {
+								j.Result[vc.Field] = append(j.Result[vc.Field],
+									&ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
+							}
+							//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
 						}
 					}
 
@@ -398,7 +405,12 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					//项目名称未能抽取到,标题来凑
 					if vc.Field == "projectname" {
 						if len(j.Result[vc.Field]) < 1 {
-							j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
+							if tmp["blocktag"] != nil {
+								j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
+							} else {
+								j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
+							}
+							//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
 						}
 					}
 
@@ -651,8 +663,13 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						j.Result[k] = append(j.Result[k],
-							&ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
+						if tmp["blocktag"] != nil {
+							j.Result[k] = append(j.Result[k],
+								&ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
+						} else {
+							j.Result[k] = append(j.Result[k],
+								&ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0})
+						}
 					}
 				}
 			}
@@ -680,9 +697,12 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 		lock.Lock()
 		tags := t[field] //获取对应标签库
 		lock.Unlock()
+		if tags == nil {
+			continue
+		}
 		for _, bl := range j.Block {
 			//冒号kv
-			if bl.ColonKV != nil {
+			if bl.ColonKV != nil && len(bl.ColonKV.Kvs) > 0 {
 				kvs := bl.ColonKV.Kvs
 				kvs2 := bl.ColonKV.Kvs_2
 				// log.Debug("ColonKV1", kvs)
@@ -701,6 +721,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 										"value":     text,
 										"type":      "colon1",
 										"matchtype": "tag_string",
+										"blocktag":  bl.Tag,
 									})
 								}
 								break
@@ -717,6 +738,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 										"value":     text,
 										"type":      "colon1",
 										"matchtype": "tag_regexp",
+										"blocktag":  bl.Tag,
 									})
 								}
 								break
@@ -736,6 +758,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 										"value":     text,
 										"type":      "colon2",
 										"matchtype": "tag_string",
+										"blocktag":  bl.Tag,
 									})
 								}
 								break
@@ -752,6 +775,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 										"value":     text,
 										"type":      "colon2",
 										"matchtype": "tag_regexp",
+										"blocktag":  bl.Tag,
 									})
 								}
 								break
@@ -761,7 +785,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 				}
 			}
 			//空格kv
-			if bl.SpaceKV != nil {
+			if bl.SpaceKV != nil && len(bl.SpaceKV.Kvs) > 0 {
 				kvs := bl.SpaceKV.Kvs
 				// log.Debug("SpaceKV", kvs)
 				for _, tag := range tags {
@@ -778,6 +802,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 										"value":     text,
 										"type":      "space",
 										"matchtype": "tag_string",
+										"blocktag":  bl.Tag,
 									})
 								}
 								break
@@ -794,6 +819,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 										"value":     text,
 										"type":      "space",
 										"matchtype": "tag_regexp",
+										"blocktag":  bl.Tag,
 									})
 								}
 								break
@@ -803,7 +829,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 				}
 			}
 			//表格kv
-			if bl.TableKV != nil {
+			if bl.TableKV != nil && len(bl.TableKV.Kv) > 0 {
 				tkv := bl.TableKV
 				// log.Debug("tkv", tkv)
 				for k, v := range tkv.Kv {
@@ -823,6 +849,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 								"value":     v,
 								"type":      "table",
 								"matchtype": "tag_string",
+								"blocktag":  bl.Tag,
 							})
 						} else { //涉及其他待处理
 							// log.Debug(tags)
@@ -862,10 +889,12 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 					tmps = append(tmps, tmp)
 					extinfo[k] = tmps
 					if val != "" {
-						if j.Result[v.Field] == nil {
-							j.Result[k] = [](*ju.ExtField){}
+						if tmp["blocktag"] != nil {
+							j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+						} else {
+							j.Result[k] = append(j.Result[k], &ju.ExtField{nil, k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
 						}
-						j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+						//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
 					}
 				}
 			}
@@ -897,7 +926,11 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+			if tmp["blocktag"] != nil{
+				j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+			}else {
+				j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{nil, v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+			}
 		}
 	}
 	return extinfo
@@ -917,7 +950,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 			if tmps, ok := v.([]map[string]interface{}); ok {
 				j.Result[k] = [](*ju.ExtField){}
 				for _, tmp := range tmps {
-					j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
+					j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
 				}
 			}
 		}
@@ -1145,6 +1178,24 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 			// log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
 		}
+		//分包和标签
+		if ju.Config["saveblock"].(bool) {
+			blocks := make([]ju.BlockAndTag, 0)
+			for _, v := range j.Block {
+				xx,_:=json.Marshal(v)
+				tmpblock := new(ju.TmpBlock)
+				err:= json.Unmarshal(xx,&tmpblock)
+				if err != nil{
+					if v.BPackage!= nil{
+						bpb, _ := json.Marshal(v.BPackage)
+						tmpblock.BPackage = string(bpb)
+					}
+					tmpblock = rangeBlockToJson(v,*tmpblock)
+				}
+				blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
+			}
+			tmp["blocks"] = blocks
+		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				for field, _ := range e.Fields {
@@ -1191,7 +1242,32 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		log.Debug("AnalysisSaveResult err", err)
 	})
 }
-
+func rangeBlockToJson(j *ju.Block,tmpblock ju.TmpBlock)(b *ju.TmpBlock){
+	if j == nil{
+		return nil
+	}
+	if len(j.Block)>0{
+		for i,v := range j.Block{
+			rangetmp := new(ju.TmpBlock)
+			vb,_:=json.Marshal(v)
+			json.Unmarshal(vb,&rangetmp)
+			tmpblock.Block[i]=rangeBlockToJson(v,*rangetmp)
+		}
+	}
+	if j.ColonKV!= nil {
+		cb,_ := json.Marshal(j.ColonKV)
+		tmpblock.ColonKV = string(cb)
+	}
+	if j.SpaceKV != nil{
+		sb,_ := json.Marshal(j.SpaceKV)
+		tmpblock.SpaceKV = string(sb)
+	}
+	if j.TableKV != nil{
+		tb,_ := json.Marshal(j.TableKV)
+		tmpblock.TableKV = string(tb)
+	}
+	return &tmpblock
+}
 //去重冗余字段
 func delFiled(k string) bool {
 	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo"
@@ -1286,7 +1362,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 285 - 113
src/jy/extract/score.go

@@ -9,17 +9,39 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"unicode/utf8"
 )
 
 var SoreConfig map[string]map[string]interface{}
+var TagConfig map[string]map[string]float64
 
 func init() {
+	qu.ReadConfig("./res/tagscore.json",&TagConfig)
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)
 		if tmp["type"] == "string" {
-			if positions, ok := tmp["position"].([]interface{}); ok {
+			if positions, ok := tmp["positivewords"].([]interface{}); ok {
+				for _, position := range positions {
+					if p, ok := position.(map[string]interface{}); ok {
+						qu.Try(func() {
+							strReq, _ := p["regstr"].(string)
+							if strings.Contains(strReq, "\\u") {
+								strReq = strings.Replace(strReq, "\\", "\\\\", -1)
+								strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
+								strReq, _ = strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(strReq)
+							} else {
+								p["regexp"] = regexp.MustCompile(strReq)
+							}
+						}, func(err interface{}) {
+							log.Println(err)
+						})
+					}
+				}
+			}
+			if positions, ok := tmp["negativewords"].([]interface{}); ok {
 				for _, position := range positions {
 					if p, ok := position.(map[string]interface{}); ok {
 						qu.Try(func() {
@@ -64,134 +86,284 @@ func init() {
 //结果打分
 func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 	result := j.Result
-	qu.Try(func() {
-		//打分
-		for field, tmps := range result {
+	qu.Catch()
+	for field, tmps := range result {
+		for tmpsindex, tmpsvalue := range tmps {
+			//是否有段标签
+			if len(tmpsvalue.BlockTag) > 0 {
+				//有标签段
+				var qz float64 = 0.0 //取权重最高的
+				for key := range tmpsvalue.BlockTag {
+					//key = "其他"//TODO 测试用
+					if TagConfig[key][field] > qz {
+						qz = TagConfig[key][field]
+					}
+				}
+				tmps[tmpsindex].Score += 2 * qz //乘以权重系数
+			} else {
+				//没有段标签,走其他
+				qz := TagConfig["其他"][field]
+				tmps[tmpsindex].Score += 2 * qz //乘以权重系数
+			}
+
+			//是否有kv值
+			if strings.Contains(tmpsvalue.Type, "colon") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
+			} else if strings.Contains(tmpsvalue.Type, "space") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
+			} else if strings.Contains(tmpsvalue.Type, "table") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
+			}
+
+			//正则
+			if strings.Contains(tmpsvalue.Type, "regexp") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
+			}
+
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {
 				continue
 			}
-			extractype := SoreConfig["extractype"]
-			fieldtype := scoreRule["type"]
-			for _, v := range tmps {
-				if len(fmt.Sprint(v.Value)) < 1 {
-					continue //空串跳过
-				}
-				//长度超过100个字,直接负分
-				vlen := len([]rune(qu.ObjToString(v.Value)))
-				if vlen > 100 && field != "projectscope" {
-					v.Score = -1
-				} else {
-					//类型打分
-					if v.ExtFrom == "title" {
-						v.Score += qu.IntAll(extractype["title"])
-					} else {
-						if strings.Contains(v.Type, "table") {
-							v.Score += qu.IntAll(extractype["table"])
-						} else if strings.Contains(v.Type, "colon") {
-							v.Score += qu.IntAll(extractype["colon"])
-						} else if strings.Contains(v.Type, "space") {
-							v.Score += qu.IntAll(extractype["space"])
-						} else if strings.Contains(v.Type, "regexp") {
-							v.Score += qu.IntAll(extractype["regexp"])
-						} else if strings.Contains(v.Type, "winnerorder") {
-							v.Score += qu.IntAll(extractype["winnerorder"])
-						}
-					}
-					//字符型打分
-					if fieldtype == "string" {
-						//位置打分
-						if positions, ok := scoreRule["position"].([]interface{}); ok {
-							for _, position := range positions {
-								if p, ok := position.(map[string]interface{}); ok {
-									qu.Try(func() {
-										if p["regexp"] != nil {
-											reg := p["regexp"].(*regexp.Regexp)
-											if reg.MatchString(qu.ObjToString(v.Value)) {
-												v.Score += qu.IntAll(p["score"])
-											}
-										}
-									}, func(err interface{}) {
-										log.Println(err)
-									})
-								}
+			if scoreRule["type"] == "string" {
+				//1.长度打分
+				valueLen := utf8.RuneCountInString(fmt.Sprint(tmpsvalue.Value))
+				if lengths, ok := scoreRule["length"].([]interface{}); ok {
+					for _, tmp := range lengths {
+						if length, ok := tmp.(map[string]interface{}); ok {
+							min := qu.IntAll(length["min"])
+							max := qu.IntAll(length["max"])
+							scores, _ := length["score"].([]interface{})
+							if len(scores) < 3 {
+								continue
 							}
-						}
-						//长度打分
-						if lengths, ok := scoreRule["length"].([]interface{}); ok {
-							for _, tmp := range lengths {
-								if length, ok := tmp.(map[string]interface{}); ok {
-									min := qu.IntAll(length["min"])
-									max := qu.IntAll(length["max"])
-									scores, _ := length["score"].([]interface{})
-									if len(scores) < 3 {
-										continue
-									}
-									if vlen < min {
-										v.Score += qu.IntAll(scores[0])
-									} else if vlen > max {
-										v.Score += qu.IntAll(scores[2])
-									} else {
-										v.Score += qu.IntAll(scores[1])
-									}
-								}
+							if valueLen < min {
+								tmps[tmpsindex].Score += qu.Float64All(scores[0])
+							} else if valueLen > max {
+								tmps[tmpsindex].Score += qu.Float64All(scores[2])
+							} else {
+								tmps[tmpsindex].Score += qu.Float64All(scores[1])
 							}
 						}
-						//
-						if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
-							for _, winnerorder := range winnerorders {
-								if p, ok := winnerorder.(map[string]interface{}); ok {
-									qu.Try(func() {
-										if p["regexp"] != nil {
-											reg := p["regexp"].(*regexp.Regexp)
-											if reg.MatchString(qu.ObjToString(v.Value)) {
-												v.Score += qu.IntAll(p["score"])
-											}
-										}
-									}, func(err interface{}) {
-										log.Println(err)
-									})
+					}
+				}
+				//2.负面词打分
+				if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
+					for _, position := range positions {
+						if p, ok := position.(map[string]interface{}); ok {
+							qu.Try(func() {
+								if p["regexp"] != nil {
+									reg := p["regexp"].(*regexp.Regexp)
+									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
+										tmps[tmpsindex].Score += qu.Float64All(p["score"])
+									}
 								}
-							}
+							}, func(err interface{}) {
+								log.Println(err)
+							})
 						}
 					}
-					//float类型打分
-					if fieldtype == "float" {
-						min := qu.IntAll(scoreRule["min"])
-						max := qu.IntAll(scoreRule["max"])
-						val := qu.IntAll(v.Value)
-						scores, _ := scoreRule["score"].([]interface{})
-						if len(scores) < 3 {
-							continue
-						}
-						if val < min && 0 < val {
-							v.Score += qu.IntAll(scores[0])
-						} else if val > max {
-							v.Score += qu.IntAll(scores[2])
-						} else if val <= max && val >= min {
-							v.Score += qu.IntAll(scores[1])
+				}
+				//3.正面词打分
+				if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
+					for _, position := range positions {
+						if p, ok := position.(map[string]interface{}); ok {
+							qu.Try(func() {
+								if p["regexp"] != nil {
+									reg := p["regexp"].(*regexp.Regexp)
+									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
+										tmps[tmpsindex].Score += qu.Float64All(p["score"])
+									}
+								}
+							}, func(err interface{}) {
+								log.Println(err)
+							})
 						}
 					}
-					//decimal
-					if fieldtype == "decimal" {
-						min := qu.IntAll(scoreRule["min"])
-						max := qu.IntAll(scoreRule["max"])
-						val := qu.IntAll(v.Value)
-						scores, _ := scoreRule["score"].([]interface{})
-						if len(scores) < 3 {
-							continue
-						}
-						if val > max {
-							v.Score += qu.IntAll(scores[2])
-						} else if val <= max && val > min {
-							v.Score += qu.IntAll(scores[1])
+				}
+				//4.位置打分
+				if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
+					for _, winnerorder := range winnerorders {
+						if p, ok := winnerorder.(map[string]interface{}); ok {
+							qu.Try(func() {
+								if p["regexp"] != nil {
+									reg := p["regexp"].(*regexp.Regexp)
+									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
+										tmps[tmpsindex].Score += qu.Float64All(p["score"])
+									}
+								}
+							}, func(err interface{}) {
+								log.Println(err)
+							})
 						}
 					}
 				}
 			}
+			//5.数据范围打分
+			if scoreRule["type"] == "float"  {
+				min := qu.IntAll(scoreRule["min"])
+				max := qu.IntAll(scoreRule["max"])
+				val := qu.IntAll(tmpsvalue.Value)
+				scores, _ := scoreRule["score"].([]interface{})
+				if len(scores) < 3 {
+					continue
+				}
+				if val < min && 0 < val {
+					tmps[tmpsindex].Score += qu.Float64All(scores[0])
+				} else if val > max {
+					tmps[tmpsindex].Score += qu.Float64All(scores[2])
+				} else if val <= max && val >= min {
+					tmps[tmpsindex].Score += qu.Float64All(scores[1])
+				}
+			}
+			//其他打分配置
+			// decimal
+			if scoreRule["type"] == "decimal" {
+				min := qu.IntAll(scoreRule["min"])
+				max := qu.IntAll(scoreRule["max"])
+				val := qu.IntAll(tmpsvalue.Value)
+				scores, _ := scoreRule["score"].([]interface{})
+				if len(scores) < 3 {
+					continue
+				}
+				if val > max {
+					tmps[tmpsindex].Score += qu.Float64All(scores[2])
+				} else if val <= max && val > min {
+					tmps[tmpsindex].Score += qu.Float64All(scores[1])
+				}
+			}
 		}
-	}, func(err interface{}) {
-		log.Println("ScoreFields err", err)
-	})
+	}
+
 	return result
+	//qu.Try(func() {
+	//	//打分
+	//	for field, tmps := range result {
+	//		scoreRule := SoreConfig[field]
+	//		if scoreRule == nil {
+	//			continue
+	//		}
+			//extractype := SoreConfig["extractype"]
+			//fieldtype := scoreRule["type"]
+	//		for _, v := range tmps {
+	//			fmt.Println(v)
+				//	if len(fmt.Sprint(v.Value)) < 1 {
+				//		continue //空串跳过
+				//	}
+				//	//长度超过100个字,直接负分
+				//	vlen := len([]rune(qu.ObjToString(v.Value)))
+				//	if vlen > 100 && field != "projectscope" {
+				//		v.Score = -1
+				//	} else {
+				//		//类型打分
+				//		if v.ExtFrom == "title" {
+				//			v.Score += qu.IntAll(extractype["title"])
+				//		} else {
+				//			if strings.Contains(v.Type, "table") {
+				//				v.Score += qu.IntAll(extractype["table"])
+				//			} else if strings.Contains(v.Type, "colon") {
+				//				v.Score += qu.IntAll(extractype["colon"])
+				//			} else if strings.Contains(v.Type, "space") {
+				//				v.Score += qu.IntAll(extractype["space"])
+				//			} else if strings.Contains(v.Type, "regexp") {
+				//				v.Score += qu.IntAll(extractype["regexp"])
+				//			} else if strings.Contains(v.Type, "winnerorder") {
+				//				v.Score += qu.IntAll(extractype["winnerorder"])
+				//			}
+				//		}
+				//		//字符型打分
+				//		if fieldtype == "string" {
+				//			//位置打分
+				//			if positions, ok := scoreRule["position"].([]interface{}); ok {
+				//				for _, position := range positions {
+				//					if p, ok := position.(map[string]interface{}); ok {
+				//						qu.Try(func() {
+				//							if p["regexp"] != nil {
+				//								reg := p["regexp"].(*regexp.Regexp)
+				//								if reg.MatchString(qu.ObjToString(v.Value)) {
+				//									v.Score += qu.IntAll(p["score"])
+				//								}
+				//							}
+				//						}, func(err interface{}) {
+				//							log.Println(err)
+				//						})
+				//					}
+				//				}
+				//			}
+				//			//长度打分
+				//			if lengths, ok := scoreRule["length"].([]interface{}); ok {
+				//				for _, tmp := range lengths {
+				//					if length, ok := tmp.(map[string]interface{}); ok {
+				//						min := qu.IntAll(length["min"])
+				//						max := qu.IntAll(length["max"])
+				//						scores, _ := length["score"].([]interface{})
+				//						if len(scores) < 3 {
+				//							continue
+				//						}
+				//						if vlen < min {
+				//							v.Score += qu.IntAll(scores[0])
+				//						} else if vlen > max {
+				//							v.Score += qu.IntAll(scores[2])
+				//						} else {
+				//							v.Score += qu.IntAll(scores[1])
+				//						}
+				//					}
+				//				}
+				//			}
+				//			//
+				//			if winnerorders, ok := scoreRule["winnerorder"].([]interface{}); ok {
+				//				for _, winnerorder := range winnerorders {
+				//					if p, ok := winnerorder.(map[string]interface{}); ok {
+				//						qu.Try(func() {
+				//							if p["regexp"] != nil {
+				//								reg := p["regexp"].(*regexp.Regexp)
+				//								if reg.MatchString(qu.ObjToString(v.Value)) {
+				//									v.Score += qu.IntAll(p["score"])
+				//								}
+				//							}
+				//						}, func(err interface{}) {
+				//							log.Println(err)
+				//						})
+				//					}
+				//				}
+				//			}
+				//		}
+				//		//float类型打分
+				//		if fieldtype == "float" {
+				//			min := qu.IntAll(scoreRule["min"])
+				//			max := qu.IntAll(scoreRule["max"])
+				//			val := qu.IntAll(v.Value)
+				//			scores, _ := scoreRule["score"].([]interface{})
+				//			if len(scores) < 3 {
+				//				continue
+				//			}
+				//			if val < min && 0 < val {
+				//				v.Score += qu.IntAll(scores[0])
+				//			} else if val > max {
+				//				v.Score += qu.IntAll(scores[2])
+				//			} else if val <= max && val >= min {
+				//				v.Score += qu.IntAll(scores[1])
+				//			}
+				//		}
+				//		//decimal
+				//		if fieldtype == "decimal" {
+				//			min := qu.IntAll(scoreRule["min"])
+				//			max := qu.IntAll(scoreRule["max"])
+				//			val := qu.IntAll(v.Value)
+				//			scores, _ := scoreRule["score"].([]interface{})
+				//			if len(scores) < 3 {
+				//				continue
+				//			}
+				//			if val > max {
+				//				v.Score += qu.IntAll(scores[2])
+				//			} else if val <= max && val > min {
+				//				v.Score += qu.IntAll(scores[1])
+				//			}
+				//		}
+				//	}
+			//}
+		//}
+	//}, func(err interface{}) {
+	//	log.Println("ScoreFields err", err)
+	//})
+	//return result
 }

+ 34 - 8
src/jy/util/article.go

@@ -33,14 +33,15 @@ type Job struct {
 }
 
 type ExtField struct {
-	Field     string      //属性
-	Code      string      //匹配标签(字符串、正则)、正则或lua代码
-	RuleText  string      //内容
-	Type      string      //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType string      //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom   string      //抽取来源(title,detail)
-	Value     interface{} //抽取结果
-	Score     int         //得分
+	BlockTag  map[string]bool //块标签
+	Field     string          //属性
+	Code      string          //匹配标签(字符串、正则)、正则或lua代码
+	RuleText  string          //内容
+	Type      string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom   string          //抽取来源(title,detail)
+	Value     interface{}     //抽取结果
+	Score     float64         //得分
 }
 
 //分块规则
@@ -68,6 +69,31 @@ type Block struct {
 	Winnerorder []map[string]interface{} //块中,中标候选人排序
 }
 
+//块
+type TmpBlock struct {
+	Tags        []Tags                   //对块做的标签,可以作为数据抽取的依据
+	Title       string                   //块标题
+	Titles      []string                 //拆分以后多个块标题
+	Index       int                      //块索引
+	Text        string                   //块内容
+	Start       int                      //开始索引
+	End         int                      //结束索引
+	ColonKV     string                   //冒号kv (分出的对应的KV值)
+	TableKV     string                   //table kv (分出的对应的KV值)
+	SpaceKV     string                   //空格 kv (分出的对应的KV值)
+	BPackage    string                   //分包信息
+	Tag         map[string]bool          //块标签
+	Block       []*TmpBlock              //子块
+	Category    string                   //块分类
+	Winnerorder []map[string]interface{} //块中,中标候选人排序
+}
+
+//分包和标签
+type BlockAndTag struct {
+	Tag   map[string]bool //块标签
+	Block interface{}     //块内容
+}
+
 //段落
 type Segment struct {
 	Index int    //段落索引

+ 16 - 0
src/jy/util/script.go

@@ -152,6 +152,14 @@ func MapToLuaTable(l *lua.LState, obj map[string]interface{}) *lua.LTable {
 			tab.RawSet(lua.LString(k), lua.LNumber(val))
 		} else if val, ok := v.(bool); ok {
 			tab.RawSet(lua.LString(k), lua.LBool(val))
+		} else if val ,ok := v.(map[string]bool);ok{//把map的value值为true的存储为数组
+			tb := l.NewTable()
+			var i int
+			for k2,_ := range val{
+				tb.Insert(i, lua.LString(k2))
+				i+=1
+			}
+			tab.RawSet(lua.LString(k), tb)
 		} else if val, ok := v.(map[string]interface{}); ok {
 			tab.RawSet(lua.LString(k), MapToLuaTable(l, val))
 		} else if val, ok := v.([]string); ok {
@@ -193,6 +201,14 @@ func LuaTableToMap(param *lua.LTable) map[string]interface{} {
 		if v, ok := val.(lua.LString); ok {
 			tmp[kk] = string(v)
 		} else if v, ok := val.(*lua.LTable); ok {
+			if kk == "blocktag"{//分块
+				t := map[string]bool{}
+				v.ForEach(func(value lua.LValue, value2 lua.LValue) {
+					t[value2.String()]=true
+				})
+				tmp[kk] = t
+				return
+			}
 			t := []map[string]interface{}{}
 			v.ForEach(func(k, inv lua.LValue) {
 				if vv, ok := inv.(*lua.LTable); ok {

+ 66 - 54
src/res/fieldscore.json

@@ -1,35 +1,37 @@
 {
     "extractype": {
         "describe": "抽取类型打分",
-        "title": 3,
-        "table": 5,
-        "colon": 3,
-        "space": 3,
+        "title": 4,
+        "table": 3,
+        "colon": 2,
+        "space": 2,
         "regexp": 2,
         "winnerorder": 3
     },
     "projectname": {
         "type": "string",
-        "position": [
-            {
-                "describe": "以*开头",
-                "regstr": "关于|\\[|【",
-                "score": -1
-            },
+        "positivewords": [
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(项目|工程|采购)$",
                 "score": 3
             }
         ],
+        "negativewords": [
+            {
+            "describe": "以*开头",
+            "regstr": "关于|\\[|【",
+            "score": -10
+            }
+        ],
         "length": [
             {
                 "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
                 "min": 4,
                 "max": 35,
                 "score": [
-                    -6,
-                    1,
+                    -5,
+                    3,
                     -1
                 ]
             }
@@ -37,16 +39,18 @@
     },
     "buyer": {
         "type": "string",
-        "position": [
+        "positivewords": [
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(委员会|办公室|幼儿园|动物园|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|中心|协会|公司|政府|初中|集团|银行|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场)$",
-                "score": 5
-            },
+                "score": 3
+            }
+        ],
+        "negativewords": [
             {
-                "describe": "包含负分",
-                "regstr": "(附件|招标失败|交易中心|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
-                "score": -20
+            "describe": "包含负分",
+            "regstr": "(附件|招标失败|交易中心|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+            "score": -10
             }
         ],
         "length": [
@@ -55,8 +59,8 @@
                 "min": 5,
                 "max": 20,
                 "score": [
-                    -6,
-                    1,
+                    -5,
+                    3,
                     -1
                 ]
             }
@@ -64,16 +68,18 @@
     },
     "winner": {
         "type": "string",
-        "position": [
+        "positivewords": [
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(集团|公司|学校|中心|家具城|门诊|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$",
-                "score": 5
-            },
+                "score": 3
+            }
+        ],
+        "negativewords": [
             {
                 "describe": "包含负分",
                 "regstr": "(附件|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
-                "score": -20
+                "score": -10
             }
         ],
         "length": [
@@ -82,8 +88,8 @@
                 "min": 5,
                 "max": 20,
                 "score": [
-                    -6,
-                    1,
+                    -5,
+                    3,
                     -1
                 ]
             }
@@ -98,26 +104,27 @@
     },
     "agency": {
         "type": "string",
-        "position": [
+        "positivewords": [
             {
                 "describe": "以*中部",
                 "regstr": "^.*(工程管理|招标代理|咨询|项目管理).*$",
-                "score": 1
+                "score": 3
             },
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(公司|事务所)$",
-                "score": 1
+                "score": 3
             }
         ],
+        "negativewords": [],
         "length": [
             {
                 "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
                 "min": 4,
                 "max": 30,
                 "score": [
-                    -6,
-                    1,
+                    -5,
+                    3,
                     -1
                 ]
             }
@@ -125,21 +132,22 @@
     },
     "buyerperson": {
         "type": "string",
-        "position": [
+        "positivewords": [
             {
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(工|老师|经理|女士|先生|主任|科长)$",
-                "score": 1
+                "score": 3
             }
         ],
+        "negativewords": [],
         "length": [
             {
                 "describe": "长度打分min>val:0,min<=val<=max:1,max<val:-1",
                 "min": 2,
                 "max": 6,
                 "score": [
-                    0,
-                    1,
+                    -5,
+                    3,
                     -1
                 ]
             }
@@ -147,11 +155,13 @@
     },
     "buyertel": {
         "type": "string",
-        "position": [
+        "positivewords": [
+        ],
+        "negativewords": [
             {
                 "describe": "出现中文汉字",
                 "regstr": "[\\u4e00-\\u9fa5]",
-                "score": -1
+                "score": -10
             }
         ],
         "length": [
@@ -160,8 +170,8 @@
                 "min": 7,
                 "max": 14,
                 "score": [
-                    -1,
-                    1,
+                    -5,
+                    3,
                     -1
                 ]
             }
@@ -169,21 +179,23 @@
     },
     "projectcode": {
         "type": "string",
-        "position": [
+        "positivewords": [
+        ],
+        "negativewords": [
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
-                "score": -20
+                "score": -10
             },
             {
                 "describe": "以两个汉字以上结束",
                 "regstr": "[\\u4e00-\\u9fa5]{2,}$",
-                "score": -20
+                "score": -10
             },
             {
                 "describe": "包含负分",
                 "regstr": "(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,\\.。、::“”‘’\"])",
-                "score": -20
+                "score": -10
             }
         ],
         "length": [
@@ -192,9 +204,9 @@
                 "min": 3,
                 "max": 30,
                 "score": [
-                    -10,
-                    1,
-                    -10
+                    -5,
+                    3,
+                    -1
                 ]
             }
         ]
@@ -205,9 +217,9 @@
         "min": 1000,
         "max": 1000000000,
         "score": [
-            1,
-            3,
-            1
+            -3,
+            2,
+            -3
         ]
     },
     "budget": {
@@ -216,9 +228,9 @@
         "min": 1000,
         "max": 1000000000,
         "score": [
-            1,
-            3,
-            1
+            -3,
+            2,
+            -3
         ]
     },
     "supervisorrate": {
@@ -227,8 +239,8 @@
         "min": 0,
         "max": 1,
         "score": [
-            0,
-            3,
+            -3,
+            2,
             -3
         ]
     }

+ 105 - 0
src/res/tagscore.json

@@ -0,0 +1,105 @@
+{
+  "招标条件": {
+    "buyer": 1,
+    "projectname": 1,
+    "agency": 1,
+    "approvalno": 1
+  },
+  "项目概况/采购需求": {
+    "projectscope": 1,
+    "projectcode": 1,
+    "projectaddr": 1,
+    "buyer": 1,
+    "budget": 1,
+    "projectscale": 1,
+    "funds": 1
+  },
+  "投标文件的递交": {
+    "bidopendate": 1,
+    "bidopentime": 1
+  },
+  "开标信息": {
+    "bidopendate": 1,
+    "bidopentime": 1
+  },
+  "联系方式": {
+    "buyer": 1,
+    "buyerperson": 1,
+    "buyertel": 1,
+    "buyeraddr": 1,
+    "agency": 1,
+    "agencyperson": 1,
+    "agencytel": 1,
+    "agencyaddr": 1
+  },
+  "项目信息": {
+    "projectcode": 1,
+    "projectname": 1
+  },
+  "采购单位信息": {
+    "buyer": 1,
+    "buyeraddr": 1,
+    "buyerperson": 1,
+    "buyertel": 1
+  },
+  "招标代理机构信息": {
+    "agency": 1,
+    "agencyaddr": 1,
+    "agencyperson": 1,
+    "agencytel": 1
+  },
+  "中标供应商": {
+    "winner": 1,
+    "winneraddr": 1,
+    "winnerpserson": 1,
+    "winnertel": 1
+  },
+  "成交信息": {
+    "projectname": 1,
+    "projectcode": 1,
+    "bidamount": 1,
+    "currency": 1,
+    "winner": 1,
+    "winneraddr": 1,
+    "experts": 1,
+    "purchasinglist": 1
+  },
+  "评标委员会": {
+    "experts": 1
+  },
+  "报价明细": {
+    "purchasinglist": 1,
+    "bidamount": 1
+  },
+  "合同金额": {
+    "bidamount": 1
+  },
+  "其他": {
+    "buyer": 1,
+    "projectname": 1,
+    "agency": 1,
+    "approvalno": 1,
+    "projectscope": 1,
+    "projectcode": 1,
+    "projectaddr": 1,
+    "budget": 1,
+    "projectscale": 1,
+    "funds": 1,
+    "bidopendate": 1,
+    "bidopentime": 1,
+    "buyerperson": 1,
+    "buyertel": 1,
+    "buyeraddr": 1,
+    "agencyperson": 1,
+    "agencytel": 1,
+    "agencyaddr": 1,
+    "winner": 1,
+    "winneraddr": 1,
+    "winnerpserson": 1,
+    "winnertel": 1,
+    "bidamount": 1,
+    "currency": 1,
+    "experts": 1,
+    "purchasinglist": 1
+  }
+}