Browse Source

合并冲突

wcj 6 years ago
parent
commit
d8482d2354

+ 2 - 3
src/config.json

@@ -9,13 +9,12 @@
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
     "saveresult": true,
-    "fieldscore": true,
     "qualityaudit": false,
     "saveblock": true,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,
-    "udptaskid": "5be107e600746bf92debf080",
+    "udptaskid": "5cdd3025698414032c8322b1",
     "udpport": "1484",
     "nextNode": [
         {
@@ -25,7 +24,7 @@
         },
         {
             "addr": "127.0.0.1",
-            "port": 1481,
+            "port": 1486,
             "stype": "hangye",
             "memo": "行业分类"
         }

+ 2 - 2
src/jy/clear/specialsymbols.go

@@ -57,7 +57,7 @@ func init() {
 	MesReg = regexp.MustCompile(messycodeStr)
 	SymInterCon = qu.ObjArrToStringArr(SpecialSymbols["symintercon"].([]interface{}))
 
-	//	text := []rune("(阳江市海陵岛经济开发试验区环境卫生管理所)(阳江市海陵岛经济开发试验区环境卫生管理所果皮箱、垃圾桶采购项目)(GX2015YJ1010GK125)的综合评分法中标公告广东")
+	//	text := []rune("2019年大兴新城地区公共厕所及附属设施项目(改造-施工)")
 	//	for i := 1; i <= 2; i++ {
 	//		text = AnotherRemoveStart(text)
 	//		qu.Debug(string(text))
@@ -371,7 +371,7 @@ func DelContext(pairedIndex map[int]int, text []rune) ([]rune, bool) {
 			tmp = text[s+1 : e]
 			if len(tmp) > 2 { //排除对称符号中只有["工程","项目","采购","服务","监理","施工","设计"]
 				for _, r := range SymInterCon {
-					if strings.HasSuffix(string(tmp), r) && len(tmp) > length {
+					if strings.HasSuffix(string(tmp), r) && len(tmp) > length && len([]rune(strings.Replace(string(tmp), r, "", -1))) > 4 {
 						ismatch = true
 						result = tmp
 						length = len(tmp)

+ 71 - 34
src/jy/extract/clearesult.go

@@ -3,10 +3,11 @@ package extract
 import (
 	db "jy/mongodbutil"
 	ju "jy/util"
-	"log"
 	qu "qfw/util"
 	"qfw/util/elastic"
 	"time"
+
+	log "github.com/donnie4w/go-logger/logger"
 )
 
 var CltLogs []map[string]interface{} //清理日志
@@ -66,7 +67,7 @@ func (c *ClearTask) ClearProcess(doc *map[string]interface{}) {
 		c.UpdateResult = append(c.UpdateResult, updatearr)
 		lock.Unlock()
 	}, func(err interface{}) {
-		log.Println((*doc)["_id"], err)
+		log.Debug((*doc)["_id"], err)
 		<-c.ClearTaskInfo.ProcessPool
 	})
 	<-c.ClearTaskInfo.ProcessPool
@@ -99,38 +100,74 @@ func SaveCltLog() {
 }
 
 //批量更新抽取结果的值
-func (c *ClearTask) UpdateResultVal() {
+func (c *ClearTask) UpdateResultVal(init bool) {
 	defer qu.Catch()
-	c.ClearChannel = make(chan bool, 5)
-	c.UpdateResult = [][]map[string]interface{}{}
-	for {
-		if len(c.UpdateResult) > 500 {
-			c.ClearChannel <- true
-			arr := c.UpdateResult[:500]
-			go func(tmp *[][]map[string]interface{}) {
-				qu.Try(func() {
-					c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, *tmp...)
-					<-c.ClearChannel
-				}, func(err interface{}) {
-					log.Println(err)
-					<-c.ClearChannel
-				})
-			}(&arr)
-			c.UpdateResult = c.UpdateResult[500:]
-		} else {
-			c.ClearChannel <- true
-			arr := c.UpdateResult
-			func(tmp *[][]map[string]interface{}) {
-				qu.Try(func() {
-					c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, *tmp...)
-					<-c.ClearChannel
-				}, func(err interface{}) {
-					log.Println(err)
-					<-c.ClearChannel
-				})
-			}(&arr)
-			c.UpdateResult = [][]map[string]interface{}{}
-			time.Sleep(10 * time.Second)
-		}
+	if c.UpdateResult == nil {
+		c.UpdateResult = [][]map[string]interface{}{}
+	}
+	if init {
+		go func() {
+			for {
+				if len(c.UpdateResult) > 500 {
+					arr := c.UpdateResult[:500]
+					c.UpdateResult = c.UpdateResult[500:]
+					qu.Try(func() {
+						c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, arr...)
+					}, func(err interface{}) {
+						log.Debug(err)
+					})
+				} else {
+					arr := c.UpdateResult
+					c.UpdateResult = [][]map[string]interface{}{}
+					qu.Try(func() {
+						c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, arr...)
+					}, func(err interface{}) {
+						log.Debug(err)
+					})
+					time.Sleep(10 * time.Second)
+				}
+			}
+		}()
+	} else {
+		arr := c.UpdateResult
+		c.UpdateResult = [][]map[string]interface{}{}
+		qu.Try(func() {
+			c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, arr...)
+		}, func(err interface{}) {
+			log.Debug(err)
+		})
+		time.Sleep(1 * time.Second)
 	}
+	//	c.ClearChannel = make(chan bool, 5)
+	//	c.UpdateResult = [][]map[string]interface{}{}
+	//	for {
+	//		if len(c.UpdateResult) > 500 {
+	//			c.ClearChannel <- true
+	//			arr := c.UpdateResult[:500]
+	//			go func(tmp *[][]map[string]interface{}) {
+	//				qu.Try(func() {
+	//					c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, *tmp...)
+	//					<-c.ClearChannel
+	//				}, func(err interface{}) {
+	//					log.Println(err)
+	//					<-c.ClearChannel
+	//				})
+	//			}(&arr)
+	//			c.UpdateResult = c.UpdateResult[500:]
+	//		} else {
+	//			c.ClearChannel <- true
+	//			arr := c.UpdateResult
+	//			func(tmp *[][]map[string]interface{}) {
+	//				qu.Try(func() {
+	//					c.ClearTaskInfo.FDB.UpdateBulk(c.ClearTaskInfo.FromColl, *tmp...)
+	//					<-c.ClearChannel
+	//				}, func(err interface{}) {
+	//					log.Println(err)
+	//					<-c.ClearChannel
+	//				})
+	//			}(&arr)
+	//			c.UpdateResult = [][]map[string]interface{}{}
+	//			time.Sleep(10 * time.Second)
+	//		}
+	//	}
 }

+ 29 - 12
src/jy/extract/clearudp.go

@@ -2,13 +2,15 @@ package extract
 
 import (
 	"encoding/json"
+	"fmt"
 	db "jy/mongodbutil"
 	ju "jy/util"
-	"log"
 	mu "mfw/util"
 	"net"
 	qu "qfw/util"
+	"sync"
 
+	log "github.com/donnie4w/go-logger/logger"
 	"gopkg.in/mgo.v2/bson"
 )
 
@@ -27,24 +29,24 @@ func clearProcessUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 		var rep map[string]interface{}
 		err := json.Unmarshal(data, &rep)
 		if err != nil {
-			log.Println(err)
+			log.Debug(err)
 			Udpclient.WriteUdp([]byte("false"), mu.OP_NOOP, ra) //回应上一个节点
 		} else {
 			sid, _ := rep["gtid"].(string)
 			eid, _ := rep["lteid"].(string)
 			//stype, _ := rep["stype"].(string)
-			log.Println("======", sid, eid)
-			Udpclient.WriteUdp([]byte("ok"), mu.OP_NOOP, ra) //回应上一个节点
+			log.Debug("======", sid, eid)
+			go Udpclient.WriteUdp([]byte("ok"), mu.OP_NOOP, ra) //回应上一个节点
 			ClearByUdp(sid, eid)
-			log.Println("udp通知清理完成,eid=", eid)
+			log.Debug("udp通知清理完成,eid=", eid)
 		}
 	case mu.OP_NOOP: //下个节点回应
 		var rep map[string]interface{}
 		err := json.Unmarshal(data, &rep)
 		if err != nil {
-			log.Println(err)
+			log.Debug(err)
 		} else {
-			log.Println(rep)
+			log.Debug(rep)
 		}
 	}
 }
@@ -59,28 +61,43 @@ func ClearByUdp(sid, eid string) {
 	//初始化脚本信息
 	clt.InitClearLuas()
 	//更新结果表清理后的字段值
-	go clt.UpdateResultVal()
+	go clt.UpdateResultVal(true)
 
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
-	log.Println("query---", query, clt.ClearTaskInfo.FromColl)
+	log.Debug("query---", query, clt.ClearTaskInfo.FromColl)
 	count := clt.ClearTaskInfo.FDB.Count(clt.ClearTaskInfo.FromColl, query)
-	log.Println("count---", count)
+	log.Debug("count---", count)
 	pageNum := (count + PageSize - 1) / PageSize
 	limit := PageSize
 	if count < PageSize {
 		limit = count
 	}
 	//一次查询5000条数据进行清理
+	index := 0
+	wg := sync.WaitGroup{}
 	for i := 0; i < pageNum; i++ {
 		query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid)}}
-		log.Printf("page=%d,query=%v", i+1, query)
+		fmt.Printf("page=%d,query=%v", i+1, query)
 		list, _ := clt.ClearTaskInfo.FDB.Find(clt.ClearTaskInfo.FromColl, query, nil, Fields2, false, 0, limit)
 		for _, v := range *list {
+			_id := qu.BsonIdToSId(v["_id"])
 			clt.ClearTaskInfo.ProcessPool <- true
+			wg.Add(1)
 			go func(val map[string]interface{}) {
+				defer wg.Done()
 				clt.ClearProcess(&val)
 			}(v)
-			sid = qu.BsonIdToSId(v["_id"])
+			//sid = qu.BsonIdToSId(v["_id"])
+			index++
+			if index%1000 == 0 {
+				log.Debug("index:", index, ",页码:", i+1, ",_id:", _id)
+			}
+			sid = _id
+			if sid >= eid {
+				break
+			}
 		}
 	}
+	wg.Wait()
+	clt.UpdateResultVal(false)
 }

+ 33 - 94
src/jy/extract/extract.go

@@ -30,7 +30,7 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 200                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -231,10 +231,10 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 	} else {
 		detail = d2
 	}
-	detail = ju.CutLableStr(detail)
-	detail = cut.ClearHtml(detail)
+	d3, _ := doc["summary"].(string)
+	detail = ju.CutLableStr(d3 + "\n" + detail)
+	detail = cut.ClearHtml(d3 + "\n" + detail)
 	doc["detail"] = detail
-
 	if isextFile {
 		file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 	}
@@ -385,11 +385,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				//项目名称未能抽取到,标题来凑
 				if vc.Field == "projectname" {
 					//if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
-					items := make([]*ju.ScoreItem, 1)
-					items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
-					field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
+					field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
 					if tmp["blocktag"] != nil {
-						field.BlockTag = tmp["blocktag"].(map[string]bool)
+						btag := make(map[string]string)
+						for k := range tmp["blocktag"].(map[string]bool) {
+							btag[k] = TagConfigDesc[k]
+						}
+						field.BlockTag = btag
 					}
 					j.Result[vc.Field] = append(j.Result[vc.Field], field)
 					//}
@@ -625,23 +627,13 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
-						if extfrom == "title" {
-							field.Score = 4
-						}
+						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
 						if tmp["blocktag"] != nil {
-							field.BlockTag = tmp["blocktag"].(map[string]bool)
-						}
-						item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
-						if extfrom == "title" {
-							item.Score = 4
-						}
-						if tmp["scoreitem"] == nil {
-							scoreItems := make([]*ju.ScoreItem, 0)
-							scoreItems = append(scoreItems, item)
-							field.ScoreItem = scoreItems
-						} else {
-							field.ScoreItem = append(field.ScoreItem, item)
+							btag := make(map[string]string)
+							for k := range tmp["blocktag"].(map[string]bool) {
+								btag[k] = TagConfigDesc[k]
+							}
+							field.BlockTag = btag
 						}
 						j.Result[k] = append(j.Result[k], field)
 					}
@@ -663,13 +655,17 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 		//块抽取
 		if in.Field != "" {
 			if extfrom == "title" {
-				extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]bool{"title": true}, j, in)
+				extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
 				if len(extinfo) > 0 {
 					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 				}
 			} else {
 				for _, v := range j.Block {
-					extinfo := extRegCoreToResult(extfrom, v.Text, &v.Classify, j, in)
+					btag := make(map[string]string)
+					for k := range v.Classify {
+						btag[k] = TagConfigDesc[k]
+					}
+					extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
 					if len(extinfo) > 0 {
 						AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 					}
@@ -720,7 +716,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 }
 
 //正则提取结果
-func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
+func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
 	defer qu.Catch()
 	extinfo := map[string][]map[string]interface{}{}
 	if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
@@ -733,6 +729,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						continue
 					}
 					val := text[pos[p]:pos[p+1]]
+					sourcevalue := val
 					if val == "招标公告" {
 						return extinfo
 					}
@@ -756,27 +753,9 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
-						if extfrom == "title" {
-							exfield.Score = 4
-						}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
 						if tmp["blocktag"] != nil {
-							exfield.BlockTag = tmp["blocktag"].(map[string]bool)
-						}
-						item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
-						if extfrom == "title" {
-							item.Score = 4
-						}
-						if strings.Contains(val, "\n") {
-							item.Score -= 1
-							exfield.Score -= 1
-						}
-						if tmp["scoreitem"] == nil {
-							sitems := make([]*ju.ScoreItem, 0)
-							sitems = append(sitems, &item)
-							exfield.ScoreItem = sitems
-						} else {
-							exfield.ScoreItem = append(exfield.ScoreItem, &item)
+							exfield.BlockTag = tmp["blocktag"].(map[string]string)
 						}
 						j.Result[k] = append(j.Result[k], &exfield)
 						//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
@@ -797,7 +776,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 							}
 							tmp := map[string]interface{}{
 								"field":     v.Field,
-								"code":      v.Code + "去除__*后",
+								"code":      v.Code,
 								"ruletext":  regArr[0],
 								"extfrom":   extfrom,
 								"value":     value,
@@ -807,28 +786,9 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 							}
 							tmps = append(tmps, tmp)
 							extinfo[v.Field] = tmps
-
 							exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
-							if extfrom == "title" {
-								exfield.Score = 4
-							}
 							if tmp["blocktag"] != nil {
-								exfield.BlockTag = tmp["blocktag"].(map[string]bool)
-							}
-							item := ju.ScoreItem{Des: "初始化抽取规则去除__*", Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: value}
-							if extfrom == "title" {
-								item.Score = 4
-							}
-							if strings.Contains(value, "\n") {
-								item.Score -= 1
-								exfield.Score -= 1
-							}
-							if tmp["scoreitem"] == nil {
-								sitems := make([]*ju.ScoreItem, 0)
-								sitems = append(sitems, &item)
-								exfield.ScoreItem = sitems
-							} else {
-								exfield.ScoreItem = append(exfield.ScoreItem, &item)
+								exfield.BlockTag = tmp["blocktag"].(map[string]string)
 							}
 							j.Result[v.Field] = append(j.Result[v.Field], &exfield)
 							//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
@@ -866,22 +826,8 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
 			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
-			if extfrom == "title" {
-				field.Score = 4
-			}
 			if tmp["blocktag"] != nil {
-				field.BlockTag = tmp["blocktag"].(map[string]bool)
-			}
-			item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
-			if extfrom == "title" {
-				item.Score = 4
-			}
-			if tmp["scoreitem"] == nil {
-				sitems := make([]*ju.ScoreItem, 0)
-				sitems = append(sitems, &item)
-				field.ScoreItem = sitems
-			} else {
-				field.ScoreItem = append(field.ScoreItem, &item)
+				field.BlockTag = tmp["blocktag"].(map[string]string)
 			}
 			j.Result[v.Field] = append(j.Result[v.Field], field)
 		}
@@ -905,15 +851,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 				for _, tmp := range tmps {
 					field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
 					if tmp["blocktag"] != nil {
-						field.BlockTag = tmp["blocktag"].(map[string]bool)
-					}
-					item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
-					if tmp["scoreitem"] == nil {
-						scoreItems := make([]*ju.ScoreItem, 0)
-						scoreItems = append(scoreItems, &item)
-						field.ScoreItem = scoreItems
-					} else {
-						field.ScoreItem = append(field.ScoreItem, &item)
+						field.BlockTag = tmp["blocktag"].(map[string]string)
 					}
 					j.Result[k] = append(j.Result[k], field)
 					//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
@@ -935,7 +873,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 					//						continue
 					//					}
 					text := qu.ObjToString(v.Value)
-					if text != "" && v.ExtFrom != "title" {
+					if text != "" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[in.Field][k].Value = text
@@ -1189,6 +1127,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(blocks) > 0 {
 			tmp["blocks"] = blocks
 		}
+		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				for field, _ := range e.Fields {
@@ -1326,7 +1265,7 @@ func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
 
 //去重冗余字段
 func delFiled(k string) bool {
-	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
+	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
 func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {

+ 6 - 160
src/jy/extract/extractInit.go

@@ -31,6 +31,7 @@ type ExtReg struct {
 	NumSign    int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
 }
 type RuleCore struct {
+	Id        string        //id
 	Field     string        //逻辑字段
 	LuaLogic  string        //进入逻辑
 	ExtFrom   string        //从哪个字段抽取
@@ -133,7 +134,7 @@ type ClearTask struct {
 	ClearLuas     map[string][]*ClearLua //清理脚本
 
 	UpdateResult [][]map[string]interface{} //清理后结果
-	ClearChannel chan bool
+	//ClearChannel chan bool
 }
 
 func init() {
@@ -308,7 +309,7 @@ func (e *ExtractTask) InitRuleCore() {
 			if b, _ := vv["isuse"].(bool); !b {
 				continue
 			}
-			rcore := &RuleCore{}
+			rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
 			rcore.Field = s_field
 			rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
 			rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
@@ -369,11 +370,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
-						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
 						} else {
@@ -413,12 +415,12 @@ func (e *ExtractTask) InitRuleCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
 							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]
 						}
-						pattern, _ = strconv.Unquote(`"` + pattern + `"`)
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")
 							posm := map[string]int{}
@@ -984,162 +986,6 @@ func (e *ExtractTask) InitAreaCode() {
 	}
 }
 
-//初始化城市省份敏感词
-//func (e *ExtractTask) InitCityDFA() {
-//	defer qu.Catch()
-//	e.CityAllGet = &ju.DFA{}
-//	e.DistrictGet = &ju.DFA{}
-//	e.AreaProvinceGet = &ju.DFA{}
-//	e.StreetGet = &ju.DFA{}
-//	//初始化map
-//	if e.ProvinceMap == nil {
-//		e.ProvinceMap = make(map[string]string)
-//	}
-//	if e.CityBriefMap == nil {
-//		e.CityBriefMap = make(map[string]*City)
-//	}
-//	if e.ProvinceBriefMap == nil {
-//		e.ProvinceBriefMap = make(map[string]*Province)
-//	}
-//	if e.AreaToCityMap == nil {
-//		e.AreaToCityMap = make(map[string][]*City)
-//	}
-//	if e.DistrictCityMap == nil {
-//		e.DistrictCityMap = make(map[string]*City)
-//	}
-//	if e.StreetDistrictMap == nil {
-//		e.StreetDistrictMap = make(map[string]*District)
-//	}
-//	//初始化省
-//	fn1 := InitProvince(e.TaskInfo.Version)
-//	for k, v := range fn1 {
-//		for _, p := range v.([]interface{}) {
-//			p1, _ := p.(string)
-//			e.AreaProvinceGet.AddWord(p1) //华中科技大学
-//			e.ProvinceMap[p1] = k         //华中科技大学:湖北
-//		}
-//	}
-
-//	//初始化城市全称
-//	fn2 := InitCityAll(e.TaskInfo.Version)
-//	for k, v := range fn2 {
-//		e.AreaProvinceGet.AddWord(k) //加入省全称dfa(k:浙江省)
-//		p := &Province{}
-//		p.Name = k                      //省全称
-//		p.Brief = v["brief"].(string)   //省简称
-//		e.ProvinceMap[k] = p.Brief      //浙江省:浙江
-//		e.ProvinceBriefMap[p.Brief] = p //浙江:省信息
-//		p.Cap = v["captial"].(string)   //省会(杭州)
-//		city, _ := v["city"].(map[string]interface{})
-//		//
-//		for k1, v1 := range city {
-//			v1m, _ := v1.(map[string]interface{})
-//			c := &City{}
-//			c.Name = k1
-//			c.Brief = v1m["brief"].(string)
-//			e.CityBriefMap[c.Brief] = c
-//			c.P = p
-//			if c.Brief == p.Cap {
-//				p.Captial = c
-//			}
-//			//加入到城市map中
-//			//
-//			cs := e.AreaToCityMap[k1]
-//			e.CityAllGet.AddWord(k1) //市全称
-//			if cs != nil {
-//				cs = append(cs, c)
-//			} else {
-//				cs = []*City{c}
-//			}
-//			e.AreaToCityMap[k1] = cs
-//			//区县
-//			districtmap := v1m["area"].(map[string]interface{}) //区或县
-//			for district, streetarr := range districtmap {
-//				d := &District{}
-//				d.Name = district
-//				d.C = c
-//				e.DistrictGet.AddWord(district) //加入区或县敏感词
-//				ctmp := e.DistrictCityMap[district]
-//				if ctmp == nil {
-//					e.DistrictCityMap[district] = c
-//				}
-//				//街道
-//				for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
-//					e.StreetGet.AddWord(s) //加入街道敏感词
-//					dtmp := e.StreetDistrictMap[s]
-//					if dtmp == nil {
-//						e.StreetDistrictMap[s] = d
-//					}
-//				}
-//			}
-//		}
-//	}
-//	//初始化城市简称
-//	fn3 := InitCitySim(e.TaskInfo.Version)
-//	e.CitySimGet = &ju.DFA{}
-//	for k, v := range fn3 {
-//		pb := v["brief"].(string)
-//		p := e.ProvinceBriefMap[pb]
-//		//加载
-//		for _, ss := range []string{k, pb} { //省全称和省简称
-//			cs := e.AreaToCityMap[ss]
-//			if cs != nil {
-//				cs = append(cs, p.Captial)
-//			} else {
-//				cs = []*City{p.Captial}
-//			}
-//			e.AreaToCityMap[ss] = cs
-//			e.CitySimGet.AddWord(ss)
-//		}
-//		city, _ := v["city"].(map[string]interface{})
-//		for k1, v1 := range city {
-//			v1m, _ := v1.(map[string]interface{})
-//			if v1m["brief"] == nil {
-//			}
-//			cb := v1m["brief"].(string)
-//			c := e.AreaToCityMap[k1][0]
-//			//加入到城市map中
-//			for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州  浙江杭州
-//				e.CitySimGet.AddWord(ss)
-//				cs := e.AreaToCityMap[ss]
-//				if cs != nil {
-//					cs = append(cs, c)
-//				} else {
-//					cs = []*City{c}
-//				}
-//				e.AreaToCityMap[ss] = cs
-//			}
-//			arr := v1m["area"].([]interface{})
-//			for _, k2 := range arr {
-//				s := k2.(string)
-//				for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
-//					cs := e.AreaToCityMap[ss]
-//					e.CitySimGet.AddWord(ss)
-//					if cs != nil {
-//						cs = append(cs, c)
-//					} else {
-//						cs = []*City{c}
-//					}
-//					e.AreaToCityMap[ss] = cs
-
-//					//只加入简称
-//					if n == 0 {
-//						d := &District{}
-//						d.Name = ss
-//						d.C = c
-//						e.DistrictGet.AddWord(ss) //加入区或县简称敏感词
-//						ctmp := e.DistrictCityMap[ss]
-//						if ctmp == nil {
-//							e.DistrictCityMap[ss] = c
-//						}
-//					}
-
-//				}
-//			}
-//		}
-//	}
-//}
-
 //保存抽取详情数据
 func (e *ExtractTask) ResultSave(init bool) {
 	defer qu.Catch()

+ 70 - 54
src/jy/extract/score.go

@@ -12,12 +12,25 @@ import (
 	"unicode/utf8"
 )
 
-var SoreConfig map[string]map[string]interface{}
-var TagConfig map[string]map[string]float64
+var (
+	SoreConfig    map[string]map[string]interface{}
+	TagConfig     map[string]map[string]float64
+	TagConfigDesc map[string]string
+
+	TitleScore, RepeatScore, BlockScore float64
+)
 
 func init() {
+	qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
 	qu.ReadConfig("./res/tagscore.json", &TagConfig)
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
+	TitleScore = qu.Float64All(SoreConfig["extractype"]["title"])
+	if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
+		RepeatScore = qu.Float64All(repeat["score"])
+	}
+	if block, ok := SoreConfig["other"]["block"].(map[string]interface{}); ok {
+		BlockScore = qu.Float64All(block["score"])
+	}
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)
@@ -85,52 +98,45 @@ func init() {
 
 //结果打分
 func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
-	result := j.Result
 	qu.Catch()
+	result := j.Result
 	for field, tmps := range result {
 		for tmpsindex, tmpsvalue := range tmps {
+			if tmpsvalue.ExtFrom == "title" { //标题打分初始化
+				tmps[tmpsindex].Score += TitleScore
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "title初始化", Code: "fieldscore.title", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: TitleScore})
+			}
 			//是否有段标签
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
 				var qz float64 = 0.0 //取权重最高的
-				var tgk string
 				for key := range tmpsvalue.BlockTag {
 					//key = "其他"//TODO 测试用
 					if TagConfig[key][field] > qz {
 						qz = TagConfig[key][field]
-						tgk = key
 					}
 				}
-				tmps[tmpsindex].Score += 2 * qz //乘以权重系数
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", Type: tgk + field, ExtFrom: "tagscore.json", Value: tmpsvalue.Value, Score: 2 * qz})
+				tmps[tmpsindex].Score += BlockScore * qz //乘以权重系数
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", ScoreFrom: "tagscore.json", Value: tmpsvalue.Value, Score: BlockScore * qz})
 			} else {
 				//没有段标签,走其他
 				//qz := TagConfig["其他"][field]
 				//tmps[tmpsindex].Score += 2 * qz //乘以权重系数
 			}
-			if tmpsvalue.ExtFrom != "title" { //非标题抽取
-				//是否有kv值
-				if strings.Contains(tmpsvalue.Type, "colon") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
-				} else if strings.Contains(tmpsvalue.Type, "space") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
-				} else if strings.Contains(tmpsvalue.Type, "table") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
-				}
-			}
-			if tmpsvalue.ExtFrom != "title" { //非标题抽取
-				if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
-				}
-			} else {
-				if strings.Contains(tmpsvalue.Type, "regexp") {
-					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"]) + 1})
-				}
+
+			//抽取类型打分
+			if strings.Contains(tmpsvalue.Type, "colon") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
+			} else if strings.Contains(tmpsvalue.Type, "space") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
+			} else if strings.Contains(tmpsvalue.Type, "table") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
+			} else if strings.Contains(tmpsvalue.Type, "regexp") {
+				tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
 			}
 			scoreRule := SoreConfig[field]
 			if scoreRule == nil {
@@ -144,26 +150,23 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if valueLen > 100 && field != "projectscope" {
 					tmps[tmpsindex].Score = -99
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Type: "length", Value: tmpsvalue.Value, Score: -99})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Value: tmpsvalue.Value, Score: -99})
 				}
 				if lengths, ok := scoreRule["length"].([]interface{}); ok {
 					for _, tmp := range lengths {
 						if length, ok := tmp.(map[string]interface{}); ok {
-							min := qu.IntAll(length["min"])
-							max := qu.IntAll(length["max"])
-							scores, _ := length["score"].([]interface{})
-							if len(scores) < 3 {
-								continue
-							}
-							if valueLen < min {
-								tmps[tmpsindex].Score += qu.Float64All(scores[0])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, "<", min), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
-							} else if valueLen > max {
-								tmps[tmpsindex].Score += qu.Float64All(scores[2])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", max), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
-							} else {
-								tmps[tmpsindex].Score += qu.Float64All(scores[1])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen, ">", min, "&&", valueLen, "<", max), Type: field, ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+							if ranges, ok := length["range"].([]interface{}); ok {
+								gt := qu.IntAll(ranges[0])
+								lte := qu.IntAll(ranges[1])
+								if lte < 0 { //∞
+									lte = 999999
+								}
+								score := qu.Float64All(ranges[2])
+								if valueLen > gt && valueLen <= lte {
+									tmps[tmpsindex].Score += score
+									tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
+									break
+								}
 							}
 						}
 					}
@@ -177,7 +180,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: field + ".negativewords", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -195,7 +198,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: field + ".positivewords", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -213,7 +216,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: field + ".winnerorder", RuleText: reg.String(), Type: "regexp", ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder" + fmt.Sprint(p["describe"]), Code: "winnerorder", RuleText: reg.String(), ScoreFrom: "fieldscore.json.winnerorder", Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -234,13 +237,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if val < min && 0 < val {
 					tmps[tmpsindex].Score += qu.Float64All(scores[0])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, "<", min, "&&", 0, "<", val), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
 				} else if val > max {
 					tmps[tmpsindex].Score += qu.Float64All(scores[2])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 				} else if val <= max && val >= min {
 					tmps[tmpsindex].Score += qu.Float64All(scores[1])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field + ".float", RuleText: fmt.Sprintln(val, "<=", max, "&&", val, ">=", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 				}
 			}
 			//其他打分配置
@@ -255,13 +258,26 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if val > max {
 					tmps[tmpsindex].Score += qu.Float64All(scores[2])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, ">", max), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 				} else if val <= max && val > min {
 					tmps[tmpsindex].Score += qu.Float64All(scores[1])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ExtFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field + ".decimal", RuleText: fmt.Sprint(val, "<=", max, "&&", val, ">", min), ScoreFrom: "fieldscore.json." + field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 				}
 			}
 		}
+		//计算重复值,并加分=重复数量*乘系数
+		valrepeat := map[string]int{}
+		for _, v := range tmps {
+			valrepeat[fmt.Sprint(v.Value)] += 1
+		}
+		for index, v := range tmps {
+			v.ValRepeat = valrepeat[fmt.Sprint(v.Value)] - 1
+			if v.ValRepeat > 0 {
+				score := RepeatScore * float64(v.ValRepeat)
+				v.Score += score
+				tmps[index].ScoreItem = append(tmps[index].ScoreItem, &ju.ScoreItem{Des: "重复次数打分repeat", Code: field + ".repeat", RuleText: "repeat:" + fmt.Sprint(v.ValRepeat), ScoreFrom: "fieldscore.json." + field, Value: v.Value, Score: score})
+			}
+		}
 	}
 	return result
 }

+ 11 - 12
src/jy/pretreated/analytable.go

@@ -131,7 +131,9 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (kvTags map[s
 	if sv, sok := v.(string); sok { //取KV
 		v1 = sv
 	} else if sv, sok := v.([]string); sok { //是数组先默认取第一个
-		v1 = sv[0]
+		if len(sv) >= 1 {
+			v1 = sv[0]
+		}
 	}
 	//对值单位的处理   (预算|费|价|额|规模|投资)
 	if moneyreg.MatchString(k) {
@@ -228,15 +230,7 @@ func (table *Table) KVFilter() {
 			if tag != "" && table.Tag == "" {
 				table.Tag = tag
 			}
-			for kk, vv := range kvTags { //根据关键词,过滤table.SortKV到table.StandKV和table.StandKVWeight
-				table.StandKV[kk] = append(table.StandKV[kk], vv...)
-				//					 else if k2 == "中标金额" {
-				//						//						u.Debug(qutil.Float64All(v1), qutil.Float64All(table.StandKV[k2]))
-				//						if qutil.Float64All(v1) > qutil.Float64All(table.StandKV[k2]) {
-				//							table.StandKV[k2] = v1
-				//						}
-				//					}
-			}
+			MergeKvTags(table.StandKV, kvTags)
 		} else {
 			//u.Debug(k, v, "---------")
 			as.AddKey(k, v)
@@ -320,6 +314,9 @@ func (table *Table) KVFilter() {
 func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
+		if len(table.StandKV[k]) == 0 || strings.TrimSpace(table.StandKV[k][0].Value) != "" {
+			continue
+		}
 		v := as.Map[k]
 		if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid {
 			if table.WinnerOrder == nil {
@@ -2030,7 +2027,6 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int)
 			//}
 			for _, vcgdw := range k1tags {
 				if vcgdw.Value == "采购单位" {
-					tn.SortKV.RemoveKey(k1)
 				}
 			}
 		} else if val, bvs := v1.(string); bvs && len(index) == 1 {
@@ -2044,7 +2040,10 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int)
 					}
 					hasValid = true
 				}
-				if hasValid && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(kvTag_k) {
+				if !hasValid {
+					continue
+				}
+				if !(len(kvTags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(kvTag_k)) {
 					tn.SortKV.RemoveKey(k1)
 					tn.assemblePackage(k1, val, index[0])
 					//log.Println("remove", k1, val)

+ 13 - 10
src/jy/pretreated/tablev2.go

@@ -63,6 +63,7 @@ type TD struct {
 	Val            string             //值
 	Text           string             //原始串
 	SortKV         *SortMap           //存放kv值
+	SortKVWeight   map[string]int     //存放kv值权重
 	Html           string             //html值
 	BH             bool               //是否是表头
 	MustBH         bool               //不能修改的表头
@@ -90,11 +91,12 @@ var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿
 func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	defer qutil.Catch()
 	td := &TD{
-		ArrVal:  []string{},
-		Goquery: Goquery,
-		SonTds:  []*TD{},
-		TR:      tr,
-		SortKV:  NewSortMap(),
+		ArrVal:       []string{},
+		Goquery:      Goquery,
+		SonTds:       []*TD{},
+		TR:           tr,
+		SortKV:       NewSortMap(),
+		SortKVWeight: map[string]int{},
 	}
 	colspan, rowspan := 0, 0
 	col, bcol := td.Goquery.Attr("colspan")
@@ -150,11 +152,13 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		ckv := GetKVAll(proCode, "", nil, 1)
 		for k, v := range ckv.KvTags {
 			td.SortKV.AddKey(k, v)
+			td.SortKVWeight[k] = -99
 		}
 	} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
 		for k, v := range ckv.KvTags {
 			td.SortKV.AddKey(k, v)
+			td.SortKVWeight[k] = -99
 		}
 	}
 	if proCode = jsonReg.FindString(text); proCode != "" {
@@ -162,6 +166,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		json.Unmarshal([]byte(proCode), &jsonMap)
 		for k, v := range jsonMap {
 			td.SortKV.AddKey(k, v)
+			td.SortKVWeight[k] = -99
 		}
 	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
@@ -218,12 +223,10 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR) {
 
 				//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
 				td.BH = false
-				for k, v := range sonts.KvTags {
-					if td.TR.Table.TableResult == nil {
-						td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
-					}
-					td.TR.Table.TableResult.KvTags[k] = append(td.TR.Table.TableResult.KvTags[k], v...)
+				if td.TR.Table.TableResult == nil {
+					td.TR.Table.TableResult = NewTableResult(sonts.Id, sonts.Toptype, sonts.BlockTag, sonts.Html, sonts.Itype, sonts.RuleBlock)
 				}
+				MergeKvTags(td.TR.Table.TableResult.KvTags, sonts.KvTags)
 				td.SonTableResult = sonts
 				//for _, k := range sonts.SortKV.Keys {
 				//u.Debug(k, sonts.SortKV.Map[k])

+ 17 - 21
src/jy/util/article.go

@@ -38,17 +38,18 @@ type Job struct {
 }
 
 type ExtField struct {
-	BlockTag    map[string]bool //块标签
-	Field       string          //属性
-	Code        string          //匹配标签(字符串、正则)、正则或lua代码
-	RuleText    string          //内容
-	Type        string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType   string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom     string          //抽取来源(title,detail)
-	SourceValue interface{}     //抽取结果--未清理
-	Value       interface{}     //抽取结果
-	Score       float64         //得分
-	ScoreItem   []*ScoreItem    //打分项
+	BlockTag    map[string]string //块标签
+	Field       string            //属性
+	Code        string            //匹配标签(字符串、正则)、正则或lua代码
+	RuleText    string            //内容
+	Type        string            //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType   string            //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom     string            //抽取来源(title,detail)
+	SourceValue interface{}       //抽取结果--未清理
+	Value       interface{}       //抽取结果
+	Score       float64           //得分
+	ScoreItem   []*ScoreItem      //打分项
+	ValRepeat   int               //结果值重复次数,打分参考
 }
 
 //打分项
@@ -56,9 +57,7 @@ type ScoreItem struct {
 	Des       string      //分数说明
 	Code      string      //匹配标签(字符串、正则)、正则或lua代码
 	RuleText  string      //内容
-	Type      string      //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType string      //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom   string      //抽取来源(title,detail)
+	ScoreFrom string      //打分来源
 	Value     interface{} //抽取结果
 	Score     float64     //得分结果
 }
@@ -155,18 +154,15 @@ type Kv struct {
 
 //最终放到job上的kv
 type JobKv struct {
-	Kvs   []*Kv //有序的冒号kv
-	Kvs_2 []*Kv //有序的冒号kv
-	//Kv      map[string]*Tag   //table kv (分出的对应的KV值)
-	//KvIndex map[string]int    //kv_index(流程)
+	Kvs    []*Kv             //有序的冒号kv
+	Kvs_2  []*Kv             //有序的冒号kv
 	KvTags map[string][]*Tag //带权重的kv
 }
 
 func NewJobKv() *JobKv {
 	return &JobKv{
-		Kvs:   []*Kv{},
-		Kvs_2: []*Kv{},
-		//Kv:     map[string]*Tag{},
+		Kvs:    []*Kv{},
+		Kvs_2:  []*Kv{},
 		KvTags: map[string][]*Tag{},
 	}
 }

+ 205 - 59
src/res/fieldscore.json

@@ -1,43 +1,91 @@
 {
     "extractype": {
         "describe": "抽取类型打分",
-        "title": 4,
+        "title": 2,
         "table": 3,
-        "colon": 2,
-        "space": 2,
+        "colon": 3,
+        "space": 3,
         "regexp": 2,
         "winnerorder": 3
     },
+    "other": {
+        "block": {
+            "describe": "块属性基础分值",
+            "score": 0.5
+        },
+        "repeat": {
+            "describe": "重复项:分值*重复次数",
+            "score": 0.1
+        }
+    },
     "projectname": {
         "type": "string",
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(项目|工程|采购)$",
+                "regstr": ".{2,100}(项目|工程|施工|服务|设备|采购|设计|系统)$",
                 "score": 3
             }
         ],
         "negativewords": [
             {
                 "describe": "以*开头",
-                "regstr": "^(关于|\\[|【)",
-                "score": -10
+                "regstr": "^(\\[|【)",
+                "score": -2
             },
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件)$",
-                "score": -2
+                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心)$",
+                "score": -5
             }
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 35,
-                "score": [
-                    -10,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
                     3,
-                    -1
+                    -2
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    3,
+                    5,
+                    0
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    5,
+                    10,
+                    1
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    10,
+                    35,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    35,
+                    45,
+                    1
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    45,
+                    -1,
+                    -2
                 ]
             }
         ]
@@ -47,25 +95,39 @@
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(委员会|办公室|幼儿园|动物园|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|中心|协会|公司|政府|初中|集团|银行|\\[大中小\\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场)$",
+                "regstr": ".{2,100}(委员会|办公室|幼儿园|动物园|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|中心|协会|公司|政府|初中|集团|银行|[大中小]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场)$",
                 "score": 3
             }
         ],
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(附件|招标失败|交易中心|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "regstr": "(附件|招标失败|交易中心|不足|公告|变更|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
                 "score": -10
             }
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 20,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    4,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    4,
+                    25,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    25,
+                    -1,
                     -1
                 ]
             }
@@ -89,12 +151,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 20,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    4,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    4,
+                    35,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    35,
+                    -1,
                     -1
                 ]
             }
@@ -124,12 +200,26 @@
         "negativewords": [],
         "length": [
             {
-                "describe": "长度打分min>val:-6,min<=val<=max:1,max<val:-1",
-                "min": 4,
-                "max": 30,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    4,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    4,
+                    30,
+                    3
+                ]
+            },
+            {
+                "describe": "长度[gt,∞,score]",
+                "range": [
+                    30,
+                    -1,
                     -1
                 ]
             }
@@ -147,12 +237,26 @@
         "negativewords": [],
         "length": [
             {
-                "describe": "长度打分min>val:0,min<=val<=max:1,max<val:-1",
-                "min": 2,
-                "max": 6,
-                "score": [
-                    -5,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    1,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    1,
+                    7,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    7,
+                    -1,
                     -1
                 ]
             }
@@ -170,12 +274,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-1,min<=val<=max:1,max<val:-1",
-                "min": 7,
-                "max": 14,
-                "score": [
-                    -5,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    6,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    6,
+                    14,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    14,
+                    -1,
                     -1
                 ]
             }
@@ -197,8 +315,8 @@
             },
             {
                 "describe": "包含负分",
-                "regstr": "(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,。、::“”‘’\"])",
-                "score": -10
+                "regstr": "(null|勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,。、::“”‘’_\"])",
+                "score": -1
             },
             {
                 "describe": "标段编号匹配-2",
@@ -208,12 +326,26 @@
         ],
         "length": [
             {
-                "describe": "长度打分min>val:-1,min<=val<=max:1,max<val:-1",
-                "min": 3,
-                "max": 30,
-                "score": [
-                    -5,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
                     3,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    3,
+                    30,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    30,
+                    -1,
                     -1
                 ]
             }
@@ -256,12 +388,26 @@
         "type": "string",
         "length": [
             {
-                "describe": "项目范围字数",
-                "min": 2,
-                "max": 500,
-                "score": [
-                    -10,
-                    3,
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    2,
+                    -10
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    2,
+                    500,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    500,
+                    -1,
                     -1
                 ]
             }

+ 27 - 0
src/res/tagscoredesc.json

@@ -0,0 +1,27 @@
+{
+  "bidcondition": "招标条件",
+  "projectoverview": "项目概况/采购需求",
+  "bidder_requirement": "投标人资格要求",
+  "examineway": "资格审查方式",
+  "biddingsignup": "投标报名",
+  "biddingfile_obtain": "招标文件的获取",
+  "bidfile_submit": "投标文件的递交",
+  "purchasepolicy": "采购项目需要落实的政府采购政策",
+  "noticemedia": "公告媒体",
+  "superviseway": "监督方式",
+  "contactway": "联系方式",
+  "bidbond": "投标保证金",
+  "bidder_inforeg": "投标人信息注册",
+  "bid_note": "投标注意事项",
+  "projectinfo": "项目信息",
+  "buyerinfo": "采购单位信息",
+  "bidagencyinfo": "招标代理机构信息",
+  "winner": "中标供应商",
+  "dealinfo": "成交信息",
+  "servicecharge": "采购代理服务费收取",
+  "bidevaluat_result": "评标结果公示",
+  "bidevaluat_committee": "评标委员会",
+  "offerdetail": "报价明细",
+  "contractamount": "合同金额",
+  "payway": "付款方式"
+}

+ 1 - 1
versioncomparison/config.json

@@ -17,7 +17,7 @@
         "buyertel",
         "buyeraddr",
         "agencyperson",
-        "agencytel",5d39d253a5cb26b9b7404ae1,5d3b23aaa5cb26b9b7c1ec59
+        "agencytel",
         "agencyaddr"
     ]
 }

+ 16 - 10
versioncomparison/main.go

@@ -15,13 +15,14 @@ import (
 )
 
 var (
-	SysConfig map[string]interface{}
-	Premgo    *mongodbutil.Pool //上个版本库
-	Newmgo    *mongodbutil.Pool //当前版本库
-	FieldData map[string]map[string]*Data
-	Compares  map[string]*Compare
-	Sid, Eid  string
-	Fields    []string
+	SysConfig   map[string]interface{}
+	Premgo      *mongodbutil.Pool //上个版本库
+	Newmgo      *mongodbutil.Pool //当前版本库
+	FieldData   map[string]map[string]*Data
+	Compares    map[string]*Compare
+	Sid, Eid    string
+	Fields      []string
+	FieldsQuery string
 )
 
 type Compare struct {
@@ -44,8 +45,13 @@ func init() {
 	Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
 	Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
 	tmp, _ := SysConfig["fields"].([]interface{})
-	for _, v := range tmp {
+	for k, v := range tmp {
 		Fields = append(Fields, qu.ObjToString(v))
+		if k < (len(tmp) - 1) {
+			FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
+		} else {
+			FieldsQuery += `"` + qu.ObjToString(v) + `":1`
+		}
 	}
 	FieldData = map[string]map[string]*Data{}
 	Compares = map[string]*Compare{}
@@ -110,7 +116,7 @@ func createXlsx() {
 func getVersionData() {
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
 	log.Println(qu.ObjToString(SysConfig["prec"]), query)
-	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{}`, false, -1, -1)
+	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
 	for _, v := range *list1 {
 		for _, key := range Fields {
 			rd := FieldData[key]
@@ -126,7 +132,7 @@ func getVersionData() {
 	}
 	log.Println("pre version 加载完成")
 
-	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{}`, false, -1, -1)
+	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
 	for _, v := range *list2 {
 		for _, field := range Fields {
 			rd := FieldData[field]