wcj 6 年之前
父節點
當前提交
111e9712d7

+ 55 - 73
src/jy/extract/extract.go

@@ -10,7 +10,6 @@ import (
 	ju "jy/util"
 	qu "qfw/util"
 	"qfw/util/redis"
-	"reflect"
 	"regexp"
 	"strconv"
 	"strings"
@@ -376,7 +375,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 						if len(j.Result[vc.Field]) < 1 {
 							items := make([]*ju.ScoreItem, 1)
 							items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
-							field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
+							field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
 							if tmp["blocktag"] != nil {
 								field.BlockTag = tmp["blocktag"].(map[string]bool)
 							}
@@ -422,7 +421,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if vc.Field == "projectname" {
 						items := make([]*ju.ScoreItem, 1)
 						items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
-						field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
+						field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
 						if len(j.Result[vc.Field]) < 1 {
 							if tmp["blocktag"] != nil {
 								field.BlockTag = tmp["blocktag"].(map[string]bool)
@@ -727,7 +726,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 			} else {
 				for _, v := range j.Block {
-					extinfo := extRegCoreToResult(extfrom, v.Text, &v.Tag, j, in)
+					extinfo := extRegCoreToResult(extfrom, v.Text, &v.Classify, j, in)
 					if len(extinfo) > 0 {
 						AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 					}
@@ -750,7 +749,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 		}
 		for _, bl := range j.Block {
 			//冒号kv
-			if bl.ColonKV != nil && len(bl.ColonKV.Kvs) > 0 {
+			if bl.ColonKV != nil {
 				kvs := bl.ColonKV.Kvs
 				kvs2 := bl.ColonKV.Kvs_2
 				// log.Debug("ColonKV1", kvs)
@@ -833,7 +832,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 				}
 			}
 			//空格kv
-			if bl.SpaceKV != nil && len(bl.SpaceKV.Kvs) > 0 {
+			if bl.SpaceKV != nil {
 				kvs := bl.SpaceKV.Kvs
 				// log.Debug("SpaceKV", kvs)
 				for _, tag := range tags {
@@ -877,7 +876,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 				}
 			}
 			//表格kv
-			if bl.TableKV != nil && len(bl.TableKV.Kv) > 0 {
+			if bl.TableKV != nil {
 				tkv := bl.TableKV
 				// log.Debug("tkv", tkv)
 				for k, v := range tkv.Kv {
@@ -947,7 +946,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
 						}
@@ -958,6 +957,10 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if extfrom == "title" {
 							item.Score = 4
 						}
+						if strings.Contains(val, "\n") {
+							item.Score -= 1
+							exfield.Score -= 1
+						}
 						if tmp["scoreitem"] == nil {
 							sitems := make([]*ju.ScoreItem, 0)
 							sitems = append(sitems, &item)
@@ -999,7 +1002,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			field := &ju.ExtField{Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
+			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
 			if extfrom == "title" {
 				field.Score = 4
 			}
@@ -1203,14 +1206,16 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		doc, result, _id, values := funcAnalysis(j)
+		doc, result, _id := funcAnalysis(j)
 		go otherNeedSave(j, result, e)
+		auxinfo := auxInfo(j)
 		//从排序结果中取值
 		tmp := map[string]interface{}{} //抽取值
-		for key, val := range values {
+		tmp["fieldall"] = auxinfo
+		for _, val := range result {
 			for _, v := range val { //取第一个非负数
-				if v.Key != "" && v.Value > -1 {
-					tmp[key] = v.Object
+				if v.Score > -1 {
+					tmp[v.Field] = v.Value
 					break
 				}
 			}
@@ -1223,14 +1228,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 		//处理附件
 		var resultf map[string][]*ju.ExtField
-		var filevalues map[string][]*ju.SortObject
 		if jf != nil {
-			_, resultf, _, filevalues = funcAnalysis(jf)
+			_, resultf, _ = funcAnalysis(jf)
+			auxinfof := auxInfo(jf)
+			tmp["fieldallf"] = auxinfof
 			ffield := map[string]interface{}{}
-			for key, val := range filevalues {
+			for _, val := range resultf {
 				for _, v := range val { //取第一个非负数
-					if v.Key != "" && v.Value > -1 {
-						ffield[key] = v.Object
+					if v.Score > -1 {
+						ffield[v.Field] = v.Value
 						break
 					}
 				}
@@ -1417,72 +1423,48 @@ func delFiled(k string) bool {
 	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
-func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string, map[string][]*ju.SortObject) {
+func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
 	result := j.Result
 	_id := qu.BsonIdToSId((*doc)["_id"])
-	iscore, _ := ju.Config["fieldscore"].(bool)
-	if iscore { //打分
-		result = ScoreFields(j)
-	}
+	result = ScoreFields(j)
+
 	//结果排序
-	values := map[string][]*ju.SortObject{}
-	for key, val := range result {
-		fieldValue := map[string][]interface{}{}
-		//cfscore := make(map[string]float64) //重复匹配加分
-		if iscore { //走打分
-			for _, v := range val {
-				if len(fmt.Sprint(v.Value)) < 1 {
-					continue //去除空串
-				}
-				//if v.Score >0 {
-				//	cfscore[fmt.Sprint(v.Value)] += 1
-				//}
-				if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil {
-					fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
-				} else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
-					fieldValue[fmt.Sprint(v.Value)+v.Type][0] = v.Score
-				}
-			}
-			//for key := range fieldValue {
-			//	for cfkey, cfv := range cfscore {
-			//		if strings.Contains(key, cfkey) {
-			//			fieldValue[key][0] = fieldValue[key][0].(float64) + cfv
-			//		}
-			//	}
-			//}
-		} else { //不走打分,按出现频次
-			for _, v := range val {
-				if len(fmt.Sprint(v.Value)) < 1 {
-					continue //去除空串
-				}
-				if fieldValue[fmt.Sprint(v.Value)] == nil {
-					fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
-				} else {
-					fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
+	for _, val := range result {
+		ju.Sort(val)
+	}
+	return doc, result, _id
+}
+
+//辅助信息,如果没有排序先排序
+func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
+	fieldalls := map[string][]map[string]interface{}{}
+	for field, val := range j.Result {
+		//ju.Sort(val)
+		sfields := []map[string]interface{}{}
+		for _, v := range val {
+			standardized := false
+			if field == "buyer" || field == "winner" || field == "agency" {
+				i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
+				if i > 0 {
+					standardized = true
 				}
 			}
-		}
-		objects := []*ju.SortObject{}
-		for k, v := range fieldValue {
-			ValueStr := "" //第二排序
-			if reflect.TypeOf(v[1]).String() == "string" {
-				ValueStr = qu.ObjToString(v[1])
-			}
-			tmp := &ju.SortObject{
-				Key:      k,
-				Value:    qu.IntAll(v[0]),
-				Object:   v[1],
-				ValueStr: ValueStr,
+			sfield := map[string]interface{}{
+				"val":          v.Value,
+				"type":         v.Type,
+				"score":        v.Score,
+				"blocktag":     v.BlockTag,
+				"sourceval":    v.SourceValue,
+				"standardized": standardized,
 			}
-			objects = append(objects, tmp)
+			sfields = append(sfields, sfield)
 		}
-		values[key] = ju.ExtSort(objects)
+		fieldalls[field] = sfields
 	}
-	return doc, result, _id, values
+	return fieldalls
 }
-
 func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 	defer qu.Catch()
 	//获取审核字段

+ 181 - 0
src/jy/extract/extractInit.go

@@ -291,6 +291,187 @@ func (e *ExtractTask) InfoTypeList() {
 
 //加载抽取规则
 func (e *ExtractTask) InitRuleCore() {
+	defer qu.Catch()
+	e.Fields = map[string]int{}
+	e.RuleCores = make(map[string]map[string][]*RuleCore)
+
+	fieldrules := map[string][]*RuleCore{}
+	vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
+	for _, vinfo := range *vinfos {
+		if b, _ := vinfo["isuse"].(bool); !b {
+			continue
+		}
+		s_field := qu.ObjToString(vinfo["s_field"])
+		pid := qu.BsonIdToSId(vinfo["_id"])
+		list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
+		for _, vv := range *list {
+			if b, _ := vv["isuse"].(bool); !b {
+				continue
+			}
+			rcore := &RuleCore{}
+			rcore.Field = s_field
+			rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
+			rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
+			//前置规则
+			rulePres := []*RegLuaInfo{}
+			plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
+			for _, v := range *plist {
+				rinfo := &RegLuaInfo{
+					Field: qu.ObjToString(v["s_field"]),
+					Code:  v["s_code"].(string),
+					Name:  v["s_name"].(string),
+					IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
+				}
+				if rinfo.IsLua {
+					rinfo.RuleText = v["s_luascript"].(string)
+					rulePres = append(rulePres, rinfo)
+				} else {
+					qu.Try(func() {
+						rinfo.RuleText = v["s_rule"].(string)
+						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
+						if len(tmp) == 2 {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
+						} else {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
+						}
+						rulePres = append(rulePres, rinfo)
+					}, func(err interface{}) {
+						log.Debug(rinfo.Code, rinfo.Field, err)
+					})
+				}
+			}
+			rcore.RulePres = rulePres
+
+			//后置规则
+			ruleBacks := []*RegLuaInfo{}
+			blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
+			for _, v := range *blist {
+				rinfo := &RegLuaInfo{
+					Field: qu.ObjToString(v["s_field"]),
+					Code:  v["s_code"].(string),
+					Name:  v["s_name"].(string),
+					IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
+				}
+				if rinfo.IsLua {
+					rinfo.RuleText = v["s_luascript"].(string)
+					ruleBacks = append(ruleBacks, rinfo)
+				} else {
+					qu.Try(func() {
+						rinfo.RuleText = v["s_rule"].(string)
+						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
+						if len(tmp) == 2 {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
+						} else {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
+						}
+						ruleBacks = append(ruleBacks, rinfo)
+					}, func(err interface{}) {
+						log.Debug(rinfo.Code, rinfo.Field, err)
+					})
+				}
+			}
+			rcore.RuleBacks = ruleBacks
+
+			//抽取规则
+			ruleCores := []*RegLuaInfo{}
+			clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
+			for _, v := range *clist {
+				if b, _ := v["isuse"].(bool); !b {
+					continue
+				}
+				field := qu.ObjToString(v["s_field"])
+				e.Fields[field] = 1 //加入抽取属性组备用
+				rinfo := &RegLuaInfo{
+					Field: field,
+					Code:  v["s_code"].(string),
+					Name:  v["s_name"].(string),
+					IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
+				}
+				if rinfo.IsLua {
+					rinfo.RuleText = v["s_luascript"].(string)
+					//提取全部属性
+					rinfo.LFields = getALLFields()
+					ruleCores = append(ruleCores, rinfo)
+				} else {
+					qu.Try(func() {
+						rinfo.RuleText = v["s_rule"].(string)
+						tmp := strings.Split(rinfo.RuleText, "__")
+						var pattern string
+						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
+						} else {
+							pattern = tmp[0]
+						}
+						if len(tmp) == 2 {
+							epos := strings.Split(tmp[1], ",")
+							posm := map[string]int{}
+							for _, v := range epos {
+								ks := strings.Split(v, ":")
+								if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
+									posm[ks[1]] = qu.IntAll(ks[0])
+								} else { //(.*)招标公告__2
+									posm[rinfo.Field] = qu.IntAll(ks[0])
+								}
+							}
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
+						} else {
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
+						}
+						ruleCores = append(ruleCores, rinfo)
+					}, func(err interface{}) {
+						log.Debug(rinfo.Code, rinfo.Field, err)
+					})
+				}
+			}
+			rcore.RuleCores = ruleCores
+			//
+			if fieldrules[s_field] == nil {
+				fieldrules[s_field] = []*RuleCore{}
+			}
+			fieldrules[s_field] = append(fieldrules[s_field], rcore)
+		}
+	}
+
+	//属性配置
+	infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
+	for _, v := range *infolist {
+		topclass := qu.ObjToString(v["topclass"])
+		if v["subclass"] == nil {
+			e.RuleCores[topclass] = make(map[string][]*RuleCore)
+			for attr, _ := range v["fields"].(map[string]interface{}) {
+				e.RuleCores[topclass][attr] = fieldrules[attr]
+			}
+		} else {
+			for ca, fs := range v["subclass"].(map[string]interface{}) {
+				e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
+				for field, _ := range fs.(map[string]interface{}) {
+					e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
+				}
+			}
+		}
+	}
+}
+
+//加载抽取规则
+func (e *ExtractTask) InitRuleCore2() {
 	defer qu.Catch()
 	e.Fields = map[string]int{}
 	infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)

+ 20 - 18
src/jy/extract/score.go

@@ -93,14 +93,16 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 			if len(tmpsvalue.BlockTag) > 0 {
 				//有标签段
 				var qz float64 = 0.0 //取权重最高的
+				var tgk string
 				for key := range tmpsvalue.BlockTag {
 					//key = "其他"//TODO 测试用
 					if TagConfig[key][field] > qz {
 						qz = TagConfig[key][field]
+						tgk = key
 					}
 				}
 				tmps[tmpsindex].Score += 2 * qz //乘以权重系数
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: 2 * qz})
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "匹配段标签权重", Code: "权重系数乘以2", RuleText: "BlockTag", Type: tgk+field,  ExtFrom: "tagscore.json", Value: tmpsvalue.Value, Score: 2 * qz})
 			} else {
 				//没有段标签,走其他
 				//qz := TagConfig["其他"][field]
@@ -110,24 +112,24 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				//是否有kv值
 				if strings.Contains(tmpsvalue.Type, "colon") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["colon"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "colonkv", Code: "fieldscore.colon", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "colonkv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["colon"])})
 				} else if strings.Contains(tmpsvalue.Type, "space") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["space"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "spacekv", Code: "fieldscore.space", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "spacekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["space"])})
 				} else if strings.Contains(tmpsvalue.Type, "table") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["table"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "tablekv", Code: "fieldscore.table", RuleText: qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "tablekv", ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["table"])})
 				}
 			}
 			if tmpsvalue.ExtFrom != "title" { //非标题抽取
 				if strings.Contains(tmpsvalue.Type, "regexp") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText:  qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp",  ExtFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])})
 				}
 			} else {
 				if strings.Contains(tmpsvalue.Type, "regexp") {
 					tmps[tmpsindex].Score += qu.Float64All(SoreConfig["extractype"]["regexp"])+ 1
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])+ 1 })
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "regexp", Code: "fieldscore.regexp", RuleText:  qu.ObjToString(SoreConfig["extractype"]["describe"]), Type: "regexp", ExtFrom:"fieldscore.json", Value: tmpsvalue.Value, Score: qu.Float64All(SoreConfig["extractype"]["regexp"])+ 1 })
 				}
 			}
 			scoreRule := SoreConfig[field]
@@ -142,7 +144,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if valueLen > 100 && field != "projectscope" {
 					tmps[tmpsindex].Score = -99
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: -99})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `valueLen > 100 && field != "projectscope"直接-99分`, Code: field, Type: "length", Value: tmpsvalue.Value, Score: -99})
 				}
 				if lengths, ok := scoreRule["length"].([]interface{}); ok {
 					for _, tmp := range lengths {
@@ -155,13 +157,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 							}
 							if valueLen < min {
 								tmps[tmpsindex].Score += qu.Float64All(scores[0])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
+								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen,"<",min), Type: field,  ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
 							} else if valueLen > max {
 								tmps[tmpsindex].Score += qu.Float64All(scores[2])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
+								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen,">",max), Type: field, ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 							} else {
 								tmps[tmpsindex].Score += qu.Float64All(scores[1])
-								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+								tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "长度打分", Code: fmt.Sprint(valueLen,">",min,"&&",valueLen,"<",max), Type: field,  ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 							}
 						}
 					}
@@ -175,7 +177,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "负面词打分"+fmt.Sprint(p["describe"]), Code:field+".negativewords" , RuleText: reg.String(), Type: "regexp",  ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -193,7 +195,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "正面词打分"+fmt.Sprint(p["describe"]), Code: field+".positivewords", RuleText: reg.String(), Type: "regexp",  ExtFrom:  "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -211,7 +213,7 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 									reg := p["regexp"].(*regexp.Regexp)
 									if reg.MatchString(qu.ObjToString(tmpsvalue.Value)) {
 										tmps[tmpsindex].Score += qu.Float64All(p["score"])
-										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "winnerorder", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
+										tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "位置打分winnerorder"+fmt.Sprint(p["describe"]), Code:  field+".winnerorder", RuleText:reg.String(), Type:  "regexp", ExtFrom:"fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(p["score"])})
 									}
 								}
 							}, func(err interface{}) {
@@ -232,13 +234,13 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if val < min && 0 < val {
 					tmps[tmpsindex].Score += qu.Float64All(scores[0])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code:field+".float", RuleText: fmt.Sprint(val ,"<",min,"&&",0,"<",val),  ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[0])})
 				} else if val > max {
 					tmps[tmpsindex].Score += qu.Float64All(scores[2])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: field+".float", RuleText: fmt.Sprint(val,">",max),  ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 				} else if val <= max && val >= min {
 					tmps[tmpsindex].Score += qu.Float64All(scores[1])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "数据范围打分", Code:field+".float", RuleText: fmt.Sprintln(val,"<=", max,"&&", val,">=", min ), ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 				}
 			}
 			//其他打分配置
@@ -253,10 +255,10 @@ func ScoreFields(j *ju.Job) map[string][]*ju.ExtField {
 				}
 				if val > max {
 					tmps[tmpsindex].Score += qu.Float64All(scores[2])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field+".decimal", RuleText: fmt.Sprint(val ,">", max),  ExtFrom: "fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[2])})
 				} else if val <= max && val > min {
 					tmps[tmpsindex].Score += qu.Float64All(scores[1])
-					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: tmpsvalue.Code, RuleText: tmpsvalue.RuleText, Type: tmpsvalue.Type, MatchType: tmpsvalue.MatchType, ExtFrom: tmpsvalue.ExtFrom, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "其他打分配置decimal", Code: field+".decimal", RuleText:fmt.Sprint(val ,"<=", max ,"&&", val,">", min), ExtFrom:"fieldscore.json."+field, Value: tmpsvalue.Value, Score: qu.Float64All(scores[1])})
 				}
 			}
 		}

+ 44 - 2
src/jy/pretreated/analystep.go

@@ -4,6 +4,7 @@
 package pretreated
 
 import (
+	"encoding/json"
 	"jy/util"
 	qutil "qfw/util"
 	"strings"
@@ -29,7 +30,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -76,7 +77,8 @@ func AnalyStart(job *util.Job) {
 		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 		}
-
+		FindProjectCode(newCon, job) //匹配项目编号
+		bl.Text = newCon
 		//调用kv解析
 		bl.ColonKV = GetKVAll(newCon, "", nil, 1)
 		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
@@ -93,6 +95,46 @@ func AnalyStart(job *util.Job) {
 	}
 }
 
+//匹配项目编号
+func FindProjectCode(newCon string, job *util.Job) {
+	newCon = TextAfterRemoveTable(newCon)
+	if strings.TrimSpace(newCon) == "" {
+		return
+	}
+	var proCode string
+	proCode = projectcodeReg.FindString(newCon)
+	blCode := &util.Block{}
+	blCode.Text = proCode
+	if proCode != "" {
+		ckv := GetKVAll(proCode, job.Title, nil, 1)
+		blCode.ColonKV = ckv
+		job.Block = append(job.Block, blCode)
+	}else if proCode = projectcodeReg2.FindString(newCon);proCode !=""{
+		ckv := GetKVAll(proCode, job.Title, nil, 1)
+		blCode.ColonKV = ckv
+		job.Block = append(job.Block, blCode)
+	}else if proCode = projectcodeReg3.FindString(newCon) ;proCode !=""{
+		ckv := GetKVAll(proCode, job.Title, nil, 1)
+		blCode.ColonKV = ckv
+		job.Block = append(job.Block, blCode)
+	}
+	if proCode = jsonReg.FindString(newCon);proCode != ""{
+		jsonMap := make(map[string]string)
+		json.Unmarshal([]byte(proCode),&jsonMap)
+		jobKv := util.NewJobKv()
+		for k,v := range jsonMap{
+			tmpkv := new(util.Kv)
+			tmpkv.Line = k+v
+			tmpkv.Key = k
+			tmpkv.Value = v
+			jobKv.Kvs = append(jobKv.Kvs, tmpkv)
+		}
+		jobKv.Kv = jsonMap
+		blCode.ColonKV = jobKv
+		job.Block = append(job.Block, blCode)
+	}
+}
+
 //分析table解析结果
 func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 	//解析结果中的kv

+ 29 - 7
src/jy/pretreated/analytable.go

@@ -107,6 +107,11 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
+	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号){1}(:|:)(.){4,30}()|\)|\])`)
+	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9a-zA-Z]`)
+	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
+	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
+	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -246,6 +251,11 @@ func (table *Table) KVFilter() {
 					//						}
 					//					}
 				}
+			} else {
+				if table.StandKV[k] == "" && qutil.ObjToString(v) != "" {
+					table.StandKV[k] = qutil.ObjToString(v)
+					table.StandKVWeight[k] = 0
+				}
 			}
 		} else {
 			//u.Debug(k, v, "---------")
@@ -580,6 +590,12 @@ func (table *Table) MergerToTableresult() {
 			if table.TableResult.SortKV.Map[k] == nil {
 				table.TableResult.SortKV.AddKey(k, v) //父集
 			} else {
+				if k == "项目编号" { //项目编号存在,又匹配到全为中文跳过
+
+					if regHz.MatchString(v) {
+						continue
+					}
+				}
 				table.TableResult.SortKV.ReplaceKey(k, v, k)
 			}
 			table.TableResult.SortKVWeight[k] = table.StandKVWeight[k]
@@ -643,7 +659,9 @@ func (ts *TableResult) Analy() {
 		//核心模块
 		ts := tn.Analy(contactFormat)
 		for _, tab := range ts {
-			tabs = append(tabs, tab)
+			if len(tab.TRs) > 0{
+				tabs = append(tabs, tab)
+			}
 			//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
 		}
 		//tn.SonTables = append(tn.SonTables, tn)
@@ -750,7 +768,8 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 		//遍历每行的td
 		tds := sel.ChildrenFiltered("td,th")
 		TR := NewTR(table)
-		tdTextIsNull := true
+		tdTextIsNull := false
+		var empty int
 		tds.Each(func(m int, selm *goquery.Selection) {
 			//对隐藏列不处理!!!
 			if IsHide(selm) {
@@ -760,11 +779,14 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 			td := NewTD(selm, TR, table) //初始化td,kv处理,td中有table处理,td的方向
 			//num++
 			TR.AddTD(td)
-			if td.Val != "" { //删除一个tr,tr中所有td是空值的
-				tdTextIsNull = false
+			if td.Val == "" && td.SonTableResult == nil { //删除一个tr,tr中所有td是空值的
+				empty++
+				if tds.Size() == empty {
+					tdTextIsNull = true
+				}
 			}
 		})
-		//tr中所有td的内容为空 将tr删除
+		//向table添加每行不为空的tr
 		if !tdTextIsNull {
 			table.AddTR(TR)
 		}
@@ -1763,7 +1785,7 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 						bfind = true
 					}
 				}
-				if bvalfind && varrpos >-1{
+				if bvalfind && varrpos > -1 && len(vals) > varrpos {
 					vals[varrpos] = td.Val // += "__" + td.Val
 				} else {
 					//添加时候去除空值和nil
@@ -3147,7 +3169,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 71 - 4
src/jy/pretreated/tablev2.go

@@ -3,6 +3,7 @@ package pretreated
 //定义表格对象
 
 import (
+	"encoding/json"
 	"fmt"
 	u "jy/util"
 	"log"
@@ -120,14 +121,68 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	//子table处理合并
 	if ht.Size() > 0 {
 		//qutil.Debug("有子表格")
+		//格式化正文
 		txt = TextAfterRemoveTable(td.Html)
 		td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
+		//处理table外内容
+		var ub []*u.Block
+		ub, _ = DivideBlock("",txt, 2, table.TableResult.RuleBlock)
+		//看是否划块
+		if len(ub) > 0 {
+			colonKvWeight := map[string]int{}
+			spaceKvWeight := map[string]int{}
+			for _, bl := range ub {
+				//冒号kv
+				for bl_ck, bl_cv := range bl.ColonKV.Kv {
+					if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
+						colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
+						td.SortKV.AddKey(bl_ck, bl_cv)
+					}
+				}
+				//空格kv
+				for bl_sk, bl_sv := range bl.SpaceKV.Kv {
+					if td.SortKV.Map[bl_sk] == nil || bl.SpaceKV.KvTag[bl_sk].Weight >= spaceKvWeight[bl_sk] {
+						spaceKvWeight[bl_sk] = bl.SpaceKV.KvTag[bl_sk].Weight
+						td.SortKV.AddKey(bl_sk, bl_sv)
+					}
+				}
+			}
+		}
 	} else {
 		txt = strings.TrimSpace(td.Goquery.Text())
 	}
 	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
 	td.Val = text //值
 	td.Text = txt //原始串
+	//调用kv解析
+	cKV := GetKVAll(text, "", nil, 1)
+	for k,v :=range cKV.Kv{
+		td.SortKV.AddKey(k,v)
+	}
+	sKV := SspacekvEntity.Entrance(text, "", nil)
+	for k,v :=range sKV.Kv{
+		td.SortKV.AddKey(k,v)
+	}
+	//抽取不到走正则抽
+	proCode := projectcodeReg.FindString(text)
+	if proCode != "" {
+		ckv := GetKVAll(proCode, "", nil, 1)
+		for k,v :=range ckv.Kv{
+			td.SortKV.AddKey(k,v)
+		}
+	}else if proCode = projectcodeReg2.FindString(text);proCode !=""{
+		ckv := GetKVAll(proCode, "", nil, 1)
+		for k,v :=range ckv.Kv{
+			td.SortKV.AddKey(k,v)
+		}
+	}
+	if proCode = jsonReg.FindString(text);proCode != ""{
+		jsonMap := make(map[string]string)
+		json.Unmarshal([]byte(proCode),&jsonMap)
+		for k,v := range jsonMap{
+			td.SortKV.AddKey(k,v)
+		}
+	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
 	td.tdIsHb(tr, table, bsontable)
 	bhead := false
@@ -174,7 +229,13 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
 			}
 			sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
 			td.BH = false
-			td.SonTableResult = sonts
+			for k,v := range sonts.SortKV.Map{
+				if td.TR.Table.TableResult == nil{
+					td.TR.Table.TableResult = NewTableResult(sonts.Id,sonts.Toptype,sonts.BlockTag,sonts.Html,sonts.Itype,sonts.RuleBlock)
+				}
+				td.TR.Table.TableResult.SortKV.AddKey(k,v)
+			}
+			//td.SonTableResult = sonts
 			//for _, k := range sonts.SortKV.Keys {
 			//u.Debug(k, sonts.SortKV.Map[k])
 			//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
@@ -374,8 +435,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 			}
 		*/
 
-		td.SortKV = FindKv(td.Val, "", 2)
-
+		fSortKV := FindKv(td.Val, "", 2)
+		for k,v := range fSortKV.Map{
+			td.SortKV.AddKey(k,v)
+		}
 		//		td.LeftNode.Val
 		//		for _, vvv := range *td.TR {
 		//			u.Debug(">>>>>")
@@ -818,7 +881,11 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 	**/
 	return
 }
-
+//纯文本
+func HtmlToText(con string) string {
+	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	return doc2.Text()
+}
 //取出排除表格之外的文本
 func TextAfterRemoveTable(con string) string {
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))

+ 11 - 10
src/jy/util/article.go

@@ -38,16 +38,17 @@ type Job struct {
 }
 
 type ExtField struct {
-	BlockTag  map[string]bool //块标签
-	Field     string          //属性
-	Code      string          //匹配标签(字符串、正则)、正则或lua代码
-	RuleText  string          //内容
-	Type      string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
-	ExtFrom   string          //抽取来源(title,detail)
-	Value     interface{}     //抽取结果
-	Score     float64         //得分
-	ScoreItem []*ScoreItem    //打分项
+	BlockTag    map[string]bool //块标签
+	Field       string          //属性
+	Code        string          //匹配标签(字符串、正则)、正则或lua代码
+	RuleText    string          //内容
+	Type        string          //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType   string          //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
+	ExtFrom     string          //抽取来源(title,detail)
+	SourceValue interface{}     //抽取结果--未清理
+	Value       interface{}     //抽取结果
+	Score       float64         //得分
+	ScoreItem   []*ScoreItem    //打分项
 }
 
 //打分项

+ 33 - 0
src/jy/util/sort.go

@@ -2,9 +2,11 @@
 package util
 
 import (
+	"fmt"
 	"sort"
 )
 
+/*
 type SortObject struct {
 	Key      string
 	Value    int
@@ -39,3 +41,34 @@ func ExtSort(list []*SortObject) []*SortObject {
 	sort.Sort(ls)
 	return ls
 }
+
+*/
+
+//ExtField排序
+type results []*ExtField
+
+func (list results) Len() int {
+	return len(list)
+}
+
+func (list results) Less(i, j int) bool {
+	if list[i].Score > list[j].Score {
+		return true
+	} else if list[i].Score < list[j].Score {
+		return false
+	} else {
+		return fmt.Sprint(list[i].Value) > fmt.Sprint(list[j].Value)
+	}
+}
+
+func (list results) Swap(i, j int) {
+	var temp *ExtField = list[i]
+	list[i] = list[j]
+	list[j] = temp
+}
+
+func Sort(list []*ExtField) []*ExtField {
+	ls := results(list)
+	sort.Sort(ls)
+	return ls
+}

+ 10 - 5
src/res/fieldscore.json

@@ -185,17 +185,22 @@
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
-                "score": -10
+                "score": -2
             },
             {
-                "describe": "以个汉字以上结束",
-                "regstr": "[\\u4e00-\\u9fa5]{2,}$",
-                "score": -10
+                "describe": "以个汉字以上结束",
+                "regstr": "[\\u4e00-\\u9fa5]{1,}$",
+                "score": -1
             },
             {
                 "describe": "包含负分",
-                "regstr": "(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,\\.。、::“”‘’\"])",
+                "regstr": "(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,。、::“”‘’\"])",
                 "score": -10
+            },
+            {
+                "describe": "标段编号匹配-2",
+                "regstr": "/.{2}",
+                "score": -2
             }
         ],
         "length": [

+ 14 - 19
versioncomparison/config.json

@@ -1,21 +1,16 @@
 {
-  "extractmgo": "192.168.3.207:27081",
-  "extractdb": "qfw",
-  "extractc": "bidding",
-  "previousmgo": "192.168.3.207:27081",
-  "previousdb": "qfw",
-  "previousc": "result_v3",
-  "newmgo": "192.168.3.207:27081",
-  "newdb": "extract_v3",
-  "newc": "result_data2",
-  "keyfield": {
-    "bidamount": 1,
-    "budget": 1,
-    "winner": 1,
-    "projectcode":1,
-    "buyer": 1,
-    "projectname": 1
-  },
-  "queryNum": 1000,
-  "querySid": "57319151edbcdc7b27000aec"
+    "premgo": "192.168.3.207:27081",
+    "predb": "qfw",
+    "prec": "result_v3",
+    "newmgo": "192.168.3.207:27081",
+    "newdb": "extract_v3",
+    "newc": "result_data",
+    "fields": [
+        "projectname",
+        "projectcode",
+        "buyer",
+        "bidamount",
+        "budget",
+        "winner"
+    ]
 }

+ 423 - 0
versioncomparison/main.bak

@@ -0,0 +1,423 @@
+package main
+
+import (
+	"fmt"
+	"gopkg.in/mgo.v2"
+	"gopkg.in/mgo.v2/bson"
+	"jy/mongodbutil"
+	"log"
+	"qfw/common/src/github.com/tealeg/xlsx"
+	"qfw/util"
+	"strings"
+)
+
+var (
+	SysConfig   map[string]interface{}
+	Extractmgo  *mgo.Session      //抽取
+	Previousmgo *mongodbutil.Pool //之前抽取
+	Newmgo      *mongodbutil.Pool //最新抽取
+)
+
+/**
+与上个抽取版本做比较
+ */
+func init() {
+	util.ReadConfig(&SysConfig)
+	if len(SysConfig) < 1 {
+		log.Println("配置文件读取失败")
+		return
+	}
+	session, e := mgo.Dial(util.ObjToString(SysConfig["extractmgo"]))
+	if e != nil {
+		log.Fatal(e)
+	}
+	Extractmgo = session
+	Previousmgo = mongodbutil.MgoFactory(2, 5, 120, util.ObjToString(SysConfig["previousmgo"]), util.ObjToString(SysConfig["previousdb"]))
+	Newmgo = mongodbutil.MgoFactory(2, 5, 120, util.ObjToString(SysConfig["newmgo"]), util.ObjToString(SysConfig["newdb"]))
+}
+
+type versionComparison struct {
+	Id  interface{} `json:"_id"`
+	Url string      `json:"url"`
+}
+
+func main() {
+	Query(util.IntAll(SysConfig["queryNum"]), util.ObjToString(SysConfig["querySid"]))
+}
+
+func Query(num int, sid string) {
+	xf, err := xlsx.OpenFile("抽取结果对比.xlsx")
+	if err != nil {
+		log.Println("读取文件", err)
+		return
+	}
+	var projectcodenum, bidamountnum, winnernum, buyernum, budgetnum, projectnamenum int //不相等计数器
+	var projectcodenumXT, bidamountnumXT, winnernumXT, buyernumXT, budgetnumXT, projectnamenumXT int      //相等计数器
+	var pcodeNotNilNumP, bidamountNotNilNumP, winnerNotNilNumP, buyerNotNilNumP, budgetNotNilNumP, pnameNotNilNumP int //不相等计数器
+	var pcodeNotNilNumN, bidamountNotNilNumN, winnerNotNilNumN, buyerNotNilNumN, budgetNotNilNumN, pnameNotNilNumN int //不相等计数器
+
+	log.Println(num, sid)
+	if num < 1 {
+		log.Println("查询数量应该大于0")
+		return
+	}
+	sum := num //总量
+	//if strings.TrimSpace(gteid) == "" {
+	//	gteid = "386cd3000000000000000000"
+	//}
+	var iter *mgo.Iter
+	if sid == "" {
+		iter = Extractmgo.DB(util.ObjToString(SysConfig["extractdb"])).C(util.ObjToString(SysConfig["extractc"])).Find(nil).Select(bson.M{"_id": 1}).Iter()
+	} else {
+		iter = Extractmgo.DB(util.ObjToString(SysConfig["extractdb"])).C(util.ObjToString(SysConfig["extractc"])).Find(bson.M{"_id": bson.M{
+			"$gte": bson.ObjectIdHex(sid)},
+		}).Select(bson.M{"_id": 1}).Iter()
+	}
+	defer log.Println("关闭 iter:", iter.Close())
+	var data map[string]bson.ObjectId
+	getdata := make([]bson.ObjectId, 0)
+	for iter.Next(&data) {
+		if num == 0 {
+			break
+		}
+		getdata = append(getdata, data["_id"])
+		num--
+	}
+	log.Println(sum, "条数据加载完成")
+	projectnames := make([]*Projectname, 0)
+	buyers := make([]*Buyer, 0)
+	projectcodes := make([]*Projectcode, 0)
+	winners := make([]*Winner, 0)
+	budgets := make([]*Budget, 0)
+	bidamounts := make([]*Bidamount, 0)
+	for _, gv := range getdata {
+		log.Println(gv)
+		gvid := gv.Hex()
+		pdata, b := Previousmgo.FindById(util.ObjToString(SysConfig["previousc"]), gvid, SysConfig["keyfield"])
+		if !b || len(*pdata) == 0 {
+			log.Println("oldId不存在")
+			continue
+		}
+		log.Println("pdata:", pdata)
+
+		ndata, b := Newmgo.FindById(util.ObjToString(SysConfig["newc"]), gvid, SysConfig["keyfield"])
+		if !b || len(*ndata) == 0 {
+			log.Println("nweId不存在")
+			continue
+		}
+		log.Println("ndata:", ndata)
+
+		versioncomparison := new(versionComparison)
+		versioncomparison.Id = gvid
+		versioncomparison.Url = "https://www.jianyu360.com/article/content/" + util.CommonEncodeArticle("content", gvid) + ".html"
+		for k := range SysConfig["keyfield"].(map[string]interface{}) {
+			var pd interface{}
+			var nd interface{}
+			if k == "budget" || k == "bidamount" {
+				pd = util.Float64All((*pdata)[k])
+				nd = util.Float64All((*ndata)[k])
+				if pd.(float64) > 0 {
+					switch k {
+					case "budget":
+						budgetNotNilNumP++
+					case "bidamount":
+						bidamountNotNilNumP++
+					}
+				}
+				if nd.(float64) > 0 {
+					switch k {
+					case "budget":
+						budgetNotNilNumN++
+					case "bidamount":
+						bidamountNotNilNumN++
+					}
+				}
+			} else {
+				pd = strings.TrimSpace(util.ObjToString((*pdata)[k]))
+				nd = strings.TrimSpace(util.ObjToString((*ndata)[k]))
+				if strings.TrimSpace(pd.(string)) != "" {
+					switch k {
+					case "projectname":
+						pnameNotNilNumP++
+					case "buyer":
+						buyerNotNilNumP++
+					case "projectcode":
+						pcodeNotNilNumP++
+					case "winner":
+						winnerNotNilNumP++
+					}
+				}
+				if strings.TrimSpace(nd.(string)) != "" {
+					switch k {
+					case "projectname":
+						pnameNotNilNumN++
+					case "buyer":
+						buyerNotNilNumN++
+					case "projectcode":
+						pcodeNotNilNumN++
+					case "winner":
+						winnerNotNilNumN++
+					}
+				}
+			}
+			if pd != nd {
+				//log.Println(k)
+				switch k {
+				case "projectname":
+					projectname := new(Projectname)
+					projectname.versionComparison = *versioncomparison
+					pd = strings.Trim(fmt.Sprint(pd), "项目")
+					pd = strings.Trim(strings.TrimSpace(fmt.Sprint(pd)), "采购")
+					nd = strings.Trim(fmt.Sprint(nd), "项目")
+					nd = strings.Trim(strings.TrimSpace(fmt.Sprint(nd)), "采购")
+					if pd != nd{
+						projectname.ProjectnameOld = fmt.Sprint(pd)
+						projectname.ProjectnameNew = fmt.Sprint(nd)
+						projectnames = append(projectnames, projectname)
+						projectnamenum++
+					}else {
+							projectnamenumXT++
+					}
+				case "buyer":
+					buyer := new(Buyer)
+					buyer.versionComparison = *versioncomparison
+					buyer.BuyerOld = fmt.Sprint(pd)
+					buyer.BuyerNew = fmt.Sprint(nd)
+					buyers = append(buyers, buyer)
+					buyernum++
+				case "projectcode":
+					projectcode := new(Projectcode)
+					projectcode.ProjectcodeOld = fmt.Sprint(pd)
+					projectcode.ProjectcodeNew = fmt.Sprint(nd)
+					projectcode.versionComparison = *versioncomparison
+					projectcodes = append(projectcodes, projectcode)
+					projectcodenum++
+				case "winner":
+					winner := new(Winner)
+					winner.WinnerOld = fmt.Sprint(pd)
+					winner.WinnerNew = fmt.Sprint(nd)
+					winner.versionComparison = *versioncomparison
+					winners = append(winners, winner)
+					winnernum++
+				case "budget":
+					budget := new(Budget)
+					budget.BudgetOld = fmt.Sprint(pd)
+					budget.BudgetNew = fmt.Sprint(nd)
+					budget.versionComparison = *versioncomparison
+					budgets = append(budgets, budget)
+					budgetnum++
+				case "bidamount":
+					bidamount := new(Bidamount)
+					bidamount.BidamountOld = fmt.Sprint(pd)
+					bidamount.BidamountNew = fmt.Sprint(nd)
+					bidamount.versionComparison = *versioncomparison
+					bidamounts = append(bidamounts, bidamount)
+					bidamountnum++
+				}
+			}else {
+				//相同统计
+				pd = strings.TrimSpace(fmt.Sprint(pd))
+				nd = strings.TrimSpace(fmt.Sprint(nd))
+				if pd == ""||pd == "0" || nd == ""|| nd == "0"{
+					continue
+				}
+				if pd == nd {
+					switch k {
+					case "projectname":
+						projectnamenumXT++
+					case "buyer":
+						buyernumXT++
+					case "projectcode":
+						projectcodenumXT++
+					case "winner":
+						winnernumXT++
+					case "budget":
+						budgetnumXT++
+					case "bidamount":
+						bidamountnumXT++
+					}
+				}
+			}
+		}
+		fmt.Println()
+	}
+
+	//log.Println(projectcodenum, bidamountnum, winnernum, buyernum, budgetnum, projectnamenum)
+	for ins, ivs := range xf.Sheets {
+		for inr, ivr := range ivs.Rows {
+			for _, ivc := range ivr.Cells {
+				//抽取对比
+				if ins == 0 {
+					if inr < 3 {
+						continue
+					}
+					switch strings.TrimSpace(ivc.String()) {
+					case "projectname":
+						style := ivr.Cells[1].GetStyle()
+						style.Font.Color = "000000"
+						ivr.Cells[1].SetValue(pnameNotNilNumP)
+						ivr.Cells[2].SetValue(pnameNotNilNumN)
+						//结果相同数量
+						ivr.Cells[3].SetValue(projectnamenumXT)
+						ivr.Cells[3].SetStyle(style)
+						//结果不同数量
+						ivr.Cells[4].SetValue(projectnamenum)
+						ivr.Cells[4].SetStyle(style)
+					case "buyer":
+						style := ivr.Cells[1].GetStyle()
+						style.Font.Color = "000000"
+						ivr.Cells[1].SetValue(buyerNotNilNumP)
+						ivr.Cells[2].SetValue(buyerNotNilNumN)
+						//结果相同数量
+						ivr.Cells[3].SetValue(buyernumXT)
+						ivr.Cells[3].SetStyle(style)
+						//结果不同数量
+						ivr.Cells[4].SetValue(buyernum)
+						ivr.Cells[4].SetStyle(style)
+					case "projectcode":
+						style := ivr.Cells[1].GetStyle()
+						style.Font.Color = "000000"
+						ivr.Cells[1].SetValue(pcodeNotNilNumP)
+						ivr.Cells[2].SetValue(pcodeNotNilNumN)
+						//结果相同数量
+						ivr.Cells[3].SetValue(projectcodenumXT)
+						ivr.Cells[3].SetStyle(style)
+						//结果不同数量
+						ivr.Cells[4].SetValue(projectcodenum)
+						ivr.Cells[4].SetStyle(style)
+					case "winner":
+						style := ivr.Cells[1].GetStyle()
+						style.Font.Color = "000000"
+						ivr.Cells[1].SetValue(winnerNotNilNumP)
+						ivr.Cells[2].SetValue(winnerNotNilNumN)
+						//结果相同数量
+						ivr.Cells[3].SetValue(winnernumXT)
+						ivr.Cells[3].SetStyle(style)
+						//结果不同数量
+						ivr.Cells[4].SetValue(winnernum)
+						ivr.Cells[4].SetStyle(style)
+					case "budget":
+						style := ivr.Cells[1].GetStyle()
+						style.Font.Color = "000000"
+						ivr.Cells[1].SetValue(budgetNotNilNumP)
+						ivr.Cells[2].SetValue(budgetNotNilNumN)
+						//结果相同数量
+						ivr.Cells[3].SetValue(budgetnumXT)
+						ivr.Cells[3].SetStyle(style)
+						//结果不同数量
+						ivr.Cells[4].SetValue(budgetnum)
+						ivr.Cells[4].SetStyle(style)
+					case "bidamount":
+						style := ivr.Cells[1].GetStyle()
+						style.Font.Color = "000000"
+						ivr.Cells[1].SetValue(bidamountNotNilNumP)
+						ivr.Cells[2].SetValue(bidamountNotNilNumN)
+						//结果相同数量
+						ivr.Cells[3].SetValue(bidamountnumXT)
+						ivr.Cells[3].SetStyle(style)
+						//结果不同数量
+						ivr.Cells[4].SetValue(bidamountnum)
+						ivr.Cells[4].SetStyle(style)
+					}
+				}
+			}
+		}
+		if ins > 0 {
+			if len(ivs.Rows) == 0 {
+				row := ivs.AddRow()
+				row.AddCell().SetValue("ObjectId")
+				row.AddCell().SetValue("dev3.1.2")
+				row.AddCell().SetValue("dev3.2")
+				row.AddCell().SetValue("URL")
+			}
+			//log.Println(ivs.Name)
+			switch strings.TrimSpace(ivs.Name) {
+			case "projectname":
+				for _, v := range projectnames {
+					row := ivs.AddRow()
+					row.AddCell().SetValue(v.Id)
+					row.AddCell().SetValue(v.ProjectnameOld)
+					row.AddCell().SetValue(v.ProjectnameNew)
+					row.AddCell().SetValue(v.Url)
+				}
+			case "buyer":
+				for _, v := range buyers {
+					row := ivs.AddRow()
+					row.AddCell().SetValue(v.Id)
+					row.AddCell().SetValue(v.BuyerOld)
+					row.AddCell().SetValue(v.BuyerNew)
+					row.AddCell().SetValue(v.Url)
+				}
+			case "projectcode":
+				for _, v := range projectcodes {
+					row := ivs.AddRow()
+					row.AddCell().SetValue(v.Id)
+					row.AddCell().SetValue(v.ProjectcodeOld)
+					row.AddCell().SetValue(v.ProjectcodeNew)
+					row.AddCell().SetValue(v.Url)
+				}
+			case "winner":
+				for _, v := range winners {
+					row := ivs.AddRow()
+					row.AddCell().SetValue(v.Id)
+					row.AddCell().SetValue(v.WinnerOld)
+					row.AddCell().SetValue(v.WinnerNew)
+					row.AddCell().SetValue(v.Url)
+				}
+			case "budget":
+				for _, v := range budgets {
+					row := ivs.AddRow()
+					row.AddCell().SetValue(v.Id)
+					row.AddCell().SetValue(v.BudgetOld)
+					row.AddCell().SetValue(v.BudgetNew)
+					row.AddCell().SetValue(v.Url)
+				}
+			case "bidamount":
+				for _, v := range bidamounts {
+					row := ivs.AddRow()
+					row.AddCell().SetValue(v.Id)
+					row.AddCell().SetValue(v.BidamountOld)
+					row.AddCell().SetValue(v.BidamountNew)
+					row.AddCell().SetValue(v.Url)
+				}
+			}
+		}
+	}
+	err = xf.Save("resultdata.xlsx")
+	if err != nil {
+		log.Println("保存xlsx失败:", err)
+		return
+	}
+	log.Println("xlsx保存成功")
+}
+
+type Projectname struct {
+	versionComparison
+	ProjectnameOld string
+	ProjectnameNew string
+}
+type Buyer struct {
+	versionComparison
+	BuyerOld string
+	BuyerNew string
+}
+type Projectcode struct {
+	versionComparison
+	ProjectcodeOld string
+	ProjectcodeNew string
+}
+type Winner struct {
+	versionComparison
+	WinnerOld string
+	WinnerNew string
+}
+type Budget struct {
+	versionComparison
+	BudgetOld string
+	BudgetNew string
+}
+type Bidamount struct {
+	versionComparison
+	BidamountOld string
+	BidamountNew string
+}

+ 142 - 385
versioncomparison/main.go

@@ -1,423 +1,180 @@
+/*
+抽取结果对比
+*/
 package main
 
 import (
+	"flag"
 	"fmt"
-	"gopkg.in/mgo.v2"
-	"gopkg.in/mgo.v2/bson"
 	"jy/mongodbutil"
 	"log"
-	"qfw/common/src/github.com/tealeg/xlsx"
-	"qfw/util"
-	"strings"
+	qu "qfw/util"
+
+	"github.com/tealeg/xlsx"
+	"gopkg.in/mgo.v2/bson"
 )
 
 var (
-	SysConfig   map[string]interface{}
-	Extractmgo  *mgo.Session      //抽取
-	Previousmgo *mongodbutil.Pool //之前抽取
-	Newmgo      *mongodbutil.Pool //最新抽取
+	SysConfig map[string]interface{}
+	Premgo    *mongodbutil.Pool //上个版本库
+	Newmgo    *mongodbutil.Pool //当前版本库
+	FieldData map[string]map[string]*Data
+	Compares  map[string]*Compare
+	Sid, Eid  string
+	Fields    []string
 )
 
-/**
-与上个抽取版本做比较
- */
-func init() {
-	util.ReadConfig(&SysConfig)
-	if len(SysConfig) < 1 {
-		log.Println("配置文件读取失败")
-		return
-	}
-	session, e := mgo.Dial(util.ObjToString(SysConfig["extractmgo"]))
-	if e != nil {
-		log.Fatal(e)
-	}
-	Extractmgo = session
-	Previousmgo = mongodbutil.MgoFactory(2, 5, 120, util.ObjToString(SysConfig["previousmgo"]), util.ObjToString(SysConfig["previousdb"]))
-	Newmgo = mongodbutil.MgoFactory(2, 5, 120, util.ObjToString(SysConfig["newmgo"]), util.ObjToString(SysConfig["newdb"]))
+type Compare struct {
+	Field                string //属性
+	PreExtNum, NewExtNum int    //上个版、当前版有值数量
+	PreNilnum, NewNilnum int    //上个版、当前版无值数量
+	EqNum, NEqNum        int    //相等、不等数据量
 }
 
-type versionComparison struct {
-	Id  interface{} `json:"_id"`
-	Url string      `json:"url"`
+type Data struct {
+	Id             string
+	PreVal, NewVal string
 }
 
+func init() {
+	flag.StringVar(&Sid, "sid", "5d348c0ca5cb26b9b76a4bb8", "开始id")
+	flag.StringVar(&Eid, "eid", "5d34ae22a5cb26b9b7850b43", "结束id")
+	flag.Parse()
+	qu.ReadConfig(&SysConfig)
+	Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
+	Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
+	tmp, _ := SysConfig["fields"].([]interface{})
+	for _, v := range tmp {
+		Fields = append(Fields, qu.ObjToString(v))
+	}
+	FieldData = map[string]map[string]*Data{}
+	Compares = map[string]*Compare{}
+}
 func main() {
-	Query(util.IntAll(SysConfig["queryNum"]), util.ObjToString(SysConfig["querySid"]))
+	getVersionData()
+	createXlsx()
 }
-
-func Query(num int, sid string) {
-	xf, err := xlsx.OpenFile("抽取结果对比.xlsx")
+func createXlsx() {
+	xf, err := xlsx.OpenFile("template.xlsx")
 	if err != nil {
-		log.Println("读取文件", err)
-		return
-	}
-	var projectcodenum, bidamountnum, winnernum, buyernum, budgetnum, projectnamenum int //不相等计数器
-	var projectcodenumXT, bidamountnumXT, winnernumXT, buyernumXT, budgetnumXT, projectnamenumXT int      //相等计数器
-	var pcodeNotNilNumP, bidamountNotNilNumP, winnerNotNilNumP, buyerNotNilNumP, budgetNotNilNumP, pnameNotNilNumP int //不相等计数器
-	var pcodeNotNilNumN, bidamountNotNilNumN, winnerNotNilNumN, buyerNotNilNumN, budgetNotNilNumN, pnameNotNilNumN int //不相等计数器
-
-	log.Println(num, sid)
-	if num < 1 {
-		log.Println("查询数量应该大于0")
+		log.Println(err)
 		return
 	}
-	sum := num //总量
-	//if strings.TrimSpace(gteid) == "" {
-	//	gteid = "386cd3000000000000000000"
-	//}
-	var iter *mgo.Iter
-	if sid == "" {
-		iter = Extractmgo.DB(util.ObjToString(SysConfig["extractdb"])).C(util.ObjToString(SysConfig["extractc"])).Find(nil).Select(bson.M{"_id": 1}).Iter()
-	} else {
-		iter = Extractmgo.DB(util.ObjToString(SysConfig["extractdb"])).C(util.ObjToString(SysConfig["extractc"])).Find(bson.M{"_id": bson.M{
-			"$gte": bson.ObjectIdHex(sid)},
-		}).Select(bson.M{"_id": 1}).Iter()
-	}
-	defer log.Println("关闭 iter:", iter.Close())
-	var data map[string]bson.ObjectId
-	getdata := make([]bson.ObjectId, 0)
-	for iter.Next(&data) {
-		if num == 0 {
-			break
+	//生成第一个sheet信息
+	sh := xf.Sheets[0]
+	for i, field := range Fields {
+		for k, row := range sh.Rows {
+			if k > 2+i {
+				style := (*row).Cells[1].GetStyle()
+				style.Font.Color = "000000"
+				(*row).Cells[0].SetString(field)
+				(*row).Cells[1].SetInt(Compares[field].PreExtNum)
+				(*row).Cells[1].SetStyle(style)
+				(*row).Cells[2].SetInt(Compares[field].NewExtNum)
+				(*row).Cells[2].SetStyle(style)
+				(*row).Cells[3].SetInt(Compares[field].EqNum)
+				(*row).Cells[3].SetStyle(style)
+				(*row).Cells[4].SetInt(Compares[field].NEqNum)
+				(*row).Cells[4].SetStyle(style)
+			}
+			sh.Rows[k] = row
 		}
-		getdata = append(getdata, data["_id"])
-		num--
 	}
-	log.Println(sum, "条数据加载完成")
-	projectnames := make([]*Projectname, 0)
-	buyers := make([]*Buyer, 0)
-	projectcodes := make([]*Projectcode, 0)
-	winners := make([]*Winner, 0)
-	budgets := make([]*Budget, 0)
-	bidamounts := make([]*Bidamount, 0)
-	for _, gv := range getdata {
-		log.Println(gv)
-		gvid := gv.Hex()
-		pdata, b := Previousmgo.FindById(util.ObjToString(SysConfig["previousc"]), gvid, SysConfig["keyfield"])
-		if !b || len(*pdata) == 0 {
-			log.Println("oldId不存在")
-			continue
-		}
-		log.Println("pdata:", pdata)
-
-		ndata, b := Newmgo.FindById(util.ObjToString(SysConfig["newc"]), gvid, SysConfig["keyfield"])
-		if !b || len(*ndata) == 0 {
-			log.Println("nweId不存在")
-			continue
+	//生成信息sheet
+	url := "https://www.jianyu360.com/article/content/%s.html"
+	for _, field := range Fields {
+		sh, _ := xf.AddSheet(field)
+		rowh := sh.AddRow()
+		rowh.AddCell().SetString("id")
+		rowh.AddCell().SetString("preval")
+		rowh.AddCell().SetString("newval")
+		rowh.AddCell().SetString("url")
+		tmp := FieldData[field]
+		for k, v := range tmp {
+			if v.NewVal != v.PreVal {
+				row := sh.AddRow()
+				row.AddCell().SetString(k)
+				row.AddCell().SetString(v.PreVal)
+				row.AddCell().SetString(v.NewVal)
+				row.AddCell().SetString(fmt.Sprintf(url, qu.CommonEncodeArticle("content", v.Id)))
+			}
 		}
-		log.Println("ndata:", ndata)
-
-		versioncomparison := new(versionComparison)
-		versioncomparison.Id = gvid
-		versioncomparison.Url = "https://www.jianyu360.com/article/content/" + util.CommonEncodeArticle("content", gvid) + ".html"
-		for k := range SysConfig["keyfield"].(map[string]interface{}) {
-			var pd interface{}
-			var nd interface{}
-			if k == "budget" || k == "bidamount" {
-				pd = util.Float64All((*pdata)[k])
-				nd = util.Float64All((*ndata)[k])
-				if pd.(float64) > 0 {
-					switch k {
-					case "budget":
-						budgetNotNilNumP++
-					case "bidamount":
-						bidamountNotNilNumP++
-					}
-				}
-				if nd.(float64) > 0 {
-					switch k {
-					case "budget":
-						budgetNotNilNumN++
-					case "bidamount":
-						bidamountNotNilNumN++
-					}
-				}
-			} else {
-				pd = strings.TrimSpace(util.ObjToString((*pdata)[k]))
-				nd = strings.TrimSpace(util.ObjToString((*ndata)[k]))
-				if strings.TrimSpace(pd.(string)) != "" {
-					switch k {
-					case "projectname":
-						pnameNotNilNumP++
-					case "buyer":
-						buyerNotNilNumP++
-					case "projectcode":
-						pcodeNotNilNumP++
-					case "winner":
-						winnerNotNilNumP++
-					}
-				}
-				if strings.TrimSpace(nd.(string)) != "" {
-					switch k {
-					case "projectname":
-						pnameNotNilNumN++
-					case "buyer":
-						buyerNotNilNumN++
-					case "projectcode":
-						pcodeNotNilNumN++
-					case "winner":
-						winnerNotNilNumN++
-					}
-				}
+	}
+	err = xf.Save("result.xlsx")
+	if err != nil {
+		log.Println("保存xlsx失败:", err)
+		return
+	}
+	log.Println("xlsx保存成功")
+}
+func getVersionData() {
+	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
+	log.Println(qu.ObjToString(SysConfig["prec"]), query)
+	list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{}`, false, -1, -1)
+	for _, v := range *list1 {
+		for _, key := range Fields {
+			rd := FieldData[key]
+			if rd == nil {
+				rd = map[string]*Data{}
 			}
-			if pd != nd {
-				//log.Println(k)
-				switch k {
-				case "projectname":
-					projectname := new(Projectname)
-					projectname.versionComparison = *versioncomparison
-					pd = strings.Trim(fmt.Sprint(pd), "项目")
-					pd = strings.Trim(strings.TrimSpace(fmt.Sprint(pd)), "采购")
-					nd = strings.Trim(fmt.Sprint(nd), "项目")
-					nd = strings.Trim(strings.TrimSpace(fmt.Sprint(nd)), "采购")
-					if pd != nd{
-						projectname.ProjectnameOld = fmt.Sprint(pd)
-						projectname.ProjectnameNew = fmt.Sprint(nd)
-						projectnames = append(projectnames, projectname)
-						projectnamenum++
-					}else {
-							projectnamenumXT++
-					}
-				case "buyer":
-					buyer := new(Buyer)
-					buyer.versionComparison = *versioncomparison
-					buyer.BuyerOld = fmt.Sprint(pd)
-					buyer.BuyerNew = fmt.Sprint(nd)
-					buyers = append(buyers, buyer)
-					buyernum++
-				case "projectcode":
-					projectcode := new(Projectcode)
-					projectcode.ProjectcodeOld = fmt.Sprint(pd)
-					projectcode.ProjectcodeNew = fmt.Sprint(nd)
-					projectcode.versionComparison = *versioncomparison
-					projectcodes = append(projectcodes, projectcode)
-					projectcodenum++
-				case "winner":
-					winner := new(Winner)
-					winner.WinnerOld = fmt.Sprint(pd)
-					winner.WinnerNew = fmt.Sprint(nd)
-					winner.versionComparison = *versioncomparison
-					winners = append(winners, winner)
-					winnernum++
-				case "budget":
-					budget := new(Budget)
-					budget.BudgetOld = fmt.Sprint(pd)
-					budget.BudgetNew = fmt.Sprint(nd)
-					budget.versionComparison = *versioncomparison
-					budgets = append(budgets, budget)
-					budgetnum++
-				case "bidamount":
-					bidamount := new(Bidamount)
-					bidamount.BidamountOld = fmt.Sprint(pd)
-					bidamount.BidamountNew = fmt.Sprint(nd)
-					bidamount.versionComparison = *versioncomparison
-					bidamounts = append(bidamounts, bidamount)
-					bidamountnum++
-				}
-			}else {
-				//相同统计
-				pd = strings.TrimSpace(fmt.Sprint(pd))
-				nd = strings.TrimSpace(fmt.Sprint(nd))
-				if pd == ""||pd == "0" || nd == ""|| nd == "0"{
-					continue
-				}
-				if pd == nd {
-					switch k {
-					case "projectname":
-						projectnamenumXT++
-					case "buyer":
-						buyernumXT++
-					case "projectcode":
-						projectcodenumXT++
-					case "winner":
-						winnernumXT++
-					case "budget":
-						budgetnumXT++
-					case "bidamount":
-						bidamountnumXT++
-					}
-				}
+			rd[qu.BsonIdToSId(v["_id"])] = &Data{
+				Id:     qu.BsonIdToSId(v["_id"]),
+				PreVal: fmt.Sprint(v[key]),
 			}
+			FieldData[key] = rd
 		}
-		fmt.Println()
 	}
+	log.Println("pre version 加载完成", len(*list1))
 
-	//log.Println(projectcodenum, bidamountnum, winnernum, buyernum, budgetnum, projectnamenum)
-	for ins, ivs := range xf.Sheets {
-		for inr, ivr := range ivs.Rows {
-			for _, ivc := range ivr.Cells {
-				//抽取对比
-				if ins == 0 {
-					if inr < 3 {
-						continue
-					}
-					switch strings.TrimSpace(ivc.String()) {
-					case "projectname":
-						style := ivr.Cells[1].GetStyle()
-						style.Font.Color = "000000"
-						ivr.Cells[1].SetValue(pnameNotNilNumP)
-						ivr.Cells[2].SetValue(pnameNotNilNumN)
-						//结果相同数量
-						ivr.Cells[3].SetValue(projectnamenumXT)
-						ivr.Cells[3].SetStyle(style)
-						//结果不同数量
-						ivr.Cells[4].SetValue(projectnamenum)
-						ivr.Cells[4].SetStyle(style)
-					case "buyer":
-						style := ivr.Cells[1].GetStyle()
-						style.Font.Color = "000000"
-						ivr.Cells[1].SetValue(buyerNotNilNumP)
-						ivr.Cells[2].SetValue(buyerNotNilNumN)
-						//结果相同数量
-						ivr.Cells[3].SetValue(buyernumXT)
-						ivr.Cells[3].SetStyle(style)
-						//结果不同数量
-						ivr.Cells[4].SetValue(buyernum)
-						ivr.Cells[4].SetStyle(style)
-					case "projectcode":
-						style := ivr.Cells[1].GetStyle()
-						style.Font.Color = "000000"
-						ivr.Cells[1].SetValue(pcodeNotNilNumP)
-						ivr.Cells[2].SetValue(pcodeNotNilNumN)
-						//结果相同数量
-						ivr.Cells[3].SetValue(projectcodenumXT)
-						ivr.Cells[3].SetStyle(style)
-						//结果不同数量
-						ivr.Cells[4].SetValue(projectcodenum)
-						ivr.Cells[4].SetStyle(style)
-					case "winner":
-						style := ivr.Cells[1].GetStyle()
-						style.Font.Color = "000000"
-						ivr.Cells[1].SetValue(winnerNotNilNumP)
-						ivr.Cells[2].SetValue(winnerNotNilNumN)
-						//结果相同数量
-						ivr.Cells[3].SetValue(winnernumXT)
-						ivr.Cells[3].SetStyle(style)
-						//结果不同数量
-						ivr.Cells[4].SetValue(winnernum)
-						ivr.Cells[4].SetStyle(style)
-					case "budget":
-						style := ivr.Cells[1].GetStyle()
-						style.Font.Color = "000000"
-						ivr.Cells[1].SetValue(budgetNotNilNumP)
-						ivr.Cells[2].SetValue(budgetNotNilNumN)
-						//结果相同数量
-						ivr.Cells[3].SetValue(budgetnumXT)
-						ivr.Cells[3].SetStyle(style)
-						//结果不同数量
-						ivr.Cells[4].SetValue(budgetnum)
-						ivr.Cells[4].SetStyle(style)
-					case "bidamount":
-						style := ivr.Cells[1].GetStyle()
-						style.Font.Color = "000000"
-						ivr.Cells[1].SetValue(bidamountNotNilNumP)
-						ivr.Cells[2].SetValue(bidamountNotNilNumN)
-						//结果相同数量
-						ivr.Cells[3].SetValue(bidamountnumXT)
-						ivr.Cells[3].SetStyle(style)
-						//结果不同数量
-						ivr.Cells[4].SetValue(bidamountnum)
-						ivr.Cells[4].SetStyle(style)
-					}
+	list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{}`, false, -1, -1)
+	for _, v := range *list2 {
+		for _, field := range Fields {
+			rd := FieldData[field]
+			if rd == nil {
+				rd = map[string]*Data{}
+			}
+			_id := qu.BsonIdToSId(v["_id"])
+			tmp := rd[_id]
+			if tmp != nil {
+				tmp.NewVal = fmt.Sprint(v[field])
+				rd[_id] = tmp
+			} else {
+				rd[_id] = &Data{
+					NewVal: fmt.Sprint(v[field]),
 				}
 			}
+			FieldData[field] = rd
 		}
-		if ins > 0 {
-			if len(ivs.Rows) == 0 {
-				row := ivs.AddRow()
-				row.AddCell().SetValue("ObjectId")
-				row.AddCell().SetValue("dev3.1.2")
-				row.AddCell().SetValue("dev3.2")
-				row.AddCell().SetValue("URL")
-			}
-			//log.Println(ivs.Name)
-			switch strings.TrimSpace(ivs.Name) {
-			case "projectname":
-				for _, v := range projectnames {
-					row := ivs.AddRow()
-					row.AddCell().SetValue(v.Id)
-					row.AddCell().SetValue(v.ProjectnameOld)
-					row.AddCell().SetValue(v.ProjectnameNew)
-					row.AddCell().SetValue(v.Url)
-				}
-			case "buyer":
-				for _, v := range buyers {
-					row := ivs.AddRow()
-					row.AddCell().SetValue(v.Id)
-					row.AddCell().SetValue(v.BuyerOld)
-					row.AddCell().SetValue(v.BuyerNew)
-					row.AddCell().SetValue(v.Url)
-				}
-			case "projectcode":
-				for _, v := range projectcodes {
-					row := ivs.AddRow()
-					row.AddCell().SetValue(v.Id)
-					row.AddCell().SetValue(v.ProjectcodeOld)
-					row.AddCell().SetValue(v.ProjectcodeNew)
-					row.AddCell().SetValue(v.Url)
-				}
-			case "winner":
-				for _, v := range winners {
-					row := ivs.AddRow()
-					row.AddCell().SetValue(v.Id)
-					row.AddCell().SetValue(v.WinnerOld)
-					row.AddCell().SetValue(v.WinnerNew)
-					row.AddCell().SetValue(v.Url)
+	}
+	log.Println("new version 加载完成", len(*list2))
+	for k, v := range FieldData {
+		cp := &Compare{Field: k}
+		for _, d := range v {
+			if d.NewVal != "" && d.PreVal != "" {
+				if d.NewVal == d.PreVal {
+					cp.EqNum++
+				} else {
+					cp.NEqNum++
 				}
-			case "budget":
-				for _, v := range budgets {
-					row := ivs.AddRow()
-					row.AddCell().SetValue(v.Id)
-					row.AddCell().SetValue(v.BudgetOld)
-					row.AddCell().SetValue(v.BudgetNew)
-					row.AddCell().SetValue(v.Url)
+				cp.PreExtNum++
+				cp.NewExtNum++
+			} else {
+				if d.NewVal == "" {
+					cp.NewNilnum++
+					if d.PreVal != "" {
+						cp.NEqNum++
+						cp.PreExtNum++
+					}
 				}
-			case "bidamount":
-				for _, v := range bidamounts {
-					row := ivs.AddRow()
-					row.AddCell().SetValue(v.Id)
-					row.AddCell().SetValue(v.BidamountOld)
-					row.AddCell().SetValue(v.BidamountNew)
-					row.AddCell().SetValue(v.Url)
+				if d.PreVal == "" {
+					cp.PreNilnum++
+					if d.NewVal != "" {
+						cp.NewExtNum++
+						cp.NEqNum++
+					}
 				}
 			}
 		}
+		Compares[k] = cp
 	}
-	err = xf.Save("resultdata.xlsx")
-	if err != nil {
-		log.Println("保存xlsx失败:", err)
-		return
-	}
-	log.Println("xlsx保存成功")
-}
-
-type Projectname struct {
-	versionComparison
-	ProjectnameOld string
-	ProjectnameNew string
-}
-type Buyer struct {
-	versionComparison
-	BuyerOld string
-	BuyerNew string
-}
-type Projectcode struct {
-	versionComparison
-	ProjectcodeOld string
-	ProjectcodeNew string
-}
-type Winner struct {
-	versionComparison
-	WinnerOld string
-	WinnerNew string
-}
-type Budget struct {
-	versionComparison
-	BudgetOld string
-	BudgetNew string
-}
-type Bidamount struct {
-	versionComparison
-	BidamountOld string
-	BidamountNew string
 }

二進制
versioncomparison/template.xlsx


二進制
versioncomparison/抽取结果对比.xlsx