fengweiqiang 5 anos atrás
pai
commit
2ff8449801

+ 56 - 125
src/jy/extract/extract.go

@@ -38,26 +38,17 @@ var (
 	Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
-func closeDb(ext *ExtractTask) {
-	//if ext.TaskInfo.FDB != nil {
-	//	s := ext.TaskInfo.FDB.Get()
-	//	db.Mgo.Close(s)
-	//}
-	//if ext.TaskInfo.TDB != nil {
-	//	s := ext.TaskInfo.TDB.Get()
-	//	db.Mgo.Close(s)
-	//}
-}
-
 //启动测试抽取
 func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
 	defer qu.Catch()
-	ext := &ExtractTask{}
-	ext.Id = taskId
-	ext.IsRun = true
-	ext.InitTestTaskInfo(resultcoll, trackcoll)
-	ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
-	defer closeDb(ext)
+	ext := TaskList[taskId]
+	if ext == nil {
+		ext = &ExtractTask{}
+		ext.Id = taskId
+		ext.InitTestTaskInfo(resultcoll, trackcoll)
+		ext.IsRun = true
+		ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	}
 	ext.InitSite()
 	ext.InitRulePres()
 	ext.InitRuleBacks(false)
@@ -71,13 +62,13 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.InitTag(true)
 	ext.InitClearFn(false)
 	ext.InitClearFn(true)
-	if ext.IsExtractCity { //版本上控制是否开始城市抽取
-		//初始化城市DFA信息
-		ext.InitCityInfo()
-		//ext.InitCityDFA()
-		ext.InitAreaCode()
-		ext.InitPostCode()
-	}
+	//if ext.IsExtractCity { //版本上控制是否开始城市抽取
+	//	//初始化城市DFA信息
+	//	ext.InitCityInfo()
+	//	//ext.InitCityDFA()
+	//	ext.InitAreaCode()
+	//	ext.InitPostCode()
+	//}
 	//质量审核
 	ext.InitAuditFields()
 	ext.InitAuditRule()
@@ -90,6 +81,8 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
 	//附件抽取是否开启
 	ext.InitFile()
+	ext.TaskInfo.TestColl = resultcoll
+	TaskList[taskId] = ext
 	return RunExtractTestTask(ext, startId, num)
 }
 
@@ -106,7 +99,10 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
 		list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
 		for _, v := range *list {
-			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+			//if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
+			//	continue
+			//}
+			if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl"{ //临时
 				continue
 			}
 			var j, jf *ju.Job
@@ -142,7 +138,6 @@ func StartExtractTaskId(taskId string) bool {
 	}
 	ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
 	ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
-	defer closeDb(ext)
 	ext.InitSite()
 	ext.InitRulePres()
 	ext.InitRuleBacks(false)
@@ -205,7 +200,6 @@ func RunExtractTask(taskId string) {
 	ext := TaskList[taskId]
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
 	count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
-	defer closeDb(ext)
 	pageNum := (count + PageSize - 1) / PageSize
 	limit := PageSize
 	if count < PageSize {
@@ -217,7 +211,10 @@ func RunExtractTask(taskId string) {
 		list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
 		fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
 		for _, v := range *list {
-			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+			//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+			//	continue
+			//}
+			if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl"{ //临时
 				continue
 			}
 			//根据标题判断是否抽取
@@ -256,6 +253,8 @@ func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
 	return (&ExtractTask{}).PreInfo(doc)
 }
 
+var clearnn *regexp.Regexp = regexp.MustCompile("名[\\s]+称")
+
 //信息预处理-和版本关联
 func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
 	defer qu.Catch()
@@ -412,7 +411,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 func file2text(doc *map[string]interface{}) {
 	tmpstr := ""
 	//if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
-		if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
+	if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
 		for _, attachs := range attach_text {
 			if fileinfos, ok := attachs.(map[string]interface{}); ok {
 				for _, fileinfo := range fileinfos {
@@ -607,7 +606,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				lockclear.Lock()
 				var cfn = []string{}
 				if isSite {
-					cfn = e.SiteClearFn[key]
 				} else {
 					cfn = e.ClearFn[key]
 				}
@@ -1144,7 +1142,7 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]ma
 				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
 					"code":        "winnerorder",
 					"field":       vc.Field,
-					"ruletext":    "中标候选人_" + v["sortstr"].(string),
+					"ruletext":    "中标候选人_" + fmt.Sprint(v["sortstr"]),
 					"extfrom":     v["sortstr"],
 					"sourcevalue": v["price"],
 					"value":       v["price"],
@@ -1168,34 +1166,6 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]ma
 				return kvmap, false
 			}
 		}
-		//else if vc.Field == "winner" {
-		//	for _, v := range j.Winnerorder {
-		//		kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
-		//			"code":        "winnerorder",
-		//			"field":       vc.Field,
-		//			"ruletext":    "中标候选人",
-		//			"extfrom":     vc.ExtFrom,
-		//			"sourcevalue": "中标候选人",
-		//			"value":       v["entname"],
-		//			"type":        "winnerorder",
-		//			"matchtype":   "winnerorder",
-		//		})
-		//	}
-		//	//候选人中标单位
-		//	if entname := j.Winnerorder[0]["entname"]; entname != nil {
-		//		kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
-		//			"code":        "CL_中标候选人",
-		//			"field":       vc.Field,
-		//			"ruletext":    "中标候选人",
-		//			"extfrom":     vc.ExtFrom,
-		//			"sourcevalue": "中标候选人",
-		//			"value":       entname,
-		//			"type":        "winnerorder",
-		//			"matchtype":   "winnerorder",
-		//		})
-		//		return kvmap, false
-		//	}
-		//}
 	}
 	for fieldname, field := range vc.LFields {
 		if field != vc.Field {
@@ -1716,11 +1686,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		tmp["spidercode"] = j.SpiderCode
 		tmp["site"] = j.Site
 		tmp["jsondata"] = j.Jsondata
-		tmp["fieldall"] = auxinfo
 		for _, val := range result {
 			for _, v := range val { //取第一个非负数,项目名称除外
 				//存0是否有效
-				if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
+				if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue{
 					tmp[v.Field] = v.Value
 					break
 				}
@@ -1763,7 +1732,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				if qu.Float64All(tmp["budget"]) < tmpBudget {
 					tmp["budget"] = tmpBudget
 				}
-				if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
+				if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/100 > qu.Float64All(tmp["budget"])) {
+					tmp["bidamount"] = tmpBidamount
+				} else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
 					tmp["bidamount"] = tmpBidamount
 				}
 			} else {
@@ -1814,11 +1785,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 		//处理附件
 		var resultf map[string][]*ju.ExtField
+		ffield := map[string]interface{}{}
 		if jf != nil {
 			_, resultf, _ = funcAnalysis(jf, e)
-			auxinfof := auxInfo(jf)
-			tmp["fieldallf"] = auxinfof
-			ffield := map[string]interface{}{}
 			for _, val := range resultf {
 				for _, v := range val { //取第一个非负数
 					if v.Score > -1 {
@@ -1833,16 +1802,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			if len(jf.Winnerorder) > 0 { //候选人信息
 				ffield["winnerorder"] = jf.Winnerorder
 			}
-			tmp["ffield"] = ffield
 		}
 		for k, v := range *doc {
 			if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
 				(*doc)[k] = []rune(qu.ObjToString(v))[:100000]
 			}
 			//去重冗余字段
-			//if delFiled(k) {
-			//	continue
-			//}
+			if delFiled(k) {
+				continue
+			}
 			if tmp[k] == nil {
 				tmp[k] = v
 			}
@@ -1854,14 +1822,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if e.IsExtractCity { //城市抽取
 			//e.ExtractCity(j, tmp, _id)
 			e.NewExtractCity(j, &tmp, _id)
-
-			//			b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
-			//			// log.Debug("省份---", p, "城市---", c, "区---", d)
-			//			tmp["district"] = d
-			//			if b {
-			//				tmp["city"] = c
-			//				tmp["area"] = p
-			//			}
 		}
 		//品牌抽取
 		if ju.IsBrandGoods {
@@ -1945,27 +1905,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 			}
 		}
-		//		fmt.Println("=============抽取结果================")
-		//		for k, v := range tmp {
-		//			qu.Debug(k, "---", v)
-		//		}
-		//tmp["extract_content"] = j.Content
 		tmp["dataging"] = j.Dataging
-		if attach_text, ok := (tmp)["new_attach_text"].(map[string]interface{}); ok {
-			//if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
-			for ai, attachs := range attach_text {
-				if fileinfos, ok := attachs.(map[string]interface{}); ok {
-					for fi, fileinfo := range fileinfos {
-						if ff, ok := fileinfo.(map[string]interface{}); ok {
-							attach_url := qu.ObjToString(ff["attach_url"])
-							if utf8.RuneCountInString(attach_url) > qu.IntAllDef(ju.Config["filelength"], 10000) {
-								(tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] = "文本过长..."
-							}
-						}
-					}
-				}
-			}
-		} //}budget bidamount
+		//budget bidamount
 		if bg, ok := tmp["budget"].(float64); ok && bg >= 500000000000 {
 			delete(tmp, "budget")
 		}
@@ -1977,6 +1918,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if tmp["projectname"] == nil || tmp["projectname"] == "" {
 			tmp["projectname"] = j.Title
 		}
+		tmp["repeat"] = 0
+
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -1992,7 +1935,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 						//	}
 						//}
 					}*/
-				tmp["repeat"] = 0
 				tmparr := []map[string]interface{}{
 					map[string]interface{}{
 						"_id": qu.StringTOBsonId(_id),
@@ -2022,6 +1964,16 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		} else { //测试结果
 			delete(tmp, "_id")
 			delete(tmp, "fieldall")
+			auxinfof := auxInfo(jf)
+			if len(auxinfo) > 0 {
+				tmp["fieldall"] = auxinfo
+			}
+			if len(auxinfof) > 0 {
+				tmp["fieldallf"] = auxinfof
+			}
+			if len(ffield) > 0 {
+				tmp["ffield"] = ffield
+			}
 			if len(j.BlockPackage) > 0 { //分包详情
 				if len(j.BlockPackage) > 10 {
 					tmp["epackage"] = "分包异常"
@@ -2047,6 +1999,8 @@ func checkFields(tmp map[string]interface{}) map[string]interface{} {
 	delete(tmp, "detail")
 	if _, ok := tmp["bidamount"].(string); ok {
 		delete(tmp, "bidamount")
+	} else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && fb/100 > qu.Float64All(tmp["budget"]) {
+		delete(tmp, "bidamount")
 	}
 	if _, ok := tmp["budget"].(string); ok {
 		delete(tmp, "budget")
@@ -2215,6 +2169,9 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 //辅助信息,如果没有排序先排序
 func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 	fieldalls := map[string][]map[string]interface{}{}
+	if j == nil {
+		return fieldalls
+	}
 	qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
 	defer qykredis.Close()
 	db := 0
@@ -2373,17 +2330,7 @@ func resetWinnerorder(j *ju.Job) {
 	//i := 0
 	winners := []*ju.ExtField{}
 	bidamounts := []*ju.ExtField{}
-	//for _, v := range j.Result["winner"] {
-	//	if v.Code == "winnerorder" {
-	//		if maxlen < i {
-	//			continue
-	//		}
-	//		j.Winnerorder[i]["entname"] = v.Value
-	//		i++
-	//	} else {
-	//		winners = append(winners, v)
-	//	}
-	//}
+
 	if maxlen > 0 {
 		winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
 		if j.Winnerorder[0]["price"] != nil {
@@ -2403,22 +2350,6 @@ func resetWinnerorder(j *ju.Job) {
 	} else if len(bidamounts) > 0 {
 		j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
 	}
-	//j.Result["winner"] = winners
-	//中标金额
-	//i = 0
-	//bidamounts := []*ju.ExtField{}
-	//for _, v := range j.Result["bidamount"] {
-	//	if v.Code == "winnerorder" {
-	//		if maxlen < i {
-	//			continue
-	//		}
-	//		j.Winnerorder[i]["price"] = v.Value
-	//		i++
-	//	} else {
-	//		bidamounts = append(bidamounts, v)
-	//	}
-	//}
-	//j.Result["bidamount"] = bidamounts
 
 }
 func RemoveReplicaSliceString(slc []string) []string {

+ 10 - 10
src/jy/extract/score_jsondata.go

@@ -24,7 +24,7 @@ var endOfPunctuationClrear = regexp.MustCompile("[,,.。??;;]+$")
 var keysClrear = regexp.MustCompile("(详见|公告|X|内文|某单位|某部|文件|\\*|暂无|?|\\?)")
 
 //jsondata清理
-func clearJd(jd *map[string]interface{}, e *ExtractTask,spiderCode string) {
+func clearJd(jd *map[string]interface{}, e *ExtractTask, spiderCode string) {
 	for k, v := range *jd {
 		if k == "buyer" || k == "winner" || k == "agency" || k == "projectcode" || k == "projectname" {
 			vstring := util2.ObjToString(v)
@@ -37,7 +37,7 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask,spiderCode string) {
 			cfn := e.ClearFn[k]
 			lockclear.Unlock()
 			if len(cfn) > 0 {
-				data := clear.DoClearFn(cfn, []interface{}{vstring, ""},spiderCode)
+				data := clear.DoClearFn(cfn, []interface{}{vstring, ""}, spiderCode)
 				lockclear.Lock()
 				if clear.AsyField[k] != nil || clear.SymField[k] != nil || clear.MesField[k] != nil {
 					vstring = clear.OtherClean(k, util2.ObjToString(data[0]))
@@ -58,7 +58,7 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask,spiderCode string) {
 			} else {
 				(*jd)[k] = vstring
 			}
-		}else if k == "Detail"{
+		} else if k == "Detail" {
 			delete(*jd, k)
 		}
 	}
@@ -85,7 +85,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1, IsTrue: newNum[len(newNum)-1].(bool)})
@@ -94,7 +94,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				//AddExtLog("clear", j.SourceMid, (*j.Jsondata)[v], newNum[0], &RegLuaInfo{ "JsonData_"+v, "", v, "", false, nil, nil}, e.TaskInfo) //抽取日志
 				//}
 				continue
-			}else if v == "bidopentime"{
+			} else if v == "bidopentime" {
 				lockclear.Lock()
 				cfn := e.ClearFn[v]
 				lockclear.Unlock()
@@ -102,10 +102,10 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 					continue
 				}
 				extFields := make([]*util.ExtField, 0)
-				if bt,ok :=(*j.Jsondata)[v].(float64);ok && bt>0{
+				if bt, ok := (*j.Jsondata)[v].(float64); ok && bt > 0 {
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: bt, Score: 0.1})
-				}else {
-					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
+				} else {
+					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode)
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1})
 				}
 				j.Result[v] = extFields
@@ -193,7 +193,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				continue
 			}
 			oneScore := j.Result[v][0].Score
-			if oneScore <0{
+			if oneScore < 0 {
 				oneScore = 0
 			}
 			if v == "budget" || v == "bidamount" {
@@ -203,7 +203,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				if jdextweight > 1 {

+ 11 - 2
src/jy/pretreated/analystep.go

@@ -6,11 +6,15 @@ package pretreated
 import (
 	"encoding/json"
 	"jy/util"
+	"regexp"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
 )
 
+
+var yjReg *regexp.Regexp = regexp.MustCompile("单位业绩|个人业绩|主要人员相关资料|投标文件格式|项目业绩|否决投标的?情况说明")
+
 func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 	con := job.Content
 	//全文的需要修复表格
@@ -31,7 +35,9 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite) //分块
 	if len(blockArrays) > 0 {                                                                  //有分块
 		//从块里面找分包
-		job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
+		if !job.IsFile{
+			job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
+		}
 		for _, bl := range blockArrays {
 			//log.Println(bl.Text)
 			if len([]rune(bl.Text)) > 80 {
@@ -72,6 +78,9 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 		bl.Text = HtmlToText(con)
 		//log.Println(bl.Text)
 		FindProjectCode(bl.Text, job) //匹配项目编号
+		if yjReg.MatchString(bl.Text){
+			bl.Text = bl.Text[:strings.Index(bl.Text,"业绩")]
+		}
 		//调用kv解析
 		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
 		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
@@ -309,7 +318,7 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, i
 		job.Winnerorder = append(job.Winnerorder, wror...)
 	}
 	//分包
-	if len(tablePackage) > 0 {
+	if len(tablePackage) > 0  {
 		pkgMap := map[string]*util.BlockPackage{}
 		for tk, tv := range tablePackage {
 			bv := job.BlockPackage[tk]

+ 90 - 92
src/jy/pretreated/analytable.go

@@ -4,8 +4,6 @@ import (
 	"fmt"
 	"jy/clear"
 	u "jy/util"
-
-	//"log"
 	qutil "qfw/util"
 	"regexp"
 	"strings"
@@ -32,11 +30,11 @@ var (
 	moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
 	//根据表格的内容判断是不是表头,如果含有金额则不是表头
 	MoneyReg = regexp.MustCompile("^[\\s  ::0-9.万元()()人民币¥$]+$")
+
 	//判断分包时
 	moneyNum = regexp.MustCompile("[元整¥万]")
 	//对隐藏表格的判断
 	display = regexp.MustCompile("(?i).*?display\\s?[:]\\s?none.*")
-
 	//---------------
 	//求是分包的概率
 	//根据表格的标签对分包进行打分
@@ -45,8 +43,8 @@ var (
 	//在判断分包打分前过虑表格key
 	FilterKey_2 = regexp.MustCompile("招标|投标|项目")
 	//根据表格的key进行分包打分
-	FindKey_2 = regexp.MustCompile("([分子][包标](号)?|标[号项段包](划分)?|包件?[号段名数])")
-	FindKey_3 = regexp.MustCompile("(标段编号)")
+	FindKey_2 = regexp.MustCompile("([分子][包标](号)?|标[号项段包](划分)?|包件?[号段名数]|包组)")
+	FindKey_3 = regexp.MustCompile("(标段编号|标包)")
 	//对值进行分包判断
 	FindVal_1  = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)")
 	FindVal2_1 = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)|^(设计|施工|监理|验收)[分子]?[标包]?[段号]?$")
@@ -60,6 +58,7 @@ var (
 
 	//清理表格标签正则
 	ClearTagReg = regexp.MustCompile("<[^>]*?>|[\\s\\n\\r]*$")
+
 	//查找表格标签正则
 	ttagreg = regexp.MustCompile("(?s)([^\\n::。,;\\s\u3000\u2003\u00a0]{2,30})[::]?[^::。;!\\n]{0,35}[\\s\\n]*$")
 
@@ -72,16 +71,16 @@ var (
 
 	//对表格kv的处理
 	//对不能标准化的key做批识
-	filter_tag_zb = regexp.MustCompile("(中标|成交|投标)[\\p{Han}]{0,6}(情况|结果|信息|明细)")
+	filter_tag_zb = regexp.MustCompile("(中标|成交|投标)[\\p{Han}]{0,6}(情况|结果|信息|明细)?")
 	//中标金额
 	//包含以下字眼做标准化处理
 	filter_zbje_k = regexp.MustCompile("(中标|成交|总|拦标|合同|供[应货]商|报)[\\p{Han}、]{0,6}(价|额|[大小]写|[万亿]?元).{0,4}$")
 	//简单判断金额
 	filter_zbje_jd = regexp.MustCompile("^[^(售|保证)]{0,4}(价|额).{0,4}$")
 	//预算金额
-	filter_ysje_jd = regexp.MustCompile("预算")
+	filter_ysje_jd = regexp.MustCompile("(预算|预控价|项目概.|项目信息)")
 	//且排队以下字眼的key
-	filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分|要求$")
+	filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分|电话|要求|需求数量|发布规模$|第[2二3三4四5五]|地址|询价保证金|行号")
 	//且值包含以下字眼
 	filter_zbje_v = regexp.MustCompile("[¥$$0-9一二三四五六七八九十,,〇零点..壹贰叁肆伍陆柒捌玖拾百佰千仟万亿億元圆角分整正()::大小写]{2,16}")
 
@@ -93,9 +92,8 @@ var (
 	//简单判断
 	filter_zbdw_jd = regexp.MustCompile("(投标|成交|中标|合同)(供应商|单位|人|名称).{0,4}$")
 	//且不包含以下字眼
-	filter_zbdw_kn = regexp.MustCompile("第[2二3三4四5五]|得分|地址|询价保证金")
-	//且值包含以下字眼
-	filter_zbdw_v = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$|([^购]中心|办公|用品)")
+	filter_zbdw_kn = regexp.MustCompile("第[2二3三4四5五]|得分|地址|询价保证金") //且值包含以下字眼
+	filter_zbdw_v  = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$|([^购]中心|办公|用品)")
 	//且值包含以下字眼
 	filter_zbdw_v2 = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$")
 
@@ -111,15 +109,16 @@ var (
 	ContactType         = map[string]*regexp.Regexp{
 		"采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|招标(服务)?|甲|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心(地址)?|业主|收料人|采购部"),
 		"代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
-		"中标单位": regexp.MustCompile("^((拟(定)?|预|最终|唯一)?(中标|成交|中选|供(货|应)))[^候选]{0,2}(人|方|单位|公司|(服务|供应)?商|企业)"),
+		"中标单位": regexp.MustCompile("^((拟(定)?|预|最终|唯一)?(中标|成交|中选|供(货|应))((成交))?)[^候选]{0,2}(人|方|单位|公司|(服务|供应)?商|企业)"),
+		"监督部门": regexp.MustCompile("投诉受理部门"),
 	}
 	ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
 	MultipleValueSplitReg       = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
 	BuyerContacts               = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"}
 	FilterSerial                = regexp.MustCompile(".+[、..::,]")
 	underline                   = regexp.MustCompile("_+$")
-	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
-	nswinnertabletag            = regexp.MustCompile("[评得分估]+|标的|班子成员")
+	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果|磋商情况)")
+	nswinnertabletag            = regexp.MustCompile("评得分估|标的|班子成员|人员")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
@@ -176,7 +175,7 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool,
 			if winnerOrderAndBidResult.MatchString(tabletag) && t1.Value == "采购单位联系人" { //处理table中项目负责人
 				kvTags[k] = append(kvTags[k], &u.Tag{Key: k, Value: v1, IsInvalid: true})
 			} else if regexp.MustCompile("(中标候选人|名单及其排序|排序)").MatchString(tabletag) && t1.Value == "采购单位" {
-				kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k1, Value: v1, Weight: t1.Weight - 100})
+				kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k1, Value: v1, Weight: t1.Weight - 150})
 			} else {
 				kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k1, Value: v1, Weight: t1.Weight})
 			}
@@ -189,18 +188,15 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool,
 			if tabletag == "" {
 				returntag = "中标情况"
 			}
-			kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100})
+			kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100, IsInvalid: true})
 		} else if filter_zbdw_ky.MatchString(k) && !filter_zbdw_kn.MatchString(k) &&
 			filter_zbdw_v.MatchString(v1) {
-			kvTags["中标单位"] = append(kvTags["中标单位"], &u.Tag{Key: k, Value: v1, Weight: -100})
+			kvTags["中标单位"] = append(kvTags["中标单位"], &u.Tag{Key: k, Value: v1, Weight: -100, IsInvalid: true})
 			if tabletag == "" {
 				returntag = "中标情况"
 			}
-		} else {
+		} else if !filter_zbje_jd.MatchString(tabletag) && !filter_zbje_jd.MatchString(k) && utf8.RuneCountInString(v1) < 13 {
 			//对上一步没有取到标准化key的进一步处理
-			if tabletag == "" {
-
-			}
 			if filter_tag_zb.MatchString(tabletag) || filter_tag_zb.MatchString(tabledesc) {
 				//u.Debug(v1, k, "-----", filter_zbdw_jd.MatchString(k), filter_zbdw_v.MatchString(v1))
 				if filter_zbje_jd.MatchString(k) && !filter_zbje_kn.MatchString(k) && filter_zbje_v.MatchString(v1) {
@@ -210,17 +206,16 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool,
 						kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100})
 					}
 
-				} /*else if filter_zbdw_jd.MatchString(k) && filter_zbdw_v.MatchString(v1) {
-					k1 = append(k1, "中标单位")
-					weight = append(weight, -100)
-					b = true
-				}*/
+				}
 			}
 		}
 	}
 	return
 }
 
+var glRex *regexp.Regexp = regexp.MustCompile("(成交|中标|候选|排名|名次|供应商排序|中标候选人|名单及其排序|排序)")
+var djReg *regexp.Regexp = regexp.MustCompile("^单价")
+
 //对解析后的表格的kv进行过滤
 func (table *Table) KVFilter(isSite bool, codeSite string) {
 	//1.标准化值查找
@@ -235,37 +230,22 @@ func (table *Table) KVFilter(isSite bool, codeSite string) {
 	//遍历table.sortkv,进行过滤处理,并放入标准化KV中,如果值是数组跳到下一步处理
 	for _, k := range table.SortKV.Keys {
 		//表格描述处理,对成交结果的处理
-		if regexp.MustCompile("(成交|中标|候选|排名|名次|供应商排序)").MatchString(k) {
+		if glRex.MatchString(k) {
 			table.Desc += "成交结果,"
 		}
-		if regexp.MustCompile("^单价").MatchString(k) {
+		if djReg.MatchString(k) {
 			continue
 		}
 		v := table.SortKV.Map[k]
 		if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight
 			k = pkgFilter.ReplaceAllString(k, "")
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
-			if k == "2、建设规模" {
-				k = "预算"
-			}
-			if k == `中标价(万元)\费率(%)` {
-				k = "中标价(万元)"
-			}
 			kvTags, tag := CommonDataAnaly(k, table.Tag, table.Desc, v, isSite, codeSite) //对key标准化处理,没有找到会走中标
-			//qutil.Debug(k, v, k1, w1, v1, tag, b)
 			if tag != "" && table.Tag == "" {
 				table.Tag = tag
 			}
 			MergeKvTags(table.StandKV, kvTags)
 		} else {
-			//u.Debug(k, v, "---------")
-			if strings.Contains(k, "总价") {
-				if vvvv, ok := v.([]string); ok && len(vvvv) > 0 {
-					as.RemoveKey("报价")
-					as.AddKey(k, vvvv[len(vvvv)-1])
-					continue
-				}
-			}
 			as.AddKey(k, v)
 		}
 	}
@@ -372,10 +352,13 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 							if !res {
 								kt := u.GetTags(filterThText.ReplaceAllString(ClearKey(k, 2), ""), isSite, codeSite)
 								if kt.Len() > 0 {
-									kv = kt[0].Value
+									if kt[0].Value == "单品报价" && winnertag {
+										kv = "中标金额"
+									} else {
+										kv = kt[0].Value
+									}
 								}
 							}
-							//qutil.Debug(k, res, repl, kv, "--", vs)
 							if !res && kv == "" { //key未验证出,验证数组的val值
 								checkKey[kn+kn1] = true
 								if winnertag { //如果是中标信息 在根据val数组信息解析候选人
@@ -465,7 +448,6 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 						//if hadSort { //有排序,再添加entname和price
 						if len(tmpEntname) > 0 && n < len(tmpEntname) && tmpEntname[n] != "" {
 							smap_v["entname"] = tmpEntname[n]
-
 							if len(tmpPrice) > 0 && n < len(tmpPrice) && tmpPrice[n] != "" {
 								smap_v["price"] = tmpPrice[n]
 							}
@@ -509,12 +491,6 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 						}
 					}
 				}
-				//table.StandKV[kk] = append(table.StandKV[kk], vv...)
-				//				else if k2 == "中标金额" {
-				//					if qutil.Float64All(v1) > qutil.Float64All(table.StandKV[k2]) {
-				//						table.StandKV[k2] = v1
-				//					}
-				//				}
 			}
 		}
 	}
@@ -825,6 +801,10 @@ func (table *Table) Analy(contactFormat *u.ContactFormat, isSite bool, codeSite
 	if trs.Size() == 0 {
 		trs = table.Goquery.ChildrenFiltered("tr")
 	}
+	ztb := table.Goquery.Find("table").Size()
+	if ztb >= 10 {
+		return []*Table{}
+	}
 	//遍历节点,初始化table 结构
 	table.createTabe(trs, isSite, codeSite)
 	if len(table.TRs) == 0 {
@@ -1455,23 +1435,7 @@ func (table *Table) ComputeRowColIsKeyRation(isSite bool, codeSite string) {
 				} else if table.Tag == "" && k == 0 && len(tr.TDs[0].Val) > 11 {
 					table.Tag = tr.TDs[0].Val
 				}
-
-				//			subVal := tdval_reg.FindAllStringSubmatch(tr.TDs[0].Val, -1)
-				//			//u.Debug(tr.TDs[0].Val, subVal)
-				//			if len(subVal) > 0 {
-				//				for _, subv1 := range subVal {
-				//					if len(subv1) == 3 {
-				//						table.SortKV.AddKey(subv1[1], subv1[2])
-				//					}
-				//				}
-				//			} else if k == 0 && len(tr.TDs[0].Val) > 11 {
-				//				table.Tag = tr.TDs[0].Val
-				//			}
-
 			}
-			//		for _, td := range tr.TDs {
-			//			u.Debug(td.BH, td.Val, "----")
-			//		}
 		}
 	}
 }
@@ -1489,6 +1453,7 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 		bodirect := 0
 		//控制中标人排序数值
 		sort := 1
+
 		//开始抽取
 		for _, tr := range table.TRs {
 			bcon = trSingleColumn(tr, bcon, table) //tr单列,是否丢弃内容
@@ -1509,16 +1474,6 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 				}
 			}
 			for _, td := range tr.TDs {
-				/**
-				rt := table.StartAndEndRation[fmtkey("r", td.StartCol, td.EndCol)]
-				if rt != nil {
-					r, t := rt.GetTDRation(td)
-					u.Debug(td.BH, td.Val, r, t)
-				}
-				**/
-				//				if td.Val == "电视" || td.Val == "电话机" || td.Val == "传真机" || td.Val == "音响" {
-				//qutil.Debug("----td.Valtype", td.Valtype, "td.BH:", td.BH, "KVDirect:", td.KVDirect, "Val:", td.Val, "direct:", direct, "vdirect:", vdirect)
-				//				}
 				if !td.BH && td.KVDirect < 3 {
 					if !table.FindTdVal(td, direct, vdirect) { //table.FindTdVal()存储了table.SortKV
 						if !table.FindTdVal(td, vdirect, direct) {
@@ -1541,6 +1496,7 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 					//fmt.Println("td:", td.Val, td.BH, td.HeadTd, td.KVDirect)
 				}
 			}
+
 		}
 		//qutil.Debug("FindKV", table.SortKV.Map)
 	} else if len(table.TRs) > 0 { //没有表头的表格处理,默认纵向吧
@@ -1779,19 +1735,41 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 		return
 	}
 	near := table.FindNear(td, direct)
-	//	if near != nil {
-	//		fmt.Println("near----", near.Val, td.Val)
-	//	}
-	//	qutil.Debug(near != nil)
-	//	qutil.Debug(near.BH)
-	//	qutil.Debug(near.KeyDirect == vdirect, near.KeyDirect == 0)
-	//	qutil.Debug(near.KVDirect == direct, near.KVDirect == 0)
-	//	qutil.Debug(near.KVDirect < 3)
 	if near != nil && near.BH && (near.KeyDirect == vdirect || near.KeyDirect == 0) && (near.KVDirect == direct || near.KVDirect == 0) && near.KVDirect < 3 {
 		near.KVDirect = direct
 		near.KeyDirect = vdirect
 		td.KVDirect = direct
 		key := repSpace.ReplaceAllString(near.Val, "")
+		if key == "名称" && near.StartCol == 0 && near.Rowspan > 0 {
+			for _, vn := range table.TRs[near.Rowspan-1].TDs {
+				if strings.Contains(vn.Val, "代理") {
+					key = "代理机构"
+					break
+				} else if strings.Contains(vn.Val, "招标") {
+					key = "采购单位"
+					break
+				} else if strings.Contains(vn.Val, "中标") {
+					key = "中标单位"
+					break
+				}
+			}
+		} else if strings.Contains(key, "中标候选人") && strings.Contains(td.Val, "公司") {
+			key = "中标单位"
+		} else if key == "单位名称" {
+			tmpnewnear := table.FindNear(near, 2)
+			if tmpnewnear != nil {
+				if tmpnewnear.MustBH || tmpnewnear.BH {
+					key = tmpnewnear.Val + near.Val
+				}
+			} else {
+				tmpnewnear = table.FindNear(near, 1)
+				if tmpnewnear != nil {
+					if tmpnewnear.MustBH || tmpnewnear.BH {
+						key = tmpnewnear.Val + near.Val
+					}
+				}
+			}
+		}
 		if near.Val == "" {
 			key = fmtkey("k", near.TR.RowPos, near.ColPos)
 		}
@@ -1802,11 +1780,19 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 			curpos := table.SortKV.Index[key]
 			thistr := table.kTD[curpos]
 			if thistr != near {
-				near.Val += "_"
-				for table.SortKV.Map[near.Val] != nil {
-					near.Val += "_"
+				if strings.TrimSpace(near.Val) == "名称" && near.TR != nil && len(near.TR.TDs) > 0 && near.ColPos-1 >= 0 {
+					rv := near.TR.TDs[near.ColPos-1].Val
+					if near.ColPos > 0 && (strings.Contains(rv, "招标") || strings.Contains(rv, "代理") || strings.Contains(rv, "采购") || strings.Contains(rv, "中标")) {
+						near = near.TR.TDs[near.ColPos-1]
+					}
+				} else {
+					//near.Val += "_"
+					//for table.SortKV.Map[near.Val] != nil {
+					//	near.Val += "_"
+					//}
+					//key = near.Val //之前这个地方没有重置,导致把之前结果覆盖了
+					bthiskey = true
 				}
-				key = near.Val //之前这个地方没有重置,导致把之前结果覆盖了
 			} else {
 				bthiskey = true
 			}
@@ -1862,6 +1848,7 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 				}
 				if bvalfind && varrpos > -1 && len(vals) > varrpos {
 					vals = append(vals, td.Val) // 累加
+					val = vals
 					//vals[varrpos] = td.Val // += "__" + td.Val
 				} else {
 					//添加时候去除空值和nil
@@ -1911,7 +1898,6 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 			table.SortKV.AddKey(key, val)
 			//if table.SortKV.Map[key] != nil {
 			pos := table.SortKV.Index[key]
-			//qutil.Debug("=========", "key:", key, "val:", val, "pos:", pos)
 			if barr {
 				mval := table.kvscope[pos]
 				if mval != nil {
@@ -1932,7 +1918,6 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 				}
 				table.kTD[pos] = near
 			}
-			//}
 		}
 		b = true
 	}
@@ -2027,6 +2012,7 @@ func (tn *Table) CheckMultiPackageByTable(isSite bool, codeSite string) (b bool,
 	oldIndex := []string{} //存放包的原始值
 	brepeat := map[string]bool{}
 	for k, v := range index {
+
 		v = u.PackageNumberConvert(v)
 		if !brepeat[v] {
 			brepeat[v] = true
@@ -2052,6 +2038,7 @@ func (tn *Table) CheckMultiPackageByTable(isSite bool, codeSite string) (b bool,
 		//多包解析
 		if b {
 			tn.BPackage = true
+			//pnum := len(index)
 			//根据数组index分包长度添加table.BlockPackage子包数组
 			for nk, v := range index {
 				if tn.BlockPackage.Map[v] == nil {
@@ -2284,6 +2271,7 @@ func (tn *Table) isGoonNext(isSite bool, codeSite string) {
 		} else {
 			str += fmt.Sprintf("%s:%s\n", nk, v)
 		}
+
 		if excludeKey2.MatchString(str) {
 			continue
 		}
@@ -2682,6 +2670,7 @@ func isHasOnePkgAndNoKv(v1 string) (bool, string) {
 
 //替换分包中混淆的词
 func replPkgConfusion(v1 string) string {
+
 	v1 = PreReg.ReplaceAllString(v1, "")
 	v1 = PreReg1.ReplaceAllString(v1, "")
 	v1 = PreCon.ReplaceAllString(v1, "")
@@ -2731,6 +2720,7 @@ func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat, isSite bool, co
 							for _, this := range thisTdKvs {
 								if str := ContactInfoVagueReg.FindString(this.Key); str != "" {
 									td.SortKV.AddKey(tdType+str, this.Value)
+
 								}
 							}
 						}
@@ -3920,11 +3910,19 @@ func hasBrand(table *Table, data ...string) ([]string, bool) {
 	return brandArr, allNull
 }
 
+var clearnn *regexp.Regexp = regexp.MustCompile("([\\d.]*)[\\n\\s]*[\\((][\\d.]+[)\\)]")
+
 //过滤td值
 func filterVal(val ...string) ([]string, bool) {
 	defer qutil.Catch()
 	n := 0 //记录被过滤的个数
 	for i, v := range val {
+		if len(clearnn.FindStringSubmatch(v)) > 0 {
+			tmpv := clearnn.FindStringSubmatch(v)[1]
+			if tmpv != "" {
+				v = tmpv
+			}
+		}
 		afterFilter := tabletdclear.ReplaceAllString(v, "")
 		afterFilter = NullVal.ReplaceAllString(afterFilter, "")
 		if afterFilter == "" {

+ 1 - 7
src/jy/pretreated/multipackage.go

@@ -3,7 +3,6 @@ package pretreated
 import (
 	"regexp"
 	"sort"
-	"strings"
 )
 
 var (
@@ -21,7 +20,7 @@ var (
 	//替换容易混淆的词
 	PreCon1 = regexp.MustCompile("(\\d+\\.?)+万?元")
 	//提取分包标识
-	MultiReg = regexp.MustCompile("(([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-])+(包|标段))[::]?|(?:^|\\n)([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+(包|标段))|[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|合同段|标包))|((子|分|合同|分|施工|监理)?(标|包件?)(段|号|项|组)?)[     ]*((\\d[.])+\\d|[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)[::]?|(子项目[0-9]+)")
+	MultiReg = regexp.MustCompile("(([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-])+(包|标段))[::]?|(?:^|\\n)([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+(包|标段))|([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|合同段|标包)))|(((子|分|合同|施工|监理|标包|标|包)(标|包段|项|组)?)[     ]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+))[::]?|(子项目[0-9]+)")
 	//匹配到的包格式分类统计
 	keyregs = []map[*regexp.Regexp]int{
 		map[*regexp.Regexp]int{
@@ -49,10 +48,6 @@ func CheckMultiPackage(con, title string) (content string, m map[string][]string
 	//if TitleReg.MatchString(title) {
 	//log.Println(title+"\n------------------", TitleReg.FindAllStringSubmatch(title, -1))
 	//}
-	if strings.Trim(con,"") == "标包划分:共划分1个标包。" {
-		m["1"]=[]string{"包1","包"}
-		return con, m, true
-	}
 	con = PreReg.ReplaceAllString(con, "")
 	con = PreReg1.ReplaceAllString(con, "")
 	pres := PreCheckMulti.FindStringSubmatch(con)
@@ -72,7 +67,6 @@ func CheckMultiPackage(con, title string) (content string, m map[string][]string
 	con = PreCheckMulti.ReplaceAllString(con, "")
 	con = PreCon.ReplaceAllString(con, "\n")
 	con = PreCon2.ReplaceAllString(con, "")
-	//con = PreCon1.ReplaceAllString(con, "")
 	res := MultiReg.FindAllStringSubmatch(con, -1)
 	if len(res) > 0 { //5 6
 		mindex := map[string]int{}

+ 136 - 54
src/jy/pretreated/tablev2.go

@@ -123,11 +123,17 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSi
 		//qutil.Debug("有子表格")
 		//格式化正文
 		txt = TextAfterRemoveTable(td.Html)
-		td.tdHasTable(&bsontable, tr, isSite, codeSite) //处理td中的table,块标签处理,子表解析集处理
+		td.tdHasTable(&bsontable, tr, isSite, codeSite, txt) //处理td中的table,块标签处理,子表解析集处理
 	} else {
 		txt = strings.TrimSpace(td.Goquery.Text())
 	}
 	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
+	strs := strings.Split(text, "\n")
+	if len(strs) == 2 {
+		if utf8.RuneCountInString(strs[1]) < 5 {
+			text = strs[0] + strs[1]
+		}
+	}
 	td.Val = text //值
 	td.Text = txt //原始串
 	//处理table外内容
@@ -157,11 +163,11 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSi
 	td.tdIsHb(tr, table, bsontable, isSite, codeSite)
 	bhead := false
 	if td.TR.RowPos == 0 { //第一行
-		if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
+		if utf8.RuneCountInString(td.Val) < 15 && td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
 			bhead = true
 		}
 	}
-	if bhead && !bsontable {
+	if utf8.RuneCountInString(td.Val) < 15 && bhead && !bsontable {
 		td.BH = true
 		td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
 		td.KVDirect = 2  //键-值方向,0未知,1横 2纵//指值和k的方向
@@ -170,31 +176,35 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSi
 }
 
 //处理td中的table,块标签处理,子表解析集处理
-func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string) {
+func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite, tag string) {
 	ts := td.TR.Table.TableResult
 	tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
 	if len(tabs) > 0 {
 		(*bsontable) = true
 		stag := ts.BlockTag //块标签
 		if stag == "" {
-			var tdleft *TD
-			if len(tr.TDs) > 0 {
-				tdleft = tr.TDs[len(tr.TDs)-1]
-				if tdleft.BH {
-					//u.Debug(tdleft.Val),如果不存在就是上一行的
-					stag = tdleft.Val
-				}
-			} else if len(tr.Table.TRs) > 0 {
-				lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
-				str := ""
-				for _, td3 := range lasttr.TDs {
-					str += td3.Val
-					if len([]rune(str)) > 14 {
-						str = ""
-						break
+			if tag != "" {
+				stag = tag
+			} else {
+				var tdleft *TD
+				if len(tr.TDs) > 0 {
+					tdleft = tr.TDs[len(tr.TDs)-1]
+					if tdleft.BH {
+						//u.Debug(tdleft.Val),如果不存在就是上一行的
+						stag = tdleft.Val
+					}
+				} else if len(tr.Table.TRs) > 0 {
+					lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
+					str := ""
+					for _, td3 := range lasttr.TDs {
+						str += td3.Val
+						if len([]rune(str)) > 14 {
+							str = ""
+							break
+						}
 					}
+					stag = str
 				}
-				stag = str
 			}
 		}
 		if strings.Contains(stag, "开标记录") {
@@ -215,15 +225,6 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string)
 			}
 			MergeKvTags(td.TR.Table.TableResult.KvTags, sonts.KvTags)
 			td.SonTableResult = sonts
-			//for _, k := range sonts.SortKV.Keys {
-			//u.Debug(k, sonts.SortKV.Map[k])
-			//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
-			//				td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
-			//}
-			//增加brand (子表)
-			//fmt.Println("sonsHasKey=============", sonts.HasKey)
-			//fmt.Println("sonsHasGoods========", sonts.HasGoods)
-			//fmt.Println("sonsHasBrand========", sonts.HasBrand)
 			if sonts.HasKey != 0 {
 				td.TR.Table.TableResult.HasKey = sonts.HasKey
 			}
@@ -289,7 +290,10 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string)
 
 //对td单元格值判断是否是表头和根据td内容长度进行分块处理
 func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite string) {
-	lenval := utf8.RuneCountInString(td.Val)//经过处理的td内容长度
+	if yjReg.MatchString(td.Text) {
+		return
+	}
+	lenval := utf8.RuneCountInString(td.Val) //经过处理的td内容长度
 	//if lentxt > 9 {
 	//td.KV = GetKVAll(txt, "")
 	ub := []*u.Block{}
@@ -432,6 +436,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite stri
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 		}
 		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2, isSite, codeSite) //获取冒号kv入口
+		if yjReg.MatchString(kvTitle) {
+			td.SortKV = NewSortMap()
+			return
+		}
 		for k, v := range resm {
 			td.SortKV.AddKey(k, v)
 		}
@@ -676,31 +684,21 @@ func (s *SortMap) RemoveKey(key string) {
 	s.Lock.Lock()
 	defer s.Lock.Unlock()
 	delete(s.Map, key)
-	pos := s.Index[key]
+	//pos := s.Index[key]
 	delete(s.Index, key)
-	if len(s.Keys) > 0 {
-		s.Keys = func() []string {
-			newkeys := []string{}
-			if len(s.Keys) > 1 {
-				if pos == 0 {
-					newkeys = append(newkeys, s.Keys[1:]...)
-					//每一个都减一
-					for k, v := range s.Index {
-						s.Index[k] = v - 1
-					}
-				} else if pos == len(s.Keys) {
-					newkeys = append(newkeys, s.Keys[:pos]...)
-				} else if len(s.Keys) > 1 {
-					tmp := s.Keys[pos+1:]
-					newkeys = append(append(newkeys, s.Keys[:pos]...), tmp...)
-					for _, v := range tmp {
-						s.Index[v] -= 1
-					}
-				}
-			}
-			return newkeys
-		}()
+	s.Keys = removeslice(s.Keys, key)
+}
+func removeslice(slice []string, elem interface{}) []string {
+	if len(slice) == 0 {
+		return slice
+	}
+	for i, v := range slice {
+		if v == elem {
+			slice = append(slice[:i], slice[i+1:]...)
+			return removeslice(slice, elem)
+		}
 	}
+	return slice
 }
 
 //判断表头是key的对象
@@ -729,6 +727,77 @@ var tLock = sync.Mutex{}
 func CheckCommon(txt string, matchStr ...string) (res, must bool, stype, reg, repl string) {
 	txt = filterThText.ReplaceAllString(txt, "")
 	stype = "con"
+	repl = "NOHEAD"
+	if u.NowTimeTest() {
+		json.Unmarshal([]byte(`{
+	"normalhead":[
+		"^((.{2,6}(名称|编号|代码|时间|类型|性质|行政区域|原因|项目|意见|须知|程度))|标段(编号)?|招标金额|规模|统一社会信用代码|拟?中标供应商|质量|(质量)?承诺|地址|招标代理|序号|材料|结构|结构层数|评委|单位|数量|排名|标的|标项|开户银行|邮编|账号|电话|传真|网址|得分|名次|包件?号|职务|(建设|招标|采购|中标|成交|甲|乙)(单位|人|供应商|方|规模).{0,2}|.{0,5}(价格?|额|资金|[预概]算|投资|费用|报价|投标价)(万?元?([大小]写)?))$__M",
+		"^.{0,7}(((单位)?名称|总监|经理|负责人|信息|率|费|期|人|号|码|(价格?|额|资金)(万?元?([大小]写)?)|员|品目|标包|代表|区域|方式|因素|合价|合计|小计|地点|条件|(资质|类别和)等级|类别|状态)|得分|注册专业|方法|家数|全称|简称|邮件|执业或职业资格|证书|部门|事项|来源|划分|长度|规模|保证金|目标)$__",
+		"(名单|证号|名称|要求|时间|日期|地点|单位|条款|机构|范围|情况|概况|品名|规格|参数|标准|指标|型号|限价|数量|方式|等级|依据|明细|概况|内容|次数|产品|性质|地区|地址|币种|主题|详情|说明|代理(公司|机构)|节支率|名单|结果|结果公示)$|^(职称|姓名|级别|职称专业|证书名称|证书编号)$__",
+		"^(联系|评标|单位|公告|采购|商品|附件|质保|用途|公示|机构|评审|品名|规格|参数|指标|型号|数量|证书).{0,10}$__",
+		"(专家|评委|打分)$__",
+		"品牌",
+        "姓名",
+		"起讫桩号",
+		"服务期",
+		"限价",
+		"邮编",
+		"面积",
+		"组织形式",
+		"招标方式",
+		"修建宽度",
+        "类别",
+        "备注",
+		"合计",
+        "电话",
+        "评审",
+		"原因",
+		"行业",
+		"价格",
+		"注册资金"
+	],
+	"jghead":[
+		"^.{0,2}[预拟]?(成交|中标|候选)(供应商|单位|企业|人|机构|价|金额).{0,2}$__M",
+		"^.{0,6}[打得评总](分)$__",
+		"(中标|磋商|投标|报|成交)总?(价|金额)__",
+		"(投标|中标)(人|方|单位|供应商)(名称)?__",
+		"成交",
+		"名次",
+	    "候选",
+		"业绩",
+		"荣誉",
+	    "排序",
+	    "排名",
+		"中标",
+		"供应商"
+	],
+	"con":[
+		"^((子|合同|分|施工|监理)?(标段?|包|合同段|标包|序号)[a-zA-Z0-9\\-一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)__$1",
+		"([a-zA-Z0-9\\-一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+(子|合同|分|施工|监理)?(标段?|包|合同段|标包|号))$__$1",
+		"(^[a-zA-Z0-9\\-一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+$)__$1",
+		"(^.{2,18}(集团|事务所|研究院|事务所|研究所|设计院))__",
+		"(^.{5,}(公司))__",
+		".{2,20}元整|[\\d]+万?元__",
+		".{4,}采购(项目)?__",
+		"(首选|第[一二三四五1-5])(顺序|推荐)?(中标|候选|成交)?(候选)?(人|单位|供应商)__BO"
+	],
+	"abandontable":[
+		"(磋商|谈判|评标(委员会)?)?((小组)?成员|(评审)?专家)(名单)?$__",
+		"(业绩|资质|原因|相关资料)$__",
+		"([废流落]标|评审)(原因|情况)__",
+		"(中标|成交)(候选)*(人|供应商|单位)((类似)*业绩|资质)__",
+		"否决投标情况",
+		"落标供应商及落标原因",
+		"被废标供应商名称",
+		"主要人员",
+		"其他投标人"
+	],
+	"bidorder":[
+		".{0,8}排[序名]$__sort",
+		"(人|供应商|单位)(名称)?$__entname"
+	]
+}`), &u.TableK1)
+	}
 	if len([]rune(txt)) < 30 {
 		tLock.Lock()
 		defer tLock.Unlock()
@@ -833,7 +902,7 @@ func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
 con 文本
 strtype 1全文 2块文本
 **/
-var hisReg = regexp.MustCompile("类似业绩|历史业绩")
+var hisReg = regexp.MustCompile("类似业绩|历史业绩|开标记录")
 
 func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
 	defer qutil.Catch()
@@ -853,6 +922,19 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 			}
 			if !b {
 				if hisReg.MatchString(tmpt.First().Text()) {
+					if html, err := tmpt.Html(); err == nil {
+						lhtml := strings.LastIndex(html, "<")
+						bhtml := html[lhtml:len(html)]
+						bhtmls := hisReg.FindStringSubmatchIndex(html)
+						if len(bhtmls) == 0 {
+							continue
+						}
+						html = html[:bhtmls[0]]
+						html += bhtml
+						if stmpt, err := goquery.NewDocumentFromReader(strings.NewReader(con)); err == nil {
+							tabs = append(tabs, stmpt.Find("table"))
+						}
+					}
 					continue
 				}
 				tabs = append(tabs, tmpt)

+ 15 - 2
src/res/moneyclear.json

@@ -38,7 +38,20 @@
     "descript": "金额除以10000",
     "maxmoney": 10000000000,
     "divisor": 10000
+  },
+  "a_zgzfcgw_zfcghtgg_new": {
+    "descript": "金额除以10000",
+    "maxmoney": 10000000000,
+    "divisor": 10000
+  },
+  "a_zgzfcgw_zfcghtgg": {
+    "descript": "金额除以10000",
+    "maxmoney": 10000000000,
+    "divisor": 10000
+  },
+  "a_zgzfcgw_dfgg_new_4_6": {
+    "descript": "金额除以10000",
+    "maxmoney": 10000000000,
+    "divisor": 10000
   }
-
-
 }