浏览代码

price和number抽取

maxiaoshan 5 年之前
父节点
当前提交
b39f36cecc

+ 2 - 3
src/config.json

@@ -3,7 +3,7 @@
     "mgodb": "192.168.3.207:27092",
     "mgodb": "192.168.3.207:27092",
     "dbsize": 3,
     "dbsize": 3,
     "dbname": "extract_kf",
     "dbname": "extract_kf",
-    "redis": "buyer=127.0.0.1:6379,winner=127.0.0.1:6379,agency=127.0.0.1:6379,qyk_redis=127.0.0.1:6379",
+    "redis": "qyk_redis=127.0.0.1:6379",
     "elasticsearch": "http://127.0.0.1:9800",
     "elasticsearch": "http://127.0.0.1:9800",
     "elasticsearch_index": "winner_enterprise_tmp",
     "elasticsearch_index": "winner_enterprise_tmp",
     "elasticsearch_type": "winnerent",
     "elasticsearch_type": "winnerent",
@@ -28,6 +28,7 @@
     "filelength": 100000,
     "filelength": 100000,
     "iscltlog": false,
     "iscltlog": false,
     "brandgoods": false,
     "brandgoods": false,
+    "pricenumber":true,
     "udptaskid": "5cdd3025698414032c8322b1",
     "udptaskid": "5cdd3025698414032c8322b1",
     "udpport": "1484",
     "udpport": "1484",
     "nextNode": [
     "nextNode": [
@@ -43,8 +44,6 @@
             "memo": "行业分类"
             "memo": "行业分类"
         }
         }
     ],
     ],
-    "clearudpport": "1485",
-    "cleartaskid": "5c9b2bf96984142ffcfc6099",
     "esconfig": {
     "esconfig": {
         "available": false,
         "available": false,
         "AccessID": "",
         "AccessID": "",

+ 12 - 14
src/jy/admin/audit/dataaudit.go

@@ -8,7 +8,8 @@ import (
 	"jy/util"
 	"jy/util"
 	"log"
 	"log"
 	qu "qfw/util"
 	qu "qfw/util"
-	"qfw/util/elastic"
+
+	//"qfw/util/elastic"
 	redis "qfw/util/redis"
 	redis "qfw/util/redis"
 	"strconv"
 	"strconv"
 	"strings"
 	"strings"
@@ -22,7 +23,8 @@ func init() {
 	Admin.GET("/audit/dataaudit", func(c *gin.Context) {
 	Admin.GET("/audit/dataaudit", func(c *gin.Context) {
 		name := c.Query("name")
 		name := c.Query("name")
 		coll := c.Query("coll")
 		coll := c.Query("coll")
-		c.HTML(200, "audit_auditone.html", gin.H{"name": name, "coll": coll})
+		stancoll := c.Query("stancoll")
+		c.HTML(200, "audit_auditone.html", gin.H{"name": name, "coll": coll, "stancoll": stancoll})
 	})
 	})
 	Admin.POST("/audit/auditonefield", AuditOneField)
 	Admin.POST("/audit/auditonefield", AuditOneField)
 	Admin.POST("/audit/addsave", AddSave)     //新增
 	Admin.POST("/audit/addsave", AddSave)     //新增
@@ -99,6 +101,7 @@ func AuditOneField(c *gin.Context) {
 func AllAudit(c *gin.Context) {
 func AllAudit(c *gin.Context) {
 	field, _ := c.GetPostForm("field")
 	field, _ := c.GetPostForm("field")
 	coll, _ := c.GetPostForm("coll")
 	coll, _ := c.GetPostForm("coll")
+	stancoll, _ := c.GetPostForm("stancoll")
 	ids, _ := c.GetPostForm("ids")
 	ids, _ := c.GetPostForm("ids")
 	idsArr := strings.Split(ids, ",")
 	idsArr := strings.Split(ids, ",")
 	log.Println("Audit Ids:", idsArr)
 	log.Println("Audit Ids:", idsArr)
@@ -107,23 +110,19 @@ func AllAudit(c *gin.Context) {
 	if len(idsArr) != len(namesArr) {
 	if len(idsArr) != len(namesArr) {
 		c.JSON(200, gin.H{"rep": false, "msg": "数据错误"})
 		c.JSON(200, gin.H{"rep": false, "msg": "数据错误"})
 	} else { //批量审核
 	} else { //批量审核
-		SaveDb := ""
 		FieldBd := 0
 		FieldBd := 0
 		// ElasticClientIndex := ""
 		// ElasticClientIndex := ""
 		// ElasticClientType := ""
 		// ElasticClientType := ""
 		RedisName := util.QYK_RedisName
 		RedisName := util.QYK_RedisName
 		if field == "winner" {
 		if field == "winner" {
-			SaveDb = util.ElasticClientDB
 			FieldBd = util.WinnerDB
 			FieldBd = util.WinnerDB
 			// ElasticClientIndex = util.ElasticClientIndex
 			// ElasticClientIndex = util.ElasticClientIndex
 			// ElasticClientType = util.ElasticClientType
 			// ElasticClientType = util.ElasticClientType
 		} else if field == "buyer" {
 		} else if field == "buyer" {
-			SaveDb = util.ElasticClientBuyerDB
 			FieldBd = util.BuyerDB
 			FieldBd = util.BuyerDB
 			// ElasticClientIndex = util.ElasticClientBuyerIndex
 			// ElasticClientIndex = util.ElasticClientBuyerIndex
 			// ElasticClientType = util.ElasticClientBuyerType
 			// ElasticClientType = util.ElasticClientBuyerType
 		} else {
 		} else {
-			SaveDb = util.ElasticClientAgencyDB
 			FieldBd = util.AgencyDB
 			FieldBd = util.AgencyDB
 			// ElasticClientIndex = util.ElasticClientAgencyIndex
 			// ElasticClientIndex = util.ElasticClientAgencyIndex
 			// ElasticClientType = util.ElasticClientAgencyType
 			// ElasticClientType = util.ElasticClientAgencyType
@@ -132,11 +131,12 @@ func AllAudit(c *gin.Context) {
 		qykredis := redis.RedisPool[RedisName].Get()
 		qykredis := redis.RedisPool[RedisName].Get()
 		defer qykredis.Close()
 		defer qykredis.Close()
 		//es
 		//es
-		escon := elastic.GetEsConn()
-		defer elastic.DestoryEsConn(escon)
+		// escon := elastic.GetEsConn()
+		// defer elastic.DestoryEsConn(escon)
 		for i, name := range namesArr {
 		for i, name := range namesArr {
 			e := make(map[string]interface{})
 			e := make(map[string]interface{})
 			e["comeintime"] = time.Now().Unix()
 			e["comeintime"] = time.Now().Unix()
+			e["updatetime"] = time.Now().Unix()
 			if field == "winner" {
 			if field == "winner" {
 				e["company_name"] = name
 				e["company_name"] = name
 			} else if field == "buyer" {
 			} else if field == "buyer" {
@@ -144,7 +144,7 @@ func AllAudit(c *gin.Context) {
 			} else {
 			} else {
 				e["agency_name"] = name
 				e["agency_name"] = name
 			}
 			}
-			sid := Mgo.Save(SaveDb, e)
+			sid := Mgo.Save(stancoll, e)
 			if sid == "" {
 			if sid == "" {
 				c.JSON(200, gin.H{"rep": false, "msg": "保存mongo出错"})
 				c.JSON(200, gin.H{"rep": false, "msg": "保存mongo出错"})
 				return
 				return
@@ -177,7 +177,6 @@ func AllAudit(c *gin.Context) {
 	}
 	}
 }
 }
 func DataSave(c *gin.Context) {
 func DataSave(c *gin.Context) {
-	SaveDb := ""
 	FieldBd := 0
 	FieldBd := 0
 	// ElasticClientIndex := ""
 	// ElasticClientIndex := ""
 	// ElasticClientType := ""
 	// ElasticClientType := ""
@@ -187,8 +186,8 @@ func DataSave(c *gin.Context) {
 	field, _ := c.GetPostForm("field")
 	field, _ := c.GetPostForm("field")
 	name, _ := c.GetPostForm("name")
 	name, _ := c.GetPostForm("name")
 	address, _ := c.GetPostForm("address")
 	address, _ := c.GetPostForm("address")
+	stancoll, _ := c.GetPostForm("stancoll")
 	if field == "winner" {
 	if field == "winner" {
-		SaveDb = util.ElasticClientDB
 		FieldBd = util.WinnerDB
 		FieldBd = util.WinnerDB
 		// ElasticClientIndex = util.ElasticClientIndex
 		// ElasticClientIndex = util.ElasticClientIndex
 		// ElasticClientType = util.ElasticClientType
 		// ElasticClientType = util.ElasticClientType
@@ -200,7 +199,6 @@ func DataSave(c *gin.Context) {
 		e["company_name"] = name
 		e["company_name"] = name
 		e["company_address"] = address
 		e["company_address"] = address
 	} else if field == "buyer" {
 	} else if field == "buyer" {
-		SaveDb = util.ElasticClientBuyerDB
 		FieldBd = util.BuyerDB
 		FieldBd = util.BuyerDB
 		// ElasticClientIndex = util.ElasticClientBuyerIndex
 		// ElasticClientIndex = util.ElasticClientBuyerIndex
 		// ElasticClientType = util.ElasticClientBuyerType
 		// ElasticClientType = util.ElasticClientBuyerType
@@ -213,7 +211,6 @@ func DataSave(c *gin.Context) {
 		e["buyer_name"] = name
 		e["buyer_name"] = name
 		e["address"] = address
 		e["address"] = address
 	} else {
 	} else {
-		SaveDb = util.ElasticClientAgencyDB
 		FieldBd = util.AgencyDB
 		FieldBd = util.AgencyDB
 		// ElasticClientIndex = util.ElasticClientAgencyIndex
 		// ElasticClientIndex = util.ElasticClientAgencyIndex
 		// ElasticClientType = util.ElasticClientAgencyType
 		// ElasticClientType = util.ElasticClientAgencyType
@@ -264,8 +261,9 @@ func DataSave(c *gin.Context) {
 	}
 	}
 	//入库时间
 	//入库时间
 	e["comeintime"] = time.Now().Unix()
 	e["comeintime"] = time.Now().Unix()
+	e["updatetime"] = time.Now().Unix()
 	//新数据保存
 	//新数据保存
-	sid := Mgo.Save(SaveDb, e)
+	sid := Mgo.Save(stancoll, e)
 	if sid == "" {
 	if sid == "" {
 		c.JSON(200, gin.H{"rep": false, "msg": "保存mongo出错"})
 		c.JSON(200, gin.H{"rep": false, "msg": "保存mongo出错"})
 		return
 		return

+ 12 - 9
src/jy/clear/tonumber.go

@@ -26,6 +26,9 @@ var moneyChar = map[string]interface{}{ //"〇": "0", "零": "0",
 	"百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000),
 	"百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000),
 	"零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01),
 	"零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01),
 }
 }
+var NumChar = map[string]interface{}{
+	"一": 1, "二": 1, "三": 1, "四": 1, "五": 1, "六": 1, "七": 1, "八": 1, "久": 1, "十": 1,
+}
 var moneyUnit = map[string]float64{
 var moneyUnit = map[string]float64{
 	"元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
 	"元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
 }
 }
@@ -80,7 +83,7 @@ func ObjToFloat(data []interface{}) []interface{} {
 
 
 //金额转换
 //金额转换
 func ObjToMoney(data []interface{}) []interface{} {
 func ObjToMoney(data []interface{}) []interface{} {
-	isfindUnit := true
+	//isfindUnit := true
 	tmpstr := (data)[0]
 	tmpstr := (data)[0]
 	totmpstr := util.ObjToString(tmpstr)
 	totmpstr := util.ObjToString(tmpstr)
 	if utf8.RuneCountInString(totmpstr) > 20 {
 	if utf8.RuneCountInString(totmpstr) > 20 {
@@ -96,8 +99,8 @@ func ObjToMoney(data []interface{}) []interface{} {
 	}
 	}
 	ret := capitalMoney(data)[0]
 	ret := capitalMoney(data)[0]
 	if ret.(float64) < float64(10000) || ret.(float64) > float64(50000000000) {
 	if ret.(float64) < float64(10000) || ret.(float64) > float64(50000000000) {
-		ret2, b := numMoney(data)
-		isfindUnit = b
+		ret2, _ := numMoney(data)
+		//isfindUnit = b
 		if ret2[0].(float64) > ret.(float64) {
 		if ret2[0].(float64) > ret.(float64) {
 			ret = ret2[0]
 			ret = ret2[0]
 		}
 		}
@@ -107,12 +110,12 @@ func ObjToMoney(data []interface{}) []interface{} {
 	//	f = 0
 	//	f = 0
 	//}
 	//}
 	//若果金额小于50,全文检索单位:万
 	//若果金额小于50,全文检索单位:万
-	if f < 50 && f > 0 && isfindUnit {
-		rep := contentUnit.FindAllStringIndex(fmt.Sprint(data[1]), -1)
-		if len(rep) > 0 {
-			f = f * 10000
-		}
-	}
+	// if f < 50 && f > 0 && isfindUnit {
+	// 	rep := contentUnit.FindAllStringIndex(fmt.Sprint(data[1]), -1)
+	// 	if len(rep) > 0 {
+	// 		f = f * 10000
+	// 	}
+	// }
 	data[0] = f
 	data[0] = f
 	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(tmpstr)) {
 	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(tmpstr)) {
 		data = append(data, false)
 		data = append(data, false)

+ 6 - 5
src/jy/extract/exportask.go

@@ -65,8 +65,9 @@ func extractAndExport(v string, t map[string]interface{}) {
 	e.InitBlockRule()
 	e.InitBlockRule()
 	e.InitPkgCore()
 	e.InitPkgCore()
 	//品牌抽取是否开启
 	//品牌抽取是否开启
-	ju.IsBrandGoods = ju.Config["brandgoods"].(bool)
-
+	ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
+	//价格个数抽取是否开启
+	ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
 	//附件抽取是否开启
 	//附件抽取是否开启
 	e.InitFile()
 	e.InitFile()
 
 
@@ -81,11 +82,11 @@ func extractAndExport(v string, t map[string]interface{}) {
 		var isSite bool
 		var isSite bool
 		if e.IsFileField && v["projectinfo"] != nil {
 		if e.IsFileField && v["projectinfo"] != nil {
 			v["isextFile"] = true
 			v["isextFile"] = true
-			j, jf,isSite= e.PreInfo(v)
+			j, jf, isSite = e.PreInfo(v)
 		} else {
 		} else {
-			j, _,isSite = e.PreInfo(v)
+			j, _, isSite = e.PreInfo(v)
 		}
 		}
-		go e.ExtractProcess(j, jf,isSite)
+		go e.ExtractProcess(j, jf, isSite)
 		e.TaskInfo.ProcessPool <- true
 		e.TaskInfo.ProcessPool <- true
 	}
 	}
 }
 }

+ 45 - 3
src/jy/extract/extract.go

@@ -73,6 +73,8 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 
 
 	//品牌抽取是否开启
 	//品牌抽取是否开启
 	ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
 	ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
+	//价格个数抽取是否开启
+	ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
 	//附件抽取是否开启
 	//附件抽取是否开启
 	ext.InitFile()
 	ext.InitFile()
 	return RunExtractTestTask(ext, startId, num)
 	return RunExtractTestTask(ext, startId, num)
@@ -155,6 +157,8 @@ func StartExtractTaskId(taskId string) bool {
 
 
 	//品牌抽取是否开启
 	//品牌抽取是否开启
 	ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
 	ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
+	//价格个数抽取是否开启
+	ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
 	//附件抽取是否开启
 	//附件抽取是否开启
 	ext.InitFile()
 	ext.InitFile()
 
 
@@ -1843,6 +1847,32 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 			}
 			// log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
 			// log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
 		}
 		}
+		//prince和number抽取
+		if ju.IsPriceNumber {
+			priceNumberLen := len(j.PriceNumberData)
+			if priceNumberLen > 1 { //table数据去重
+				tmpPriceNumberData := []map[string]interface{}{}
+				tableStrs := map[string]bool{}
+				for _, tb := range j.PriceNumberData {
+					has := false
+					bytes, _ := json.Marshal(tb)
+					str := string(bytes)
+					if len(tableStrs) > 0 && tableStrs[str] {
+						has = true
+					} else {
+						tableStrs[str] = true
+					}
+					if !has {
+						for _, data := range tb {
+							tmpPriceNumberData = append(tmpPriceNumberData, data)
+						}
+					}
+				}
+				tmp["pricenumber"] = tmpPriceNumberData
+			} else if priceNumberLen == 1 {
+				tmp["pricenumber"] = j.PriceNumberData[0]
+			}
+		}
 		//所有kv组成的字符串
 		//所有kv组成的字符串
 		var kvtext bytes.Buffer
 		var kvtext bytes.Buffer
 		blocks := make([]ju.BlockAndTag, 0)
 		blocks := make([]ju.BlockAndTag, 0)
@@ -2066,14 +2096,26 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 //辅助信息,如果没有排序先排序
 //辅助信息,如果没有排序先排序
 func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 	fieldalls := map[string][]map[string]interface{}{}
 	fieldalls := map[string][]map[string]interface{}{}
+	qykredis := redis.RedisPool[ju.QYK_RedisName].Get()
+	defer qykredis.Close()
+	db := 0
 	for field, val := range j.Result {
 	for field, val := range j.Result {
 		//ju.Sort(val)
 		//ju.Sort(val)
+		if field == "buyer" {
+			db = ju.BuyerDB
+		} else if field == "winner" {
+			db = ju.WinnerDB
+		} else if field == "agency" {
+			db = ju.AgencyDB
+		}
 		sfields := []map[string]interface{}{}
 		sfields := []map[string]interface{}{}
 		for _, v := range val {
 		for _, v := range val {
 			standardized := false
 			standardized := false
-			if field == "buyer" || field == "winner" || field == "agency" {
-				i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
-				if i > 0 {
+			if _, err := qykredis.Do("SELECT", db); err != nil {
+				fmt.Println("redis select err", err)
+			} else {
+				rep, err := qykredis.Do("GET", v.Value)
+				if rep != nil && err == nil {
 					standardized = true
 					standardized = true
 				}
 				}
 			}
 			}

+ 7 - 1
src/jy/pretreated/analystep.go

@@ -29,7 +29,7 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 		}
 		}
 	}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite) //分块
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite) //分块
-	if len(blockArrays) > 0 { //有分块
+	if len(blockArrays) > 0 {                                                                  //有分块
 		//从块里面找分包
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
 		for _, bl := range blockArrays {
 		for _, bl := range blockArrays {
@@ -381,6 +381,12 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, i
 			job.BrandData = append(job.BrandData, v) //加入job
 			job.BrandData = append(job.BrandData, v) //加入job
 		}
 		}
 	}
 	}
+	//加入job
+	if len(tabres.PriceNumberData) > 0 {
+		for _, tabledata := range tabres.PriceNumberData { //校验重复的table对象
+			job.PriceNumberData = append(job.PriceNumberData, tabledata)
+		}
+	}
 }
 }
 
 
 //一行多列 一列多行,按照分块逻辑处理
 //一行多列 一列多行,按照分块逻辑处理

+ 246 - 17
src/jy/pretreated/analytable.go

@@ -20,12 +20,14 @@ import (
 var (
 var (
 	//清理品目中数字
 	//清理品目中数字
 	numclear = regexp.MustCompile("^[\\d一二三四五六七八九十.]+")
 	numclear = regexp.MustCompile("^[\\d一二三四五六七八九十.]+")
+	num1     = regexp.MustCompile("(\\d)")
 	//清理表格title中的不需要的内容
 	//清理表格title中的不需要的内容
-	tabletitleclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/((人民币万元件个公斤))]")
+	tabletitleclear  = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、.,.。、_/((人民币万元件个公斤))]")
+	tabletitleclear2 = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕]*")
 	//清理表格中是key中包含的空格或数字等
 	//清理表格中是key中包含的空格或数字等
 	tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]")
 	tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]")
 	//清理表格td中的符号
 	//清理表格td中的符号
-	tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、,。、_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕¥$]*")
+	tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。、_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕¥$]*")
 	//判断key是金额,对万元的处理
 	//判断key是金额,对万元的处理
 	moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
 	moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
 	//根据表格的内容判断是不是表头,如果含有金额则不是表头
 	//根据表格的内容判断是不是表头,如果含有金额则不是表头
@@ -653,6 +655,14 @@ func (table *Table) MergerToTableresult() {
 			}
 			}
 		}
 		}
 	}
 	}
+	//抽取prince和number 并列table
+	if len(table.PriceNumberData) > 0 {
+		for _, v := range table.PriceNumberData {
+			if len(v) > 0 {
+				table.TableResult.PriceNumberData = append(table.TableResult.PriceNumberData, v)
+			}
+		}
+	}
 	if table.BlockPackage != nil && len(table.BlockPackage.Keys) == 0 {
 	if table.BlockPackage != nil && len(table.BlockPackage.Keys) == 0 {
 		for _, v := range table.BlockPackage.Keys {
 		for _, v := range table.BlockPackage.Keys {
 			if table.BlockPackage.Map[v] != nil {
 			if table.BlockPackage.Map[v] != nil {
@@ -884,6 +894,11 @@ func (tn *Table) AnalyTables(contactFormat *u.ContactFormat, isSite bool, codeSi
 			if u.IsBrandGoods {
 			if u.IsBrandGoods {
 				table.analyBrand()
 				table.analyBrand()
 			}
 			}
+			//table中抽取单价和个数
+			if u.IsPriceNumber {
+				//qutil.Debug("======================抽取price和number===========")
+				table.extractPriceNumber()
+			}
 			res, _, _, _, _ := CheckCommon(table.Tag, "abandontable")
 			res, _, _, _, _ := CheckCommon(table.Tag, "abandontable")
 			if !res {
 			if !res {
 				//过滤、标准化、合并kv,table.StandKV,table.StandKVWeight
 				//过滤、标准化、合并kv,table.StandKV,table.StandKVWeight
@@ -2167,7 +2182,7 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int,
 							}
 							}
 						}
 						}
 					}
 					}
-					if k1 == "单价"{
+					if k1 == "单价" {
 						continue
 						continue
 					}
 					}
 					tn.assemblePackage(k1, v, sv2, isSite, codeSite)
 					tn.assemblePackage(k1, v, sv2, isSite, codeSite)
@@ -3192,6 +3207,176 @@ func (tn *Table) tdkv(td *TD) []*u.Kv {
 	return thisTdKvs
 	return thisTdKvs
 }
 }
 
 
+func (table *Table) extractPriceNumber() {
+	lineMapArr := make(map[string]*SortMap)
+	lineMap := make(map[string]*SortMap)
+	lineMapArr, lineMap = initLineMapLineMapArr(table) //不同数据类型的数据组合
+	//qutil.Debug("lineMapArr----", lineMapArr)
+	if len(lineMapArr) > 0 {
+		for _, arrMap := range lineMapArr {
+			resultArrMap := table.matchMapArrPrinceNumber(arrMap) //最终数据
+			//qutil.Debug("resultArrMap-------------------", resultArrMap)
+			//处理数组长度不一致情况
+			if len(resultArrMap) > 0 {
+				numLen := len(resultArrMap["number"])
+				priceLen := len(resultArrMap["price"])
+				itemLen := len(resultArrMap["item"])
+				maxNum := numLen //获取最大长度
+				if numLen == 0 { //没有
+					maxNum = priceLen
+				}
+				//取个数数据的长度为基准(数据长度可能不一致)
+				if numLen != priceLen && numLen > 0 && priceLen > 0 { //有number和price数据且长度不同,进行数据增减补齐
+					if priceLen > numLen { //price多,删
+						tmpArr := resultArrMap["price"]
+						resultArrMap["price"] = tmpArr[:numLen]
+					} else if priceLen < numLen { //price少,补空
+						for {
+							resultArrMap["price"] = append(resultArrMap["price"], "")
+							//qutil.Debug("=============price==============")
+							if len(resultArrMap["price"]) == numLen {
+								break
+							}
+						}
+					}
+				}
+				if maxNum > 0 && itemLen > 0 && maxNum != itemLen { //有price或者number,item长度保持一致
+					if itemLen > maxNum {
+						tmpArr := resultArrMap["item"]
+						resultArrMap["item"] = tmpArr[:maxNum]
+					} else if itemLen < maxNum {
+						for {
+							resultArrMap["item"] = append(resultArrMap["item"], "")
+							//qutil.Debug("=============item==============")
+							if len(resultArrMap["item"]) == maxNum {
+								break
+							}
+						}
+					}
+				}
+				//封装成一一对应数据
+				/*
+					{
+						"price"	:["123","125"],
+						"number" :["1","12"]
+					}
+					转换为:
+					[
+						{"price":"123","number":"1"},
+						{"price":"125","number":"12"}
+					]
+				*/
+				finishData := []map[string]interface{}{}
+				//qutil.Debug("maxNum--------------------", maxNum)
+				for t := 0; t < maxNum; t++ {
+					tmpMap := make(map[string]interface{})
+					if len(resultArrMap["number"]) > 0 {
+						number := resultArrMap["number"][t]
+						tmpMap["number"] = number
+					}
+					if len(resultArrMap["price"]) > 0 {
+						price := resultArrMap["price"][t]
+						tmpMap["price"] = price
+					}
+					if len(resultArrMap["item"]) > 0 {
+						item := resultArrMap["item"][t]
+						runeItem := []rune(qutil.ObjToString(item))
+						if len(runeItem) > 50 {
+							tmpMap["item"] = string(runeItem[:50])
+						} else {
+							tmpMap["item"] = item
+						}
+					}
+					finishData = append(finishData, tmpMap)
+				}
+				//qutil.Debug(finishData)
+				if len(finishData) > 0 {
+					table.PriceNumberData = append(table.PriceNumberData, finishData)
+				}
+			}
+		}
+	}
+	if len(lineMap) > 0 {
+		for _, strMap := range lineMap {
+			resultStrMap := table.matchMapPrinceNumber(strMap)
+			//qutil.Debug("resultStrMap---", resultStrMap)
+			if len(resultStrMap) > 0 {
+				if resultStrMap["price"] != nil || resultStrMap["number"] != nil { //有price或者number在保存
+					if item := qutil.ObjToString(resultStrMap["item"]); item != "" { //item过长截取
+						runeItem := []rune(item)
+						if len(runeItem) > 50 {
+							resultStrMap["item"] = string(runeItem[:50])
+						}
+					}
+					finishData := []map[string]interface{}{}
+					finishData = append(finishData, resultStrMap)
+					//qutil.Debug("finishData---", finishData)
+					if len(finishData) > 0 {
+						table.PriceNumberData = append(table.PriceNumberData, finishData)
+					}
+				}
+			}
+		}
+	}
+	//qutil.Debug("table.PriceNumberData---------", table.PriceNumberData)
+}
+
+//数组数据抽取price和number
+func (table *Table) matchMapArrPrinceNumber(arrMap *SortMap) map[string][]interface{} {
+	result := make(map[string][]interface{}) //最终存储数据
+	for _, key := range arrMap.Keys {
+		val := arrMap.Map[key].([]string)
+		for f, reg := range u.PriceNumberReg {
+			key = tabletitleclear2.ReplaceAllString(key, "") //过滤
+			if reg.MatchString(key) {                        //匹配成功
+				//qutil.Debug("arr----key", key, val, f)
+				tmp := []interface{}{}
+				if f == "price" {
+					tmp = dealPriceInterface(key, val...) //处理金额,单位转换
+				} else if f == "number" {
+					tmp = dealNumberInterface(val...) //处理数量
+				} else {
+					for _, v := range val {
+						tmp = append(tmp, v)
+					}
+				}
+				if len(tmp) > 0 {
+					result[f] = tmp
+				}
+
+			}
+		}
+	}
+	return result
+}
+
+//字符串数据抽取price和number
+func (table *Table) matchMapPrinceNumber(strMap *SortMap) map[string]interface{} {
+	result := make(map[string]interface{})
+	for _, key := range strMap.Keys {
+		val := qutil.ObjToString(strMap.Map[key])
+		for f, reg := range u.PriceNumberReg {
+			key = tabletitleclear2.ReplaceAllString(key, "") //过滤
+			if reg.MatchString(key) {                        //匹配成功
+				//qutil.Debug("str----key", key, val)
+				if f == "price" {
+					if len(regHz.FindAllString(val, -1)) > 5 { //price中汉字过多视为内容错误
+						continue
+					}
+					tmp := dealPriceInterface(key, val)[0] //处理金额,单位转换
+					result[f] = tmp
+				} else if f == "number" {
+					tmp := dealNumberInterface(val)[0]
+					result[f] = tmp
+				} else {
+					result[f] = val
+				}
+			}
+		}
+	}
+	return result
+}
+
 //table中抽取品牌,table.BrandData
 //table中抽取品牌,table.BrandData
 func (table *Table) analyBrand() {
 func (table *Table) analyBrand() {
 	//5c2d8c05a5cb26b9b782572b
 	//5c2d8c05a5cb26b9b782572b
@@ -3247,7 +3432,7 @@ func (table *Table) analyBrand() {
 					delete(finishKa, "unitprice")
 					delete(finishKa, "unitprice")
 				}
 				}
 				finishData := dealArrData(maxNum, finishKa)
 				finishData := dealArrData(maxNum, finishKa)
-				table.BrandData = append(table.BrandData, finishData) //存储table.BrandData
+				table.BrandData = append(table.BrandData, finishData) //存储table.BrandData
 			}
 			}
 		}
 		}
 	}
 	}
@@ -3422,6 +3607,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 		val := table.SortKV.Map[key]
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
 		key = regReplAllSpace.ReplaceAllString(key, "")
 		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+		//qutil.Debug(key, "---------------------------", val)
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 			/*
 				{
 				{
@@ -3762,23 +3948,66 @@ func filterItem(itemval ...string) []string {
 }
 }
 
 
 //处理价格
 //处理价格
-func dealPrice(key string, val ...string) []string {
+func dealPriceInterface(key string, val ...string) (result []interface{}) {
 	defer qutil.Catch()
 	defer qutil.Catch()
-	iswan := strings.Contains(key, "万") //表格title中带有万
-	result := []string{}
-	for _, v := range val { //1.00万元 1元
-		tmparr := strings.Split(v, ".")
-		tmparr[0] = moneyNum.ReplaceAllString(tmparr[0], "")
-		if iswan {
-			result = append(result, tmparr[0]+"0000")
-		} else {
-			if strings.Contains(v, "万") { //价格中带有万
-				result = append(result, tmparr[0]+"0000")
-			} else {
-				result = append(result, tmparr[0])
+	for _, v := range val {
+		if num1.MatchString(v) { //含数字
+			tdIsWan := strings.Contains(v, "万")
+			if !tdIsWan {
+				if strings.Contains(key, "万") {
+					v = v + "万"
+				}
 			}
 			}
+			data := []interface{}{v, ""}
+			money := clear.ObjToMoney(data)[0]
+			result = append(result, money)
+		} else {
+			result = append(result, "")
 		}
 		}
 	}
 	}
+	return
+}
+
+//处理number
+func dealNumberInterface(val ...string) (result []interface{}) {
+	defer qutil.Catch()
+	for _, v := range val { //1个 1.00个
+		n := numclear.FindString(v)
+		if n == "" {
+			result = append(result, "")
+		} else if tmp := clear.NumChar[n]; tmp != nil { //一二三...
+			result = append(result, tmp)
+		} else { //数字
+			result = append(result, qutil.IntAll(strings.Split(n, ".")[0]))
+		}
+	}
+	return
+}
+
+//处理价格
+func dealPrice(key string, val ...string) []string {
+	defer qutil.Catch()
+	result := []string{}
+	for _, v := range val {
+		data := []interface{}{v, key}
+		money := clear.ObjToMoney(data)[0]
+		result = append(result, fmt.Sprintf("%v", money))
+	}
+
+	// result := []string{}
+	// for _, v := range val { //1.00万元 1元 2.25元/斤
+	// 	tmparr := strings.Split(v, ".")
+	// 	tmparr[0] = moneyNum.ReplaceAllString(tmparr[0], "")
+	// 	if iswan {
+	// 		result = append(result, tmparr[0]+"0000")
+	// 	} else { //td val值带万
+	// 		if strings.Contains(v, "万") { //价格中带有万
+	// 			result = append(result, tmparr[0]+"0000")
+	// 		} else {
+	// 			result = append(result, tmparr[0])
+	// 		}
+	// 	}
+	// }
 	return result
 	return result
 }
 }
 
 

+ 52 - 42
src/jy/pretreated/tablev2.go

@@ -17,23 +17,24 @@ import (
 
 
 //所有中标候选人只取第一个
 //所有中标候选人只取第一个
 type TableResult struct {
 type TableResult struct {
-	Id             interface{} //信息id
-	Toptype        string      //信息类型
-	Itype          int         //1全文 2是块
-	BlockTag       string      //块标签
-	Html           string
-	Tabs           []*Table            //子表集合,子表中包含标准化kv或原始kv
-	GoqueryTabs    *goquery.Selection  //goquery对象
-	TableSize      int                 //子表的个数0,1,n
-	IsMultiPackage bool                //是否有子包
-	PackageMap     *SortMap            //子包对象的sortmap,含标准化过的
-	KvTags         map[string][]*u.Tag //全局KVmap值,标准化处理过的
-	WinnerOrder    []map[string]interface{}
-	BrandData      [][]map[string]string //品牌抽取结果
-	HasKey         int                   //有key
-	HasBrand       int                   //有品牌
-	HasGoods       int                   //有商品
-	RuleBlock      *u.RuleBlock
+	Id              interface{} //信息id
+	Toptype         string      //信息类型
+	Itype           int         //1全文 2是块
+	BlockTag        string      //块标签
+	Html            string
+	Tabs            []*Table            //子表集合,子表中包含标准化kv或原始kv
+	GoqueryTabs     *goquery.Selection  //goquery对象
+	TableSize       int                 //子表的个数0,1,n
+	IsMultiPackage  bool                //是否有子包
+	PackageMap      *SortMap            //子包对象的sortmap,含标准化过的
+	KvTags          map[string][]*u.Tag //全局KVmap值,标准化处理过的
+	WinnerOrder     []map[string]interface{}
+	BrandData       [][]map[string]string      //品牌抽取结果
+	PriceNumberData [][]map[string]interface{} //单价个数抽取结果
+	HasKey          int                        //有key
+	HasBrand        int                        //有品牌
+	HasGoods        int                        //有商品
+	RuleBlock       *u.RuleBlock
 }
 }
 
 
 //快速创建TableResult对象
 //快速创建TableResult对象
@@ -87,7 +88,7 @@ var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[
 var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
 var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
 var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
 var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
 
 
-func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite string) *TD {
+func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSite string) *TD {
 	defer qutil.Catch()
 	defer qutil.Catch()
 	td := &TD{
 	td := &TD{
 		ArrVal:  []string{},
 		ArrVal:  []string{},
@@ -121,7 +122,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
 		//qutil.Debug("有子表格")
 		//qutil.Debug("有子表格")
 		//格式化正文
 		//格式化正文
 		txt = TextAfterRemoveTable(td.Html)
 		txt = TextAfterRemoveTable(td.Html)
-		td.tdHasTable(&bsontable, tr,isSite,codeSite) //处理td中的table,块标签处理,子表解析集处理
+		td.tdHasTable(&bsontable, tr, isSite, codeSite) //处理td中的table,块标签处理,子表解析集处理
 	} else {
 	} else {
 		txt = strings.TrimSpace(td.Goquery.Text())
 		txt = strings.TrimSpace(td.Goquery.Text())
 	}
 	}
@@ -130,7 +131,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
 	td.Text = txt //原始串
 	td.Text = txt //原始串
 	//处理table外内容
 	//处理table外内容
 	var ub []*u.Block
 	var ub []*u.Block
-	ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock,isSite,codeSite)
+	ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock, isSite, codeSite)
 	//看是否划块
 	//看是否划块
 	if len(ub) > 0 {
 	if len(ub) > 0 {
 		for _, bl := range ub {
 		for _, bl := range ub {
@@ -175,7 +176,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
 		}
 		}
 	}
 	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
-	td.tdIsHb(tr, table, bsontable,isSite,codeSite)
+	td.tdIsHb(tr, table, bsontable, isSite, codeSite)
 	bhead := false
 	bhead := false
 	if td.TR.RowPos == 0 { //第一行
 	if td.TR.RowPos == 0 { //第一行
 		if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
 		if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
@@ -192,7 +193,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
 }
 }
 
 
 //处理td中的table,块标签处理,子表解析集处理
 //处理td中的table,块标签处理,子表解析集处理
-func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
+func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string) {
 	ts := td.TR.Table.TableResult
 	ts := td.TR.Table.TableResult
 	tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
 	tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
 	if len(tabs) > 0 {
 	if len(tabs) > 0 {
@@ -219,7 +220,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
 				stag = str
 				stag = str
 			}
 			}
 		}
 		}
-		if strings.Contains(stag,"开标记录"){
+		if strings.Contains(stag, "开标记录") {
 			return
 			return
 		}
 		}
 		for _, tv := range tabs {
 		for _, tv := range tabs {
@@ -228,7 +229,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
 			}
 			}
 			sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
 			sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
 			sonts.GoqueryTabs = tv
 			sonts.GoqueryTabs = tv
-			sonts.Analy(isSite,codeSite)
+			sonts.Analy(isSite, codeSite)
 
 
 			//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
 			//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
 			td.BH = false
 			td.BH = false
@@ -262,6 +263,13 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
 					}
 					}
 				}
 				}
 			}
 			}
+			if sonts.PriceNumberData != nil && len(sonts.PriceNumberData) > 0 {
+				for _, v := range sonts.PriceNumberData {
+					if len(v) > 0 {
+						td.TR.Table.TableResult.PriceNumberData = append(td.TR.Table.TableResult.PriceNumberData, v)
+					}
+				}
+			}
 			if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
 			if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
 				td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
 				td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
 			}
 			}
@@ -303,7 +311,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
 }
 }
 
 
 //对td单元格值判断是否是表头和根据td内容长度进行分块处理
 //对td单元格值判断是否是表头和根据td内容长度进行分块处理
-func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string) {
+func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite string) {
 	lenval := len([]rune(td.Val)) //经过处理的td内容长度
 	lenval := len([]rune(td.Val)) //经过处理的td内容长度
 	//if lentxt > 9 {
 	//if lentxt > 9 {
 	//td.KV = GetKVAll(txt, "")
 	//td.KV = GetKVAll(txt, "")
@@ -311,7 +319,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
 	//经过处理的td内容长度大于50,划块,分包
 	//经过处理的td内容长度大于50,划块,分包
 	if lenval > 50 { //看是否划块
 	if lenval > 50 { //看是否划块
 		//u.Debug(txt)
 		//u.Debug(txt)
-		ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock,isSite,codeSite) //对td的原始值
+		ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock, isSite, codeSite) //对td的原始值
 		//看是否划块
 		//看是否划块
 		if len(ub) > 0 {
 		if len(ub) > 0 {
 			for _, bl := range ub {
 			for _, bl := range ub {
@@ -344,10 +352,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
 		}
 		}
 		if isFindPkg {
 		if isFindPkg {
 			if len(ub) > 0 {
 			if len(ub) > 0 {
-				blockPackage = FindPackageFromBlocks(&ub,isSite,codeSite) //从块里面找分包
+				blockPackage = FindPackageFromBlocks(&ub, isSite, codeSite) //从块里面找分包
 			} else {
 			} else {
-				if !excludeKey2.MatchString(td.Val){
-					blockPackage = FindPackageFromText("", td.Val,isSite,codeSite) //从正文里面找分包
+				if !excludeKey2.MatchString(td.Val) {
+					blockPackage = FindPackageFromText("", td.Val, isSite, codeSite) //从正文里面找分包
 				}
 				}
 			}
 			}
 		}
 		}
@@ -389,7 +397,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
 			td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
 			td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
 			td.BH = true
 			td.BH = true
 		}
 		}
-		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3,isSite,codeSite) //td冒号kv
+		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3, isSite, codeSite) //td冒号kv
 		for k, v := range resm {
 		for k, v := range resm {
 			if k != "" && v != "" {
 			if k != "" && v != "" {
 				td.SortKV.AddKey(k, v) //存放kv值
 				td.SortKV.AddKey(k, v) //存放kv值
@@ -410,14 +418,14 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
 			if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
 			if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
 				btw = false
 				btw = false
 			}
 			}
-			if strings.Contains(td.Val, "个项目") ||strings.Contains(td.Val, "奥图码"){
+			if strings.Contains(td.Val, "个项目") || strings.Contains(td.Val, "奥图码") {
 				must = false
 				must = false
 				btw = false
 				btw = false
 			}
 			}
 			td.Valtype = repl
 			td.Valtype = repl
 			td.MustBH = must
 			td.MustBH = must
 			td.BH = btw
 			td.BH = btw
-			if strings.Contains(txt,"年估算额年(万元)"){
+			if strings.Contains(txt, "年估算额年(万元)") {
 				td.MustBH = true
 				td.MustBH = true
 				td.BH = true
 				td.BH = true
 			}
 			}
@@ -450,7 +458,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
 		if len(td.TR.TDs) > 0 {
 		if len(td.TR.TDs) > 0 {
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 		}
 		}
-		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2,isSite,codeSite) //获取冒号kv入口
+		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2, isSite, codeSite) //获取冒号kv入口
 		for k, v := range resm {
 		for k, v := range resm {
 			td.SortKV.AddKey(k, v)
 			td.SortKV.AddKey(k, v)
 		}
 		}
@@ -496,7 +504,7 @@ func (tr *TR) AddTD(td *TD) {
 		tr.TDs[len(tr.TDs)-1].RightNode = td
 		tr.TDs[len(tr.TDs)-1].RightNode = td
 	}
 	}
 	**/
 	**/
-	if tr==nil|| tr.TDs == nil{
+	if tr == nil || tr.TDs == nil {
 		return
 		return
 	}
 	}
 	td.ColPos = len(tr.TDs)
 	td.ColPos = len(tr.TDs)
@@ -592,12 +600,13 @@ type Table struct {
 	StartAndEndRation      map[string]*TDRationScope //同行或同列的概率,截断的单独起算
 	StartAndEndRation      map[string]*TDRationScope //同行或同列的概率,截断的单独起算
 	StartAndEndRationKSort *SortMap
 	StartAndEndRationKSort *SortMap
 	WinnerOrder            []map[string]interface{}
 	WinnerOrder            []map[string]interface{}
-	BSplit                 bool                  //是否是有一个表拆分成的多个表
-	BHeader                bool                  //拆分表是否有表头
-	BrandData              [][]map[string]string //品牌抽取结果
-	HasKey                 int                   //有key
-	HasBrand               int                   //有品牌
-	HasGoods               int                   //有商品
+	BSplit                 bool                       //是否是有一个表拆分成的多个表
+	BHeader                bool                       //拆分表是否有表头
+	BrandData              [][]map[string]string      //品牌抽取结果
+	HasKey                 int                        //有key
+	HasBrand               int                        //有品牌
+	HasGoods               int                        //有商品
+	PriceNumberData        [][]map[string]interface{} //单价和个数抽取结果
 }
 }
 
 
 func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
 func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {
@@ -851,7 +860,8 @@ func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
 con 文本
 con 文本
 strtype 1全文 2块文本
 strtype 1全文 2块文本
 **/
 **/
-var hisReg =regexp.MustCompile("类似业绩|历史业绩")
+var hisReg = regexp.MustCompile("类似业绩|历史业绩")
+
 func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
 func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
 	defer qutil.Catch()
 	defer qutil.Catch()
 	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
 	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
@@ -869,7 +879,7 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 				}
 				}
 			}
 			}
 			if !b {
 			if !b {
-				if hisReg.MatchString(tmpt.First().Text()){
+				if hisReg.MatchString(tmpt.First().Text()) {
 					continue
 					continue
 				}
 				}
 				tabs = append(tabs, tmpt)
 				tabs = append(tabs, tmpt)

+ 4 - 3
src/jy/util/article.go

@@ -27,7 +27,8 @@ type Job struct {
 	PackageInfo       map[string]map[string]interface{} //分包信息
 	PackageInfo       map[string]map[string]interface{} //分包信息
 	RuleBlock         *RuleBlock                        //分块规则
 	RuleBlock         *RuleBlock                        //分块规则
 	BlockClassify     *BlockClassify                    //块分类
 	BlockClassify     *BlockClassify                    //块分类
-	BrandData         [][]map[string]string             //
+	BrandData         [][]map[string]string             //品牌抽取
+	PriceNumberData   [][]map[string]interface{}        //单价和个数抽取
 	HasTable          int                               //有table
 	HasTable          int                               //有table
 	HasKey            int                               //是否匹配到table中的标题
 	HasKey            int                               //是否匹配到table中的标题
 	HasBrand          int                               //有品牌
 	HasBrand          int                               //有品牌
@@ -134,8 +135,8 @@ type BlockPackage struct {
 	Budget          float64                  //标段(包)预算
 	Budget          float64                  //标段(包)预算
 	IsTrueBudget    bool                     //标段(包)预算0是否有效
 	IsTrueBudget    bool                     //标段(包)预算0是否有效
 	Winner          string                   //标段(包)中标单位
 	Winner          string                   //标段(包)中标单位
-	WinnerTel		string					 //中标单位联系电话
-	WinnerPerson	string					 //中标联系人
+	WinnerTel       string                   //中标单位联系电话
+	WinnerPerson    string                   //中标联系人
 	Bidamount       float64                  //标段(包)中标价
 	Bidamount       float64                  //标段(包)中标价
 	IsTrueBidamount bool                     //标段(包)中标价 0是否有效
 	IsTrueBidamount bool                     //标段(包)中标价 0是否有效
 	Index           string                   //序号 (转换后编号,只有数字或字母)
 	Index           string                   //序号 (转换后编号,只有数字或字母)

+ 10 - 0
src/jy/util/util.go

@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"fmt"
 	. "jy/mongodbutil"
 	. "jy/mongodbutil"
 	qu "qfw/util"
 	qu "qfw/util"
+	"regexp"
 	"strconv"
 	"strconv"
 
 
 	. "gopkg.in/mgo.v2/bson"
 	. "gopkg.in/mgo.v2/bson"
@@ -28,6 +29,10 @@ var BrandRules map[string]map[string]string
 var GoodsConfig []string
 var GoodsConfig []string
 var BrandConfig []string
 var BrandConfig []string
 
 
+var IsPriceNumber bool //是否开启价格和个数抽取
+var PriceNumberConfig map[string]string
+var PriceNumberReg map[string]*regexp.Regexp
+
 var GoodsGet *DFA     //商品
 var GoodsGet *DFA     //商品
 var BrandGet *DFA     //品牌
 var BrandGet *DFA     //品牌
 var IsBrandGoods bool //是否开启品牌抽取
 var IsBrandGoods bool //是否开启品牌抽取
@@ -49,6 +54,11 @@ func UtilInit() {
 	IsSaveTag, _ = Config["iscltlog"].(bool)
 	IsSaveTag, _ = Config["iscltlog"].(bool)
 	SaveBlock, _ = Config["saveblock"].(bool)
 	SaveBlock, _ = Config["saveblock"].(bool)
 	QualityAudit, _ = Config["qualityaudit"].(bool)
 	QualityAudit, _ = Config["qualityaudit"].(bool)
+
+	PriceNumberReg = make(map[string]*regexp.Regexp)
+	for k, v := range PriceNumberConfig {
+		PriceNumberReg[k] = regexp.MustCompile(v)
+	}
 }
 }
 
 
 func GetSyncIndex(code string) string {
 func GetSyncIndex(code string) string {

+ 11 - 7
src/main.go

@@ -24,12 +24,16 @@ func init() {
 	log.SetLevel(log.DEBUG)
 	log.SetLevel(log.DEBUG)
 	log.SetRollingDaily("./", "out.log")
 	log.SetRollingDaily("./", "out.log")
 	qu.ReadConfig(&util.Config)
 	qu.ReadConfig(&util.Config)
-	qu.ReadConfig("./res/brandrule.json", &util.BrandRules)
-	qu.ReadConfig("./res/goods.json", &util.GoodsConfig)
-	qu.ReadConfig("./res/brand.json", &util.BrandConfig)
-	//初始化品牌和商品
-	util.InitBrand()
-	util.InitGoods()
+	//抽取price和number相关
+	qu.ReadConfig("./res/pricenumber.json", &util.PriceNumberConfig)
+	/*
+		qu.ReadConfig("./res/brandrule.json", &util.BrandRules)
+		qu.ReadConfig("./res/goods.json", &util.GoodsConfig)
+		qu.ReadConfig("./res/brand.json", &util.BrandConfig)
+		初始化品牌和商品
+		util.InitBrand()
+		util.InitGoods()
+	*/
 	//初始化util
 	//初始化util
 	util.UtilInit()
 	util.UtilInit()
 	//初始化redis
 	//初始化redis
@@ -67,7 +71,7 @@ func init() {
 
 
 func main() {
 func main() {
 	extract.ExtractUdp() //udp通知抽取
 	extract.ExtractUdp() //udp通知抽取
-	extract.ClearUdp()   //udp通知清理
+	//extract.ClearUdp()   //udp通知清理
 	go extract.Export()
 	go extract.Export()
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
 	go log.Debug("启动..", qu.ObjToString(util.Config["port"]))
 	go log.Debug("启动..", qu.ObjToString(util.Config["port"]))

+ 4 - 4
src/main_test.go

@@ -27,10 +27,10 @@ func Test_han(t *testing.T) {
 	os.Exit(0)
 	os.Exit(0)
 }
 }
 func Test_task(t *testing.T) {
 func Test_task(t *testing.T) {
-	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_dev32")
+	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df59ee3e9d1f601e46fc3f9", "1", "mxs_v1", "mxs_v1")
-	//extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "mxs_v1", "mxs_v1")
+	//extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df59ee3e9d1f601e46fc3f9", "1", "mxs_v1", "mxs_v1")
+	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5df50776e9d1f601e4964179", "1", "mxs_v1", "mxs_v2")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 	time.Sleep(5 * time.Second)
 }
 }
@@ -120,7 +120,7 @@ func Test_buyer(t *testing.T) {
 	for _, d := range *demo {
 	for _, d := range *demo {
 		id1 := util.BsonIdToSId(d["_id"])
 		id1 := util.BsonIdToSId(d["_id"])
 		buyer1 := util.ObjToString(d["buyer"])
 		buyer1 := util.ObjToString(d["buyer"])
-		title := util.ObjToString(d["title"])
+		//title := util.ObjToString(d["title"])
 		for _, r := range *result {
 		for _, r := range *result {
 			id2 := util.BsonIdToSId(r["_id"])
 			id2 := util.BsonIdToSId(r["_id"])
 			buyer2 := util.ObjToString(r["buyer"])
 			buyer2 := util.ObjToString(r["buyer"])

+ 5 - 0
src/res/pricenumber.json

@@ -0,0 +1,5 @@
+{
+	"price":"(单价|^价格|(单个商品|包件)最高限价|(单次服务|控制)+金额|^金额$)",
+	"number":"((采购|需求|预估)(数)?量|^数量|服务次数)",
+	"item":"(^(货物(服务)?|品目|品名|产品|采购|项目|标项|商品|物资|物品|印刷品|物料|材料|设备)[((货物项目设备))]{0,}(名称|种类|内容|服务|描述)?$|^服务(产品|类型)|(采购|机械)(目录|设备)|^名称$)+"
+}

+ 1 - 1
src/res/tablev1.json

@@ -1,7 +1,7 @@
 {
 {
 	"normalhead":[
 	"normalhead":[
 		"^((.{2,6}(名称|编号|代码|时间|类型|性质|行政区域|原因|项目|意见|须知|程度))|标段(编号)?|招标金额|规模|统一社会信用代码|拟?中标供应商|质量|(质量)?承诺|地址|招标代理|序号|材料|结构|结构层数|评委|单位|数量|排名|标的|标项|开户银行|邮编|账号|电话|传真|网址|得分|名次|包件?号|职务|(建设|招标|采购|中标|成交|甲|乙)(单位|人|供应商|方|规模).{0,2}|.{0,5}(价格?|额|资金|[预概]算|投资|费用|报价|投标价)(万?元?([大小]写)?))$__M",
 		"^((.{2,6}(名称|编号|代码|时间|类型|性质|行政区域|原因|项目|意见|须知|程度))|标段(编号)?|招标金额|规模|统一社会信用代码|拟?中标供应商|质量|(质量)?承诺|地址|招标代理|序号|材料|结构|结构层数|评委|单位|数量|排名|标的|标项|开户银行|邮编|账号|电话|传真|网址|得分|名次|包件?号|职务|(建设|招标|采购|中标|成交|甲|乙)(单位|人|供应商|方|规模).{0,2}|.{0,5}(价格?|额|资金|[预概]算|投资|费用|报价|投标价)(万?元?([大小]写)?))$__M",
-		"^.{0,7}(((单位)?名称|总监|经理|负责人|信息|率|费|期|人|号|码|(价格?|额|资金)(万?元?([大小]写)?)|员|品目|标包|代表|区域|方式|因素|合价|合计|小计|地点|条件|(资质|类别和)等级|类别|状态)|得分|注册专业|方法|家数|全称|简称|邮件|执业或职业资格|证书|部门|事项|来源|划分|长度|规模|保证金|目标)$__",
+		"^.{0,7}(((单位)?名称|总监|经理|负责人|信息|率|费|期|人|号|码|(价格?|额|资金)(万?元?([大小]写)?)|员|品目|标包|代表|区域|方式|因素|合价|合计|小计|地点|条件|(资质|类别和)等级|类别|状态)|得分|注册专业|方法|家数|全称|简称|邮件|执业或职业资格|证书|部门|事项|来源|划分|长度|规模|保证金|目标|描述)$__",
 		"(名单|证号|名称|要求|时间|日期|地点|单位|条款|机构|范围|情况|概况|品名|规格|参数|标准|指标|型号|限价|数量|方式|等级|依据|明细|概况|内容|次数|产品|性质|地区|地址|币种|主题|详情|说明|代理(公司|机构)|节支率|名单|结果|结果公示)$|^(职称|姓名|级别|职称专业|证书名称|证书编号)$__",
 		"(名单|证号|名称|要求|时间|日期|地点|单位|条款|机构|范围|情况|概况|品名|规格|参数|标准|指标|型号|限价|数量|方式|等级|依据|明细|概况|内容|次数|产品|性质|地区|地址|币种|主题|详情|说明|代理(公司|机构)|节支率|名单|结果|结果公示)$|^(职称|姓名|级别|职称专业|证书名称|证书编号)$__",
 		"^(联系|评标|单位|公告|采购|商品|附件|质保|用途|公示|机构|评审|品名|规格|参数|指标|型号|数量|证书).{0,10}$__",
 		"^(联系|评标|单位|公告|采购|商品|附件|质保|用途|公示|机构|评审|品名|规格|参数|指标|型号|数量|证书).{0,10}$__",
 		"(专家|评委|打分)$__",
 		"(专家|评委|打分)$__",

+ 3 - 21
src/web/templates/admin/audit_auditone.html

@@ -202,25 +202,6 @@ function audit(text,_id){//单条信息审核
   id = _id;
   id = _id;
 	AddQYKHTML(text)
 	AddQYKHTML(text)
 }
 }
-//审核确认
-function auditsave(parentclass,childclass){
-	var val = $("#auditname").val();
-	$("#modal-info-auditdata").modal("hide");
-	showConfirm("确定通过?", function() {
-		$.ajax({
-			url:"/admin/audit/auditsave",
-			type:"post",
-			data:{"field":field,"val":val,"parentclass":parentclass,"childclass":childclass,"id":id,"eid":eid,"coll":coll},
-			success:function(r){
-				if(r.rep){				
-					ttable.ajax.reload();
-				}else{
-					showTip("审核失败", 1000, function() {});
-				}
-			}
-		})
-	});
-}
 function selectrow(me){
 function selectrow(me){
 		var sel=$(me);
 		var sel=$(me);
 		var isSelected=sel.prop('checked');
 		var isSelected=sel.prop('checked');
@@ -352,9 +333,10 @@ function Add() {
 	obj["coll"] = {{.coll}};
 	obj["coll"] = {{.coll}};
 	obj["_id"] = id;
 	obj["_id"] = id;
   obj["field"] = {{.name}};
   obj["field"] = {{.name}};
+  obj["stancoll"] = {{.stancoll}};
 	$.post("/admin/audit/datasave",obj,function(data){
 	$.post("/admin/audit/datasave",obj,function(data){
 		if(data&&data.rep){
 		if(data&&data.rep){
-       $("#info_data").modal("hide");
+      $("#info_data").modal("hide");
       ttable.ajax.reload();
       ttable.ajax.reload();
 		}else{
 		}else{
 			showTip(data.msg,1000)
 			showTip(data.msg,1000)
@@ -378,7 +360,7 @@ function allAudit(){
 		$.ajax({
 		$.ajax({
 			url:"/admin/audit/allaudit",
 			url:"/admin/audit/allaudit",
 			type:"post",
 			type:"post",
-			data:{"ids":ids.join(","),"names":names.join(","),"coll":{{.coll}},"field":{{.name}}},
+			data:{"ids":ids.join(","),"names":names.join(","),"coll":{{.coll}},"stancoll":{{.stancoll}},"field":{{.name}}},
 			success:function(r){
 			success:function(r){
 				if(r.rep){	
 				if(r.rep){	
 					ttable.ajax.reload();			
 					ttable.ajax.reload();			

+ 2 - 1
src/web/templates/admin/audit_recogfield.html

@@ -66,7 +66,7 @@ $(function () {
 			{ "data": "s_user"},
 			{ "data": "s_user"},
 			{ "data": "_id","width":"30%",render:function(val,a,row){
 			{ "data": "_id","width":"30%",render:function(val,a,row){
 				 return	 '<a class="btn btn-sm btn-info opr" opr="edit">编辑</a>'+
 				 return	 '<a class="btn btn-sm btn-info opr" opr="edit">编辑</a>'+
-					'&nbsp;&nbsp;<a class="btn btn-sm btn-success" href="/admin/audit/dataaudit?name='+row["s_recogfield"]+'&coll='+row["s_coll"]+'">数据审核</a>'+
+					'&nbsp;&nbsp;<a class="btn btn-sm btn-success" href="/admin/audit/dataaudit?name='+row["s_recogfield"]+'&coll='+row["s_coll"]+'&stancoll='+row["s_stancoll"]+'">数据审核</a>'+
 					'&nbsp;&nbsp;<a class="btn btn-sm btn-warning" href="/admin/rulemanager/getrulelist?id='+val+'&fname='+row["s_recogfield"]+'">编辑规则</a>'+
 					'&nbsp;&nbsp;<a class="btn btn-sm btn-warning" href="/admin/rulemanager/getrulelist?id='+val+'&fname='+row["s_recogfield"]+'">编辑规则</a>'+
 					'&nbsp;&nbsp;<a class="btn btn-sm btn-danger" onclick="del(\''+val+'\')">删除</a>'
 					'&nbsp;&nbsp;<a class="btn btn-sm btn-danger" onclick="del(\''+val+'\')">删除</a>'
 			}}
 			}}
@@ -86,6 +86,7 @@ $(function () {
 					{label:"审核字段",s_label:"s_recogfield",must:true},
 					{label:"审核字段",s_label:"s_recogfield",must:true},
 					//{label:"信息库",s_label:"s_lib",must:true},
 					//{label:"信息库",s_label:"s_lib",must:true},
 					{label:"信息表",s_label:"s_coll",must:true},
 					{label:"信息表",s_label:"s_coll",must:true},
+          {label:"标准表",s_label:"s_stancoll",must:true},
 					//{label:"异常标记",s_label:"s_tagattr",must:true},
 					//{label:"异常标记",s_label:"s_tagattr",must:true},
 					//{label:"审核标记",s_label:"s_auditattr",type:"tpl_list_local",must:true,list:[{"s_name":"正确","_id":"ok"},{"s_name":"异常","_id":"err"}],default:"ok"},
 					//{label:"审核标记",s_label:"s_auditattr",type:"tpl_list_local",must:true,list:[{"s_name":"正确","_id":"ok"},{"s_name":"异常","_id":"err"}],default:"ok"},
 					{s_label:"_id",type:"tpl_hidden"},
 					{s_label:"_id",type:"tpl_hidden"},