unknown 6 жил өмнө
parent
commit
d4ff95ef44

+ 4 - 4
src/config.json

@@ -8,10 +8,10 @@
     "elasticPoolSize": 30,
 	"mergetable":"projectset",
 	"mergetablealias":"projectset_v1",
-    "saveresult": true,
-    "fieldscore": true,
-    "qualityaudit": true,
-	"iscltlog":true,
+    "saveresult": false,
+    "fieldscore": false,
+    "qualityaudit": false,
+	"iscltlog":false,
 	"brandgoods":true,
     "udptaskid": "5be107e600746bf92debf080",
     "udpip": "127.0.0.1",

+ 1 - 1
src/jy/admin/audit/rulemanager.go

@@ -97,7 +97,7 @@ func SaveRecogField(c *gin.Context) {
 		if len(*d) > 0 {
 			c.JSON(200, gin.H{"msg": "已存在!"})
 		} else {
-			data["l_lasttime"] = time.Now().Unix()
+			data["l_createtime"] = time.Now().Unix()
 			//data["l_date"] = time.Now().Unix()
 			data["s_user"] = session.Get("username")
 			data["delete"] = false

+ 45 - 23
src/jy/pretreated/analytable.go

@@ -21,7 +21,7 @@ var (
 	//清理表格中是key中包含的空格或数字等
 	tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]")
 	//清理表格td中的符号
-	tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、,。、_??;;~\\-#\\\\附(件|图)]|^*")
+	tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、,。、_??;;~\\-#\\\\]*|(详?见)附(件|图)")
 	//判断key是金额,对万元的处理
 	moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
 	//根据表格的内容判断是不是表头,如果含有金额则不是表头
@@ -658,8 +658,6 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 		TR := NewTR(table)
 		tdTextIsNull := true
 		tds.Each(func(m int, selm *goquery.Selection) {
-			//			t, _ := selm.Html()
-			//			fmt.Println("t---------", t)
 			//对隐藏列不处理!!!
 			if IsHide(selm) {
 				return
@@ -667,7 +665,6 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 			//进入每一个单元格
 			td := NewTD(selm, TR, table)
 			//num++
-			//fmt.Println("------", td.SortKV.Keys, td.SortKV.Map)
 			TR.AddTD(td)
 			if td.Val != "" { //删除一个tr,tr中所有td是空值的
 				tdTextIsNull = false
@@ -680,6 +677,11 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 	})
 	//重置行列
 	table.ComputeRowColSpan()
+	//	for n, tr := range table.TRs {
+	//		for m, td := range tr.TDs {
+	//			qutil.Debug(td.BH, n, m, td.Text, td.StartRow, td.EndRow, td.StartCol, td.EndCol)
+	//		}
+	//	}
 
 	tm := []map[string]interface{}{}
 	tmk := map[string]bool{}
@@ -738,7 +740,7 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 			//删除尾部空行
 			for len(table.TRs) > 0 {
 				npos := len(table.TRs)
-				tailTR := table.TRs[npos-1]
+				tailTR := table.TRs[npos-1] //最后一个tr
 				bspace := true
 				for _, v := range tailTR.TDs {
 					if v.Val != "" || v.SonTableResult != nil || len(v.SortKV.Keys) > 0 {
@@ -782,6 +784,7 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 			table.TdContactFormat(contactFormat)
 			//开始查找kv,核心模块
 			table.FindKV()
+			qutil.Debug(table.SortKV.Map)
 			//table中抽取品牌
 			if u.IsBrandGoods {
 				table.analyBrand1()
@@ -827,16 +830,16 @@ func (table *Table) Adjust() {
 	table.RowNum = len(table.TRs)
 	//		for k1, tr := range table.TRs {
 	//			for k2, td := range tr.TDs {
-	//				u.Debug(k1, k2, td.Val, td.Rowspan, td.Colspan, td.ColPos, tr.RowPos)
+	//				qutil.Debug(k1, k2, td.Val, td.Rowspan, td.Colspan, td.ColPos, tr.RowPos)
 	//			}
 	//		}
 	//计算行列起止位置,跨行跨列处理
 	table.ComputeRowColSpan()
-	//		for k1, tr := range table.TRs {
-	//			for k2, td := range tr.TDs {
-	//				u.Debug(k1, k2, td.Val, td.StartRow, td.EndRow, td.StartCol, td.EndCol)
-	//			}
+	//	for k1, tr := range table.TRs {
+	//		for k2, td := range tr.TDs {
+	//			qutil.Debug(k1, k2, td.Val, td.StartRow, td.EndRow, td.StartCol, td.EndCol)
 	//		}
+	//	}
 	//大概计算每个起止行列的概率
 	table.GetKeyRation()
 	/*
@@ -847,7 +850,7 @@ func (table *Table) Adjust() {
 				for _, td := range v.Tdmap[v1] {
 					str += "__" + td.Val + fmt.Sprintf("%d_%d_%d_%d", td.StartRow, td.EndRow, td.StartCol, td.EndCol)
 				}
-				u.Debug(k, k1, string(bs), v.Rationmap[v1], str)
+				qutil.Debug(k, k1, string(bs), v.Rationmap[v1], str)
 			}
 		}
 	*/
@@ -862,7 +865,6 @@ func (table *Table) Adjust() {
 			}
 		}
 	}
-
 	if float32(count)/float32(table.TDNum) < 0.85 {
 		//精确计算起止行列是表头的概率
 		table.ComputeRowColIsKeyRation()
@@ -871,7 +873,7 @@ func (table *Table) Adjust() {
 		for i, tr := range table.TRs {
 			for _, td := range tr.TDs {
 				if td.BH {
-					//u.Debug("----=====---", td.Val, len(table.TRs[len(table.TRs)-1].TDs), i, len(table.TRs)-1)
+					//qutil.Debug("----=====---", td.Val, len(table.TRs[len(table.TRs)-1].TDs), i, len(table.TRs)-1)
 					if i == len(table.TRs)-1 && len(table.TRs[len(table.TRs)-1].TDs) == 2 {
 						res, _, _, _, _ := CheckCommon(td.Val, "abandontable")
 						if res {
@@ -896,7 +898,7 @@ func (table *Table) ComputeRowColSpan() {
 	for k, v := range table.TRs {
 		nk := 0 //nk列的起始,k行的起始||如果有合并,起始就不是0
 		ball := true
-		rowspans := v.TDs[0].Rowspan
+		rowspans := v.TDs[0].Rowspan //某一行第一个td的rowspan
 		for _, v1 := range v.TDs {
 			if v1.Rowspan != rowspans {
 				ball = false
@@ -995,16 +997,20 @@ func (table *Table) FindTag() {
 //计算r/c_start_end的概率
 func (table *Table) GetKeyRation() {
 	for _, vn := range table.StartAndEndRationKSort.Keys {
+		qutil.Debug("vn:", vn)
 		v := table.StartAndEndRation[vn]
 		for _, v1 := range v.Poss {
 			count := 0
 			n := 0
+			qutil.Debug("len:", len(v.Tdmap[v1]))
 			for _, td := range v.Tdmap[v1] {
 				n++
 				if td.BH {
+					qutil.Debug("val:", td.Val)
 					count++
 				}
 			}
+			qutil.Debug(float32(count), float32(n), float32(count)/float32(n))
 			v.Rationmap[v1] = float32(count) / float32(n)
 		}
 	}
@@ -1020,11 +1026,15 @@ func (table *Table) ComputeRowColIsKeyRation() {
 		checkCompute := map[string]bool{}
 		for k, tr := range table.TRs {
 			rk := fmtkey("r", tr.TDs[0].StartRow, tr.TDs[0].EndRow)
+			qutil.Debug("rk", rk)
 			if k == 0 { //第1行的概率
 				ck := fmtkey("c", tr.TDs[0].StartCol, tr.TDs[0].EndCol)
+				qutil.Debug("ck", ck)
 				//u.Debug(table.BFirstRow, "--", table.StartAndEndRation[rk], table.StartAndEndRation[ck])
 				ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0])
 				ration2, _ := table.StartAndEndRation[ck].GetTDRation(tr.TDs[0])
+				qutil.Debug("ration1:", ration1, "ration2:", ration2)
+				qutil.Debug(len(tr.TDs) == 2 && ration2 < 0.55, len(tr.TDs) == 2 && ration1 > 0.5)
 				if (len(tr.TDs) == 2 && ration2 < 0.55) && (len(tr.TDs) == 2 && ration1 > 0.5) { //第一行为key
 					bkeyfirstrow = true
 					ball := true
@@ -1061,6 +1071,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 						}
 					}
 				}
+				qutil.Debug("bkeyfirstrow:", bkeyfirstrow, "bkeyfirstcol:", bkeyfirstcol)
 				if !bkeyfirstrow && !bkeyfirstcol {
 					if len(tr.TDs) > 1 && ration1 > ration2 && ration1 > 0.5 {
 						bkeyfirstrow = true
@@ -1091,6 +1102,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 					}
 				}
 			} else {
+				qutil.Debug("bkeyfirstrow", bkeyfirstrow)
 				if bkeyfirstrow {
 					//第一列的概率
 					ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0])
@@ -1105,6 +1117,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 					} //else {for _, td := range tr.TDs {}}
 				} else {
 					//列在起作用
+					qutil.Debug("bkeyfirstcol", bkeyfirstcol)
 					if bkeyfirstcol {
 						for _, td := range tr.TDs {
 							ck := fmtkey("c", td.StartCol, td.EndCol)
@@ -1142,20 +1155,25 @@ func (table *Table) ComputeRowColIsKeyRation() {
 			}
 		}
 	}
+	//qutil.Debug("table.Brule", table.Brule, !bkeyfirstcol && !bkeyfirstrow)
 	if !table.Brule || (!bkeyfirstcol && !bkeyfirstrow) {
 		//断行问题,虽然同列或同行,但中间被跨行截断,表格方向调整
 		for _, k := range table.StartAndEndRationKSort.Keys {
+			qutil.Debug("k:", k)
 			v := table.StartAndEndRation[k]
 			//横向判断,要判断最多的方向,否则会出现不定的情况(map遍历问题)
 			k1 := k[:1]
 			for _, v2 := range v.Poss {
 				lentds := len(v.Tdmap[v2])
+				qutil.Debug(v2.Max, v2.Min, "len", lentds)
 				if v.Rationmap[v2] > checkval {
 					for _, td := range v.Tdmap[v2] {
+						qutil.Debug("td:", td.Val)
 						if td.KeyDirect == 0 && !MoneyReg.MatchString(td.Val) {
 							if k1 == "r" {
 								ck := fmtkey("c", td.StartCol, td.EndCol)
 								rt := table.StartAndEndRation[ck]
+								qutil.Debug("ck:", ck, "rt:", rt)
 								//clen := 0
 								var fv float32
 								var tdn []*TD
@@ -1164,6 +1182,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 									//clen = len(tdn)
 								}
 								if lentds > 1 {
+									qutil.Debug((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil)
 									if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" {
 										td.KeyDirect = 1
 										td.KVDirect = 2
@@ -1173,6 +1192,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 							} else {
 								ck := fmtkey("r", td.StartRow, td.EndRow)
 								rt := table.StartAndEndRation[ck]
+								qutil.Debug("ck:", ck, "rt:", rt)
 								var fv float32
 								var tdn []*TD
 								//clen := 0
@@ -1181,6 +1201,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 									//clen = len(tdn)
 								}
 								if lentds > 1 {
+									qutil.Debug(tdn != nil, v.Rationmap[v2] > fv, tdn == nil)
 									if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" {
 										td.KeyDirect = 2
 										td.KVDirect = 1
@@ -1188,12 +1209,13 @@ func (table *Table) ComputeRowColIsKeyRation() {
 									}
 								}
 							}
-
+							qutil.Debug(td.Val, td.BH, td.KeyDirect, td.KVDirect)
 						} else {
 							break
 						}
 					}
 				} else if v.Rationmap[v2] < 0.5 && len(v.Tdmap[v2]) > 3 {
+					qutil.Debug("================================")
 					for _, td := range v.Tdmap[v2] {
 						//						u.Debug(td.Val, "-----", td.BH)
 						if td.KeyDirect == 0 && td.BH && !td.MustBH {
@@ -2755,7 +2777,7 @@ func (table *Table) analyBrand1() {
 			arrcount1 := 0 //记录key是否存在必须title(数组数据)
 			arrcount2 := 0
 			ka := make(map[string][]string) //最终存储数据
-			//qutil.Debug(k, "aMap.Keys----", aMap.Keys)
+			//qutil.Debug("aMap.Keys----", aMap.Keys)
 			for _, k0 := range aMap.Keys {
 				v0 := aMap.Map[k0].([]string)
 				//qutil.Debug("k0:", k0, "v0:", v0)
@@ -3202,13 +3224,13 @@ func assembleData(m interface{}, n int) []map[string]string {
 			datas[i] = data
 		}
 		//end
-		for _, fdv := range datas { //清除空数据和只含特殊符号的数据
-			for fmk, fmv := range fdv {
-				if tabletdclear.ReplaceAllString(fmv, "") == "" {
-					delete(fdv, fmk)
-				}
-			}
-		}
+		//		for _, fdv := range datas { //清除空数据和只含特殊符号的数据
+		//			for fmk, fmv := range fdv {
+		//				if tabletdclear.ReplaceAllString(fmv, "") == "" {
+		//					delete(fdv, fmk)
+		//				}
+		//			}
+		//		}
 	} else { //字符串数据
 		realTypeM := m.(map[string]string)
 		datas = append(datas, realTypeM)

+ 12 - 13
src/jy/pretreated/tablev2.go

@@ -29,10 +29,10 @@ type TableResult struct {
 	SortKV         *SortMap             //全局KVmap值,标准化处理过的
 	SortKVWeight   map[string]int       //全局KVmap值,标准化处理过的
 	WinnerOrder    []map[string]interface{}
-	BrandData      [][]map[string]string
-	HasKey         int //有key
-	HasBrand       int //有品牌
-	HasGoods       int //有商品
+	BrandData      [][]map[string]string //品牌抽取结果
+	HasKey         int                   //有key
+	HasBrand       int                   //有品牌
+	HasGoods       int                   //有商品
 }
 
 //快速创建TableResult对象
@@ -116,6 +116,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	bsontable := false
 	txt := ""
 	if ht.Size() > 0 {
+		//qutil.Debug("有子表格")
 		txt = TextAfterRemoveTable(td.Html)
 		ts := td.TR.Table.TableResult
 		tabs, _ := ComputeConRatio(td.Html, 2)
@@ -194,9 +195,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 						}
 					}
 				}
-
 				//u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
-
 			}
 		}
 	} else {
@@ -471,7 +470,7 @@ type Table struct {
 	Brule                  bool //是否规则
 	TRs                    []*TR
 	BFirstRow              bool
-	RowNum                 int                       //
+	RowNum                 int                       //行
 	ColNum                 int                       //列数
 	TDNum                  int                       //td个数
 	BPackage               bool                      //是否有包
@@ -491,12 +490,12 @@ type Table struct {
 	StartAndEndRation      map[string]*TDRationScope //同行或同列的概率,截断的单独起算
 	StartAndEndRationKSort *SortMap
 	WinnerOrder            []map[string]interface{}
-	BSplit                 bool //是否是有一个表拆分成的多个表
-	BHeader                bool //拆分表是否有表头
-	BrandData              [][]map[string]string
-	HasKey                 int //有key
-	HasBrand               int //有品牌
-	HasGoods               int //有商品
+	BSplit                 bool                  //是否是有一个表拆分成的多个表
+	BHeader                bool                  //拆分表是否有表头
+	BrandData              [][]map[string]string //品牌抽取结果
+	HasKey                 int                   //有key
+	HasBrand               int                   //有品牌
+	HasGoods               int                   //有商品
 }
 
 func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {

+ 3 - 3
src/main_test.go

@@ -14,7 +14,7 @@ import (
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")
-	extract.StartExtractTestTask("5c528686698414055c47b115", "5c2a439aa5cb26b9b76405de", "1", "mxs_v2", "mxs_v2")
+	extract.StartExtractTestTask("5c528686698414055c47b115", "5a524c3d40d2d9bbe8e9cef0", "1", "mxs_v2", "mxs_v2")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }
@@ -66,12 +66,12 @@ func Test_reg3(t *testing.T) {
 	text := []rune("(法撒旦法士大夫发的发)生(的]发的法旦法士大夫三发的)")
 	for i := 1; i <= 2; i++ {
 		if len(text) > 0 {
-			text = aa12(i, text)
+			text = gl(i, text)
 		}
 	}
 	log.Println("finish--", string(text))
 }
-func aa12(i int, text []rune) []rune {
+func gl(i int, text []rune) []rune {
 	pairedIndex := make(map[int]int)
 	surplusMax := -1  //记录多余的反符号最大值
 	positiveMax := -1 //记录多余的正符号最大值

+ 1 - 1
src/res/brandrule.json

@@ -1,6 +1,6 @@
 {
 	"must":{
-		"itemname":"((^(货物|品目|产品|商品|物资|印刷品|物料|材料|采购项目|设备|成交标(的)?)(名称|种类|内容|服务)+|服务产品|(采购|机械)(目录|设备)|^(品名|品目)$)和?)+",
+		"itemname":"((^(货物|品目|产品|标项|商品|物资|印刷品|物料|材料|设备|成交标(的)?)(名称|种类|内容|服务)+|服务产品|(采购|机械)(目录|设备)|^(品名|品目)$)和?)+",
 		"brandname":"^(品牌(名称)?|厂家)",
 		"modal":"^(规格)?(型号|参数)|规格$|技术规格", 
 		"unitprice":"单价|^价格|(预算|采购预算)(金额)?$|(单个商品|包件)最高限价|(中标成交|单次服务|控制)+金额|^金额$"