wcj 6 éve
szülő
commit
78faed2edd

+ 10 - 2
src/jy/extract/extract.go

@@ -289,7 +289,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 			pretreated.AnalyStart(jf)
 		}
 	}, func(err interface{}) {
-		log.Debug("pretreated.AnalyStart", err)
+		log.Debug("pretreated.AnalyStart", err, j.SourceMid)
 	})
 	return j, jf
 }
@@ -678,8 +678,16 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 //lua脚本根据属性设置提取kv值
 func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
 	kvmap := map[string][]map[string]interface{}{}
+	blocks := []*ju.Block{}
+	for _, bl := range j.Block {
+		if len(bl.Block) > 0 {
+			blocks = append(blocks, bl.Block...)
+		} else {
+			blocks = append(blocks, bl)
+		}
+	}
 	for fieldname, field := range in.LFields {
-		for _, bl := range j.Block {
+		for _, bl := range blocks {
 			tp := ""
 			for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
 				if k == 0 {

+ 10 - 4
src/jy/pretreated/analystep.go

@@ -56,7 +56,8 @@ func AnalyStart(job *util.Job) {
 			for i := 0; i < len(tabs); i++ {
 				//添加标识:文本中有table
 				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-				processTableResult(tabres, bl, job)                                                     //分析table解析结果
+				processTableResult(tabres, bl, job)
+				//分析table解析结果
 			}
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
@@ -139,7 +140,10 @@ func FindProjectCode(newCon string, job *util.Job) {
 //分析table解析结果
 func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 	//解析结果中的kv
-	block.TableKV = &util.JobKv{KvTags: tabres.KvTags}
+	if block.TableKV == nil {
+		block.TableKV = util.NewJobKv()
+	}
+	MergeKvTags(block.TableKV.KvTags, tabres.KvTags)
 	//分包
 	tablePackage := map[string]*util.BlockPackage{}
 	if tabres.IsMultiPackage {
@@ -152,7 +156,7 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 			//解析kv
 			//找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
 			labelKVs := []*util.Kv{}
-			if blockPackage.TableKV != nil && len(blockPackage.TableKV.KvTags) > 0 {
+			if blockPackage.TableKV != nil {
 				for tk, tv := range blockPackage.TableKV.KvTags {
 					for _, tvv := range tv {
 						if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
@@ -163,8 +167,10 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 						}
 					}
 				}
+			} else {
+				blockPackage.TableKV = util.NewJobKv()
 			}
-			blockPackage.TableKV.KvTags = GetKvTags(labelKVs, "", nil)
+			MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil))
 			tablePackage[k] = blockPackage
 		}
 	}

+ 11 - 1
src/jy/pretreated/analytable.go

@@ -3,6 +3,7 @@ package pretreated
 import (
 	"fmt"
 	u "jy/util"
+	"log"
 	qutil "qfw/util"
 	"regexp"
 	"strings"
@@ -708,6 +709,7 @@ func (ts *TableResult) Analy() {
 		//		for k, v := range table.TableResult.SortKV.Map {
 		//			qutil.Debug(k, "=====", v)
 		//		}
+		MergeKvTags(ts.KvTags, table.TableResult.KvTags)
 	}
 }
 
@@ -788,6 +790,7 @@ func (tn *Table) AnalyTables(contactFormat *u.ContactFormat) []*Table {
 			table.TdContactFormat(contactFormat) //contactFormat,处理采购单位,代理机构
 			//开始查找kv,核心模块,table.SortKV
 			table.FindKV()
+			log.Println(table.SortKV.Map)
 			//table中抽取品牌,table.BrandData
 			if u.IsBrandGoods {
 				table.analyBrand()
@@ -1487,7 +1490,11 @@ func (table *Table) FindKV() {
 			}
 			if len(kmap) > 0 {
 				for _, k := range kmapkeys {
-					table.SortKV.AddKey(k, kmap[k])
+					if len(kmap[k]) == 1 {
+						table.SortKV.AddKey(k, kmap[k][0])
+					} else if len(kmap[k]) > 1 {
+						table.SortKV.AddKey(k, kmap[k])
+					}
 				}
 			}
 		}
@@ -2082,6 +2089,9 @@ func (tn *Table) isGoonNext() {
 					if bp.TableKV == nil {
 						bp.TableKV = u.NewJobKv()
 					}
+					if bp.SpaceKV == nil {
+						bp.SpaceKV = u.NewJobKv()
+					}
 					for k2, v2 := range mv.ColonKV.KvTags {
 						for _, v2v := range v2 {
 							isExists := false

+ 19 - 14
src/jy/pretreated/tablev2.go

@@ -63,7 +63,6 @@ type TD struct {
 	Val            string             //值
 	Text           string             //原始串
 	SortKV         *SortMap           //存放kv值
-	SortKVWeight   map[string]int     //存放kv值权重
 	Html           string             //html值
 	BH             bool               //是否是表头
 	MustBH         bool               //不能修改的表头
@@ -91,12 +90,11 @@ var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿
 func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	defer qutil.Catch()
 	td := &TD{
-		ArrVal:       []string{},
-		Goquery:      Goquery,
-		SonTds:       []*TD{},
-		TR:           tr,
-		SortKV:       NewSortMap(),
-		SortKVWeight: map[string]int{},
+		ArrVal:  []string{},
+		Goquery: Goquery,
+		SonTds:  []*TD{},
+		TR:      tr,
+		SortKV:  NewSortMap(),
 	}
 	colspan, rowspan := 0, 0
 	col, bcol := td.Goquery.Attr("colspan")
@@ -145,20 +143,28 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 				td.SortKV.AddKey(bl_sk, bl_sv)
 			}
 		}
+	} else {
+		//for _, v := range GetKVAll(txt, "", nil, 2).KvTags {
+		//for _, vv := range v {
+		//td.SortKV.AddKey(vv.Key, vv.Value)
+		//}
+		//}
 	}
 	//抽取不到走正则抽
 	proCode := projectcodeReg.FindString(text)
 	if proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
-		for k, v := range ckv.KvTags {
-			td.SortKV.AddKey(k, v)
-			td.SortKVWeight[k] = -99
+		for _, v := range ckv.KvTags {
+			for _, vv := range v {
+				td.SortKV.AddKey(vv.Key, vv.Value)
+			}
 		}
 	} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
-		for k, v := range ckv.KvTags {
-			td.SortKV.AddKey(k, v)
-			td.SortKVWeight[k] = -99
+		for _, v := range ckv.KvTags {
+			for _, vv := range v {
+				td.SortKV.AddKey(vv.Key, vv.Value)
+			}
 		}
 	}
 	if proCode = jsonReg.FindString(text); proCode != "" {
@@ -166,7 +172,6 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		json.Unmarshal([]byte(proCode), &jsonMap)
 		for k, v := range jsonMap {
 			td.SortKV.AddKey(k, v)
-			td.SortKVWeight[k] = -99
 		}
 	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理