浏览代码

table解析被覆盖

fengweiqiang 6 年之前
父节点
当前提交
fd22c41ae3
共有 4 个文件被更改,包括 51 次插入21 次删除
  1. 1 1
      src/jy/extract/extract.go
  2. 3 3
      src/jy/pretreated/analystep.go
  3. 14 6
      src/jy/pretreated/analytable.go
  4. 33 11
      src/jy/pretreated/tablev2.go

+ 1 - 1
src/jy/extract/extract.go

@@ -957,7 +957,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if extfrom == "title" {
 							item.Score = 4
 						}
-						if strings.Contains(val,"\n"){
+						if strings.Contains(val,"\n") {
 							item.Score -=1
 							exfield.Score-=1
 						}

+ 3 - 3
src/jy/pretreated/analystep.go

@@ -77,7 +77,8 @@ func AnalyStart(job *util.Job) {
 		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 		}
-		findProjectCode(newCon, job) //匹配项目编号
+		FindProjectCode(newCon, job) //匹配项目编号
+		bl.Text = newCon
 		//调用kv解析
 		bl.ColonKV = GetKVAll(newCon, "", nil, 1)
 		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
@@ -95,12 +96,11 @@ func AnalyStart(job *util.Job) {
 }
 
 //匹配项目编号
-func findProjectCode(newCon string, job *util.Job) {
+func FindProjectCode(newCon string, job *util.Job) {
 	newCon = TextAfterRemoveTable(newCon)
 	if strings.TrimSpace(newCon) == "" {
 		return
 	}
-
 	var proCode string
 	proCode = projectcodeReg.FindString(newCon)
 	blCode := &util.Block{}

+ 14 - 6
src/jy/pretreated/analytable.go

@@ -108,8 +108,9 @@ var (
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`(编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9]`)
-	jsonReg						= regexp.MustCompile(`\{.+:[^}]*\} `)//  \{".*\":\".+\"}
+	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9]`)
+	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
+	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -249,6 +250,11 @@ func (table *Table) KVFilter() {
 					//						}
 					//					}
 				}
+			} else {
+				if table.StandKV[k] == "" && qutil.ObjToString(v) != "" {
+					table.StandKV[k] = qutil.ObjToString(v)
+					table.StandKVWeight[k] = 0
+				}
 			}
 		} else {
 			//u.Debug(k, v, "---------")
@@ -583,9 +589,9 @@ func (table *Table) MergerToTableresult() {
 			if table.TableResult.SortKV.Map[k] == nil {
 				table.TableResult.SortKV.AddKey(k, v) //父集
 			} else {
-				if k == "项目编号"{
-					reg := regexp.MustCompile("[\u4e00-\u9fa5]")
-					if reg.MatchString(v){
+				if k == "项目编号" { //项目编号存在,又匹配到全为中文跳过
+
+					if regHz.MatchString(v) {
 						continue
 					}
 				}
@@ -652,7 +658,9 @@ func (ts *TableResult) Analy() {
 		//核心模块
 		ts := tn.Analy(contactFormat)
 		for _, tab := range ts {
-			tabs = append(tabs, tab)
+			if len(tab.TRs) > 0{
+				tabs = append(tabs, tab)
+			}
 			//fmt.Println("tab.SortKV.Map", tab.SortKV.Keys)
 		}
 		//tn.SonTables = append(tn.SonTables, tn)

+ 33 - 11
src/jy/pretreated/tablev2.go

@@ -3,6 +3,7 @@ package pretreated
 //定义表格对象
 
 import (
+	"encoding/json"
 	"fmt"
 	u "jy/util"
 	"log"
@@ -146,16 +147,6 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 					}
 				}
 			}
-		}else {
-			//调用kv解析
-			cKV := GetKVAll(txt, "", nil, 1)
-			for k,v :=range cKV.Kv{
-				td.SortKV.AddKey(k,v)
-			}
-			sKV := SspacekvEntity.Entrance(txt, "", nil)
-			for k,v :=range sKV.Kv{
-				td.SortKV.AddKey(k,v)
-			}
 		}
 	} else {
 		txt = strings.TrimSpace(td.Goquery.Text())
@@ -163,6 +154,33 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
 	td.Val = text //值
 	td.Text = txt //原始串
+	//调用kv解析
+	cKV := GetKVAll(text, "", nil, 1)
+	for k,v :=range cKV.Kv{
+		td.SortKV.AddKey(k,v)
+	}
+	sKV := SspacekvEntity.Entrance(text, "", nil)
+	for k,v :=range sKV.Kv{
+		td.SortKV.AddKey(k,v)
+	}
+	proCode := projectcodeReg.FindString(text)
+	if proCode != "" {
+		ckv := GetKVAll(proCode, "", nil, 1)
+		for k,v :=range ckv.Kv{
+			td.SortKV.AddKey(k,v)
+		}
+	}else if proCode = projectcodeReg2.FindString(text);proCode !=""{
+		ckv := GetKVAll(proCode, "", nil, 1)
+		for k,v :=range ckv.Kv{
+			td.SortKV.AddKey(k,v)
+		}
+	}else if proCode = jsonReg.FindString(text);proCode != ""{
+		jsonMap := make(map[string]string)
+		json.Unmarshal([]byte(proCode),&jsonMap)
+		for k,v := range jsonMap{
+			td.SortKV.AddKey(k,v)
+		}
+	}
 	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
 	td.tdIsHb(tr, table, bsontable)
 	bhead := false
@@ -859,7 +877,11 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 	**/
 	return
 }
-
+//纯文本
+func HtmlToText(con string) string {
+	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	return doc2.Text()
+}
 //取出排除表格之外的文本
 func TextAfterRemoveTable(con string) string {
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))