Răsfoiți Sursa

不规则表格-是否是规则的表格,单元各个数=行数*列数

zhengkun 3 ani în urmă
părinte
comite
f5848b6e4b

+ 97 - 12
src/jy/extract/extract.go

@@ -440,6 +440,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		RuleBlock:     e.RuleBlock,
 		Dataging:      qu.IntAll(doc["dataging"]),
 		IsClearnMoney: isClearnMoneystr,
+		IsUnRulesTab :	false,
 	}
 	if isextFile {
 		jf = &ju.Job{
@@ -460,6 +461,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 			IsFile:         isextFile,
 			Dataging:       qu.IntAll(doc["dataging"]),
 			IsClearnMoney:  isClearnMoneystr,
+			IsUnRulesTab :	false,
 		}
 	}
 	codeSite := j.SpiderCode
@@ -1903,6 +1905,66 @@ var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:
 var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
 
 
+//特殊金额-处理判断-倍率关系
+func calculateAbnormalMoney(val []*ju.ExtField) (bool,int) {
+	//金额结果只有两种 - 倍率关系10000 - 过10E
+	moneyIndex := []int{}
+	moneyArr := []float64{}
+	difValue := map[string]interface{}{}
+	for k, v := range val { //取第一个非负数,项目名称除外
+		if  v.IsTrue && v.Score > -1 {
+			moneyArr = append(moneyArr,qu.Float64All(v.Value))
+			moneyIndex = append(moneyIndex,k)
+			key := ""
+			if m,ok := v.Value.(float64);ok {
+				key = fmt.Sprintf("%f",m)
+			}else {
+				key = qu.ObjToString(v.Value)
+			}
+			if difValue[key]==nil {
+				difValue[key] = 1
+			}
+			if len(difValue)>2 {
+				return false,0
+			}
+		}
+	}
+	//计算金额数组
+	if len(difValue)==2 {
+		money_1,money_2 := float64(0),float64(0)
+		for k,v := range moneyArr{
+			if k==0 {
+				money_1=v
+			}else {
+				if v!=money_1 {
+					money_2=v
+					break
+				}
+			}
+		}
+		isRatio,new_money:= false,float64(0)  //判断金额是否为倍率关系
+		if money_1!=float64(0)&&money_2!=float64(0) {
+			if money_1 == money_2*float64(10000) && money_1>=1000000000{
+				isRatio = true
+				new_money = money_2
+			}
+			if money_2 == money_1*float64(10000) && money_2>=1000000000{
+				isRatio = true
+				new_money = money_1
+			}
+
+			if isRatio { //采用新值
+				for k,v := range moneyArr{
+					if v==new_money {
+						return true,moneyIndex[k]
+					}
+				}
+			}
+		}
+	}
+	return false,0
+}
+
 
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
@@ -1989,8 +2051,19 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 
 			//预算-中标金额字段-特殊情况特殊处理
-
-
+			if k=="bidamount" || k=="budget" {
+				b,index :=calculateAbnormalMoney(val)
+				if b {
+					new_v := val[index]
+					tmp[new_v.Field] = new_v.Value
+					fieldSource[new_v.Field] = map[string]interface{}{
+						"ext_type":new_v.Type,
+						"ext_from":new_v.ExtFrom,
+					}
+					tmp["is_dif_ratioMoney"] = true
+					continue
+				}
+			}
 
 			for _, v := range val { //取第一个非负数,项目名称除外
 				//存0是否有效
@@ -2047,11 +2120,16 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					}
 				}
 				if qu.Float64All(tmp["budget"]) < tmpBudget {
-					fieldSource["budget"] = map[string]interface{}{
-						"ext_type":"",
-						"ext_from":"package",
+					if tmpBudget == qu.Float64All(tmp["budget"])*float64(10000) &&
+						tmpBudget>=1000000000 && qu.Float64All(tmp["budget"])>0{
+						tmp["is_dif_ratioMoney"] = true
+					}else {
+						fieldSource["budget"] = map[string]interface{}{
+							"ext_type":"",
+							"ext_from":"package",
+						}
+						tmp["budget"] = tmpBudget
 					}
-					tmp["budget"] = tmpBudget
 				}
 				if qu.Float64All(tmp["agencyfee"]) < tmpAgencyfee {
 					fieldSource["agencyfee"] = map[string]interface{}{
@@ -2067,11 +2145,16 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					}
 					tmp["bidamount"] = tmpBidamount
 				} else if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
-					fieldSource["bidamount"] = map[string]interface{}{
-						"ext_type":"",
-						"ext_from":"package",
+					if tmpBidamount == qu.Float64All(tmp["bidamount"])*float64(10000) &&
+						tmpBidamount>=1000000000 && qu.Float64All(tmp["bidamount"])>0{
+						tmp["is_dif_ratioMoney"] = true
+					}else {
+						fieldSource["bidamount"] = map[string]interface{}{
+							"ext_type":"",
+							"ext_from":"package",
+						}
+						tmp["bidamount"] = tmpBidamount
 					}
-					tmp["bidamount"] = tmpBidamount
 				}
 			} else {
 				//包数等于1,tmp没有值取包里的值
@@ -2201,8 +2284,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 
 		//添加字段来源
 		tmp["field_source"] = fieldSource
-		//添加字段来源
-
+		//是否为不规则表格字段
+		if j.IsUnRulesTab {
+			tmp["is_UnRules_Tab"]= j.IsUnRulesTab
+		}
 		for k, v := range *doc {
 			if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
 				(*doc)[k] = []rune(qu.ObjToString(v))[:100000]

+ 11 - 10
src/jy/pretreated/analystep.go

@@ -60,8 +60,6 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 			ration = newration
 		}
 	}*/
-	//计算是否规则-不规则
-
 
 	job.BlockPackage = map[string]*util.BlockPackage{}
 	//分块+处理每块kv
@@ -80,8 +78,10 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 			}
 			FindProjectCode(bl.Text, job) //匹配项目编号
 			//表格找分包相关---
-			processTableInBlock(bl, job, isSite, codeSite) //处理表格
-
+			isUnRulesTab := processTableInBlock(bl, job, isSite, codeSite) //处理表格
+			if isUnRulesTab { //是否不规则表格
+				job.IsUnRulesTab = isUnRulesTab
+			}
 			//新加 未分块table中未能解析到中标候选人,从正文中解析-全文匹配一次
 			if (job.Winnerorder == nil || len(job.Winnerorder) == 0 ) || len(job.Winnerorder) > 10 {
 				//表格没有划分时候:-纯文本匹配
@@ -122,9 +122,7 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 				//blockTag - 块标签
 				//处理表格
 				tabres := AnalyTableV2(tabs[i], job.Category, blockTag, con, 1, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象
-				if tabres.TableSize>0 {
-
-				}
+				job.IsUnRulesTab = tabres.isUnRulesTab
 				processTableResult(tabres, bl, job, isSite, codeSite)
 			}
 		} else {
@@ -172,10 +170,11 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 	}
 }
 
-func processTableInBlock(bl *util.Block, job *util.Job, isSite bool, codeSite string) {
+func processTableInBlock(bl *util.Block, job *util.Job, isSite bool, codeSite string) bool {
 	//块中再查找表格(块,处理完把值赋到块)
 	//bl.Text = formatText(bl.Text, "biangeng")
 	tabs, _ := ComputeConRatio(bl.Text, 2)
+	isUnRulesTab := false
 	for i, tab := range tabs {
 		job.HasTable = 1
 		tmptag := ""
@@ -190,14 +189,16 @@ func processTableInBlock(bl *util.Block, job *util.Job, isSite bool, codeSite st
 		//	tabres.PackageMap = nil
 		//	tabres.IsMultiPackage = false
 		//}
-		if tabres.TableSize>0 {
-
+		if tabres.isUnRulesTab { //是否为不规则表格
+			isUnRulesTab = true
 		}
 		processTableResult(tabres, bl, job, isSite, codeSite) //分析table解析结果
 		if bl.Title == "" && tabres.BlockTag != "" {
 			bl.Title = tabres.BlockTag
 		}
 	}
+
+	return isUnRulesTab
 }
 
 //匹配项目编号

+ 3 - 3
src/jy/pretreated/analytable.go

@@ -847,10 +847,10 @@ func (ts *TableResult) Analy(isSite bool, codeSite string) {
 	}
 	for _, table := range tabs {
 		table.MergerToTableresult()
-		//		for k, v := range table.TableResult.SortKV.Map {
-		//			qutil.Debug(k, "=====", v)
-		//		}
 		MergeKvTags(ts.KvTags, table.TableResult.KvTags)
+		if !table.Brule {
+			ts.isUnRulesTab = true
+		}
 	}
 }
 

+ 2 - 0
src/jy/pretreated/tablev2.go

@@ -36,6 +36,7 @@ type TableResult struct {
 	HasBrand        int                        //有品牌
 	HasGoods        int                        //有商品
 	RuleBlock       *u.RuleBlock
+	isUnRulesTab    bool
 }
 
 //快速创建TableResult对象
@@ -51,6 +52,7 @@ func NewTableResult(Id interface{}, Toptype, BlockTag, con string, Itype int, ru
 		PackageMap:  NewSortMap(),
 		KvTags:      map[string][]*u.Tag{},
 		RuleBlock:   ruleBlock,
+		isUnRulesTab:false,
 	}
 }
 

+ 1 - 0
src/jy/util/article.go

@@ -42,6 +42,7 @@ type Job struct {
 	SimDistrictScore  map[string]float64                //简称district得分
 	Dataging          int
 	IsClearnMoney     string //站点清理金额
+	IsUnRulesTab      bool //是否为不规则表格
 }
 
 type ExtField struct {