Browse Source

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

fengweiqiang 6 years ago
parent
commit
b628b6b457

+ 68 - 24
src/jy/extract/extract.go

@@ -392,17 +392,19 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				// log.Debug("抽取-后置规则", tmp)
 
 				//项目名称未能抽取到,标题来凑
-				if vc.Field == "projectname" && vc.ExtFrom == "title" {
-					isextitle := true
-					for _, v := range j.Result[vc.Field] {
-						if len([]rune(qu.ObjToString(v.Value))) > 5 {
-							isextitle = false
-							break
+				if vc.Field == "projectname" {
+					if vc.ExtFrom == "title" {
+						isextitle := true
+						for _, v := range j.Result[vc.Field] {
+							if len([]rune(qu.ObjToString(v.Value))) > 5 {
+								isextitle = false
+								break
+							}
+						}
+						if isextitle { //标题加入选举
+							field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
+							j.Result[vc.Field] = append(j.Result[vc.Field], field)
 						}
-					}
-					if isextitle { //标题加入选举
-						field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
-						j.Result[vc.Field] = append(j.Result[vc.Field], field)
 					}
 					for i := 0; i < 3; i++ {
 						for _, v := range vc.RuleBacks {
@@ -578,26 +580,68 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
 			ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e)
 		}
 	}
-	for k, v := range kvMap {
-		if j.Result[k] == nil {
-			j.Result[k] = [](*ju.ExtField){}
+	//如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
+	if vc.Field == "budget" && len(kvMap) == 0 {
+		if len(j.BlockPackage) == 1 {
+			for _, bp := range j.BlockPackage {
+				for fieldname, field := range vc.LFields {
+					if field != vc.Field {
+						continue
+					}
+					tp := ""
+					for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
+						if k == 0 {
+							tp = "colon"
+						} else if k == 1 {
+							tp = "space"
+						} else if k == 2 {
+							tp = "table"
+						}
+						if v == nil || v.KvTags == nil {
+							continue
+						}
+						for _, vv := range v.KvTags[fieldname] {
+							text := ju.TrimLRSpace(vv.Value, "")
+							if text != "" {
+								j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{
+									Field:       vc.Field,
+									Code:        "CL_分包",
+									Type:        tp,
+									MatchType:   "package",
+									RuleText:    bp.Text,
+									SourceValue: vv.Key,
+									Value:       text,
+								})
+							}
+						}
+					}
+				}
+				break
+			}
 		}
-		for _, tmp := range v {
-			field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
-			if tmp["blocktag"] != nil {
-				btag := make(map[string]string)
-				for k := range tmp["blocktag"].(map[string]bool) {
-					blocktag.Lock()
-					if TagConfigDesc[k] != "" {
-						btag[k] = TagConfigDesc[k]
+	} else {
+		for k, v := range kvMap {
+			if j.Result[k] == nil {
+				j.Result[k] = [](*ju.ExtField){}
+			}
+			for _, tmp := range v {
+				field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+				if tmp["blocktag"] != nil {
+					btag := make(map[string]string)
+					for k := range tmp["blocktag"].(map[string]bool) {
+						blocktag.Lock()
+						if TagConfigDesc[k] != "" {
+							btag[k] = TagConfigDesc[k]
+						}
+						blocktag.Unlock()
 					}
-					blocktag.Unlock()
+					field.BlockTag = btag
 				}
-				field.BlockTag = btag
+				j.Result[k] = append(j.Result[k], field)
 			}
-			j.Result[k] = append(j.Result[k], field)
 		}
 	}
+
 }
 
 //抽取-规则-kv

+ 1 - 1
src/jy/pretreated/analykv.go

@@ -13,7 +13,7 @@ var Key = regexp.MustCompile("[:::]")
 var Time = regexp.MustCompile("[\\d]")
 var dh = regexp.MustCompile("[,,.]")
 var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
-var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;]")
+var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;\\-]")
 var matchkh = map[string]string{
 	"(": ")",
 	"(": ")",

+ 44 - 40
src/jy/pretreated/analystep.go

@@ -30,7 +30,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 { //有分块
+	if len(blockArrays) > 0 {                                                //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -38,11 +38,11 @@ func AnalyStart(job *util.Job) {
 			if len([]rune(bl.Text)) > 80 {
 				bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock)
 				for _, bl_bl := range bl.Block {
-					processTableInBlock(bl_bl, job)
+					processTableInBlock(bl_bl, job, false)
 				}
 			}
 			FindProjectCode(bl.Text, job) //匹配项目编号
-			processTableInBlock(bl, job)
+			processTableInBlock(bl, job, true)
 			//新加 未分块table中未能解析到中标候选人,从正文中解析
 			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
 				bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
@@ -86,14 +86,18 @@ func AnalyStart(job *util.Job) {
 	}
 }
 
-func processTableInBlock(bl *util.Block, job *util.Job) {
+func processTableInBlock(bl *util.Block, job *util.Job, packageFlag bool) {
 	//块中再查找表格(块,处理完把值赋到块)
 	tabs, _ := ComputeConRatio(bl.Text, 2)
 	for _, tab := range tabs {
 		job.HasTable = 1
 		//添加标识:文本中有table
 		tabres := AnalyTableV2(tab, job.Category, bl.Title, tab.Text(), 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-		processTableResult(tabres, bl, job)                                                              //分析table解析结果
+		if packageFlag {
+			tabres.PackageMap = nil
+			tabres.IsMultiPackage = false
+		}
+		processTableResult(tabres, bl, job) //分析table解析结果
 		if bl.Title == "" && tabres.BlockTag != "" {
 			bl.Title = tabres.BlockTag
 		}
@@ -109,43 +113,43 @@ func FindProjectCode(newCon string, job *util.Job) {
 	var proCode string
 	blCode := &util.Block{}
 	/*		if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
-			//5d424bdfa5cb26b9b7ac7a85
-			//5d425a48a5cb26b9b7df5fec
-			//5d425506a5cb26b9b7cd2c3c
-			splitStr := strings.Split(newConTMP, " ")
-			if len(splitStr) >= 2 {
-				if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
-					newCon = "项目编号:" + splitStr[len(splitStr)-1]
-				} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-					//5d4253f3a5cb26b9b7ca2662
-					newCon = "项目编号:" + tmpstr
-				}
-			} else if len(splitStr) == 1 {
-				if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-					newCon = "项目编号:" + tmpstr
-				} else if strings.Contains(newConTMP, "、") {
-					tmpstrs := strings.Split(newCon, "、")
-					newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+				//5d424bdfa5cb26b9b7ac7a85
+				//5d425a48a5cb26b9b7df5fec
+				//5d425506a5cb26b9b7cd2c3c
+				splitStr := strings.Split(newConTMP, " ")
+				if len(splitStr) >= 2 {
+					if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+						newCon = "项目编号:" + splitStr[len(splitStr)-1]
+					} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+						//5d4253f3a5cb26b9b7ca2662
+						newCon = "项目编号:" + tmpstr
+					}
+				} else if len(splitStr) == 1 {
+					if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+						newCon = "项目编号:" + tmpstr
+					} else if strings.Contains(newConTMP, "、") {
+						tmpstrs := strings.Split(newCon, "、")
+						newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+					}
 				}
 			}
-		}
-		proCode = projectcodeReg.FindString(newCon)
-		if proCode != "" {
-			ckv := GetKVAll(proCode, job.Title, nil, 1)
-			blCode.ColonKV = ckv
-			blCode.Text = proCode
-			job.Block = append(job.Block, blCode)
-		} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
-			ckv := GetKVAll(proCode, job.Title, nil, 1)
-			blCode.ColonKV = ckv
-			blCode.Text = proCode
-			job.Block = append(job.Block, blCode)
-		} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
-			ckv := GetKVAll(proCode, job.Title, nil, 1)
-			blCode.Text = proCode
-			blCode.ColonKV = ckv
-			job.Block = append(job.Block, blCode)
-		}*/
+			proCode = projectcodeReg.FindString(newCon)
+			if proCode != "" {
+				ckv := GetKVAll(proCode, job.Title, nil, 1)
+				blCode.ColonKV = ckv
+				blCode.Text = proCode
+				job.Block = append(job.Block, blCode)
+			} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
+				ckv := GetKVAll(proCode, job.Title, nil, 1)
+				blCode.ColonKV = ckv
+				blCode.Text = proCode
+				job.Block = append(job.Block, blCode)
+			} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
+				ckv := GetKVAll(proCode, job.Title, nil, 1)
+				blCode.Text = proCode
+				blCode.ColonKV = ckv
+				job.Block = append(job.Block, blCode)
+			}*/
 	if proCode = jsonReg.FindString(newCon); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)

+ 1 - 1
src/jy/pretreated/analytable.go

@@ -93,7 +93,7 @@ var (
 	projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
 	MhSpilt        = regexp.MustCompile("[::]")
 	//识别采购单位联系人、联系电话、代理机构联系人、联系电话
-	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公)?((电话([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|(项目)?经办)人)|采购方代表")
+	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?((电话([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|(项目)?经办)人)|采购方代表")
 	ContactInfoMustReg  = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
 	ContactType         = map[string]*regexp.Regexp{
 		"采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|甲|招标(服务)?|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人|采购部"),

+ 23 - 6
src/jy/pretreated/colonkv.go

@@ -353,12 +353,13 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//	}
 	startIndex := 0
 	prevKey := ""
-	index, notmatchCount, allMatchCount := 0, 0, 0
+	index, tmpindex, notmatchCount, allMatchCount := 0, 0, 0, 0
 	weightMap := map[string]map[string]interface{}{}     //权重
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
 	kvsTemp := make([]*Kv, len(*kvs))
 	copy(kvsTemp, *kvs)
 	//again := 0
+	ishad := false
 	for kv_index, kv := range *kvs {
 		isBreak := true
 		v := strings.TrimSpace(kv.Value)
@@ -415,6 +416,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			}
 		} else if ascFind {
 			for _, ct_k := range HasOrderContactType(k) {
+				ishad = false
 				//again++
 				if k_length < 3 || k_length > 15 {
 					isBreak = false
@@ -482,6 +484,8 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				}
 				allMatchCount++
 				if IsMapHasValue(ct_k, indexMap) {
+					ishad = true
+					tmpindex = GetIndex(ct_k, indexMap)
 					isContinue = true
 					continue
 				}
@@ -543,11 +547,15 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		//		} else if index < 2 {
 		//			index++
 		//		}
-		if prevKey != k {
-			prevKey = k
-			index = 1
-		} else if prevKey == k {
-			index++
+		if ishad {
+			index = tmpindex
+		} else {
+			if prevKey != k {
+				prevKey = k
+				index = 1
+			} else if prevKey == k {
+				index++
+			}
 		}
 		//		if startIndex == 0 || startIndex%2 == 1 || index == 0 {
 		//			index = 1
@@ -908,3 +916,12 @@ func GetMoneyUnit(key, val string) string {
 	}
 	return ""
 }
+
+func GetIndex(ct_k string, indexMap map[int]string) int {
+	for k, v := range indexMap {
+		if ct_k == v {
+			return k
+		}
+	}
+	return 1
+}

+ 10 - 8
src/jy/pretreated/division.go

@@ -158,6 +158,7 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock) ([]*ut
 		if k+1 != index {
 			if k == 0 {
 				returnValue = 3
+				break
 			} else {
 				if currentIndex+1 != index {
 					//如果序号不是连续的,不往下走
@@ -177,7 +178,6 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock) ([]*ut
 			}
 			currentIndex = index
 		}
-
 		//
 		title := serialTitles[2]                         //标题
 		title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
@@ -428,6 +428,7 @@ func appendWarpStop(text string) string {
 	}
 	return text
 }
+
 //分段
 func DivideSegmentHtml(txt string) []*util.Segment {
 	//先分段
@@ -448,6 +449,7 @@ func DivideSegmentHtml(txt string) []*util.Segment {
 	}
 	return segs
 }
+
 //分段
 func DivideSegment(txt string) []*util.Segment {
 	//先分段
@@ -463,7 +465,7 @@ func DivideSegment(txt string) []*util.Segment {
 				tmpstr += fmt.Sprint(r)
 				return false
 			} else if tmpstr == fmt.Sprint(r) {
-				if r == 46 || r == 12289{
+				if r == 46 || r == 12289 {
 					tmpstr = ""
 				}
 				return false
@@ -473,7 +475,7 @@ func DivideSegment(txt string) []*util.Segment {
 				return true
 			}
 		}
-		tmpstr= ""
+		tmpstr = ""
 		return r == 10 || r == 13
 	})
 	//再去除空行
@@ -588,7 +590,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			return false, ""
 		}
 		//
-		is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
+		is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
 		for _, sv := range is {
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 		}
@@ -628,13 +630,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexPkgMap[sv[0]] = v[0]
 		}
 		//key在包前面,并且在一行的开头
-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
 		if len(keys) == 0 {
 			//key在包前面,并且key以冒号结尾
-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		if len(keys) == 0 {
-			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		for _, key := range keys {
 			startEndMap[key[5]] = key[4]
@@ -688,7 +690,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			}
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
-			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
+			text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {

+ 7 - 2
src/res/fieldscore.json

@@ -64,6 +64,11 @@
                 "describe": "以*结尾",
                 "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心)$",
                 "score": -5
+            },
+            {
+                "describe": "包含词",
+                "regstr": "(万元|本项目)",
+                "score": -10
             }
         ],
         "length": [
@@ -103,14 +108,14 @@
                 "describe": "[gt,lte,score]",
                 "range": [
                     35,
-                    45,
+                    60,
                     1
                 ]
             },
             {
                 "describe": "[gt,∞,score]",
                 "range": [
-                    45,
+                    60,
                     -1,
                     -2
                 ]