wcj 6 ani în urmă
părinte
comite
33aa9995d3

+ 9 - 16
src/jy/pretreated/analystep.go

@@ -22,7 +22,7 @@ func AnalyStart(job *util.Job) {
 	tabs, ration := ComputeConRatio(con, 1)
 	if len(tabs) > 0 {
 		newcon, newtabs, newration := FindBigText(con, ration, tabs)
-		if newcon != "" && newration == 0 {
+		if newcon != "" {
 			con = newcon
 			tabs = newtabs
 			ration = newration
@@ -34,16 +34,7 @@ func AnalyStart(job *util.Job) {
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title)
 		for _, bl := range blockArrays {
 			if len([]rune(bl.Text)) > 80 {
-				ba1, _ := DivideBlock(bl.Text, 1, job.RuleBlock)
-				if len(ba1) > 0 {
-					t := ""
-					for _, t1 := range ba1 {
-						t += t1.Text
-					}
-					bl.Text = t
-					bl.ColonKV = GetKVAll(t, bl.Title, 1)
-					bl.SpaceKV = SspacekvEntity.Entrance(t, bl.Title)
-				}
+				bl.Block, _ = DivideBlock(bl.Text, 1, job.RuleBlock)
 			}
 			//块中再查找表格(块,处理完把值赋到块)
 			t1, _ := ComputeConRatio(bl.Text, 2)
@@ -77,8 +68,8 @@ func AnalyStart(job *util.Job) {
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 		}
 		//调用kv解析
-		bl.ColonKV = GetKVAll(newCon, "", 1)
-		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "")
+		bl.ColonKV = GetKVAll(newCon, "", nil, 1)
+		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
 		job.Block = append(job.Block, bl)
 	}
 }
@@ -268,7 +259,7 @@ func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) str
 			content = regEndWrap.ReplaceAllString(content, "")
 			doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
 			doc.Find("table").Eq(0).ReplaceWithHtml(content)
-			con, _ = doc.Html()
+			con, _ = doc.Find("body").Html()
 		}
 	}
 	return con
@@ -284,9 +275,11 @@ func FindBigText(con string, r float32, t []*goquery.Selection) (content string,
 		if content != "" {
 			tabs, ration = ComputeConRatio(content, 1)
 			if len(tabs) > 0 {
-				content = tableDivideBlock(content, ration, tabs)
-				if content == "" {
+				con := tableDivideBlock(content, ration, tabs)
+				if con == "" {
 					return
+				} else {
+					content = con
 				}
 			} else {
 				doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))

+ 4 - 4
src/jy/pretreated/analytable.go

@@ -1251,7 +1251,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
 				tr.TDs[0].BH = false
 				tr.TDs[0].KVDirect = 0
 				sv := FindKv(tr.TDs[0].Val, "", 2)
-				_, resm := colonkvEntity.entrance(tr.TDs[0].Val, "", 2)
+				_, resm := colonkvEntity.entrance(tr.TDs[0].Val, "", nil, 2)
 				for k, v := range resm {
 					sv.AddKey(k, v)
 				}
@@ -1376,7 +1376,7 @@ func (table *Table) FindKV() {
 			for n, r := range r1 {
 				if len([]rune(r)) < 60 { // 长度小于60才去分
 					//res1, _ := GetKVAll(r, "", nil)
-					res1, _ := colonkvEntity.entrance(r, "", 2)
+					res1, _ := colonkvEntity.entrance(r, "", nil, 2)
 					if res1 != nil {
 						nmap[n] = res1
 						nmapkeys = append(nmapkeys, n)
@@ -2022,8 +2022,8 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 					//if !(len(k1tags) > 0 && k1tags[0].Value == "采购单位") {
 					//	tn.SortKV.RemoveKey(k1)
 					//}
-					for _,vcgdw:=range k1tags{
-						if vcgdw.Value =="采购单位"{
+					for _, vcgdw := range k1tags {
+						if vcgdw.Value == "采购单位" {
 							tn.SortKV.RemoveKey(k1)
 						}
 					}

+ 7 - 6
src/jy/pretreated/colonkv.go

@@ -63,8 +63,11 @@ func (ce *ColonkvEntity) divisionMoreKV(con string) string {
 }
 
 //获取冒号kv入口
-func (ce *ColonkvEntity) entrance(con, title string, from int) ([]*Kv, map[string]string) {
+func (ce *ColonkvEntity) entrance(con, title string, contactFormat *ContactFormat, from int) ([]*Kv, map[string]string) {
 	kvs := ce.GetKvs(con, title, from)
+	if from == 1 {
+		FormatContactKv(&kvs, title, nil, contactFormat)
+	}
 	kv := map[string]string{}
 	for _, v := range kvs {
 		if strings.TrimSpace(v.Value) == "" {
@@ -238,8 +241,7 @@ func IsContactKvHandle(value string, m map[string]bool) bool {
 
 //kv关于联系人信息的处理
 //采购人>集中采购机构
-/*
-func FormatContactKv(kvs *[]*Kv, title string, buyers []string) {
+func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *ContactFormat) {
 	////////////////////////////
 	//处理联系人信息
 	var indexMap map[int]string
@@ -565,7 +567,6 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string) {
 	//	}
 	//Debug("totalIndexMap", len(totalIndexMap))
 }
-*/
 func ContactTypeTitleMatch(title string) string {
 	matchType := ""
 	if title != "" && len([]rune(title)) < 15 {
@@ -614,9 +615,9 @@ func HasOrderContactType(text string) []string {
 
 //两种冒号kv结合到一起
 //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
-func GetKVAll(content, title string, from int) *JobKv {
+func GetKVAll(content, title string, contactFormat *ContactFormat, from int) *JobKv {
 	content = formatText(content, "kv")
-	m1Kvs, _ := colonkvEntity.entrance(content, title, from)
+	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from)
 	m1, m1Weight := KvTagsToKV(m1Kvs, title, nil, from)
 	if m1 == nil {
 		m1 = map[string]string{}

+ 90 - 53
src/jy/pretreated/division.go

@@ -48,12 +48,16 @@ var (
 	regDivision        = regexp.MustCompile("[::]")
 	regSpliteSegment   = regexp.MustCompile("[\r\n]")
 	regFilterNumber    = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
-	regSplit           = regexp.MustCompile("和|以?及|与|、")
+	regSplit           = regexp.MustCompile("或|和|以?及|与|、|或")
 	regStartWrap       = regexp.MustCompile("^[\r\n]")
 	regEndWrap         = regexp.MustCompile("[\r\n]$")
 	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
 	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
 	moreColonReg       = regexp.MustCompile("[::]+")
+	regFilter          = regexp.MustCompile("等$")
+	confusion          = map[string]string{
+		"参与": "canyu",
+	}
 	//查找分包之前,先对内容进行预处理
 	/*
 		第一包:采购设备清单
@@ -154,7 +158,6 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 		}
 		//获取块中除了序号和标题的内容
 		blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
-		var titles = []string{}
 		if title != "" {
 			blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
 			//特殊情况处理
@@ -173,6 +176,7 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 						十二、开标时间:2017年3月20日9时30分
 					*/
 					blockText = title
+					title = ""
 				}
 			} else if blockTextTemp != "" && regDivision.MatchString(title) {
 				/*
@@ -185,34 +189,16 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 				divisionIndexs := regDivision.FindStringIndex(title)
 				titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
 				titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
+				blockText = title + "\n" + blockText
 				if titleAfter != "" {
-					titles = append(titles, titleBefore)
-					//分段 去每一个冒号前面的key
-					segments := regSpliteSegment.Split(blockText, -1)
-					for _, sv := range segments {
-						divisionIndexs = regDivision.FindStringIndex(sv)
-						if len(divisionIndexs) == 0 {
-							continue
-						}
-						titleTemp := regReplAllSpace.ReplaceAllString(sv[:divisionIndexs[0]], "")
-						if titleTemp == "" {
-							continue
-						}
-						titles = append(titles, titleTemp)
-					}
-					blockText = title + "\n" + blockText
 					title = ""
 				} else {
-					blockText = title + "\n" + blockText
 					title = titleBefore
 				}
 			} else {
 				blockText = title + "\n" + blockText
 			}
 		}
-		if len(titles) == 0 {
-			titles = append(titles, title)
-		}
 		//没有内容的块,不打标签,不分段
 		if blockText == "" {
 			continue
@@ -222,29 +208,29 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 			continue
 		}
 		blockText = hasMergeKV(title, blockText)
-		block := &util.Block{
-			Index: index,     //序号
-			Text:  blockText, //内容
-			Title: title,     //标题
-			Start: start,
-			End:   nextStart,
-		}
 		//
 		titleIsExists := map[string]bool{} //去重
-		for _, tv := range titles {
-			tv = filterTitle(tv)
-			//分割标题 [和及]。。。
-			splitTitles := regSplit.Split(tv, -1)
-			for _, sv := range splitTitles {
-				if sv == "" || titleIsExists[sv] {
-					continue
-				}
-				titleIsExists[sv] = true
-				//标题过短过长不打标签
-				if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
-					//打标签
-					block.Tags = append(block.Tags, util.GetBlockTags(sv))
-				}
+		title = filterTitle(title)
+		//分割标题 [和及]。。。 参与
+		splitTitles := ProcTitle(title)
+		block := &util.Block{
+			Index:  index,     //序号
+			Text:   blockText, //内容
+			Title:  title,     //标题
+			Titles: splitTitles,
+			Start:  start,
+			End:    nextStart,
+		}
+
+		for _, sv := range splitTitles {
+			if sv == "" || titleIsExists[sv] {
+				continue
+			}
+			titleIsExists[sv] = true
+			//标题过短过长不打标签
+			if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
+				//打标签
+				block.Tags = append(block.Tags, util.GetBlockTags(sv))
 			}
 		}
 		tagsToBlocks(blocks, block)
@@ -268,18 +254,62 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 			returnValue = 1
 		}
 	}
-
+	contactFormat := &util.ContactFormat{
+		IndexMap: map[int]string{},
+		MatchMap: map[string]map[string]bool{},
+	}
 	for _, bl := range returnBlocks {
 		//解析kv
 		newText := TextAfterRemoveTable(bl.Text)
-		bl.ColonKV = GetKVAll(newText, bl.Title, from)
-		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title)
+		bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from)
+		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat)
 		//正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
 		bl.Text = appendWarpStop(bl.Text)
 	}
 	return returnBlocks, returnValue
 }
 
+//块标题处理
+func ProcTitle(title string) []string {
+	if title == "" {
+		return []string{}
+	}
+	for k, v := range confusion {
+		title = strings.Replace(title, k, v, -1)
+	}
+	direct := 1
+	prev := ""
+	ara := regSplit.Split(title, -1)
+	for kk, vv := range ara {
+		for kkk, vvv := range confusion {
+			vv = strings.Replace(vv, vvv, kkk, -1)
+		}
+		ara[kk] = vv
+		if len([]rune(vv)) == 2 {
+			if kk == 0 {
+				direct = -1
+			} else {
+				start := ""
+				if len([]rune(prev)) > 3 {
+					start = string([]rune(prev)[:len([]rune(prev))-2])
+				}
+				ara[kk] = start + vv
+			}
+		}
+		if len([]rune(vv)) > 3 {
+			if direct == -1 {
+				end := string([]rune(vv)[len([]rune(vv))-2:])
+				for i := 0; i < kk; i++ {
+					ara[i] = ara[i] + end
+				}
+				break
+			}
+			prev = vv
+		}
+	}
+	return ara
+}
+
 //有合并kv的 例如项目名称及编号
 func hasMergeKV(title, text string) string {
 	title = regDivision.ReplaceAllString(title, "")
@@ -413,6 +443,12 @@ func tagsToBlocks(blocks []*util.Block, block *util.Block) {
 }
 
 func filterTitle(title string) string {
+	if strings.Contains(title, ",") && strings.Contains(title, "。") {
+		return ""
+	}
+	if len([]rune(title)) > 30 {
+		return ""
+	}
 	//清理空格
 	title = regReplAllSpace.ReplaceAllString(title, "")
 	//清理成对出现的符号中的内容
@@ -421,6 +457,7 @@ func filterTitle(title string) string {
 	title = regReplAllSymbol.ReplaceAllString(title, "")
 	//清理序号
 	title = regFilterNumber.ReplaceAllString(title, "")
+	title = regFilter.ReplaceAllString(title, "")
 	return title
 }
 
@@ -438,8 +475,8 @@ func FindPackageFromBlocks(blocks *[]*util.Block, title string) (blockPackage ma
 		//把分包内容摘除掉有问题 有的项目名称中包含二标段
 		if ok && false {
 			v.Text = surplusText
-			v.ColonKV = GetKVAll(surplusText, v.Title, 1)
-			v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title)
+			v.ColonKV = GetKVAll(surplusText, v.Title, nil, 1)
+			v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title, nil)
 		}
 	}
 	return
@@ -588,9 +625,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 				//合并文本
 				(*blockPackage)[index].Text += "\n" + text
 				//合并冒号kv
-				colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", 1)
+				colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1)
 				if headKey != "" {
-					kvAgain := GetKVAll(text, "", 4)
+					kvAgain := GetKVAll(text, "", nil, 4)
 					for kv_k, kv_v := range kvAgain.Kv {
 						if colonJobKv.Kv[kv_k] == "" {
 							colonJobKv.Kv[kv_k] = kv_v
@@ -608,7 +645,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					(*blockPackage)[index].ColonKV.Kv[kv_k] = kv_v
 				}
 				//合并空格kv
-				spaceJobKv := SspacekvEntity.Entrance(text, "")
+				spaceJobKv := SspacekvEntity.Entrance(text, "", nil)
 				for kv_k, kv_v := range spaceJobKv.Kv {
 					if kv_v == "" {
 						continue
@@ -626,9 +663,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					Type:     bv[1],
 					Accuracy: accuracy,
 				}
-				finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", 4)
+				finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4)
 				if headKey != "" {
-					kvAgain := GetKVAll(text, "", 4)
+					kvAgain := GetKVAll(text, "", nil, 4)
 					for kv_k, kv_v := range kvAgain.Kv {
 						if finalKv.Kv[kv_k] == "" {
 							finalKv.Kv[kv_k] = kv_v
@@ -637,7 +674,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					}
 				}
 				newBpkg.ColonKV = finalKv
-				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "")
+				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil)
 				(*blockPackage)[index] = newBpkg
 			}
 		}

+ 2 - 2
src/jy/pretreated/spacekv.go

@@ -16,7 +16,7 @@ var (
 	excludeSpaceKey = regexp.MustCompile("[.、�\\[【{{〔<《\\]】}}〕>》]")
 )
 
-func (se *SpacekvEntity) Entrance(text, title string) *util.JobKv {
+func (se *SpacekvEntity) Entrance(text, title string, contactFormat *util.ContactFormat) *util.JobKv {
 	lines := se.getLines(text)
 	kvMaps := []*util.Kv{}
 	for _, line := range lines {
@@ -26,7 +26,7 @@ func (se *SpacekvEntity) Entrance(text, title string) *util.JobKv {
 		}
 		kvMaps = append(kvMaps, kvMap...)
 	}
-	//FormatContactKv(&kvMaps, title, nil, contactFormat)
+	FormatContactKv(&kvMaps, title, nil, contactFormat)
 	kv, tagKv := KvTagsToKV(kvMaps, title, nil, 1)
 	return &util.JobKv{
 		Kvs:   kvMaps,

+ 3 - 3
src/jy/pretreated/tablev2.go

@@ -214,7 +214,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	ub := []*u.Block{}
 	if lentxt > 50 { //看是否划块
 		//u.Debug(txt)
-		ub, _ = DivideBlock(txt, 2, nil)
+		ub, _ = DivideBlock(txt, 2, table.TableResult.RuleBlock)
 		if len(ub) > 0 {
 			colonKvWeight := map[string]int{}
 			spaceKvWeight := map[string]int{}
@@ -294,7 +294,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		if len(td.TR.TDs) > 0 {
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 		}
-		_, resm := colonkvEntity.entrance(text, kvTitle, 2)
+		_, resm := colonkvEntity.entrance(text, kvTitle, nil, 2)
 		for k, v := range resm {
 			td.SortKV.AddKey(k, v)
 		}
@@ -339,7 +339,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		if len(td.TR.TDs) > 0 {
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 		}
-		_, resm := colonkvEntity.entrance(text, kvTitle, 2)
+		_, resm := colonkvEntity.entrance(text, kvTitle, nil, 2)
 		for k, v := range resm {
 			td.SortKV.AddKey(k, v)
 		}

+ 26 - 24
src/jy/util/article.go

@@ -6,30 +6,30 @@ import (
 
 //
 type Job struct {
-	SourceMid    string                            //数据源的MongoId
-	Category     string                            //类别
-	CategorySecond string							//二级分类
-	Content      string                            //正文
-	Title        string                            //标题
-	SpiderCode   string                            //爬虫代码
-	Domain       string                            //网站域名
-	Href         string                            //原文链接
-	City         string                            //城市
-	Province     string                            //省份
-	Data         *map[string]interface{}           //数据库源数据
-	Block        []*Block                          //分块
-	Result       map[string][]*ExtField            //结果
-	BuyerAddr    string                            //采购单位地址
-	BlockPackage map[string]*BlockPackage          //块中的分包
-	Winnerorder  []map[string]interface{}          //中标候选人排序
-	PackageInfo  map[string]map[string]interface{} //分包信息
-	RuleBlock    *RuleBlock                        //分块规则
-	BrandData    [][]map[string]string             //
-	HasTable     int                               //有table
-	HasKey       int                               //是否匹配到table中的标题
-	HasBrand     int                               //有品牌
-	HasGoods     int                               //有商品
-	IsFile       bool                              //有附件
+	SourceMid      string                            //数据源的MongoId
+	Category       string                            //类别
+	CategorySecond string                            //二级分类
+	Content        string                            //正文
+	Title          string                            //标题
+	SpiderCode     string                            //爬虫代码
+	Domain         string                            //网站域名
+	Href           string                            //原文链接
+	City           string                            //城市
+	Province       string                            //省份
+	Data           *map[string]interface{}           //数据库源数据
+	Block          []*Block                          //分块
+	Result         map[string][]*ExtField            //结果
+	BuyerAddr      string                            //采购单位地址
+	BlockPackage   map[string]*BlockPackage          //块中的分包
+	Winnerorder    []map[string]interface{}          //中标候选人排序
+	PackageInfo    map[string]map[string]interface{} //分包信息
+	RuleBlock      *RuleBlock                        //分块规则
+	BrandData      [][]map[string]string             //
+	HasTable       int                               //有table
+	HasKey         int                               //是否匹配到table中的标题
+	HasBrand       int                               //有品牌
+	HasGoods       int                               //有商品
+	IsFile         bool                              //有附件
 }
 
 type ExtField struct {
@@ -53,6 +53,7 @@ type RuleBlock struct {
 type Block struct {
 	Tags     []Tags          //对块做的标签,可以作为数据抽取的依据
 	Title    string          //块标题
+	Titles   []string        //拆分以后多个块标题
 	Index    int             //块索引
 	Text     string          //块内容
 	Start    int             //开始索引
@@ -63,6 +64,7 @@ type Block struct {
 	BPackage *BlockPackage   //分包信息
 	Tag      map[string]bool //块标签
 	Block    []*Block        //子块
+	Category string          //块分类
 }
 
 //段落

+ 5 - 5
src/res/formattext.json

@@ -20,11 +20,6 @@
             "separator": " ",
             "desc": "替换掉无效的kv"
         },
-        {
-            "reg": "[^\\n::]{2,18}[::]\\s*详见[^,。,.::\\s]{2,18}",
-            "separator": "",
-            "desc": "替换掉无效的kv"
-        },
         {
             "reg": "(\\d+[,,.]+)+\\d+((百|千)?元|(百|千)?(万|亿)元?)",
             "separator": "[,,]__",
@@ -182,6 +177,11 @@
             "reg": "\n[\\d.\u3000\u2003\u00a0\\s]*(联系人)及(电话)[::](.+?)[\u3000\u2003\u00a0\\s]+(.+)",
             "separator": "\n$1:$3\n$2:$4",
             "desc": ""
+        },
+        {
+            "reg": "[^\\n::]{2,18}[::]\\s*详见[^,。,.::\\s]{2,18}",
+            "separator": "",
+            "desc": "替换掉无效的kv"
         }
     ]
 }