|
@@ -48,12 +48,16 @@ var (
|
|
|
regDivision = regexp.MustCompile("[::]")
|
|
|
regSpliteSegment = regexp.MustCompile("[\r\n]")
|
|
|
regFilterNumber = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
|
|
|
- regSplit = regexp.MustCompile("和|以?及|与|、")
|
|
|
+ regSplit = regexp.MustCompile("或|和|以?及|与|、|或")
|
|
|
regStartWrap = regexp.MustCompile("^[\r\n]")
|
|
|
regEndWrap = regexp.MustCompile("[\r\n]$")
|
|
|
regMoreWrap = regexp.MustCompile("[\r\n]{2,}")
|
|
|
replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
|
|
|
moreColonReg = regexp.MustCompile("[::]+")
|
|
|
+ regFilter = regexp.MustCompile("等$")
|
|
|
+ confusion = map[string]string{
|
|
|
+ "参与": "canyu",
|
|
|
+ }
|
|
|
//查找分包之前,先对内容进行预处理
|
|
|
/*
|
|
|
第一包:采购设备清单
|
|
@@ -154,7 +158,6 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
|
|
|
}
|
|
|
//获取块中除了序号和标题的内容
|
|
|
blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
|
|
|
- var titles = []string{}
|
|
|
if title != "" {
|
|
|
blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
|
|
|
//特殊情况处理
|
|
@@ -173,6 +176,7 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
|
|
|
十二、开标时间:2017年3月20日9时30分
|
|
|
*/
|
|
|
blockText = title
|
|
|
+ title = ""
|
|
|
}
|
|
|
} else if blockTextTemp != "" && regDivision.MatchString(title) {
|
|
|
/*
|
|
@@ -185,34 +189,16 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
|
|
|
divisionIndexs := regDivision.FindStringIndex(title)
|
|
|
titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
|
|
|
titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
|
|
|
+ blockText = title + "\n" + blockText
|
|
|
if titleAfter != "" {
|
|
|
- titles = append(titles, titleBefore)
|
|
|
- //分段 去每一个冒号前面的key
|
|
|
- segments := regSpliteSegment.Split(blockText, -1)
|
|
|
- for _, sv := range segments {
|
|
|
- divisionIndexs = regDivision.FindStringIndex(sv)
|
|
|
- if len(divisionIndexs) == 0 {
|
|
|
- continue
|
|
|
- }
|
|
|
- titleTemp := regReplAllSpace.ReplaceAllString(sv[:divisionIndexs[0]], "")
|
|
|
- if titleTemp == "" {
|
|
|
- continue
|
|
|
- }
|
|
|
- titles = append(titles, titleTemp)
|
|
|
- }
|
|
|
- blockText = title + "\n" + blockText
|
|
|
title = ""
|
|
|
} else {
|
|
|
- blockText = title + "\n" + blockText
|
|
|
title = titleBefore
|
|
|
}
|
|
|
} else {
|
|
|
blockText = title + "\n" + blockText
|
|
|
}
|
|
|
}
|
|
|
- if len(titles) == 0 {
|
|
|
- titles = append(titles, title)
|
|
|
- }
|
|
|
//没有内容的块,不打标签,不分段
|
|
|
if blockText == "" {
|
|
|
continue
|
|
@@ -222,29 +208,29 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
|
|
|
continue
|
|
|
}
|
|
|
blockText = hasMergeKV(title, blockText)
|
|
|
- block := &util.Block{
|
|
|
- Index: index, //序号
|
|
|
- Text: blockText, //内容
|
|
|
- Title: title, //标题
|
|
|
- Start: start,
|
|
|
- End: nextStart,
|
|
|
- }
|
|
|
//
|
|
|
titleIsExists := map[string]bool{} //去重
|
|
|
- for _, tv := range titles {
|
|
|
- tv = filterTitle(tv)
|
|
|
- //分割标题 [和及]。。。
|
|
|
- splitTitles := regSplit.Split(tv, -1)
|
|
|
- for _, sv := range splitTitles {
|
|
|
- if sv == "" || titleIsExists[sv] {
|
|
|
- continue
|
|
|
- }
|
|
|
- titleIsExists[sv] = true
|
|
|
- //标题过短过长不打标签
|
|
|
- if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
|
|
|
- //打标签
|
|
|
- block.Tags = append(block.Tags, util.GetBlockTags(sv))
|
|
|
- }
|
|
|
+ title = filterTitle(title)
|
|
|
+ //分割标题 [和及]。。。 参与
|
|
|
+ splitTitles := ProcTitle(title)
|
|
|
+ block := &util.Block{
|
|
|
+ Index: index, //序号
|
|
|
+ Text: blockText, //内容
|
|
|
+ Title: title, //标题
|
|
|
+ Titles: splitTitles,
|
|
|
+ Start: start,
|
|
|
+ End: nextStart,
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, sv := range splitTitles {
|
|
|
+ if sv == "" || titleIsExists[sv] {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ titleIsExists[sv] = true
|
|
|
+ //标题过短过长不打标签
|
|
|
+ if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
|
|
|
+ //打标签
|
|
|
+ block.Tags = append(block.Tags, util.GetBlockTags(sv))
|
|
|
}
|
|
|
}
|
|
|
tagsToBlocks(blocks, block)
|
|
@@ -268,18 +254,62 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
|
|
|
returnValue = 1
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+ contactFormat := &util.ContactFormat{
|
|
|
+ IndexMap: map[int]string{},
|
|
|
+ MatchMap: map[string]map[string]bool{},
|
|
|
+ }
|
|
|
for _, bl := range returnBlocks {
|
|
|
//解析kv
|
|
|
newText := TextAfterRemoveTable(bl.Text)
|
|
|
- bl.ColonKV = GetKVAll(newText, bl.Title, from)
|
|
|
- bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title)
|
|
|
+ bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from)
|
|
|
+ bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat)
|
|
|
//正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
|
|
|
bl.Text = appendWarpStop(bl.Text)
|
|
|
}
|
|
|
return returnBlocks, returnValue
|
|
|
}
|
|
|
|
|
|
+//块标题处理
|
|
|
+func ProcTitle(title string) []string {
|
|
|
+ if title == "" {
|
|
|
+ return []string{}
|
|
|
+ }
|
|
|
+ for k, v := range confusion {
|
|
|
+ title = strings.Replace(title, k, v, -1)
|
|
|
+ }
|
|
|
+ direct := 1
|
|
|
+ prev := ""
|
|
|
+ ara := regSplit.Split(title, -1)
|
|
|
+ for kk, vv := range ara {
|
|
|
+ for kkk, vvv := range confusion {
|
|
|
+ vv = strings.Replace(vv, vvv, kkk, -1)
|
|
|
+ }
|
|
|
+ ara[kk] = vv
|
|
|
+ if len([]rune(vv)) == 2 {
|
|
|
+ if kk == 0 {
|
|
|
+ direct = -1
|
|
|
+ } else {
|
|
|
+ start := ""
|
|
|
+ if len([]rune(prev)) > 3 {
|
|
|
+ start = string([]rune(prev)[:len([]rune(prev))-2])
|
|
|
+ }
|
|
|
+ ara[kk] = start + vv
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if len([]rune(vv)) > 3 {
|
|
|
+ if direct == -1 {
|
|
|
+ end := string([]rune(vv)[len([]rune(vv))-2:])
|
|
|
+ for i := 0; i < kk; i++ {
|
|
|
+ ara[i] = ara[i] + end
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ prev = vv
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ara
|
|
|
+}
|
|
|
+
|
|
|
//有合并kv的 例如项目名称及编号
|
|
|
func hasMergeKV(title, text string) string {
|
|
|
title = regDivision.ReplaceAllString(title, "")
|
|
@@ -413,6 +443,12 @@ func tagsToBlocks(blocks []*util.Block, block *util.Block) {
|
|
|
}
|
|
|
|
|
|
func filterTitle(title string) string {
|
|
|
+ if strings.Contains(title, ",") && strings.Contains(title, "。") {
|
|
|
+ return ""
|
|
|
+ }
|
|
|
+ if len([]rune(title)) > 30 {
|
|
|
+ return ""
|
|
|
+ }
|
|
|
//清理空格
|
|
|
title = regReplAllSpace.ReplaceAllString(title, "")
|
|
|
//清理成对出现的符号中的内容
|
|
@@ -421,6 +457,7 @@ func filterTitle(title string) string {
|
|
|
title = regReplAllSymbol.ReplaceAllString(title, "")
|
|
|
//清理序号
|
|
|
title = regFilterNumber.ReplaceAllString(title, "")
|
|
|
+ title = regFilter.ReplaceAllString(title, "")
|
|
|
return title
|
|
|
}
|
|
|
|
|
@@ -438,8 +475,8 @@ func FindPackageFromBlocks(blocks *[]*util.Block, title string) (blockPackage ma
|
|
|
//把分包内容摘除掉有问题 有的项目名称中包含二标段
|
|
|
if ok && false {
|
|
|
v.Text = surplusText
|
|
|
- v.ColonKV = GetKVAll(surplusText, v.Title, 1)
|
|
|
- v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title)
|
|
|
+ v.ColonKV = GetKVAll(surplusText, v.Title, nil, 1)
|
|
|
+ v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title, nil)
|
|
|
}
|
|
|
}
|
|
|
return
|
|
@@ -588,9 +625,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
//合并文本
|
|
|
(*blockPackage)[index].Text += "\n" + text
|
|
|
//合并冒号kv
|
|
|
- colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", 1)
|
|
|
+ colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1)
|
|
|
if headKey != "" {
|
|
|
- kvAgain := GetKVAll(text, "", 4)
|
|
|
+ kvAgain := GetKVAll(text, "", nil, 4)
|
|
|
for kv_k, kv_v := range kvAgain.Kv {
|
|
|
if colonJobKv.Kv[kv_k] == "" {
|
|
|
colonJobKv.Kv[kv_k] = kv_v
|
|
@@ -608,7 +645,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
(*blockPackage)[index].ColonKV.Kv[kv_k] = kv_v
|
|
|
}
|
|
|
//合并空格kv
|
|
|
- spaceJobKv := SspacekvEntity.Entrance(text, "")
|
|
|
+ spaceJobKv := SspacekvEntity.Entrance(text, "", nil)
|
|
|
for kv_k, kv_v := range spaceJobKv.Kv {
|
|
|
if kv_v == "" {
|
|
|
continue
|
|
@@ -626,9 +663,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
Type: bv[1],
|
|
|
Accuracy: accuracy,
|
|
|
}
|
|
|
- finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", 4)
|
|
|
+ finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4)
|
|
|
if headKey != "" {
|
|
|
- kvAgain := GetKVAll(text, "", 4)
|
|
|
+ kvAgain := GetKVAll(text, "", nil, 4)
|
|
|
for kv_k, kv_v := range kvAgain.Kv {
|
|
|
if finalKv.Kv[kv_k] == "" {
|
|
|
finalKv.Kv[kv_k] = kv_v
|
|
@@ -637,7 +674,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
}
|
|
|
}
|
|
|
newBpkg.ColonKV = finalKv
|
|
|
- newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "")
|
|
|
+ newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil)
|
|
|
(*blockPackage)[index] = newBpkg
|
|
|
}
|
|
|
}
|