|
@@ -1,6 +1,7 @@
|
|
|
package pretreated
|
|
|
|
|
|
import (
|
|
|
+ "fmt"
|
|
|
"jy/util"
|
|
|
qutil "qfw/util"
|
|
|
"regexp"
|
|
@@ -65,6 +66,24 @@ var (
|
|
|
*/
|
|
|
regPackageFilter = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
|
|
|
filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
|
|
|
+ xuhao = map[string]bool{
|
|
|
+ "19968_12289": true,
|
|
|
+ "19968_46": true,
|
|
|
+ "20108_12289": true,
|
|
|
+ "20108_46": true,
|
|
|
+ "19977_12289": true,
|
|
|
+ "19977_46": true,
|
|
|
+ "22235_12289": true,
|
|
|
+ "22235_46": true,
|
|
|
+ "20116_12289": true,
|
|
|
+ "20116_46": true,
|
|
|
+ "20845_12289": true,
|
|
|
+ "20845_46": true,
|
|
|
+ "19971_12289": true,
|
|
|
+ "19971_46": true,
|
|
|
+ "20843_12289": true,
|
|
|
+ "20061_46": true,
|
|
|
+ }
|
|
|
)
|
|
|
|
|
|
//分块
|
|
@@ -409,11 +428,52 @@ func appendWarpStop(text string) string {
|
|
|
}
|
|
|
return text
|
|
|
}
|
|
|
-
|
|
|
+//分段
|
|
|
+func DivideSegmentHtml(txt string) []*util.Segment {
|
|
|
+ //先分段
|
|
|
+ _segs := strings.FieldsFunc(txt, func(r rune) bool {
|
|
|
+ return r == 10 || r == 13
|
|
|
+ })
|
|
|
+ //再去除空行
|
|
|
+ segs := make([]*util.Segment, 0)
|
|
|
+ _index := 0
|
|
|
+ for _, seg := range _segs {
|
|
|
+ if seg != " " && len(seg) > 1 {
|
|
|
+ _seg := util.Segment{}
|
|
|
+ _index = _index + 1
|
|
|
+ _seg.Index = _index
|
|
|
+ _seg.Text = seg
|
|
|
+ segs = append(segs, &_seg)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return segs
|
|
|
+}
|
|
|
//分段
|
|
|
func DivideSegment(txt string) []*util.Segment {
|
|
|
//先分段
|
|
|
+ tmpstr := ""
|
|
|
_segs := strings.FieldsFunc(txt, func(r rune) bool {
|
|
|
+ if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
|
|
|
+ r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
|
|
|
+ if tmpstr == "" {
|
|
|
+ tmpstr += fmt.Sprint(r)
|
|
|
+ return false
|
|
|
+ } else if strings.Contains(tmpstr, "_") {
|
|
|
+ tmpstr = ""
|
|
|
+ tmpstr += fmt.Sprint(r)
|
|
|
+ return false
|
|
|
+ } else if tmpstr == fmt.Sprint(r) {
|
|
|
+ if r == 46 || r == 12289{
|
|
|
+ tmpstr = ""
|
|
|
+ }
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ tmpstr += "_" + fmt.Sprint(r)
|
|
|
+ if xuhao[tmpstr] {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tmpstr= ""
|
|
|
return r == 10 || r == 13
|
|
|
})
|
|
|
//再去除空行
|
|
@@ -528,7 +588,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
return false, ""
|
|
|
}
|
|
|
//
|
|
|
- is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
|
|
|
+ is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
|
|
|
for _, sv := range is {
|
|
|
appendWarpIndex = append(appendWarpIndex, sv[0])
|
|
|
}
|
|
@@ -568,13 +628,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
indexPkgMap[sv[0]] = v[0]
|
|
|
}
|
|
|
//key在包前面,并且在一行的开头
|
|
|
- keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
|
|
|
+ keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
|
|
|
if len(keys) == 0 {
|
|
|
//key在包前面,并且key以冒号结尾
|
|
|
- keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
|
|
|
+ keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
|
|
|
}
|
|
|
if len(keys) == 0 {
|
|
|
- keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
|
|
|
+ keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
|
|
|
}
|
|
|
for _, key := range keys {
|
|
|
startEndMap[key[5]] = key[4]
|
|
@@ -628,7 +688,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
|
|
|
}
|
|
|
index := util.PackageNumberConvert(bk)
|
|
|
//去掉前缀,空格必须要加,分kv的时候要用
|
|
|
- text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
|
|
|
+ text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
|
|
|
headKey := ""
|
|
|
if indexKeyStringMap[iv] != "" {
|
|
|
//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
|