|
@@ -3,11 +3,13 @@ package pretreated
|
|
|
import (
|
|
|
"fmt"
|
|
|
"jy/util"
|
|
|
+
|
|
|
qutil "qfw/util"
|
|
|
"regexp"
|
|
|
"sort"
|
|
|
"strconv"
|
|
|
"strings"
|
|
|
+ "unicode/utf8"
|
|
|
)
|
|
|
|
|
|
//分块、分段功能
|
|
@@ -53,6 +55,7 @@ var (
|
|
|
regStartWrap = regexp.MustCompile("^[\r\n]")
|
|
|
regEndWrap = regexp.MustCompile("[\r\n]$")
|
|
|
regMoreWrap = regexp.MustCompile("[\r\n]{2,}")
|
|
|
+ regStrWrap = regexp.MustCompile("分包名称[::]")
|
|
|
replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
|
|
|
moreColonReg = regexp.MustCompile("[::]+")
|
|
|
regFilter = regexp.MustCompile("等$")
|
|
@@ -578,6 +581,7 @@ func FindPackageFromText(title string, content string,isSite bool,codeSite strin
|
|
|
//分块之后分包
|
|
|
func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool,isSite bool,codeSite string) (bool, string) {
|
|
|
//查找知否有分包
|
|
|
+ content = regStrWrap.ReplaceAllString(content, "\n")
|
|
|
content = regMoreWrap.ReplaceAllString(content, "\n")
|
|
|
content = regEndWrap.ReplaceAllString(content, "")
|
|
|
con, pkg, flag := CheckMultiPackage(content, title)
|
|
@@ -781,12 +785,28 @@ func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[str
|
|
|
} else {
|
|
|
text = con[iv:]
|
|
|
}
|
|
|
+ tmptext := text
|
|
|
if strings.Contains(text, "、") {
|
|
|
text = strings.Split(text, "、")[0]
|
|
|
} else if strings.Contains(text, "\n") {
|
|
|
- text = strings.Split(text, "\n")[0]
|
|
|
+ texts := strings.Split(text, "\n")
|
|
|
+ text2 :=""
|
|
|
+ if ik+1 < len(indexs)-1 {
|
|
|
+ if startEndMap[indexs[ik+1+1]] != 0 {
|
|
|
+ text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]]
|
|
|
+ } else {
|
|
|
+ text2 = con[indexs[ik+1]:indexs[ik+1+1]]
|
|
|
+ }
|
|
|
+ if texts[len(texts)-1] == text2{
|
|
|
+ text = texts[0]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if utf8.RuneCountInString(text)<5{
|
|
|
+ indexTextMap[iv] = tmptext
|
|
|
+ }else {
|
|
|
+ indexTextMap[iv] = text
|
|
|
}
|
|
|
- indexTextMap[iv] = text
|
|
|
warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
|
|
|
if warpCount > maxWarpCount {
|
|
|
maxWarpCount = warpCount
|