|
@@ -11,15 +11,15 @@ import (
|
|
|
|
|
|
//分块、分段功能
|
|
//分块、分段功能
|
|
var (
|
|
var (
|
|
- regSerialTitles = []string{
|
|
|
|
|
|
+ /*regSerialTitles = []string{
|
|
"([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)",
|
|
"([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)",
|
|
"[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)",
|
|
"[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)",
|
|
"(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
|
|
"(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
|
|
"(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)",
|
|
"(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)",
|
|
"(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
|
|
"(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
|
|
"1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
|
|
"1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
|
|
- }
|
|
|
|
- regSerialTitles_1 = []*regexp.Regexp{
|
|
|
|
|
|
+ }*/
|
|
|
|
+ /*regSerialTitles_1 = []*regexp.Regexp{
|
|
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
|
|
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
|
|
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
|
|
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
|
|
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
|
|
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
|
|
@@ -36,7 +36,7 @@ var (
|
|
regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
|
|
regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
|
|
regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
|
|
regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
|
|
regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
|
|
regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
|
|
- }
|
|
|
|
|
|
+ }*/
|
|
regReplAllTd = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
|
|
regReplAllTd = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
|
|
regIsNumber = regexp.MustCompile("^\\d+$")
|
|
regIsNumber = regexp.MustCompile("^\\d+$")
|
|
regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
|
|
regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
|
|
@@ -64,7 +64,7 @@ var (
|
|
)
|
|
)
|
|
|
|
|
|
//分块
|
|
//分块
|
|
-func DivideBlock(content string, from int) ([]*util.Block, int) {
|
|
|
|
|
|
+func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.Block, int) {
|
|
defer qutil.Catch()
|
|
defer qutil.Catch()
|
|
returnValue := 0
|
|
returnValue := 0
|
|
var blocks []*util.Block
|
|
var blocks []*util.Block
|
|
@@ -75,7 +75,7 @@ func DivideBlock(content string, from int) ([]*util.Block, int) {
|
|
//contentTemp := regReplAllTd.ReplaceAllString(content, "")
|
|
//contentTemp := regReplAllTd.ReplaceAllString(content, "")
|
|
contentTemp := TextAfterRemoveTable(content)
|
|
contentTemp := TextAfterRemoveTable(content)
|
|
tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
|
|
tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
|
|
- regContenSerialTitle, regSerialTitleIndex := getSerialType(contentTemp)
|
|
|
|
|
|
+ regContenSerialTitle, regSerialTitleIndex := getSerialType(contentTemp, ruleBlock.BlockRegs)
|
|
//没有分块
|
|
//没有分块
|
|
if regSerialTitleIndex == -1 {
|
|
if regSerialTitleIndex == -1 {
|
|
if len(contentTemp) == len(content) {
|
|
if len(contentTemp) == len(content) {
|
|
@@ -86,7 +86,7 @@ func DivideBlock(content string, from int) ([]*util.Block, int) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//匹配序号和标题
|
|
//匹配序号和标题
|
|
- regSerialTitle := regSerialTitles_2[regSerialTitleIndex]
|
|
|
|
|
|
+ regSerialTitle := ruleBlock.TitleRegs[regSerialTitleIndex]
|
|
indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
|
|
indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
|
|
indexs = filterSerial(content, indexs, tdIndexs)
|
|
indexs = filterSerial(content, indexs, tdIndexs)
|
|
//头块
|
|
//头块
|
|
@@ -330,11 +330,11 @@ func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
|
|
}
|
|
}
|
|
|
|
|
|
//获取正文所用的序号类型
|
|
//获取正文所用的序号类型
|
|
-func getSerialType(content string) (*regexp.Regexp, int) {
|
|
|
|
|
|
+func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
|
|
var regContenSerialTitle *regexp.Regexp
|
|
var regContenSerialTitle *regexp.Regexp
|
|
//先判断文章最外层使用的是哪种序号
|
|
//先判断文章最外层使用的是哪种序号
|
|
contentStartIndex, regSerialTitleIndex := -1, -1
|
|
contentStartIndex, regSerialTitleIndex := -1, -1
|
|
- for k, v := range regSerialTitles_1 {
|
|
|
|
|
|
+ for k, v := range blockRegs {
|
|
indexs := v.FindStringIndex(content)
|
|
indexs := v.FindStringIndex(content)
|
|
//只用最外层的序号,里面的过滤掉
|
|
//只用最外层的序号,里面的过滤掉
|
|
if len(indexs) == 2 && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
|
|
if len(indexs) == 2 && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
|
|
@@ -753,8 +753,8 @@ func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[str
|
|
}
|
|
}
|
|
|
|
|
|
//分块之后的kv
|
|
//分块之后的kv
|
|
-func kvAfterDivideBlock(text string, from int) []*util.Kv {
|
|
|
|
- blocks, _ := DivideBlock(text, from)
|
|
|
|
|
|
+func kvAfterDivideBlock(text string, from int, ruleBlock *util.RuleBlock) []*util.Kv {
|
|
|
|
+ blocks, _ := DivideBlock(text, from, ruleBlock)
|
|
kvs := []*util.Kv{}
|
|
kvs := []*util.Kv{}
|
|
for _, v := range blocks {
|
|
for _, v := range blocks {
|
|
//util.Debug(v.Text)
|
|
//util.Debug(v.Text)
|