data_processing
/
jy-data-extract


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815
							package pretreated

import (
	"jy/util"
	qutil "qfw/util"
	"regexp"
	"sort"
	"strconv"
	"strings"
)

//分块、分段功能
var (
	/*regSerialTitles = []string{
		"([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、．.:：，](.*)",
		"[（(]([一二三四五六七八九十]+)[)）][\u3000\u2003\u00a0\\s]*[、．.:：]?(.*)",
		"(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
		"(\\d+)[\u3000\u2003\u00a0\\s]*[.．]([^\\d][^\r\n]+)",
		"(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
		"1[.．](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d.．][^\r\n]+)",
	}*/
	regSerialTitles_1 = []*regexp.Regexp{
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、．.:：，](.*)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[（(]([一二三四五六七八九十]+)[)）][\u3000\u2003\u00a0\\s]*[、．.:：]?(.*)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[.．]([^\\d][^\r\n]+)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[.．](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d.．][^\r\n]+)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s（]*|^[\u3000\u2003\u00a0\\s（]*)(\\d+)[\u3000\u2003\u00a0\\s）]+([^\r\n]+)"),
	}
	regSerialTitles_2 = []*regexp.Regexp{
		regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、．.:：，](.*)$"),
		regexp.MustCompile("^[（(]([一二三四五六七八九十]+)[)）][\u3000\u2003\u00a0\\s]*[、．.:：]?(.*)$"),
		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"),
		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[.．]([^\\d][^\r\n]+)$"),
		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
		regexp.MustCompile("^1[.．](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d.．][^\r\n]+)$"),
		regexp.MustCompile("^[（](\\d+)[\u3000\u2003\u00a0\\s）]+([^\r\n]+)$"),
	}
	regReplAllTd       = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
	regIsNumber        = regexp.MustCompile("^\\d+$")
	regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
	regReplAllSpace    = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+")
	regTrimSpace       = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
	regReplWrapSpace   = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
	regReplAllSymbol   = regexp.MustCompile("[（\\(<《【\\[{｛〔）\\)>》】\\]}｝〕,，;；:：'\"“”。.\\?？/+=\\-_——*&……\\^%$￥@#!！`~·]")
	regFilterTitle     = regexp.MustCompile("[（\\(<《【\\[{｛〔].+?[）\\)>》】\\]}｝〕]")
	regDivision        = regexp.MustCompile("[:：]")
	regSpliteSegment   = regexp.MustCompile("[\r\n]")
	regFilterNumber    = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
	regSplit           = regexp.MustCompile("或|和|以?及|与|、|或")
	regStartWrap       = regexp.MustCompile("^[\r\n]")
	regEndWrap         = regexp.MustCompile("[\r\n]$")
	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、．.:：，])+\\d")
	moreColonReg       = regexp.MustCompile("[:：]+")
	regFilter          = regexp.MustCompile("等$")
	confusion          = map[string]string{
		"参与": "canyu",
	}
	//查找分包之前，先对内容进行预处理
	/*
		第一包：采购设备清单
		<table></table>
	*/
	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ 　\u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
	filterPkgTitleKey = regexp.MustCompile("结果[:：]?$")
)

//分块
func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.Block, int) {
	defer qutil.Catch()
	returnValue := 0
	var blocks []*util.Block
	if strings.TrimSpace(content) == "" {
		return blocks, -1
	}
	//table里面的内容不考虑，先把table清理掉
	//contentTemp := regReplAllTd.ReplaceAllString(content, "")
	contentTemp := TextAfterRemoveTable(content)
	tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
	var regContenSerialTitle *regexp.Regexp
	var regSerialTitleIndex int
	if ruleBlock!=nil && len(ruleBlock.BlockRegs)>0{
		regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
	}else {
		regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp,  regSerialTitles_1)
	}
	//没有分块
	if regSerialTitleIndex == -1 {
		if len(contentTemp) == len(content) {
			//没有分块
			return blocks, -1
		} else { //有table
			return blocks, -2
		}
	}
	//匹配序号和标题
	var regSerialTitle *regexp.Regexp
	if ruleBlock != nil && len(ruleBlock.TitleRegs)>0{
		regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
	}else {
		regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
	}
	indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
	indexs = filterSerial(content, indexs, tdIndexs)
	//头块
	var headBlock, endBlock *util.Block
	currentIndex := 0
	for k, v := range indexs {
		start, end := v[0], v[1]
		//添加开头部分
		if k == 0 {
			if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" {
				headBlock = &util.Block{
					Index: -1,       //序号
					Text:  headTemp, //内容
					Title: "",       //标题
					Start: 0,
					End:   start,
				}
			}
		}
		//分块
		blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "")
		serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题
		if len(serialTitles) < 3 {
			continue
		}
		indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号
		index := 0
		//转成数字序号
		if regIsNumber.MatchString(indexSting) {
			index, _ = strconv.Atoi(indexSting)
		} else if regIsChineseNumber.MatchString(indexSting) {
			index = util.ChineseNumberToInt(indexSting)
		}
		//序号开始就是错误的
		if k+1 != index {
			if k == 0 {
				returnValue = 3
			} else {
				if currentIndex+1 != index {
					//如果序号不是连续的，不往下走
					returnValue = 2
					//添加结尾部分
					if from != 3 {
						endBlock = &util.Block{
							Index: -2,              //序号
							Text:  content[start:], //内容
							Title: "",              //标题
							Start: start,
							End:   len(content),
						}
						break
					}
				}
			}
			currentIndex = index
		}

		//
		title := serialTitles[2]                         //标题
		title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
		//分块后的块文
		nextStart := len(content)
		if k < len(indexs)-1 {
			nextStart = indexs[k+1][0]
		}
		//获取块中除了序号和标题的内容
		blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
		if title != "" {
			blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
			//特殊情况处理
			if blockTextTemp == "" {
				if regDivision.MatchString(title) {
					/*
						一、项目编号：HMEC170223
						二、项目名称：执法记录仪采购
					*/
					blockText = title
					divisionIndexs := regDivision.FindStringIndex(title)
					title = title[:divisionIndexs[0]]
				} else {
					/*
						十一、投标代表须持本人身份证原件亲自递交投标文件，代理机构项目经理审核通过后，办理签收手续，否则投标文件被拒收。
						十二、开标时间：2017年3月20日9时30分
					*/
					blockText = title
					title = ""
				}
			} else if blockTextTemp != "" && regDivision.MatchString(title) {
				/*
					2、采购单位名称：福建省汀州医院
					采购单位地址: 龙岩市长汀县
					联系人：胡科长
					联系方式：0597-6826353
				*/
				//多个标题
				divisionIndexs := regDivision.FindStringIndex(title)
				titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
				titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
				blockText = title + "\n" + blockText
				if titleAfter != "" {
					title = ""
				} else {
					title = titleBefore
				}
			} else {
				blockText = title + "\n" + blockText
			}
		}
		//没有内容的块，不打标签，不分段
		if blockText == "" {
			continue
		}
		//过滤
		if regexp.MustCompile("投标文件格式|业绩").MatchString(title) {
			continue
		}
		blockText = hasMergeKV(title, blockText)
		//
		titleIsExists := map[string]bool{} //去重
		title = filterTitle(title)
		//分割标题 [和及]。。。 参与
		splitTitles := ProcTitle(title)
		block := &util.Block{
			Index:  index,     //序号
			Text:   blockText, //内容
			Title:  title,     //标题
			Titles: splitTitles,
			Start:  start,
			End:    nextStart,
		}

		for _, sv := range splitTitles {
			if sv == "" || titleIsExists[sv] {
				continue
			}
			titleIsExists[sv] = true
			//标题过短过长不打标签
			if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
				//打标签
				block.Tags = append(block.Tags, util.GetBlockTags(sv))
			}
		}
		tagsToBlocks(blocks, block)
		//log.Println(index, sv, splitTitles)
		//log.Println(blockText)
		blocks = append(blocks, block)
	}
	var returnBlocks []*util.Block
	if len(blocks) > 0 {
		//头
		if headBlock != nil {
			returnBlocks = append(returnBlocks, headBlock)
		}
		//中间块
		returnBlocks = append(returnBlocks, blocks...)
		//尾
		if endBlock != nil {
			returnBlocks = append(returnBlocks, endBlock)
		}
		if returnValue == 0 {
			returnValue = 1
		}
	}
	contactFormat := &util.ContactFormat{
		IndexMap: map[int]string{},
		MatchMap: map[string]map[string]bool{},
	}
	for _, bl := range returnBlocks {
		//解析kv
		newText := TextAfterRemoveTable(bl.Text)
		bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from)
		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat)
		//正则抽取的时候有时需要匹配换行或者句号，这里在解析完kv之后，在块结尾添加换行和句号
		bl.Text = appendWarpStop(bl.Text)
	}
	return returnBlocks, returnValue
}

//块标题处理
func ProcTitle(title string) []string {
	if title == "" {
		return []string{}
	}
	for k, v := range confusion {
		title = strings.Replace(title, k, v, -1)
	}
	direct := 1
	prev := ""
	ara := regSplit.Split(title, -1)
	for kk, vv := range ara {
		for kkk, vvv := range confusion {
			vv = strings.Replace(vv, vvv, kkk, -1)
		}
		ara[kk] = vv
		if len([]rune(vv)) == 2 {
			if kk == 0 {
				direct = -1
			} else {
				start := ""
				if len([]rune(prev)) > 3 {
					start = string([]rune(prev)[:len([]rune(prev))-2])
				}
				ara[kk] = start + vv
			}
		}
		if len([]rune(vv)) > 3 {
			if direct == -1 {
				end := string([]rune(vv)[len([]rune(vv))-2:])
				for i := 0; i < kk; i++ {
					ara[i] = ara[i] + end
				}
				break
			}
			prev = vv
		}
	}
	return ara
}

//有合并kv的 例如项目名称及编号
func hasMergeKV(title, text string) string {
	title = regDivision.ReplaceAllString(title, "")
	titles := regSplit.Split(title, -1)
	if len(titles) <= 1 {
		return text
	}
	before := titles[0]
	after := titles[1]
	if strings.Contains(title, "项目") && len([]rune(after)) == 2 {
		after = "项目" + after
	} else {
		return text
	}
	if strings.Count(text, "\n") != 1 {
		return text
	}
	texts := strings.Split(text, "\n")
	textOneLine := texts[0]
	textTwoLine := texts[1]
	if regDivision.MatchString(textTwoLine) {
		return text
	}
	if textTwoLine := strings.SplitN(textTwoLine, "，", 2); len(textTwoLine) == 2 {
		text = textOneLine + "\n" + before + "：" + textTwoLine[0] + "，" + after + "：" + textTwoLine[1]
	}
	return text
}

//过滤序号，判断序号是不是在td里，如果是的话这个序号作废
func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
	returnIndexs := [][]int{}
	for _, v := range indexs {
		flag := false
		//根据序号的开始位置，判断是不是在td里面
		for _, tv := range tdIndexs {
			if v[0] > tv[0] && v[0] < tv[1] {
				flag = true
				continue
			}
		}
		if flag {
			continue
		}
		returnIndexs = append(returnIndexs, []int{v[0], v[1]})
	}
	return returnIndexs
}

//获取正文所用的序号类型
func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
	var regContenSerialTitle *regexp.Regexp
	//先判断文章最外层使用的是哪种序号
	contentStartIndex, regSerialTitleIndex := -1, -1
	for k, v := range blockRegs {
		indexs := v.FindStringIndex(content)
		//只用最外层的序号，里面的过滤掉
		if len(indexs) == 2 && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
			regSerialTitleIndex = k
			contentStartIndex = indexs[0]
			regContenSerialTitle = v
		}
	}
	return regContenSerialTitle, regSerialTitleIndex
}

//添加换行和句号
func appendWarpStop(text string) string {
	//清理前后空格
	text = regTrimSpace.ReplaceAllString(text, "")
	//添加句号
	if !strings.HasSuffix(text, "。") {
		text += "。"
	}
	//添加换行
	if !regEndWrap.MatchString(text) {
		text += "\n"
	}
	return text
}

//分段
func DivideSegment(txt string) []*util.Segment {
	//先分段
	_segs := strings.FieldsFunc(txt, func(r rune) bool {
		return r == 10 || r == 13
	})
	//再去除空行
	segs := make([]*util.Segment, 0)
	_index := 0
	for _, seg := range _segs {
		if seg != " " && len(seg) > 1 {
			_seg := util.Segment{}
			_index = _index + 1
			_seg.Index = _index
			_seg.Text = seg
			segs = append(segs, &_seg)
		}

	}
	return segs
}

/** 给块打标签 **/
func tagsToBlocks(blocks []*util.Block, block *util.Block) {
	if len(block.Tags) == 0 {
		return
	}
	tag := map[string]bool{}
	tagWeight := map[string]int{}
	for _, v := range block.Tags {
		for _, ts := range v {
			tag[ts.Value] = true
			tagWeight[ts.Value] = ts.Weight
		}
	}
	for v, _ := range tag {
		for _, block := range blocks {
			if block.Tag[v] {
				for _, blockTags := range block.Tags {
					for _, ts := range blockTags {
						if ts.Value == v && ts.Weight < tagWeight[v] {
							block.Tag[v] = false
						}
					}
				}
			}
		}
	}
	block.Tag = tag
}

func filterTitle(title string) string {
	if strings.Contains(title, "，") && strings.Contains(title, "。") {
		return ""
	}
	if len([]rune(title)) > 30 {
		return ""
	}
	//清理空格
	title = regReplAllSpace.ReplaceAllString(title, "")
	//清理成对出现的符号中的内容
	title = regFilterTitle.ReplaceAllString(title, "")
	//清理特殊符号
	title = regReplAllSymbol.ReplaceAllString(title, "")
	//清理序号
	title = regFilterNumber.ReplaceAllString(title, "")
	title = regFilter.ReplaceAllString(title, "")
	return title
}

//从块里面找分包
func FindPackageFromBlocks(blocks *[]*util.Block, title string) (blockPackage map[string]*util.BlockPackage) {
	blockPackage = map[string]*util.BlockPackage{}
	//块分包
	for _, v := range *blocks {
		text := regPackageFilter.ReplaceAllString(v.Text, "<table>")
		text = TextAfterRemoveTable(text)
		if text == "" {
			continue
		}
		ok, surplusText := divisionPackageChild(&blockPackage, text, title, true, v.Tag["中标单位"])
		//把分包内容摘除掉有问题 有的项目名称中包含二标段
		if ok && false {
			v.Text = surplusText
			v.ColonKV = GetKVAll(surplusText, v.Title, nil, 1)
			v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title, nil)
		}
	}
	return
}

//从正文里面找分包
func FindPackageFromText(title string, content string) (blockPackage map[string]*util.BlockPackage) {
	blockPackage = map[string]*util.BlockPackage{}
	//从正文里面找分包
	divisionPackageChild(&blockPackage, content, title, true, false)
	return
}

//分块之后分包
func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool) (bool, string) {
	//查找知否有分包
	content = regMoreWrap.ReplaceAllString(content, "\n")
	content = regEndWrap.ReplaceAllString(content, "")
	con, pkg, flag := CheckMultiPackage(content, title)
	if !flag {
		return false, ""
	}
	//	util.Debug(con)
	//	util.Debug(pkg)
	//分包前面添加换行
	appendWarpIndex := []int{}
	for _, v := range pkg {
		//如果文本内容以识别出来的分包标识结尾，不是分包
		if len(pkg) == 1 && strings.HasSuffix(con, v[0]) {
			return false, ""
		}
		//
		is := regexp.MustCompile(v[0]+"[:：]*").FindAllStringIndex(con, -1)
		for _, sv := range is {
			appendWarpIndex = append(appendWarpIndex, sv[0])
		}
	}
	appendWarpIndex = getPkgIndex(appendWarpIndex)
	conTemp := ""
	for k, v := range appendWarpIndex {
		if k == 0 {
			conTemp += con[:v] + "\n"
		} else {
			conTemp += "\n" + con[appendWarpIndex[k-1]:v]
		}
		if k == len(appendWarpIndex)-1 {
			conTemp += "\n" + con[v:]
		}
	}
	con = conTemp
	con = replSerial.ReplaceAllString(con, "\n")
	con = regMoreWrap.ReplaceAllString(con, "\n")
	//util.Debug(con)
	//根据分包，找索引位置
	indexMap := map[int]int{}
	indexKeyStringMap := map[int]string{}
	indexKeyIntMap := map[int]int{}
	indexs := []int{}
	startEndMap := map[int]int{}
	pkgIndexMap := map[string][]int{}
	indexPkgMap := map[int]string{}
	//遍历分包，把kv在包前面的移动到包后面
	for _, v := range pkg {
		pgflag := v[0] + "[:：]*"
		is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1)
		for _, sv := range is {
			indexMap[sv[0]] = sv[1]
			indexs = append(indexs, sv[0])
			pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0])
			indexPkgMap[sv[0]] = v[0]
		}
		//key在包前面，并且在一行的开头
		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
		if len(keys) == 0 {
			//key在包前面，并且key以冒号结尾
			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
		}
		if len(keys) == 0 {
			keys = regexp.MustCompile("()注[:：]([\u4e00-\u9fa5]{2,8}?([（(].{1,8}?[)）])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
		}
		for _, key := range keys {
			startEndMap[key[5]] = key[4]
			//
			headkey := con[key[4]:key[5]]
			headkey = regReplAllSpace.ReplaceAllString(headkey, "")
			if !regDivision.MatchString(headkey) {
				headkey += "："
			}
			headkey = moreColonReg.ReplaceAllString(headkey, "：")
			colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
			if len(colonIndexs) > 1 {
				headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
			}
			indexKeyStringMap[key[5]] = headkey
			indexKeyIntMap[key[5]] = key[1]
		}
	}
	indexs = getPkgIndex(indexs)
	for ik, iv := range indexs {
		if indexKeyStringMap[iv] != "" {
			continue
		}
		if indexKeyIntMap[iv] == indexMap[iv] {
			continue
		}
		if ik > 0 {
			indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
		}
	}
	//
	//获取截取标识
	surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
	//查找分包内容，分kv
	for _, iv := range indexs {
		text := indexTextMap[iv]
		//
		warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
		if len(indexWarpMap) > 0 {
			maxWarpCount = indexWarpMap[iv]
		}
		if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount {
			textTemp := text
			text = textTemp[:warpIndex[maxWarpCount-1][1]]
			surplusText += textTemp[warpIndex[maxWarpCount-1][0]:]
		}
		for bk, bv := range pkg {
			//判断分包如果在这段文字里面，该段文字就属于该包的
			if !strings.HasPrefix(text, bv[0]) {
				continue
			}
			index := util.PackageNumberConvert(bk)
			//去掉前缀,空格必须要加，分kv的时候要用
			text = regexp.MustCompile(bv[0]+"[:：]*").ReplaceAllString(text, "")
			headKey := ""
			if indexKeyStringMap[iv] != "" {
				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
				headKey = indexKeyStringMap[iv]
				text = indexKeyStringMap[iv] + "  " + text
				//}
				for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
					delete(indexKeyStringMap, pkgIndexMap_v)
				}
			}
			//如果一块中有多个相同的包，合并到一个
			if (*blockPackage)[index] != nil {
				//合并文本
				(*blockPackage)[index].Text += "\n" + text
				//合并冒号kv
				colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1)
				if headKey != "" {
					kvAgain := GetKVAll(text, "", nil, 4)
					for kv_k, kv_v := range kvAgain.Kv {
						if colonJobKv.Kv[kv_k] == "" {
							colonJobKv.Kv[kv_k] = kv_v
							colonJobKv.KvTag[kv_k] = kvAgain.KvTag[kv_k]
						}
					}
				}
				for kv_k, kv_v := range colonJobKv.Kv {
					if kv_v == "" {
						continue
					}
					if (*blockPackage)[index].ColonKV.Kv[kv_k] != "" {
						continue
					}
					(*blockPackage)[index].ColonKV.Kv[kv_k] = kv_v
				}
				//合并空格kv
				spaceJobKv := SspacekvEntity.Entrance(text, "", nil)
				for kv_k, kv_v := range spaceJobKv.Kv {
					if kv_v == "" {
						continue
					}
					if (*blockPackage)[index].SpaceKV.Kv[kv_k] != "" {
						continue
					}
					(*blockPackage)[index].SpaceKV.Kv[kv_k] = kv_v
				}
			} else {
				newBpkg := &util.BlockPackage{
					Origin:   bk,
					Text:     text,
					Index:    index,
					Type:     bv[1],
					Accuracy: accuracy,
				}
				finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4)
				if headKey != "" {
					kvAgain := GetKVAll(text, "", nil, 4)
					for kv_k, kv_v := range kvAgain.Kv {
						if finalKv.Kv[kv_k] == "" {
							finalKv.Kv[kv_k] = kv_v
							finalKv.KvTag[kv_k] = kvAgain.KvTag[kv_k]
						}
					}
				}
				newBpkg.ColonKV = finalKv
				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil)
				(*blockPackage)[index] = newBpkg
			}
		}
	}
	//中标人排序
	if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 {
		for _, v := range *blockPackage {
			v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2)
		}
	}
	return true, surplusText
}
func getPkgIndex(indexs []int) []int {
	sort.Ints(indexs)
	indexsNew := []int{}
	count := 0
	for k, v := range indexs {
		if k > 0 && v-indexs[k-1] <= 10 {
			count++
			continue
		}
		indexsNew = append(indexsNew, v)
	}
	if count > 0 && count == len(indexs)-1 {
		return []int{}
	}
	return indexsNew
}

//每个包对应的结束位置，都是整行结束
func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) {
	//util.Debug(con)
	surplusText := ""
	indexTextMap := map[int]string{}
	indexWarpMap := map[int]int{}
	maxWarpCount := 0
	for ik, iv := range indexs {
		text := ""
		if ik < len(indexs)-1 {
			if startEndMap[indexs[ik+1]] != 0 {
				text = con[iv:startEndMap[indexs[ik+1]]]
			} else {
				text = con[iv:indexs[ik+1]]
			}
		} else {
			text = con[iv:]
		}
		indexTextMap[iv] = text
		warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
		if warpCount > maxWarpCount {
			maxWarpCount = warpCount
		}
		indexWarpMap[iv] = warpCount
		if ik == 0 {
			surplusText += con[:iv]
		}
	}
	pkgLaw := ""
	if len(pkgIndexMap) > 1 {
		//有规律的出现 AB or ABAB
		if pkgLaw == "" {
			prevVal := ""
			notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0
			indexMaxMap := map[int]int{}
			for ik, iv := range indexs {
				if notRepeatCount == len(pkgIndexMap) {
					notRepeatCount = 0
				}
				if prevVal != indexPkgMap[iv] {
					notRepeatCount++
				} else {
					notRepeatCount = -1
					currentIndex = ik
					break
				}
				prevVal = indexPkgMap[iv]
				if notRepeatCount == len(pkgIndexMap) {
					indexMaxMap[iv] = onceMax
					onceMax = 0
				}
				if indexWarpMap[iv] > onceMax {
					onceMax = indexWarpMap[iv]
					allMax = onceMax
				}
				if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) {
					notRepeatCount = -2
					currentIndex = ik
				}
			}
			//util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap)
			if len(indexMaxMap) > 0 {
				pkgLaw = "AB"
				thisMax := 0
				for ik := len(indexs) - 1; ik >= 0; ik-- {
					iv := indexs[ik]
					if currentIndex != -1 && ik >= currentIndex {
						indexWarpMap[iv] = allMax
						continue
					}
					if indexMaxMap[iv] > 0 {
						thisMax = indexMaxMap[iv]
					}
					indexWarpMap[iv] = thisMax
				}
			}
		}
	}
	if pkgLaw == "" {
		indexWarpMap = map[int]int{}
	}
	//util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap)
	return surplusText, maxWarpCount, indexTextMap, indexWarpMap
}

//分块之后的kv
func kvAfterDivideBlock(text string, from int, ruleBlock *util.RuleBlock) []*util.Kv {
	blocks, _ := DivideBlock(text, from, ruleBlock)
	kvs := []*util.Kv{}
	for _, v := range blocks {
		//util.Debug(v.Text)
		//		for _, vvv := range v.ColonKV.Kvs {
		//			util.Debug(vvv.Key, vvv.Value, vvv.Title)
		//		}
		kvs = append(kvs, v.ColonKV.Kvs...)
	}
	return kvs
}