data_processing
/
data_extract


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108
							package pretreated

import (
	"fmt"
	"jy/clear"
	"jy/util"
	qutil "qfw/util"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"unicode/utf8"
)

//分块、分段功能
var (
	/*regSerialTitles = []string{
		"([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、．.:：，](.*)",
		"[（(]([一二三四五六七八九十]+)[)）][\u3000\u2003\u00a0\\s]*[、．.:：]?(.*)",
		"(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
		"(\\d+)[\u3000\u2003\u00a0\\s]*[.．]([^\\d][^\r\n]+)",
		"(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
		"1[.．](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d.．][^\r\n]+)",
	}*/
	regSerialTitles_1 = []*regexp.Regexp{
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、．.:：，](.*)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[（(]([一二三四五六七八九十]+)[)）][\u3000\u2003\u00a0\\s]*[、．.:：]?(.*)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[.．]([^\\d][^\r\n]+)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[.．](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d.．][^\r\n]+)"),
		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s（]*|^[\u3000\u2003\u00a0\\s（]*)(\\d+)[\u3000\u2003\u00a0\\s）]+([^\r\n]+)"),
	}
	regSerialTitles_2 = []*regexp.Regexp{
		regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、．.:：，](.*)$"),
		regexp.MustCompile("^[（(]([一二三四五六七八九十]+)[)）][\u3000\u2003\u00a0\\s]*[、．.:：]?(.*)$"),
		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"),
		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[.．]([^\\d][^\r\n]+)$"),
		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
		regexp.MustCompile("^1[.．](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d.．][^\r\n]+)$"),
		regexp.MustCompile("^[（](\\d+)[\u3000\u2003\u00a0\\s）]+([^\r\n]+)$"),
	}
	regReplAllTd       = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
	regIsNumber        = regexp.MustCompile("^\\d+$")
	regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
	regReplAllSpace    = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+")
	regTrimSpace       = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
	regReplWrapSpace   = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
	regReplAllSymbol   = regexp.MustCompile("[（\\(<《【\\[{｛〔）\\)>》】\\]}｝〕,，;；:：'\"“”。.\\?？/+=\\-_——*&……\\^%$￥@#!！`~·]")
	regFilterTitle     = regexp.MustCompile("[（\\(<《【\\[{｛〔].+?[）\\)>》】\\]}｝〕]")
	regDivision        = regexp.MustCompile("[:：]")
	regSpliteSegment   = regexp.MustCompile("[\r\n]")
	regFilterNumber    = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
	regSplit           = regexp.MustCompile("或|和|以?及|与|、|或")
	regStartWrap       = regexp.MustCompile("^[\r\n]")
	regEndWrap         = regexp.MustCompile("[\r\n]$")
	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
	regStrWrap         = regexp.MustCompile("分包名称[：:]")
	regBZJWarap        = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[:：]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标（每包内含相应文件正副本）)|[未|不]+划分标段)")
	regFJWarap         = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)")
	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、．.:：，])+\\d")
	moreColonReg       = regexp.MustCompile("[:：]+")
	regFilter          = regexp.MustCompile("等$")
	pkgFilter          = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+")
	indexTile          = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[:：]+`) //小标题
	indexTile2         = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[：:]\n`)
	regReplAllSpace2   = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.:：、\\(\\)]+")
	confusion          = map[string]string{
		"参与": "canyu",
	}
	//查找分包之前，先对内容进行预处理
	/*
		第一包：采购设备清单
		<table></table>
	*/
	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ 　\u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
	filterPkgTitleKey = regexp.MustCompile("结果[:：]?$")
	xuhao             = map[string]bool{
		"19968_12289": true,
		"19968_46":    true,
		"20108_12289": true,
		"20108_46":    true,
		"19977_12289": true,
		"19977_46":    true,
		"22235_12289": true,
		"22235_46":    true,
		"20116_12289": true,
		"20116_46":    true,
		"20845_12289": true,
		"20845_46":    true,
		"19971_12289": true,
		"19971_46":    true,
		"20843_12289": true,
		"20061_46":    true,
	}

	//非分包中标单位值
 	unPackageWinnerReg = regexp.MustCompile("(重新招标|方案包)")
	conformWinnerKVReg = regexp.MustCompile("^(中标人|中标银行|第一名)[:：](.{4,20}(分行|公司))")
	conformWinnerKVReg1 = regexp.MustCompile("^[-].{4,15}公司$")
	conformWinnerKVReg2 = regexp.MustCompile("(.*)?确定(.*公司)为中标人(.*)?")

    conformWinnerTextReg3 = regexp.MustCompile("拟定供应商信息[:：\\s]+名称[:：](.*)[\\s]+地址")
	conformWinnerTextReg4 = regexp.MustCompile("拟定供应商信息[:：\\s]+名称[:：](.*)[\\s]+地址")


	/*
	拟定供应商信息:
	名称：郑州人民广播电台
	地址：郑州市金水区内环路17号A座。
	*/

	//针对处理-替换敏感词
	packageReg1 = regexp.MustCompile("(包件[一二三四五1-9][:：].*)\n1[、.\\s]+名称[:：](.*)\n2[、.\\s]+")
    packageReg2 = regexp.MustCompile("标段[(（]包[）)][\\[][O0]+([1-9一二三四五六七八九])[\\]]")


	//标段(包)[001]巴盟神舟30MW:

)

//分块
func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) ([]*util.Block, int) {
	defer qutil.Catch()
	returnValue := 0
	var blocks []*util.Block
	if strings.TrimSpace(content) == "" || codeSite == "a_zgyc_ztbxx" {
		return blocks, -1
	}
	//table里面的内容不考虑，先把table清理掉
	//contentTemp := regReplAllTd.ReplaceAllString(content, "")
	contentTemp := TextAfterRemoveTable(content)
	tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
	var regContenSerialTitle *regexp.Regexp
	var regSerialTitleIndex int
	if ruleBlock != nil && len(ruleBlock.BlockRegs) > 0 {
		regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
	} else {
		regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, regSerialTitles_1)
	}
	//没有分块
	if regSerialTitleIndex == -1 {
		if len(contentTemp) == len(content) {
			//没有分块
			return blocks, -1
		} else { //有table
			return blocks, -2
		}
	}
	//匹配序号和标题
	var regSerialTitle *regexp.Regexp
	if ruleBlock != nil && len(ruleBlock.TitleRegs) > 0 {
		regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
	} else {
		regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
	}
	indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
	indexs = filterSerial(content, indexs, tdIndexs)
	//头块
	var headBlock, endBlock *util.Block
	currentIndex := 0
	for k, v := range indexs {
		start, end := v[0], v[1]
		//添加开头部分
		if k == 0 {
			if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" {
				headBlock = &util.Block{
					Index: -1,       //序号
					Text:  headTemp, //内容
					Title: "",       //标题
					Start: 0,
					End:   start,
				}
			}
		}
		//分块
		blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "")
		serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题
		if len(serialTitles) < 3 {
			continue
		}
		indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号
		index := 0
		//转成数字序号
		if regIsNumber.MatchString(indexSting) {
			index, _ = strconv.Atoi(indexSting)
		} else if regIsChineseNumber.MatchString(indexSting) {
			index = util.ChineseNumberToInt(indexSting)
		}
		//序号开始就是错误的
		if k+1 != index {
			if k == 0 {
				returnValue = 3
				break
			} else {
				if currentIndex+1 != index {
					//如果序号不是连续的，不往下走
					returnValue = 2
					//添加结尾部分
					if from != 3 {
						endBlock = &util.Block{
							Index: -2,              //序号
							Text:  content[start:], //内容
							Title: "",              //标题
							Start: start,
							End:   len(content),
						}
						break
					}
				}
			}
			currentIndex = index
		}
		//
		title := serialTitles[2]                         //标题
		title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
		//分块后的块文
		nextStart := len(content)
		if k < len(indexs)-1 {
			nextStart = indexs[k+1][0]
		}
		//获取块中除了序号和标题的内容
		blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
		if title != "" {
			blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
			//特殊情况处理
			if blockTextTemp == "" {
				if regDivision.MatchString(title) {
					/*
						一、项目编号：HMEC170223
						二、项目名称：执法记录仪采购
					*/
					blockText = title
					divisionIndexs := regDivision.FindStringIndex(title)
					title = title[:divisionIndexs[0]]
				} else {
					/*
						十一、投标代表须持本人身份证原件亲自递交投标文件，代理机构项目经理审核通过后，办理签收手续，否则投标文件被拒收。
						十二、开标时间：2017年3月20日9时30分
					*/
					blockText = title
					title = ""
				}
			} else if blockTextTemp != "" && regDivision.MatchString(title) {
				/*
					2、采购单位名称：福建省汀州医院
					采购单位地址: 龙岩市长汀县
					联系人：胡科长
					联系方式：0597-6826353
				*/
				//多个标题
				divisionIndexs := regDivision.FindStringIndex(title)
				titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
				titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
				blockText = title + "\n" + blockText
				if titleAfter != "" {
					title = ""
				} else {
					title = titleBefore
				}
			} else {
				blockText = title + "\n" + blockText
			}
		}
		//没有内容的块，不打标签，不分段
		if blockText == "" {
			continue
		}
		//过滤
		if regexp.MustCompile("投标文件格式|业绩").MatchString(title) &&
			!regexp.MustCompile("拟定的唯一供应商名称").MatchString(title){
			continue
		}
		blockText = hasMergeKV(title, blockText)
		//
		titleIsExists := map[string]bool{} //去重
		title = filterTitle(title)
		//分割标题 [和及]。。。 参与
		splitTitles := ProcTitle(title)
		blockText = mergetext(splitTitles, blockText)
		block := &util.Block{
			Index:  index,     //序号
			Text:   blockText, //内容
			Title:  title,     //标题
			Titles: splitTitles,
			Start:  start,
			End:    nextStart,
		}

		titles := []string{}
		for _, sv := range splitTitles {
			if sv == "" || titleIsExists[sv] {
				continue
			}
			titleIsExists[sv] = true
			//标题过短过长不打标签
			if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
				//打标签
				block.Tags = append(block.Tags, util.GetBlockTags(sv))
				titles = append(titles, sv)
			}
		}
		block.Title = title
		block.Titles = titles
		if ruleBlock != nil {
			block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles)
		}
		tagsToBlocks(blocks, block)
		//log.Println(index, sv, splitTitles)
		//log.Println(blockText)
		blocks = append(blocks, block)
	}
	var returnBlocks []*util.Block
	if len(blocks) > 0 {
		//头
		if headBlock != nil {
			if tp == "招标" {
				headBlock.Classify = map[string]bool{"bidcondition": true}
			}
			returnBlocks = append(returnBlocks, headBlock)
		}
		//中间块
		returnBlocks = append(returnBlocks, blocks...)
		//尾
		if endBlock != nil {
			returnBlocks = append(returnBlocks, endBlock)
		}
		if returnValue == 0 {
			returnValue = 1
		}
	}
	contactFormat := &util.ContactFormat{
		IndexMap: map[int]string{},
		MatchMap: map[string]map[string]bool{},
	}
	for _, bl := range returnBlocks {
		//解析kv
		newText := TextAfterRemoveTable(bl.Text) //取出纯文本

		bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from, isSite, codeSite)
		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat, isSite, codeSite)
		//正则抽取的时候有时需要匹配换行或者句号，这里在解析完kv之后，在块结尾添加换行和句号
		bl.Text = appendWarpStop(bl.Text)
	}
	return returnBlocks, returnValue
}

func mergetext(titles []string, text string) string {
	if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
		return text
	}
	splitLenstrs := strings.Split(text, "\n")
	if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
		return text
	}
	tt := ""
	for i, v := range splitLenstrs[1:] {
		lentexts := regDivision.Split(v, -1)
		if len(lentexts) == 2 {
			if strings.Contains(titles[i], lentexts[0]) {
				tt += titles[i] + ":" + lentexts[1] + "\n"
			}else if strings.Contains(titles[i], lentexts[0]) ||strings.Contains(titles[i], lentexts[0]){
				tt += titles[i] + ":" + lentexts[1] + "\n"
			}
		}else {
			//特殊处理
			if strings.Contains(v,"中标人 ") {
				tt +=v+"\n"
			}
		}
	}
	if len(tt) == 0 {
		return text
	} else {
		return tt
	}
}

//块标题处理
func ProcTitle(title string) []string {
	if title == "" {
		return []string{}
	}
	for k, v := range confusion {
		title = strings.Replace(title, k, v, -1)
	}
	direct := 1
	prev := ""
	ara := regSplit.Split(title, -1)
	for kk, vv := range ara {
		for kkk, vvv := range confusion {
			vv = strings.Replace(vv, vvv, kkk, -1)
		}
		ara[kk] = vv
		if len([]rune(vv)) == 2 {
			if kk == 0 {
				direct = -1
			} else {
				start := ""
				if len([]rune(prev)) > 3 {
					start = string([]rune(prev)[:len([]rune(prev))-2])
				}
				ara[kk] = start + vv
			}
		} else if vv == "联系人" || vv == "联系方式" {
			if strings.Contains(prev, "代理") {
				ara[kk] = "代理机构" + vv
			} else if strings.Contains(prev, "中标") {
				ara[kk] = "中标单位" + vv
			} else if strings.Contains(prev, "采购") {
				ara[kk] = "采购单位" + vv
			}
		}
		if len([]rune(vv)) > 3 {
			if direct == -1 {
				end := string([]rune(vv)[len([]rune(vv))-2:])
				for i := 0; i < kk; i++ {
					ara[i] = ara[i] + end
				}
				break
			}
			prev = vv
		}
	}
	return ara
}

//有合并kv的 例如项目名称及编号
func hasMergeKV(title, text string) string {
	title = regDivision.ReplaceAllString(title, "")
	titles := regSplit.Split(title, -1)
	if len(titles) <= 1 {
		return text
	}
	before := titles[0]
	after := titles[1]
	if strings.Contains(title, "项目") && len([]rune(after)) == 2 {
		after = "项目" + after
	} else {
		return text
	}
	if strings.Count(text, "\n") != 1 {
		return text
	}
	texts := strings.Split(text, "\n")
	textOneLine := texts[0]
	textTwoLine := texts[1]
	if regDivision.MatchString(textTwoLine) {
		return text
	}
	if textTwoLine := strings.SplitN(textTwoLine, "，", 2); len(textTwoLine) == 2 {
		text = textOneLine + "\n" + before + "：" + textTwoLine[0] + "，" + after + "：" + textTwoLine[1]
	}
	return text
}

//过滤序号，判断序号是不是在td里，如果是的话这个序号作废
func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
	returnIndexs := [][]int{}
	for _, v := range indexs {
		flag := false
		//根据序号的开始位置，判断是不是在td里面
		for _, tv := range tdIndexs {
			if v[0] > tv[0] && v[0] < tv[1] {
				flag = true
				continue
			}
		}
		if flag {
			continue
		}
		returnIndexs = append(returnIndexs, []int{v[0], v[1]})
	}
	return returnIndexs
}

//获取正文所用的序号类型
func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
	var regContenSerialTitle *regexp.Regexp
	//先判断文章最外层使用的是哪种序号
	contentStartIndex, regSerialTitleIndex := -1, -1
	for k, v := range blockRegs {
		indexs := v.FindStringIndex(content)
		//只用最外层的序号，里面的过滤掉
		if len(indexs) == 2  && !strings.Contains(content,"中标候选人排序") && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
			regSerialTitleIndex = k
			contentStartIndex = indexs[0]
			regContenSerialTitle = v
		}
	}
	return regContenSerialTitle, regSerialTitleIndex
}

//添加换行和句号
func appendWarpStop(text string) string {
	//清理前后空格
	text = regTrimSpace.ReplaceAllString(text, "")
	//添加句号
	if !strings.HasSuffix(text, "。") {
		text += "。"
	}
	//添加换行
	if !regEndWrap.MatchString(text) {
		text += "\n"
	}
	return text
}

//分段
func DivideSegmentHtml(txt string) []*util.Segment {
	//先分段
	_segs := strings.FieldsFunc(txt, func(r rune) bool {
		return r == 10 || r == 13
	})
	//再去除空行
	segs := make([]*util.Segment, 0)
	_index := 0
	for _, seg := range _segs {
		if seg != " " && len(seg) > 1 {
			_seg := util.Segment{}
			_index = _index + 1
			_seg.Index = _index
			_seg.Text = seg
			segs = append(segs, &_seg)
		}
	}
	return segs
}

//分段
func DivideSegment(txt string) []*util.Segment {
	//先分段
	tmpstr := ""
	_segs := strings.FieldsFunc(txt, func(r rune) bool {
		if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
			r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
			if tmpstr == "" {
				tmpstr += fmt.Sprint(r)
				return false
			} else if strings.Contains(tmpstr, "_") {
				tmpstr = ""
				tmpstr += fmt.Sprint(r)
				return false
			} else if tmpstr == fmt.Sprint(r) {
				if r == 46 || r == 12289 {
					tmpstr = ""
				}
				return false
			}
			tmpstr += "_" + fmt.Sprint(r)
			if xuhao[tmpstr] {
				return true
			}
		}
		tmpstr = ""
		return r == 10 || r == 13
	})
	//再去除空行
	segs := make([]*util.Segment, 0)
	_index := 0
	for _, seg := range _segs {
		if seg != " " && len(seg) > 1 {
			_seg := util.Segment{}
			_index = _index + 1
			_seg.Index = _index
			_seg.Text = seg
			segs = append(segs, &_seg)
		}

	}
	return segs
}

/** 给块打标签 **/
func tagsToBlocks(blocks []*util.Block, block *util.Block) {
	if len(block.Tags) == 0 {
		return
	}
	tag := map[string]bool{}
	tagWeight := map[string]int{}
	for _, v := range block.Tags {
		for _, ts := range v {
			tag[ts.Value] = true
			tagWeight[ts.Value] = ts.Weight
		}
	}
	for v, _ := range tag {
		for _, block := range blocks {
			if block.Tag[v] {
				for _, blockTags := range block.Tags {
					for _, ts := range blockTags {
						if ts.Value == v && ts.Weight < tagWeight[v] {
							block.Tag[v] = false
						}
					}
				}
			}
		}
	}
	block.Tag = tag
}

func filterTitle(title string) string {
	if strings.Contains(title, "，") && strings.Contains(title, "。") {
		return ""
	}
	if len([]rune(title)) > 30 {
		return ""
	}
	//清理空格
	title = regReplAllSpace.ReplaceAllString(title, "")
	//清理成对出现的符号中的内容
	title = regFilterTitle.ReplaceAllString(title, "")
	//清理特殊符号
	title = regReplAllSymbol.ReplaceAllString(title, "")
	//清理序号
	title = regFilterNumber.ReplaceAllString(title, "")
	title = regFilter.ReplaceAllString(title, "")
	return title
}

//从块里面找分包
func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
	blockPackage = map[string]*util.BlockPackage{}
	//块分包
	for _, v := range *blocks {
		text := regPackageFilter.ReplaceAllString(v.Text, "<table>")
		text = TextAfterRemoveTable(text)
		if text == "" {
			continue
		}
		//var ok bool
		//var surplusText string
		//分析分包-金额，中标单位，人电话，包名，中标后选人
		divisionPackageChild(&blockPackage, text, v.Title, true, v.Tag["中标单位"], isSite, codeSite)
	}
	//orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite)

	for k, v := range blockPackage {
		findWinnerBugetBidmountByKv(v, blockPackage, k) //根据kv-find字段
	}
	return
}

func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) {
	if v.ColonKV != nil && v.ColonKV.KvTags != nil {
		for kc, cv := range v.ColonKV.KvTags {
			if kc == "预算" && v.Budget <= 0 {
				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
				if len(moneys) > 0 {
					if vf, ok := moneys[0].(float64); ok {
						blockPackage[k].Budget = vf
						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
					} else if vi, ok := moneys[0].(int); ok {
						blockPackage[k].Budget = float64(vi)
						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
					}
				}
			} else if (kc == "中标金额"||kc=="各包中标/成交候选供应商及报价") && v.Bidamount <= 0 {
				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
				if len(moneys) > 0 {
					if vf, ok := moneys[0].(float64); ok {
						blockPackage[k].Bidamount = vf
						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
					} else if vi, ok := moneys[0].(int); ok {
						blockPackage[k].Bidamount = float64(vi)
						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
					}
				}
			} else if (kc == "中标单位"||kc=="第1    名"||kc=="各包中标/成交候选供应商及报价") && v.Winner == "" {
				if !unPackageWinnerReg.MatchString(cv[0].Value) {
					isW:=false
					if len(cv)>1 {
						for _,v_cv :=range cv{
							if v_cv.Key=="中标单位" && v_cv.Value!="" {
								isW = true
								blockPackage[k].Winner = v_cv.Value
								break
							}
						}
					}
					if !isW {
						blockPackage[k].Winner = cv[0].Value
					}
				}
			}else { //特殊情况-特殊处理
				res := conformWinnerKVReg.FindAllStringSubmatch(cv[0].Value, -1)
				if len(res) > 0 {
					text := res[0][2]
					if text!="" {
						blockPackage[k].Winner = text
						continue
					}
				}
				if kc=="中标信息" && conformWinnerKVReg1.MatchString(cv[0].Value){
					blockPackage[k].Winner = cv[0].Value
					continue
				}
				if conformWinnerKVReg2.MatchString(cv[0].Value) {
					blockPackage[k].Winner = conformWinnerKVReg2.ReplaceAllString(cv[0].Value,"${2}")
					continue
				}

				//全文找
				res = conformWinnerTextReg3.FindAllStringSubmatch(v.Text, -1)
				if len(res) > 0 {
					text := res[0][1]
					if text!="" {
						blockPackage[k].Winner = text
						continue
					}
				}

			}
		}
	}
	if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
		for kc, cv := range v.SpaceKV.KvTags {
			if kc == "预算" && v.Budget <= 0 {
				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
				if len(moneys) > 0 {
					if vf, ok := moneys[0].(float64); ok {
						blockPackage[k].Budget = vf
						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
					} else if vi, ok := moneys[0].(int); ok {
						blockPackage[k].Budget = float64(vi)
						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
					}
				}

			} else if kc == "中标金额" && v.Bidamount <= 0 {
				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
				if len(moneys) > 0 {
					if vf, ok := moneys[0].(float64); ok {
						blockPackage[k].Bidamount = vf
						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
					} else if vi, ok := moneys[0].(int); ok {
						blockPackage[k].Bidamount = float64(vi)
						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
					}
				}
			} else if kc == "中标单位" && v.Winner == "" {
				blockPackage[k].Winner = cv[0].Value
			}
		}
	}
}

//从正文里面找分包
func FindPackageFromText(title string, content string, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
	blockPackage = map[string]*util.BlockPackage{}
	//从正文里面找分包
	divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite)
	for k, v := range blockPackage {
		findWinnerBugetBidmountByKv(v, blockPackage, k)
	}
	//winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
	return
}

//分块之后分包
func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) {
	//查找知否有分包
	content = regFJWarap.ReplaceAllString(content, "\n")
	content = regAZWarap.ReplaceAllString(content, "\n")
	content = regStrWrap.ReplaceAllString(content, "\n")
	content = regMoreWrap.ReplaceAllString(content, "\n")
	content = regEndWrap.ReplaceAllString(content, "")
	content = regBZJWarap.ReplaceAllString(content, "")


	//替换敏感词
	content = packageReg1.ReplaceAllString(content,"${1}\n中标单位:${2}\n")
	content = packageReg2.ReplaceAllString(content,"\n标段${1}:")


	con, pkg, flag := CheckMultiPackage(content, title) //找pkg分包包名
	if !flag {
		return false, ""
	}
	//	util.Debug(con)
	//	util.Debug(pkg)
	//分包前面添加换行
	appendWarpIndex := []int{} //分包名，正文下标位置： 1000长 300下标
	for _, v := range pkg {
		//如果文本内容以识别出来的分包标识结尾，不是分包
		if len(pkg) == 1 && strings.HasSuffix(con, v[0]) {
			return false, ""
		}
		is := regexp.MustCompile(v[0]+"[:：]*").FindAllStringIndex(con, -1)
		for _, sv := range is {
			appendWarpIndex = append(appendWarpIndex, sv[0])
		}
	}
	appendWarpIndex = getPkgIndex(appendWarpIndex)
	conTemp := ""
	for k, v := range appendWarpIndex {
		if k == 0 {
			conTemp += con[:v] + "\n"
		} else {
			conTemp += "\n" + con[appendWarpIndex[k-1]:v]
		}
		if k == len(appendWarpIndex)-1 {
			conTemp += "\n" + con[v:]
		}
	}
	con = conTemp
	con = replSerial.ReplaceAllString(con, "\n")
	con = regMoreWrap.ReplaceAllString(con, "\n")
	//根据分包，找索引位置
	indexMap := map[int]int{}
	indexKeyStringMap := map[int]string{}
	indexKeyIntMap := map[int]int{}
	indexs := []int{}
	startEndMap := map[int]int{}
	pkgIndexMap := map[string][]int{}
	indexPkgMap := map[int]string{}
	
	//小标题
	titleindexs := indexTile.FindAllStringIndex(con, -1)
	if len(titleindexs) == 0 {
		titleindexs = indexTile2.FindAllStringIndex(con, -1)
	}
	//遍历分包，把kv在包前面的移动到包后面
	for _, v := range pkg {
		pgflag := v[0] + "[:：]*"
		is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1)
		for _, sv := range is {
			indexMap[sv[0]] = sv[1]
			indexs = append(indexs, sv[0])
			pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0])
			indexPkgMap[sv[0]] = v[0]
		}
		//key在包前面，并且在一行的开头
		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
		if len(keys) == 0 {
			//key在包前面，并且key以冒号结尾
			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
		}
		if len(keys) == 0 {
			keys = regexp.MustCompile("()注[:：]([\u4e00-\u9fa5]{2,8}?([（(].{1,8}?[)）])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
		}
		for _, key := range keys {
			startEndMap[key[5]] = key[4]
			//
			headkey := con[key[4]:key[5]]
			headkey = regReplAllSpace.ReplaceAllString(headkey, "")
			if !regDivision.MatchString(headkey) {
				headkey += ":"
			}
			headkey = moreColonReg.ReplaceAllString(headkey, ":")
			colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
			if len(colonIndexs) > 1 {
				headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
			}
			indexKeyStringMap[key[5]] = headkey
			indexKeyIntMap[key[5]] = key[1]
		}
	}
	indexs = getPkgIndex(indexs)
	for ik, iv := range indexs {
		if indexKeyStringMap[iv] != "" {
			continue
		}
		if indexKeyIntMap[iv] == indexMap[iv] {
			continue
		}
		if ik > 0 {
			indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
		}
	}
	//获取截取标识
	surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
	//查找分包内容，分kv
	for _, iv := range indexs {
		text := indexTextMap[iv]
		tmptext := text
		//
		warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
		if len(indexWarpMap) > 0 {
			maxWarpCount = indexWarpMap[iv]
		}
		if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount {
			textTemp := text
			text = textTemp[:warpIndex[maxWarpCount-1][1]]
			surplusText += textTemp[warpIndex[maxWarpCount-1][0]:]
		}
		for bk, bv := range pkg {
			//判断分包如果在这段文字里面，该段文字就属于该包的
			if !strings.HasPrefix(text, bv[0]) {
				continue
			}
			index := util.PackageNumberConvert(bk)
			//去掉前缀,空格必须要加，分kv的时候要用
			text = regexp.MustCompile(bv[0]+"[:：]*").ReplaceAllString(text, "")
			if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text || strings.TrimLeft(tmptext, bv[0]+"：") == text {
				var tagtitle string

				for i, v := range titleindexs {
					if i == 0 {
						continue
					}
					if v[0] > iv {
						tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]]
						break
					}
				}
				tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
				if tagtitle == "" {
					tagtitle = title
				} else if strings.Contains(tagtitle, bv[0]) && title != "" {
					tagtitle = title
				}
				text = tagtitle + ":" + text
			}
			headKey := ""
			if indexKeyStringMap[iv] != "" {
				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
				headKey = indexKeyStringMap[iv]
				//}
				for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
					delete(indexKeyStringMap, pkgIndexMap_v)
					break
				}
			}
			//如果一块中有多个相同的包，合并到一个
			if (*blockPackage)[index] != nil {
				//合并文本
				(*blockPackage)[index].Text += "\n" + text
				//合并冒号kv
				colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1, isSite, codeSite)
				if headKey != "" {
					kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
					MergeKvTags(colonJobKv.KvTags, kvAgain.KvTags)
				}
				MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags)
				//合并空格kv
				spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite)
				MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags)
			} else {
				newBpkg := &util.BlockPackage{
					Origin:   bk,
					Text:     text,
					Index:    index,
					Name:     bv[0],
					Type:     bv[1],
					Accuracy: accuracy,
				}
				//fmt.Println(text)
				finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4, isSite, codeSite)
				if headKey != "" {
					kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
					MergeKvTags(finalKv.KvTags, kvAgain.KvTags)
				}
				//kv-字段-
				newBpkg.ColonKV = finalKv
				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil, isSite, codeSite)
				(*blockPackage)[index] = newBpkg
			}
		}
	}
	//中标人排序
	//if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 {
	//	for _, v := range *blockPackage {
	//		v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2, isSite, codeSite)
	//	}
	//}
	return true, surplusText
}
func getPkgIndex(indexs []int) []int {
	sort.Ints(indexs)
	indexsNew := []int{}
	count := 0
	for k, v := range indexs {
		if k > 0 && v-indexs[k-1] <= 10 {
			count++
			continue
		}
		indexsNew = append(indexsNew, v)
	}
	if count > 0 && count == len(indexs)-1 {
		return []int{}
	}
	return indexsNew
}

//每个包对应的结束位置，都是整行结束
func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) {
	//util.Debug(con)
	surplusText := ""
	indexTextMap := map[int]string{}
	indexWarpMap := map[int]int{}
	maxWarpCount := 0
	for ik, iv := range indexs {
		text := ""
		if ik < len(indexs)-1 {
			if startEndMap[indexs[ik+1]] != 0 {
				text = con[iv:startEndMap[indexs[ik+1]]]
			} else {
				text = con[iv:indexs[ik+1]]
			}
		} else {
			text = con[iv:]
		}
		//fmt.Println(text)
		tmptext := text
		//if strings.Contains(text, "、") {
		//	text = strings.Split(text, "、")[0]
		//} else
		if strings.Contains(text, "\n") {
			texts := strings.Split(text, "\n")
			text2 := ""
			if ik+1 < len(indexs)-1 {
				if startEndMap[indexs[ik+1+1]] != 0 {
					text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]]
				} else {
					text2 = con[indexs[ik+1]:indexs[ik+1+1]]
				}
				if texts[len(texts)-1] == text2 {
					text = texts[0]
				}
			}
		}
		if utf8.RuneCountInString(text) < 5 {
			indexTextMap[iv] = tmptext
		} else {
			indexTextMap[iv] = text
		}
		warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
		if warpCount > maxWarpCount {
			maxWarpCount = warpCount
		}
		indexWarpMap[iv] = warpCount
		if ik == 0 {
			surplusText += con[:iv]
		}
	}
	pkgLaw := ""
	if len(pkgIndexMap) > 1 {
		//有规律的出现 AB or ABAB
		if pkgLaw == "" {
			prevVal := ""
			notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0
			indexMaxMap := map[int]int{}
			for ik, iv := range indexs {
				if notRepeatCount == len(pkgIndexMap) {
					notRepeatCount = 0
				}
				if prevVal != indexPkgMap[iv] {
					notRepeatCount++
				} else {
					notRepeatCount = -1
					currentIndex = ik
					break
				}
				prevVal = indexPkgMap[iv]
				if notRepeatCount == len(pkgIndexMap) {
					indexMaxMap[iv] = onceMax
					onceMax = 0
				}
				if indexWarpMap[iv] > onceMax {
					onceMax = indexWarpMap[iv]
					allMax = onceMax
				}
				if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) {
					notRepeatCount = -2
					currentIndex = ik
				}
			}
			//util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap)
			if len(indexMaxMap) > 0 {
				pkgLaw = "AB"
				thisMax := 0
				for ik := len(indexs) - 1; ik >= 0; ik-- {
					iv := indexs[ik]
					if currentIndex != -1 && ik >= currentIndex {
						indexWarpMap[iv] = allMax
						continue
					}
					if indexMaxMap[iv] > 0 {
						thisMax = indexMaxMap[iv]
					}
					indexWarpMap[iv] = thisMax
				}
			}
		}
	}
	if pkgLaw == "" {
		indexWarpMap = map[int]int{}
	}
	//util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap)
	return surplusText, maxWarpCount, indexTextMap, indexWarpMap
}

//分块之后的kv
func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) []*util.Kv {
	blocks, _ := DivideBlock(tp, text, from, ruleBlock, isSite, codeSite)
	kvs := []*util.Kv{}
	for _, v := range blocks {
		//util.Debug(v.Text)
		//		for _, vvv := range v.ColonKV.Kvs {
		//			util.Debug(vvv.Key, vvv.Value, vvv.Title)
		//		}
		kvs = append(kvs, v.ColonKV.Kvs...)
	}
	return kvs
}