123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146 |
- package pretreated
- import (
- "fmt"
- "jy/clear"
- "jy/util"
- qutil "qfw/util"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "unicode/utf8"
- )
- //分块、分段功能
- var (
- /*regSerialTitles = []string{
- "([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)",
- "[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)",
- "(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
- "(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)",
- "(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
- "1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
- }*/
- regSerialTitles_1 = []*regexp.Regexp{
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)"),
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"),
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)"),
- regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s(]*|^[\u3000\u2003\u00a0\\s(]*)(\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)"),
- }
- regSerialTitles_2 = []*regexp.Regexp{
- regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)$"),
- regexp.MustCompile("^[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)$"),
- regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"),
- regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)$"),
- regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
- regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
- regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
- }
- regReplAllTd = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
- regIsNumber = regexp.MustCompile("^\\d+$")
- regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
- regReplAllSpace = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+")
- regTrimSpace = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
- regReplWrapSpace = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
- regReplAllSymbol = regexp.MustCompile("[(\\(<《【\\[{{〔)\\)>》】\\]}}〕,,;;::'\"“”。.\\??/+=\\-_——*&……\\^%$¥@#!!`~·]")
- regFilterTitle = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]")
- regDivision = regexp.MustCompile("[::]")
- regSpliteSegment = regexp.MustCompile("[\r\n]")
- regFilterNumber = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
- regSplit = regexp.MustCompile("或|和|以?及|与|、|或")
- regStartWrap = regexp.MustCompile("^[\r\n]")
- regEndWrap = regexp.MustCompile("[\r\n]$")
- regMoreWrap = regexp.MustCompile("[\r\n]{2,}")
- regStrWrap = regexp.MustCompile("分包名称[::]")
- regBZJWarap = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
- regFJWarap = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
- regAZWarap = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)")
- replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
- moreColonReg = regexp.MustCompile("[::]+")
- regFilter = regexp.MustCompile("等$")
- pkgFilter = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+")
- indexTile = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[::]+`) //小标题
- indexTile2 = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[::]\n`)
- regReplAllSpace2 = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
- confusion = map[string]string{
- "参与": "canyu",
- }
- //查找分包之前,先对内容进行预处理
- /*
- 第一包:采购设备清单
- <table></table>
- */
- regPackageFilter = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
- filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
- xuhao = map[string]bool{
- "19968_12289": true,
- "19968_46": true,
- "20108_12289": true,
- "20108_46": true,
- "19977_12289": true,
- "19977_46": true,
- "22235_12289": true,
- "22235_46": true,
- "20116_12289": true,
- "20116_46": true,
- "20845_12289": true,
- "20845_46": true,
- "19971_12289": true,
- "19971_46": true,
- "20843_12289": true,
- "20061_46": true,
- }
- //非分包中标单位值
- unPackageWinnerReg = regexp.MustCompile("(重新招标|方案包)")
- conformWinnerKVReg = regexp.MustCompile("^(中标人|中标银行|第一名)[::](.{4,20}(分行|公司))")
- conformWinnerKVReg1 = regexp.MustCompile("^[-].{4,15}公司$")
- conformWinnerKVReg2 = regexp.MustCompile("(.*)?确定(.*公司)为中标人(.*)?")
- conformWinnerTextReg3 = regexp.MustCompile("拟定供应商信息[::\\s]+名称[::](.*)[\\s]+地址")
- /*
- 拟定供应商信息:
- 名称:郑州人民广播电台
- 地址:郑州市金水区内环路17号A座。
- */
- //针对处理-替换敏感词-中标
- packageReg1 = regexp.MustCompile("(包件[一二三四五1-9][::].*)\n1[、.\\s]+名称[::](.*)\n2[、.\\s]+")
- packageReg2 = regexp.MustCompile("标段[((]包[))][\\[][O0]+([1-9一二三四五六七八九])[\\]]")
- packageReg3 = regexp.MustCompile("(中标价格)[::]")
- packageReg4 = regexp.MustCompile("([1-9](标段)[::])拟定供应商名称[::](.*公司)\n")
- packageReg5 = regexp.MustCompile("(第[1-9一二三四五](标段))(中标人)[::](.*)\n")
- packageReg6 = regexp.MustCompile("供应商名称[::](.{4,20}公司)[((]([0]?1包)[))][、,,](.{4,20}公司)[((]([0]?2包)[))]")
- //预算
- packageReg20 = regexp.MustCompile("(最高投标限价为|投资预算约[为]?)([0-9.万元人民币]+)")
- packageReg21 = regexp.MustCompile("(预算金额|项目预算)[::](包[\\s]?1|1[\\s]?包)[::]?([0-9.万元人民币]+)[,,](包[\\s]?2|2[\\s]?包)[::]?([0-9.万元人民币]+)")
- untitleReg = regexp.MustCompile("(技术评分明细表)")
- unpriceReg = regexp.MustCompile("(^([Xx]\\+[1-9\\.]+元/每)|分析)")
- //敏感词-影响分包-替换-分割
- replaceSenstiveReg1 = regexp.MustCompile("([一二三四五六七八九十1-9][、]项目名称[::].*采购项目)([一二三四五六七八九十1-9][、]采购结果)")
- //价格~单位换行 替换
- packageReg50 = regexp.MustCompile("(投标报价[::][0-9.]+)\n(万元)")
- )
- //分块
- func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) ([]*util.Block, int) {
- defer qutil.Catch()
- returnValue := 0
- var blocks []*util.Block
- if strings.TrimSpace(content) == "" || codeSite == "a_zgyc_ztbxx" || codeSite=="a_gyzbgfyxgs_zbjg" {
- return blocks, -1
- }
- //table里面的内容不考虑,先把table清理掉
- //contentTemp := regReplAllTd.ReplaceAllString(content, "")
- contentTemp := TextAfterRemoveTable(content)
- tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
- var regContenSerialTitle *regexp.Regexp
- var regSerialTitleIndex int
- if ruleBlock != nil && len(ruleBlock.BlockRegs) > 0 {
- regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
- } else {
- regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, regSerialTitles_1)
- }
- //没有分块
- if regSerialTitleIndex == -1 {
- if len(contentTemp) == len(content) {
- //没有分块
- return blocks, -1
- } else { //有table
- return blocks, -2
- }
- }
- //匹配序号和标题
- var regSerialTitle *regexp.Regexp
- if ruleBlock != nil && len(ruleBlock.TitleRegs) > 0 {
- regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
- } else {
- regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
- }
- indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
- indexs = filterSerial(content, indexs, tdIndexs)
- //头块
- var headBlock, endBlock *util.Block
- currentIndex := 0
- for k, v := range indexs {
- start, end := v[0], v[1]
- //添加开头部分
- if k == 0 {
- if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" {
- headBlock = &util.Block{
- Index: -1, //序号
- Text: headTemp, //内容
- Title: "", //标题
- Start: 0,
- End: start,
- }
- }
- }
- //分块
- blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "")
- serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题
- if len(serialTitles) < 3 {
- continue
- }
- indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号
- index := 0
- //转成数字序号
- if regIsNumber.MatchString(indexSting) {
- index, _ = strconv.Atoi(indexSting)
- } else if regIsChineseNumber.MatchString(indexSting) {
- index = util.ChineseNumberToInt(indexSting)
- }
- //序号开始就是错误的
- if k+1 != index {
- if k == 0 {
- returnValue = 3
- break
- } else {
- if currentIndex+1 != index {
- //如果序号不是连续的,不往下走
- returnValue = 2
- //添加结尾部分
- if from != 3 {
- endBlock = &util.Block{
- Index: -2, //序号
- Text: content[start:], //内容
- Title: "", //标题
- Start: start,
- End: len(content),
- }
- break
- }
- }
- }
- currentIndex = index
- }
- //
- title := serialTitles[2] //标题
- title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
- //分块后的块文
- nextStart := len(content)
- if k < len(indexs)-1 {
- nextStart = indexs[k+1][0]
- }
- //获取块中除了序号和标题的内容
- blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
- if title != "" {
- blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
- //特殊情况处理
- if blockTextTemp == "" {
- if regDivision.MatchString(title) {
- /*
- 一、项目编号:HMEC170223
- 二、项目名称:执法记录仪采购
- */
- blockText = title
- divisionIndexs := regDivision.FindStringIndex(title)
- title = title[:divisionIndexs[0]]
- } else {
- /*
- 十一、投标代表须持本人身份证原件亲自递交投标文件,代理机构项目经理审核通过后,办理签收手续,否则投标文件被拒收。
- 十二、开标时间:2017年3月20日9时30分
- */
- blockText = title
- title = ""
- }
- } else if blockTextTemp != "" && regDivision.MatchString(title) {
- /*
- 2、采购单位名称:福建省汀州医院
- 采购单位地址: 龙岩市长汀县
- 联系人:胡科长
- 联系方式:0597-6826353
- */
- //多个标题
- divisionIndexs := regDivision.FindStringIndex(title)
- titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
- titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
- blockText = title + "\n" + blockText
- if titleAfter != "" {
- title = ""
- } else {
- title = titleBefore
- }
- } else {
- blockText = title + "\n" + blockText
- }
- }
- //没有内容的块,不打标签,不分段
- if blockText == "" {
- continue
- }
- //过滤
- if regexp.MustCompile("投标文件格式|业绩").MatchString(title) &&
- !regexp.MustCompile("拟定的唯一供应商名称").MatchString(title){
- continue
- }
- blockText = hasMergeKV(title, blockText)
- //
- titleIsExists := map[string]bool{} //去重
- title = filterTitle(title)
- //分割标题 [和及]。。。 参与
- splitTitles := ProcTitle(title)
- blockText = mergetext(splitTitles, blockText)
- block := &util.Block{
- Index: index, //序号
- Text: blockText, //内容
- Title: title, //标题
- Titles: splitTitles,
- Start: start,
- End: nextStart,
- }
- titles := []string{}
- for _, sv := range splitTitles {
- if sv == "" || titleIsExists[sv] {
- continue
- }
- titleIsExists[sv] = true
- //标题过短过长不打标签
- if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
- //打标签
- block.Tags = append(block.Tags, util.GetBlockTags(sv))
- titles = append(titles, sv)
- }
- }
- block.Title = title
- block.Titles = titles
- if ruleBlock != nil {
- block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles)
- }
- tagsToBlocks(blocks, block)
- //log.Println(index, sv, splitTitles)
- //log.Println(blockText)
- blocks = append(blocks, block)
- }
- var returnBlocks []*util.Block
- if len(blocks) > 0 {
- //头
- if headBlock != nil {
- if tp == "招标" {
- headBlock.Classify = map[string]bool{"bidcondition": true}
- }
- returnBlocks = append(returnBlocks, headBlock)
- }
- //中间块
- returnBlocks = append(returnBlocks, blocks...)
- //尾
- if endBlock != nil {
- returnBlocks = append(returnBlocks, endBlock)
- }
- if returnValue == 0 {
- returnValue = 1
- }
- }
- contactFormat := &util.ContactFormat{
- IndexMap: map[int]string{},
- MatchMap: map[string]map[string]bool{},
- }
- for _, bl := range returnBlocks {
- //解析kv
- newText := TextAfterRemoveTable(bl.Text) //取出纯文本
- bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from, isSite, codeSite)
- bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat, isSite, codeSite)
- //正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
- bl.Text = appendWarpStop(bl.Text)
- }
- return returnBlocks, returnValue
- }
- func mergetext(titles []string, text string) string {
- if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
- return text
- }
- splitLenstrs := strings.Split(text, "\n")
- if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
- return text
- }
- tt := ""
- for i, v := range splitLenstrs[1:] {
- lentexts := regDivision.Split(v, -1)
- if len(lentexts) == 2 {
- if strings.Contains(titles[i], lentexts[0]) {
- tt += titles[i] + ":" + lentexts[1] + "\n"
- }else if strings.Contains(strings.ReplaceAll(titles[i],"的",""), strings.ReplaceAll(lentexts[0],"的","")){
- tt += titles[i] + ":" + lentexts[1] + "\n"
- }else if strings.Contains(strings.ReplaceAll(titles[i],"联系地址","地址"), strings.ReplaceAll(lentexts[0],"联系地址","地址")){
- tt += titles[i] + ":" + lentexts[1] + "\n"
- }
- }else {
- //特殊处理
- if strings.Contains(v,"中标人 ") {
- tt +=v+"\n"
- }
- }
- }
- if len(tt) == 0 {
- return text
- } else {
- return tt
- }
- }
- //块标题处理
- func ProcTitle(title string) []string {
- if title == "" {
- return []string{}
- }
- for k, v := range confusion {
- title = strings.Replace(title, k, v, -1)
- }
- direct := 1
- prev := ""
- ara := regSplit.Split(title, -1)
- for kk, vv := range ara {
- for kkk, vvv := range confusion {
- vv = strings.Replace(vv, vvv, kkk, -1)
- }
- ara[kk] = vv
- if len([]rune(vv)) == 2 {
- if kk == 0 {
- direct = -1
- } else {
- start := ""
- if len([]rune(prev)) > 3 {
- start = string([]rune(prev)[:len([]rune(prev))-2])
- }
- ara[kk] = start + vv
- }
- } else if vv == "联系人" || vv == "联系方式" {
- if strings.Contains(prev, "代理") {
- ara[kk] = "代理机构" + vv
- } else if strings.Contains(prev, "中标") {
- ara[kk] = "中标单位" + vv
- } else if strings.Contains(prev, "采购") {
- ara[kk] = "采购单位" + vv
- }
- }
- if len([]rune(vv)) > 3 {
- if direct == -1 {
- end := string([]rune(vv)[len([]rune(vv))-2:])
- for i := 0; i < kk; i++ {
- ara[i] = ara[i] + end
- }
- break
- }
- prev = vv
- }
- }
- return ara
- }
- //有合并kv的 例如项目名称及编号
- func hasMergeKV(title, text string) string {
- title = regDivision.ReplaceAllString(title, "")
- titles := regSplit.Split(title, -1)
- if len(titles) <= 1 {
- return text
- }
- before := titles[0]
- after := titles[1]
- if strings.Contains(title, "项目") && len([]rune(after)) == 2 {
- after = "项目" + after
- } else {
- return text
- }
- if strings.Count(text, "\n") != 1 {
- return text
- }
- texts := strings.Split(text, "\n")
- textOneLine := texts[0]
- textTwoLine := texts[1]
- if regDivision.MatchString(textTwoLine) {
- return text
- }
- if textTwoLine := strings.SplitN(textTwoLine, ",", 2); len(textTwoLine) == 2 {
- text = textOneLine + "\n" + before + ":" + textTwoLine[0] + "," + after + ":" + textTwoLine[1]
- }
- return text
- }
- //过滤序号,判断序号是不是在td里,如果是的话这个序号作废
- func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
- returnIndexs := [][]int{}
- for _, v := range indexs {
- flag := false
- //根据序号的开始位置,判断是不是在td里面
- for _, tv := range tdIndexs {
- if v[0] > tv[0] && v[0] < tv[1] {
- flag = true
- continue
- }
- }
- if flag {
- continue
- }
- returnIndexs = append(returnIndexs, []int{v[0], v[1]})
- }
- return returnIndexs
- }
- //获取正文所用的序号类型
- func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
- var regContenSerialTitle *regexp.Regexp
- //先判断文章最外层使用的是哪种序号
- contentStartIndex, regSerialTitleIndex := -1, -1
- for k, v := range blockRegs {
- indexs := v.FindStringIndex(content)
- //只用最外层的序号,里面的过滤掉
- if len(indexs) == 2 && !strings.Contains(content,"中标候选人排序") && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
- regSerialTitleIndex = k
- contentStartIndex = indexs[0]
- regContenSerialTitle = v
- }
- }
- return regContenSerialTitle, regSerialTitleIndex
- }
- //添加换行和句号
- func appendWarpStop(text string) string {
- //清理前后空格
- text = regTrimSpace.ReplaceAllString(text, "")
- //添加句号
- if !strings.HasSuffix(text, "。") {
- text += "。"
- }
- //添加换行
- if !regEndWrap.MatchString(text) {
- text += "\n"
- }
- return text
- }
- //分段
- func DivideSegmentHtml(txt string) []*util.Segment {
- //先分段
- _segs := strings.FieldsFunc(txt, func(r rune) bool {
- return r == 10 || r == 13
- })
- //再去除空行
- segs := make([]*util.Segment, 0)
- _index := 0
- for _, seg := range _segs {
- if seg != " " && len(seg) > 1 {
- _seg := util.Segment{}
- _index = _index + 1
- _seg.Index = _index
- _seg.Text = seg
- segs = append(segs, &_seg)
- }
- }
- return segs
- }
- //分段
- func DivideSegment(txt string) []*util.Segment {
- //先分段
- tmpstr := ""
- _segs := strings.FieldsFunc(txt, func(r rune) bool {
- if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
- r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
- if tmpstr == "" {
- tmpstr += fmt.Sprint(r)
- return false
- } else if strings.Contains(tmpstr, "_") {
- tmpstr = ""
- tmpstr += fmt.Sprint(r)
- return false
- } else if tmpstr == fmt.Sprint(r) {
- if r == 46 || r == 12289 {
- tmpstr = ""
- }
- return false
- }
- tmpstr += "_" + fmt.Sprint(r)
- if xuhao[tmpstr] {
- return true
- }
- }
- tmpstr = ""
- return r == 10 || r == 13
- })
- //再去除空行
- segs := make([]*util.Segment, 0)
- _index := 0
- for _, seg := range _segs {
- if seg != " " && len(seg) > 1 {
- _seg := util.Segment{}
- _index = _index + 1
- _seg.Index = _index
- _seg.Text = seg
- segs = append(segs, &_seg)
- }
- }
- return segs
- }
- /** 给块打标签 **/
- func tagsToBlocks(blocks []*util.Block, block *util.Block) {
- if len(block.Tags) == 0 {
- return
- }
- tag := map[string]bool{}
- tagWeight := map[string]int{}
- for _, v := range block.Tags {
- for _, ts := range v {
- tag[ts.Value] = true
- tagWeight[ts.Value] = ts.Weight
- }
- }
- for v, _ := range tag {
- for _, block := range blocks {
- if block.Tag[v] {
- for _, blockTags := range block.Tags {
- for _, ts := range blockTags {
- if ts.Value == v && ts.Weight < tagWeight[v] {
- block.Tag[v] = false
- }
- }
- }
- }
- }
- }
- block.Tag = tag
- }
- func filterTitle(title string) string {
- if strings.Contains(title, ",") && strings.Contains(title, "。") {
- return ""
- }
- if len([]rune(title)) > 30 {
- return ""
- }
- //清理空格
- title = regReplAllSpace.ReplaceAllString(title, "")
- //清理成对出现的符号中的内容
- title = regFilterTitle.ReplaceAllString(title, "")
- //清理特殊符号
- title = regReplAllSymbol.ReplaceAllString(title, "")
- //清理序号
- title = regFilterNumber.ReplaceAllString(title, "")
- title = regFilter.ReplaceAllString(title, "")
- return title
- }
- //从块里面找分包
- func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
- blockPackage = map[string]*util.BlockPackage{}
- //块分包
- for _, v := range *blocks {
- text := regPackageFilter.ReplaceAllString(v.Text, "<table>")
- text = TextAfterRemoveTable(text)
- if text == "" {
- continue
- }
- //var ok bool
- //var surplusText string
- //分析分包-金额,中标单位,人电话,包名,中标后选人
- divisionPackageChild(&blockPackage, text, v.Title, true, v.Tag["中标单位"], isSite, codeSite)
- }
- //orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
- for k, v := range blockPackage {
- findWinnerBugetBidmountByKv(v, blockPackage, k) //根据kv-find字段
- }
- return
- }
- func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) {
- if v.ColonKV != nil && v.ColonKV.KvTags != nil {
- for kc, cv := range v.ColonKV.KvTags {
- if kc == "预算" && v.Budget <= 0 {
- moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
- if len(moneys) > 0 {
- if vf, ok := moneys[0].(float64); ok {
- blockPackage[k].Budget = vf
- blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
- } else if vi, ok := moneys[0].(int); ok {
- blockPackage[k].Budget = float64(vi)
- blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
- }
- }
- } else if (kc == "中标金额"||kc=="各包中标/成交候选供应商及报价") && v.Bidamount <= 0 {
- //特殊金额类可避免
- if unpriceReg.MatchString(cv[0].Value) {
- continue
- }
- moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
- if len(moneys) > 0 {
- if vf, ok := moneys[0].(float64); ok {
- blockPackage[k].Bidamount = vf
- blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
- } else if vi, ok := moneys[0].(int); ok {
- blockPackage[k].Bidamount = float64(vi)
- blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
- }
- }
- } else if (kc == "中标单位"||kc=="第1 名"||kc=="各包中标/成交候选供应商及报价") && v.Winner == "" {
- if !unPackageWinnerReg.MatchString(cv[0].Value) {
- isW:=false
- if len(cv)>1 {
- for _,v_cv :=range cv{
- if v_cv.Key=="中标单位" && v_cv.Value!="" {
- isW = true
- blockPackage[k].Winner = v_cv.Value
- break
- }
- }
- }
- if !isW {
- blockPackage[k].Winner = cv[0].Value
- }
- }
- }else { //特殊情况-特殊处理
- res := conformWinnerKVReg.FindAllStringSubmatch(cv[0].Value, -1)
- if len(res) > 0 {
- text := res[0][2]
- if text!="" {
- blockPackage[k].Winner = text
- continue
- }
- }
- if kc=="中标信息" && conformWinnerKVReg1.MatchString(cv[0].Value){
- blockPackage[k].Winner = cv[0].Value
- continue
- }
- if conformWinnerKVReg2.MatchString(cv[0].Value) {
- blockPackage[k].Winner = conformWinnerKVReg2.ReplaceAllString(cv[0].Value,"${2}")
- continue
- }
- //全文找
- res = conformWinnerTextReg3.FindAllStringSubmatch(v.Text, -1)
- if len(res) > 0 {
- text := res[0][1]
- if text!="" {
- blockPackage[k].Winner = text
- continue
- }
- }
- }
- }
- }
- if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
- for kc, cv := range v.SpaceKV.KvTags {
- if kc == "预算" && v.Budget <= 0 {
- moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
- if len(moneys) > 0 {
- if vf, ok := moneys[0].(float64); ok {
- blockPackage[k].Budget = vf
- blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
- } else if vi, ok := moneys[0].(int); ok {
- blockPackage[k].Budget = float64(vi)
- blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
- }
- }
- } else if kc == "中标金额" && v.Bidamount <= 0 {
- moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
- if len(moneys) > 0 {
- if vf, ok := moneys[0].(float64); ok {
- blockPackage[k].Bidamount = vf
- blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
- } else if vi, ok := moneys[0].(int); ok {
- blockPackage[k].Bidamount = float64(vi)
- blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
- }
- }
- } else if kc == "中标单位" && v.Winner == "" {
- blockPackage[k].Winner = cv[0].Value
- }
- }
- }
- }
- //从正文里面找分包
- func FindPackageFromText(title string, content string, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
- blockPackage = map[string]*util.BlockPackage{}
- //从正文里面找分包
- divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite)
- for k, v := range blockPackage {
- findWinnerBugetBidmountByKv(v, blockPackage, k)
- }
- //winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
- return
- }
- //分块之后分包
- func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) {
- //查找知否有分包
- content = replaceSenstiveReg1.ReplaceAllString(content,"$1\n$2")
- content = regFJWarap.ReplaceAllString(content, "\n")
- content = regAZWarap.ReplaceAllString(content, "\n")
- content = regStrWrap.ReplaceAllString(content, "\n")
- content = regMoreWrap.ReplaceAllString(content, "\n")
- content = regEndWrap.ReplaceAllString(content, "")
- content = regBZJWarap.ReplaceAllString(content, "")
- //替换敏感词
- content = packageReg1.ReplaceAllString(content,"${1}\n中标单位:${2}\n")
- content = packageReg2.ReplaceAllString(content,"\n标段${1}:")
- content = packageReg3.ReplaceAllString(content,"\n${1}:")
- content = packageReg4.ReplaceAllString(content,"\n${1}\n中标单位:${3}\n")
- content = packageReg5.ReplaceAllString(content,"\n${1}\n中标单位:${4}\n")
- content = packageReg6.ReplaceAllString(content,"\n$2\n中标单位:$1\n$4\n中标单位:$3")
- //替换换行金额
- content = packageReg50.ReplaceAllString(content,"$1$2")
- content = packageReg20.ReplaceAllString(content,"\n预算金额:${2}\n")
- content = packageReg21.ReplaceAllString(content,"\n${2}\n预算金额:${3}\n${4}\n预算金额:${5}")
- //6、项目预算:1包3689028.00元,2包700000.00元。
- if untitleReg.MatchString(title){
- return false, ""
- }
- con, pkg, flag := CheckMultiPackage(content) //找pkg分包包名
- if !flag {
- return false, ""
- }
- // util.Debug(con)
- // util.Debug(pkg)
- //分包前面添加换行
- appendWarpIndex := []int{} //分包名,正文下标位置: 1000长 300下标
- for _, v := range pkg {
- //如果文本内容以识别出来的分包标识结尾,不是分包
- if len(pkg) == 1 && strings.HasSuffix(con, v[0]) {
- return false, ""
- }
- is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
- for _, sv := range is {
- appendWarpIndex = append(appendWarpIndex, sv[0])
- }
- }
- appendWarpIndex = getPkgIndex(appendWarpIndex)
- conTemp := ""
- for k, v := range appendWarpIndex {
- if k == 0 {
- conTemp += con[:v] + "\n"
- } else {
- conTemp += "\n" + con[appendWarpIndex[k-1]:v]
- }
- if k == len(appendWarpIndex)-1 {
- conTemp += "\n" + con[v:]
- }
- }
- con = conTemp
- con = replSerial.ReplaceAllString(con, "\n")
- con = regMoreWrap.ReplaceAllString(con, "\n")
- //根据分包,找索引位置
- indexMap := map[int]int{}
- indexKeyStringMap := map[int]string{}
- indexKeyIntMap := map[int]int{}
- indexs := []int{}
- startEndMap := map[int]int{}
- pkgIndexMap := map[string][]int{}
- indexPkgMap := map[int]string{}
-
- //小标题
- titleindexs := indexTile.FindAllStringIndex(con, -1)
- if len(titleindexs) == 0 {
- titleindexs = indexTile2.FindAllStringIndex(con, -1)
- }
- //遍历分包,把kv在包前面的移动到包后面
- for _, v := range pkg {
- pgflag := v[0] + "[::]*"
- is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1)
- for _, sv := range is {
- indexMap[sv[0]] = sv[1]
- indexs = append(indexs, sv[0])
- pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0])
- indexPkgMap[sv[0]] = v[0]
- }
- //key在包前面,并且在一行的开头
- keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
- if len(keys) == 0 {
- //key在包前面,并且key以冒号结尾
- keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
- }
- if len(keys) == 0 {
- keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
- }
- for _, key := range keys {
- startEndMap[key[5]] = key[4]
- //
- headkey := con[key[4]:key[5]]
- headkey = regReplAllSpace.ReplaceAllString(headkey, "")
- if !regDivision.MatchString(headkey) {
- headkey += ":"
- }
- headkey = moreColonReg.ReplaceAllString(headkey, ":")
- colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
- if len(colonIndexs) > 1 {
- headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
- }
- indexKeyStringMap[key[5]] = headkey
- indexKeyIntMap[key[5]] = key[1]
- }
- }
- indexs = getPkgIndex(indexs)
- for ik, iv := range indexs {
- if indexKeyStringMap[iv] != "" {
- continue
- }
- if indexKeyIntMap[iv] == indexMap[iv] {
- continue
- }
- if ik > 0 {
- indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
- }
- }
- //获取截取标识
- surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
- //查找分包内容,分kv
- for _, iv := range indexs {
- text := indexTextMap[iv]
- tmptext := text
- //
- warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
- if len(indexWarpMap) > 0 {
- maxWarpCount = indexWarpMap[iv]
- }
- if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount {
- textTemp := text
- text = textTemp[:warpIndex[maxWarpCount-1][1]]
- surplusText += textTemp[warpIndex[maxWarpCount-1][0]:]
- }
- for bk, bv := range pkg {
- //判断分包如果在这段文字里面,该段文字就属于该包的
- if !strings.HasPrefix(text, bv[0]) {
- continue
- }
- index := util.PackageNumberConvert(bk)
- //去掉前缀,空格必须要加,分kv的时候要用
- text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
- if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text || strings.TrimLeft(tmptext, bv[0]+":") == text {
- var tagtitle string
- for i, v := range titleindexs {
- if i == 0 {
- continue
- }
- if v[0] > iv {
- tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]]
- break
- }
- }
- tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
- if tagtitle == "" {
- tagtitle = title
- } else if strings.Contains(tagtitle, bv[0]) && title != "" {
- tagtitle = title
- }
- text = tagtitle + ":" + text
- }
- headKey := ""
- if indexKeyStringMap[iv] != "" {
- //if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
- headKey = indexKeyStringMap[iv]
- //}
- for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
- delete(indexKeyStringMap, pkgIndexMap_v)
- break
- }
- }
- //如果一块中有多个相同的包,合并到一个
- if (*blockPackage)[index] != nil {
- //合并文本
- (*blockPackage)[index].Text += "\n" + text
- //合并冒号kv
- colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1, isSite, codeSite)
- if headKey != "" {
- kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
- MergeKvTags(colonJobKv.KvTags, kvAgain.KvTags)
- }
- MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags)
- //合并空格kv
- spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite)
- MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags)
- } else {
- newBpkg := &util.BlockPackage{
- Origin: bk,
- Text: text,
- Index: index,
- Name: bv[0],
- Type: bv[1],
- Accuracy: accuracy,
- }
- //fmt.Println(text)
- finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4, isSite, codeSite)
- if headKey != "" {
- kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
- MergeKvTags(finalKv.KvTags, kvAgain.KvTags)
- }
- //kv-字段-
- newBpkg.ColonKV = finalKv
- newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil, isSite, codeSite)
- (*blockPackage)[index] = newBpkg
- }
- }
- }
- //中标人排序
- //if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 {
- // for _, v := range *blockPackage {
- // v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2, isSite, codeSite)
- // }
- //}
- return true, surplusText
- }
- func getPkgIndex(indexs []int) []int {
- sort.Ints(indexs)
- indexsNew := []int{}
- count := 0
- for k, v := range indexs {
- if k > 0 && v-indexs[k-1] <= 10 {
- count++
- continue
- }
- indexsNew = append(indexsNew, v)
- }
- if count > 0 && count == len(indexs)-1 {
- return []int{}
- }
- return indexsNew
- }
- //每个包对应的结束位置,都是整行结束
- func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) {
- //util.Debug(con)
- surplusText := ""
- indexTextMap := map[int]string{}
- indexWarpMap := map[int]int{}
- maxWarpCount := 0
- for ik, iv := range indexs {
- text := ""
- if ik < len(indexs)-1 {
- if startEndMap[indexs[ik+1]] != 0 {
- text = con[iv:startEndMap[indexs[ik+1]]]
- } else {
- text = con[iv:indexs[ik+1]]
- }
- } else {
- text = con[iv:]
- }
- //fmt.Println(text)
- tmptext := text
- //if strings.Contains(text, "、") {
- // text = strings.Split(text, "、")[0]
- //} else
- if strings.Contains(text, "\n") {
- texts := strings.Split(text, "\n")
- text2 := ""
- if ik+1 < len(indexs)-1 {
- if startEndMap[indexs[ik+1+1]] != 0 {
- text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]]
- } else {
- text2 = con[indexs[ik+1]:indexs[ik+1+1]]
- }
- if texts[len(texts)-1] == text2 {
- text = texts[0]
- }
- }
- }
- if utf8.RuneCountInString(text) < 5 {
- indexTextMap[iv] = tmptext
- } else {
- indexTextMap[iv] = text
- }
- warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
- if warpCount > maxWarpCount {
- maxWarpCount = warpCount
- }
- indexWarpMap[iv] = warpCount
- if ik == 0 {
- surplusText += con[:iv]
- }
- }
- pkgLaw := ""
- if len(pkgIndexMap) > 1 {
- //有规律的出现 AB or ABAB
- if pkgLaw == "" {
- prevVal := ""
- notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0
- indexMaxMap := map[int]int{}
- for ik, iv := range indexs {
- if notRepeatCount == len(pkgIndexMap) {
- notRepeatCount = 0
- }
- if prevVal != indexPkgMap[iv] {
- notRepeatCount++
- } else {
- notRepeatCount = -1
- currentIndex = ik
- break
- }
- prevVal = indexPkgMap[iv]
- if notRepeatCount == len(pkgIndexMap) {
- indexMaxMap[iv] = onceMax
- onceMax = 0
- }
- if indexWarpMap[iv] > onceMax {
- onceMax = indexWarpMap[iv]
- allMax = onceMax
- }
- if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) {
- notRepeatCount = -2
- currentIndex = ik
- }
- }
- //util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap)
- if len(indexMaxMap) > 0 {
- pkgLaw = "AB"
- thisMax := 0
- for ik := len(indexs) - 1; ik >= 0; ik-- {
- iv := indexs[ik]
- if currentIndex != -1 && ik >= currentIndex {
- indexWarpMap[iv] = allMax
- continue
- }
- if indexMaxMap[iv] > 0 {
- thisMax = indexMaxMap[iv]
- }
- indexWarpMap[iv] = thisMax
- }
- }
- }
- }
- if pkgLaw == "" {
- indexWarpMap = map[int]int{}
- }
- //util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap)
- return surplusText, maxWarpCount, indexTextMap, indexWarpMap
- }
- //分块之后的kv
- func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) []*util.Kv {
- blocks, _ := DivideBlock(tp, text, from, ruleBlock, isSite, codeSite)
- kvs := []*util.Kv{}
- for _, v := range blocks {
- //util.Debug(v.Text)
- // for _, vvv := range v.ColonKV.Kvs {
- // util.Debug(vvv.Key, vvv.Value, vvv.Title)
- // }
- kvs = append(kvs, v.ColonKV.Kvs...)
- }
- return kvs
- }
|