package pretreated import ( "fmt" "jy/clear" "jy/util" qutil "qfw/util" "regexp" "sort" "strconv" "strings" "unicode/utf8" ) //分块、分段功能 var ( /*regSerialTitles = []string{ "([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)", "[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)", "(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)", "(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)", "(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)", "1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)", }*/ regSerialTitles_1 = []*regexp.Regexp{ regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"), regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"), regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"), regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)"), regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"), regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)"), regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s(]*|^[\u3000\u2003\u00a0\\s(]*)(\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)"), } regSerialTitles_2 = []*regexp.Regexp{ regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)$"), regexp.MustCompile("^[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)$"), regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"), regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)$"), regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"), regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"), regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"), } regReplAllTd = regexp.MustCompile("(?smi).+?") regIsNumber = regexp.MustCompile("^\\d+$") regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$") regReplAllSpace = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+") regTrimSpace = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$") regReplWrapSpace = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$") regReplAllSymbol = regexp.MustCompile("[(\\(<《【\\[{{〔)\\)>》】\\]}}〕,,;;::'\"“”。.\\??/+=\\-_——*&……\\^%$¥@#!!`~·]") regFilterTitle = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]") regDivision = regexp.MustCompile("[::]") regSpliteSegment = regexp.MustCompile("[\r\n]") regFilterNumber = regexp.MustCompile("^[\\d一二三四五六七八九十]+") regSplit = regexp.MustCompile("或|和|以?及|与|、|或") regStartWrap = regexp.MustCompile("^[\r\n]") regEndWrap = regexp.MustCompile("[\r\n]$") regMoreWrap = regexp.MustCompile("[\r\n]{2,}") regStrWrap = regexp.MustCompile("分包名称[::]") regBZJWarap = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)") regFJWarap = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)") regAZWarap = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)") replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d") moreColonReg = regexp.MustCompile("[::]+") regFilter = regexp.MustCompile("等$") pkgFilter = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+") indexTile = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[::]+`) //小标题 indexTile2 = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[::]\n`) regReplAllSpace2 = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+") confusion = map[string]string{ "参与": "canyu", } //查找分包之前,先对内容进行预处理 /* 第一包:采购设备清单
*/ regPackageFilter = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?") filterPkgTitleKey = regexp.MustCompile("结果[::]?$") xuhao = map[string]bool{ "19968_12289": true, "19968_46": true, "20108_12289": true, "20108_46": true, "19977_12289": true, "19977_46": true, "22235_12289": true, "22235_46": true, "20116_12289": true, "20116_46": true, "20845_12289": true, "20845_46": true, "19971_12289": true, "19971_46": true, "20843_12289": true, "20061_46": true, } //非分包中标单位值 unPackageWinnerReg = regexp.MustCompile("(重新招标|方案包)") conformWinnerKVReg = regexp.MustCompile("^(中标人|中标银行|第一名)[::](.{4,20}(分行|公司))") conformWinnerKVReg1 = regexp.MustCompile("^[-].{4,15}公司$") conformWinnerKVReg2 = regexp.MustCompile("(.*)?确定(.*公司)为中标人(.*)?") conformWinnerTextReg3 = regexp.MustCompile("拟定供应商信息[::\\s]+名称[::](.*)[\\s]+地址") conformWinnerTextReg4 = regexp.MustCompile("拟定供应商信息[::\\s]+名称[::](.*)[\\s]+地址") /* 拟定供应商信息: 名称:郑州人民广播电台 地址:郑州市金水区内环路17号A座。 */ //针对处理-替换敏感词 packageReg1 = regexp.MustCompile("(包件[一二三四五1-9][::].*)\n1[、.\\s]+名称[::](.*)\n2[、.\\s]+") packageReg2 = regexp.MustCompile("标段[((]包[))][\\[][O0]+([1-9一二三四五六七八九])[\\]]") //标段(包)[001]巴盟神舟30MW: ) //分块 func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) ([]*util.Block, int) { defer qutil.Catch() returnValue := 0 var blocks []*util.Block if strings.TrimSpace(content) == "" || codeSite == "a_zgyc_ztbxx" { return blocks, -1 } //table里面的内容不考虑,先把table清理掉 //contentTemp := regReplAllTd.ReplaceAllString(content, "") contentTemp := TextAfterRemoveTable(content) tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1) var regContenSerialTitle *regexp.Regexp var regSerialTitleIndex int if ruleBlock != nil && len(ruleBlock.BlockRegs) > 0 { regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs) } else { regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, regSerialTitles_1) } //没有分块 if regSerialTitleIndex == -1 { if len(contentTemp) == len(content) { //没有分块 return blocks, -1 } else { //有table return blocks, -2 } } //匹配序号和标题 var regSerialTitle *regexp.Regexp if ruleBlock != nil && len(ruleBlock.TitleRegs) > 0 { regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex] } else { regSerialTitle = regSerialTitles_2[regSerialTitleIndex] } indexs := regContenSerialTitle.FindAllStringIndex(content, -1) indexs = filterSerial(content, indexs, tdIndexs) //头块 var headBlock, endBlock *util.Block currentIndex := 0 for k, v := range indexs { start, end := v[0], v[1] //添加开头部分 if k == 0 { if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" { headBlock = &util.Block{ Index: -1, //序号 Text: headTemp, //内容 Title: "", //标题 Start: 0, End: start, } } } //分块 blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "") serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题 if len(serialTitles) < 3 { continue } indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号 index := 0 //转成数字序号 if regIsNumber.MatchString(indexSting) { index, _ = strconv.Atoi(indexSting) } else if regIsChineseNumber.MatchString(indexSting) { index = util.ChineseNumberToInt(indexSting) } //序号开始就是错误的 if k+1 != index { if k == 0 { returnValue = 3 break } else { if currentIndex+1 != index { //如果序号不是连续的,不往下走 returnValue = 2 //添加结尾部分 if from != 3 { endBlock = &util.Block{ Index: -2, //序号 Text: content[start:], //内容 Title: "", //标题 Start: start, End: len(content), } break } } } currentIndex = index } // title := serialTitles[2] //标题 title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格 //分块后的块文 nextStart := len(content) if k < len(indexs)-1 { nextStart = indexs[k+1][0] } //获取块中除了序号和标题的内容 blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "") if title != "" { blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "") //特殊情况处理 if blockTextTemp == "" { if regDivision.MatchString(title) { /* 一、项目编号:HMEC170223 二、项目名称:执法记录仪采购 */ blockText = title divisionIndexs := regDivision.FindStringIndex(title) title = title[:divisionIndexs[0]] } else { /* 十一、投标代表须持本人身份证原件亲自递交投标文件,代理机构项目经理审核通过后,办理签收手续,否则投标文件被拒收。 十二、开标时间:2017年3月20日9时30分 */ blockText = title title = "" } } else if blockTextTemp != "" && regDivision.MatchString(title) { /* 2、采购单位名称:福建省汀州医院 采购单位地址: 龙岩市长汀县 联系人:胡科长 联系方式:0597-6826353 */ //多个标题 divisionIndexs := regDivision.FindStringIndex(title) titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "") titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "") blockText = title + "\n" + blockText if titleAfter != "" { title = "" } else { title = titleBefore } } else { blockText = title + "\n" + blockText } } //没有内容的块,不打标签,不分段 if blockText == "" { continue } //过滤 if regexp.MustCompile("投标文件格式|业绩").MatchString(title) && !regexp.MustCompile("拟定的唯一供应商名称").MatchString(title){ continue } blockText = hasMergeKV(title, blockText) // titleIsExists := map[string]bool{} //去重 title = filterTitle(title) //分割标题 [和及]。。。 参与 splitTitles := ProcTitle(title) blockText = mergetext(splitTitles, blockText) block := &util.Block{ Index: index, //序号 Text: blockText, //内容 Title: title, //标题 Titles: splitTitles, Start: start, End: nextStart, } titles := []string{} for _, sv := range splitTitles { if sv == "" || titleIsExists[sv] { continue } titleIsExists[sv] = true //标题过短过长不打标签 if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 { //打标签 block.Tags = append(block.Tags, util.GetBlockTags(sv)) titles = append(titles, sv) } } block.Title = title block.Titles = titles if ruleBlock != nil { block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles) } tagsToBlocks(blocks, block) //log.Println(index, sv, splitTitles) //log.Println(blockText) blocks = append(blocks, block) } var returnBlocks []*util.Block if len(blocks) > 0 { //头 if headBlock != nil { if tp == "招标" { headBlock.Classify = map[string]bool{"bidcondition": true} } returnBlocks = append(returnBlocks, headBlock) } //中间块 returnBlocks = append(returnBlocks, blocks...) //尾 if endBlock != nil { returnBlocks = append(returnBlocks, endBlock) } if returnValue == 0 { returnValue = 1 } } contactFormat := &util.ContactFormat{ IndexMap: map[int]string{}, MatchMap: map[string]map[string]bool{}, } for _, bl := range returnBlocks { //解析kv newText := TextAfterRemoveTable(bl.Text) //取出纯文本 bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from, isSite, codeSite) bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat, isSite, codeSite) //正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号 bl.Text = appendWarpStop(bl.Text) } return returnBlocks, returnValue } func mergetext(titles []string, text string) string { if len(titles) == 0 || utf8.RuneCountInString(text) > 150 { return text } splitLenstrs := strings.Split(text, "\n") if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 { return text } tt := "" for i, v := range splitLenstrs[1:] { lentexts := regDivision.Split(v, -1) if len(lentexts) == 2 { if strings.Contains(titles[i], lentexts[0]) { tt += titles[i] + ":" + lentexts[1] + "\n" }else if strings.Contains(titles[i], lentexts[0]) ||strings.Contains(titles[i], lentexts[0]){ tt += titles[i] + ":" + lentexts[1] + "\n" } }else { //特殊处理 if strings.Contains(v,"中标人 ") { tt +=v+"\n" } } } if len(tt) == 0 { return text } else { return tt } } //块标题处理 func ProcTitle(title string) []string { if title == "" { return []string{} } for k, v := range confusion { title = strings.Replace(title, k, v, -1) } direct := 1 prev := "" ara := regSplit.Split(title, -1) for kk, vv := range ara { for kkk, vvv := range confusion { vv = strings.Replace(vv, vvv, kkk, -1) } ara[kk] = vv if len([]rune(vv)) == 2 { if kk == 0 { direct = -1 } else { start := "" if len([]rune(prev)) > 3 { start = string([]rune(prev)[:len([]rune(prev))-2]) } ara[kk] = start + vv } } else if vv == "联系人" || vv == "联系方式" { if strings.Contains(prev, "代理") { ara[kk] = "代理机构" + vv } else if strings.Contains(prev, "中标") { ara[kk] = "中标单位" + vv } else if strings.Contains(prev, "采购") { ara[kk] = "采购单位" + vv } } if len([]rune(vv)) > 3 { if direct == -1 { end := string([]rune(vv)[len([]rune(vv))-2:]) for i := 0; i < kk; i++ { ara[i] = ara[i] + end } break } prev = vv } } return ara } //有合并kv的 例如项目名称及编号 func hasMergeKV(title, text string) string { title = regDivision.ReplaceAllString(title, "") titles := regSplit.Split(title, -1) if len(titles) <= 1 { return text } before := titles[0] after := titles[1] if strings.Contains(title, "项目") && len([]rune(after)) == 2 { after = "项目" + after } else { return text } if strings.Count(text, "\n") != 1 { return text } texts := strings.Split(text, "\n") textOneLine := texts[0] textTwoLine := texts[1] if regDivision.MatchString(textTwoLine) { return text } if textTwoLine := strings.SplitN(textTwoLine, ",", 2); len(textTwoLine) == 2 { text = textOneLine + "\n" + before + ":" + textTwoLine[0] + "," + after + ":" + textTwoLine[1] } return text } //过滤序号,判断序号是不是在td里,如果是的话这个序号作废 func filterSerial(content string, indexs, tdIndexs [][]int) [][]int { returnIndexs := [][]int{} for _, v := range indexs { flag := false //根据序号的开始位置,判断是不是在td里面 for _, tv := range tdIndexs { if v[0] > tv[0] && v[0] < tv[1] { flag = true continue } } if flag { continue } returnIndexs = append(returnIndexs, []int{v[0], v[1]}) } return returnIndexs } //获取正文所用的序号类型 func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) { var regContenSerialTitle *regexp.Regexp //先判断文章最外层使用的是哪种序号 contentStartIndex, regSerialTitleIndex := -1, -1 for k, v := range blockRegs { indexs := v.FindStringIndex(content) //只用最外层的序号,里面的过滤掉 if len(indexs) == 2 && !strings.Contains(content,"中标候选人排序") && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) { regSerialTitleIndex = k contentStartIndex = indexs[0] regContenSerialTitle = v } } return regContenSerialTitle, regSerialTitleIndex } //添加换行和句号 func appendWarpStop(text string) string { //清理前后空格 text = regTrimSpace.ReplaceAllString(text, "") //添加句号 if !strings.HasSuffix(text, "。") { text += "。" } //添加换行 if !regEndWrap.MatchString(text) { text += "\n" } return text } //分段 func DivideSegmentHtml(txt string) []*util.Segment { //先分段 _segs := strings.FieldsFunc(txt, func(r rune) bool { return r == 10 || r == 13 }) //再去除空行 segs := make([]*util.Segment, 0) _index := 0 for _, seg := range _segs { if seg != " " && len(seg) > 1 { _seg := util.Segment{} _index = _index + 1 _seg.Index = _index _seg.Text = seg segs = append(segs, &_seg) } } return segs } //分段 func DivideSegment(txt string) []*util.Segment { //先分段 tmpstr := "" _segs := strings.FieldsFunc(txt, func(r rune) bool { if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 || r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 { if tmpstr == "" { tmpstr += fmt.Sprint(r) return false } else if strings.Contains(tmpstr, "_") { tmpstr = "" tmpstr += fmt.Sprint(r) return false } else if tmpstr == fmt.Sprint(r) { if r == 46 || r == 12289 { tmpstr = "" } return false } tmpstr += "_" + fmt.Sprint(r) if xuhao[tmpstr] { return true } } tmpstr = "" return r == 10 || r == 13 }) //再去除空行 segs := make([]*util.Segment, 0) _index := 0 for _, seg := range _segs { if seg != " " && len(seg) > 1 { _seg := util.Segment{} _index = _index + 1 _seg.Index = _index _seg.Text = seg segs = append(segs, &_seg) } } return segs } /** 给块打标签 **/ func tagsToBlocks(blocks []*util.Block, block *util.Block) { if len(block.Tags) == 0 { return } tag := map[string]bool{} tagWeight := map[string]int{} for _, v := range block.Tags { for _, ts := range v { tag[ts.Value] = true tagWeight[ts.Value] = ts.Weight } } for v, _ := range tag { for _, block := range blocks { if block.Tag[v] { for _, blockTags := range block.Tags { for _, ts := range blockTags { if ts.Value == v && ts.Weight < tagWeight[v] { block.Tag[v] = false } } } } } } block.Tag = tag } func filterTitle(title string) string { if strings.Contains(title, ",") && strings.Contains(title, "。") { return "" } if len([]rune(title)) > 30 { return "" } //清理空格 title = regReplAllSpace.ReplaceAllString(title, "") //清理成对出现的符号中的内容 title = regFilterTitle.ReplaceAllString(title, "") //清理特殊符号 title = regReplAllSymbol.ReplaceAllString(title, "") //清理序号 title = regFilterNumber.ReplaceAllString(title, "") title = regFilter.ReplaceAllString(title, "") return title } //从块里面找分包 func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) { blockPackage = map[string]*util.BlockPackage{} //块分包 for _, v := range *blocks { text := regPackageFilter.ReplaceAllString(v.Text, "
") text = TextAfterRemoveTable(text) if text == "" { continue } //var ok bool //var surplusText string //分析分包-金额,中标单位,人电话,包名,中标后选人 divisionPackageChild(&blockPackage, text, v.Title, true, v.Tag["中标单位"], isSite, codeSite) } //orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite) for k, v := range blockPackage { findWinnerBugetBidmountByKv(v, blockPackage, k) //根据kv-find字段 } return } func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) { if v.ColonKV != nil && v.ColonKV.KvTags != nil { for kc, cv := range v.ColonKV.KvTags { if kc == "预算" && v.Budget <= 0 { moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { blockPackage[k].Budget = vf blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { blockPackage[k].Budget = float64(vi) blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool) } } } else if (kc == "中标金额"||kc=="各包中标/成交候选供应商及报价") && v.Bidamount <= 0 { moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { blockPackage[k].Bidamount = vf blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { blockPackage[k].Bidamount = float64(vi) blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool) } } } else if (kc == "中标单位"||kc=="第1 名"||kc=="各包中标/成交候选供应商及报价") && v.Winner == "" { if !unPackageWinnerReg.MatchString(cv[0].Value) { isW:=false if len(cv)>1 { for _,v_cv :=range cv{ if v_cv.Key=="中标单位" && v_cv.Value!="" { isW = true blockPackage[k].Winner = v_cv.Value break } } } if !isW { blockPackage[k].Winner = cv[0].Value } } }else { //特殊情况-特殊处理 res := conformWinnerKVReg.FindAllStringSubmatch(cv[0].Value, -1) if len(res) > 0 { text := res[0][2] if text!="" { blockPackage[k].Winner = text continue } } if kc=="中标信息" && conformWinnerKVReg1.MatchString(cv[0].Value){ blockPackage[k].Winner = cv[0].Value continue } if conformWinnerKVReg2.MatchString(cv[0].Value) { blockPackage[k].Winner = conformWinnerKVReg2.ReplaceAllString(cv[0].Value,"${2}") continue } //全文找 res = conformWinnerTextReg3.FindAllStringSubmatch(v.Text, -1) if len(res) > 0 { text := res[0][1] if text!="" { blockPackage[k].Winner = text continue } } } } } if v.SpaceKV != nil && v.SpaceKV.KvTags != nil { for kc, cv := range v.SpaceKV.KvTags { if kc == "预算" && v.Budget <= 0 { moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { blockPackage[k].Budget = vf blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { blockPackage[k].Budget = float64(vi) blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool) } } } else if kc == "中标金额" && v.Bidamount <= 0 { moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { blockPackage[k].Bidamount = vf blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { blockPackage[k].Bidamount = float64(vi) blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool) } } } else if kc == "中标单位" && v.Winner == "" { blockPackage[k].Winner = cv[0].Value } } } } //从正文里面找分包 func FindPackageFromText(title string, content string, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) { blockPackage = map[string]*util.BlockPackage{} //从正文里面找分包 divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite) for k, v := range blockPackage { findWinnerBugetBidmountByKv(v, blockPackage, k) } //winnerOrderEntity.Find(content, true, 2, isSite, codeSite) return } //分块之后分包 func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) { //查找知否有分包 content = regFJWarap.ReplaceAllString(content, "\n") content = regAZWarap.ReplaceAllString(content, "\n") content = regStrWrap.ReplaceAllString(content, "\n") content = regMoreWrap.ReplaceAllString(content, "\n") content = regEndWrap.ReplaceAllString(content, "") content = regBZJWarap.ReplaceAllString(content, "") //替换敏感词 content = packageReg1.ReplaceAllString(content,"${1}\n中标单位:${2}\n") content = packageReg2.ReplaceAllString(content,"\n标段${1}:") con, pkg, flag := CheckMultiPackage(content, title) //找pkg分包包名 if !flag { return false, "" } // util.Debug(con) // util.Debug(pkg) //分包前面添加换行 appendWarpIndex := []int{} //分包名,正文下标位置: 1000长 300下标 for _, v := range pkg { //如果文本内容以识别出来的分包标识结尾,不是分包 if len(pkg) == 1 && strings.HasSuffix(con, v[0]) { return false, "" } is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1) for _, sv := range is { appendWarpIndex = append(appendWarpIndex, sv[0]) } } appendWarpIndex = getPkgIndex(appendWarpIndex) conTemp := "" for k, v := range appendWarpIndex { if k == 0 { conTemp += con[:v] + "\n" } else { conTemp += "\n" + con[appendWarpIndex[k-1]:v] } if k == len(appendWarpIndex)-1 { conTemp += "\n" + con[v:] } } con = conTemp con = replSerial.ReplaceAllString(con, "\n") con = regMoreWrap.ReplaceAllString(con, "\n") //根据分包,找索引位置 indexMap := map[int]int{} indexKeyStringMap := map[int]string{} indexKeyIntMap := map[int]int{} indexs := []int{} startEndMap := map[int]int{} pkgIndexMap := map[string][]int{} indexPkgMap := map[int]string{} //小标题 titleindexs := indexTile.FindAllStringIndex(con, -1) if len(titleindexs) == 0 { titleindexs = indexTile2.FindAllStringIndex(con, -1) } //遍历分包,把kv在包前面的移动到包后面 for _, v := range pkg { pgflag := v[0] + "[::]*" is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1) for _, sv := range is { indexMap[sv[0]] = sv[1] indexs = append(indexs, sv[0]) pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0]) indexPkgMap[sv[0]] = v[0] } //key在包前面,并且在一行的开头 keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1) if len(keys) == 0 { //key在包前面,并且key以冒号结尾 keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1) } if len(keys) == 0 { keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1) } for _, key := range keys { startEndMap[key[5]] = key[4] // headkey := con[key[4]:key[5]] headkey = regReplAllSpace.ReplaceAllString(headkey, "") if !regDivision.MatchString(headkey) { headkey += ":" } headkey = moreColonReg.ReplaceAllString(headkey, ":") colonIndexs := regDivision.FindAllStringIndex(headkey, -1) if len(colonIndexs) > 1 { headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]] } indexKeyStringMap[key[5]] = headkey indexKeyIntMap[key[5]] = key[1] } } indexs = getPkgIndex(indexs) for ik, iv := range indexs { if indexKeyStringMap[iv] != "" { continue } if indexKeyIntMap[iv] == indexMap[iv] { continue } if ik > 0 { indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]] } } //获取截取标识 surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con) //查找分包内容,分kv for _, iv := range indexs { text := indexTextMap[iv] tmptext := text // warpIndex := regSpliteSegment.FindAllStringIndex(text, -1) if len(indexWarpMap) > 0 { maxWarpCount = indexWarpMap[iv] } if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount { textTemp := text text = textTemp[:warpIndex[maxWarpCount-1][1]] surplusText += textTemp[warpIndex[maxWarpCount-1][0]:] } for bk, bv := range pkg { //判断分包如果在这段文字里面,该段文字就属于该包的 if !strings.HasPrefix(text, bv[0]) { continue } index := util.PackageNumberConvert(bk) //去掉前缀,空格必须要加,分kv的时候要用 text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "") if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text || strings.TrimLeft(tmptext, bv[0]+":") == text { var tagtitle string for i, v := range titleindexs { if i == 0 { continue } if v[0] > iv { tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]] break } } tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "") if tagtitle == "" { tagtitle = title } else if strings.Contains(tagtitle, bv[0]) && title != "" { tagtitle = title } text = tagtitle + ":" + text } headKey := "" if indexKeyStringMap[iv] != "" { //if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) { headKey = indexKeyStringMap[iv] //} for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] { delete(indexKeyStringMap, pkgIndexMap_v) break } } //如果一块中有多个相同的包,合并到一个 if (*blockPackage)[index] != nil { //合并文本 (*blockPackage)[index].Text += "\n" + text //合并冒号kv colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1, isSite, codeSite) if headKey != "" { kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite) MergeKvTags(colonJobKv.KvTags, kvAgain.KvTags) } MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags) //合并空格kv spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite) MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags) } else { newBpkg := &util.BlockPackage{ Origin: bk, Text: text, Index: index, Name: bv[0], Type: bv[1], Accuracy: accuracy, } //fmt.Println(text) finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4, isSite, codeSite) if headKey != "" { kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite) MergeKvTags(finalKv.KvTags, kvAgain.KvTags) } //kv-字段- newBpkg.ColonKV = finalKv newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil, isSite, codeSite) (*blockPackage)[index] = newBpkg } } } //中标人排序 //if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 { // for _, v := range *blockPackage { // v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2, isSite, codeSite) // } //} return true, surplusText } func getPkgIndex(indexs []int) []int { sort.Ints(indexs) indexsNew := []int{} count := 0 for k, v := range indexs { if k > 0 && v-indexs[k-1] <= 10 { count++ continue } indexsNew = append(indexsNew, v) } if count > 0 && count == len(indexs)-1 { return []int{} } return indexsNew } //每个包对应的结束位置,都是整行结束 func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) { //util.Debug(con) surplusText := "" indexTextMap := map[int]string{} indexWarpMap := map[int]int{} maxWarpCount := 0 for ik, iv := range indexs { text := "" if ik < len(indexs)-1 { if startEndMap[indexs[ik+1]] != 0 { text = con[iv:startEndMap[indexs[ik+1]]] } else { text = con[iv:indexs[ik+1]] } } else { text = con[iv:] } //fmt.Println(text) tmptext := text //if strings.Contains(text, "、") { // text = strings.Split(text, "、")[0] //} else if strings.Contains(text, "\n") { texts := strings.Split(text, "\n") text2 := "" if ik+1 < len(indexs)-1 { if startEndMap[indexs[ik+1+1]] != 0 { text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]] } else { text2 = con[indexs[ik+1]:indexs[ik+1+1]] } if texts[len(texts)-1] == text2 { text = texts[0] } } } if utf8.RuneCountInString(text) < 5 { indexTextMap[iv] = tmptext } else { indexTextMap[iv] = text } warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1)) if warpCount > maxWarpCount { maxWarpCount = warpCount } indexWarpMap[iv] = warpCount if ik == 0 { surplusText += con[:iv] } } pkgLaw := "" if len(pkgIndexMap) > 1 { //有规律的出现 AB or ABAB if pkgLaw == "" { prevVal := "" notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0 indexMaxMap := map[int]int{} for ik, iv := range indexs { if notRepeatCount == len(pkgIndexMap) { notRepeatCount = 0 } if prevVal != indexPkgMap[iv] { notRepeatCount++ } else { notRepeatCount = -1 currentIndex = ik break } prevVal = indexPkgMap[iv] if notRepeatCount == len(pkgIndexMap) { indexMaxMap[iv] = onceMax onceMax = 0 } if indexWarpMap[iv] > onceMax { onceMax = indexWarpMap[iv] allMax = onceMax } if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) { notRepeatCount = -2 currentIndex = ik } } //util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap) if len(indexMaxMap) > 0 { pkgLaw = "AB" thisMax := 0 for ik := len(indexs) - 1; ik >= 0; ik-- { iv := indexs[ik] if currentIndex != -1 && ik >= currentIndex { indexWarpMap[iv] = allMax continue } if indexMaxMap[iv] > 0 { thisMax = indexMaxMap[iv] } indexWarpMap[iv] = thisMax } } } } if pkgLaw == "" { indexWarpMap = map[int]int{} } //util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap) return surplusText, maxWarpCount, indexTextMap, indexWarpMap } //分块之后的kv func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) []*util.Kv { blocks, _ := DivideBlock(tp, text, from, ruleBlock, isSite, codeSite) kvs := []*util.Kv{} for _, v := range blocks { //util.Debug(v.Text) // for _, vvv := range v.ColonKV.Kvs { // util.Debug(vvv.Key, vvv.Value, vvv.Title) // } kvs = append(kvs, v.ColonKV.Kvs...) } return kvs }