package pretreated
import (
"fmt"
"jy/clear"
"jy/util"
qutil "qfw/util"
"regexp"
"sort"
"strconv"
"strings"
"unicode/utf8"
)
//分块、分段功能
var (
/*regSerialTitles = []string{
"([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)",
"[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)",
"(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)",
"(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)",
"(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
"1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
}*/
regSerialTitles_1 = []*regexp.Regexp{
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)"),
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)"),
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)"),
regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s(]*|^[\u3000\u2003\u00a0\\s(]*)(\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)"),
}
regSerialTitles_2 = []*regexp.Regexp{
regexp.MustCompile("^([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)$"),
regexp.MustCompile("^[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)$"),
regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)$"),
regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]*[..]([^\\d][^\r\n]+)$"),
regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
}
regReplAllTd = regexp.MustCompile("(?smi)
.+?")
regIsNumber = regexp.MustCompile("^\\d+$")
regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
regReplAllSpace = regexp.MustCompile("[\u3000\u2003\u00a0\\s]+")
regTrimSpace = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
regReplWrapSpace = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
regReplAllSymbol = regexp.MustCompile("[(\\(<《【\\[{{〔)\\)>》】\\]}}〕,,;;::'\"“”。.\\??/+=\\-_——*&……\\^%$¥@#!!`~·]")
regFilterTitle = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]")
regDivision = regexp.MustCompile("[::]")
regSpliteSegment = regexp.MustCompile("[\r\n]")
regFilterNumber = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
regSplit = regexp.MustCompile("或|和|以?及|与|、|或")
regStartWrap = regexp.MustCompile("^[\r\n]")
regEndWrap = regexp.MustCompile("[\r\n]$")
regMoreWrap = regexp.MustCompile("[\r\n]{2,}")
regStrWrap = regexp.MustCompile("分包名称[::]")
regBZJWarap = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
regFJWarap = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
regAZWarap = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)")
replSerial = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
moreColonReg = regexp.MustCompile("[::]+")
regFilter = regexp.MustCompile("等$")
pkgFilter = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+")
indexTile = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[::]+`) //小标题
indexTile2 = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[::]\n`)
regReplAllSpace2 = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
confusion = map[string]string{
"参与": "canyu",
}
//查找分包之前,先对内容进行预处理
/*
第一包:采购设备清单
*/
regPackageFilter = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?")
filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
xuhao = map[string]bool{
"19968_12289": true,
"19968_46": true,
"20108_12289": true,
"20108_46": true,
"19977_12289": true,
"19977_46": true,
"22235_12289": true,
"22235_46": true,
"20116_12289": true,
"20116_46": true,
"20845_12289": true,
"20845_46": true,
"19971_12289": true,
"19971_46": true,
"20843_12289": true,
"20061_46": true,
}
//非分包中标单位值
unPackageWinnerReg = regexp.MustCompile("(重新招标|方案包)")
conformWinnerKVReg = regexp.MustCompile("^(中标人|中标银行|第一名)[::](.{4,20}(分行|公司))")
conformWinnerKVReg1 = regexp.MustCompile("^[-].{4,15}公司$")
conformWinnerKVReg2 = regexp.MustCompile("(.*)?确定(.*公司)为中标人(.*)?")
conformWinnerTextReg3 = regexp.MustCompile("拟定供应商信息[::\\s]+名称[::](.*)[\\s]+地址")
conformWinnerTextReg4 = regexp.MustCompile("拟定供应商信息[::\\s]+名称[::](.*)[\\s]+地址")
/*
拟定供应商信息:
名称:郑州人民广播电台
地址:郑州市金水区内环路17号A座。
*/
//针对处理-替换敏感词
packageReg1 = regexp.MustCompile("(包件[一二三四五1-9][::].*)\n1[、.\\s]+名称[::](.*)\n2[、.\\s]+")
packageReg2 = regexp.MustCompile("标段[((]包[))][\\[][O0]+([1-9一二三四五六七八九])[\\]]")
//标段(包)[001]巴盟神舟30MW:
)
//分块
func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) ([]*util.Block, int) {
defer qutil.Catch()
returnValue := 0
var blocks []*util.Block
if strings.TrimSpace(content) == "" || codeSite == "a_zgyc_ztbxx" {
return blocks, -1
}
//table里面的内容不考虑,先把table清理掉
//contentTemp := regReplAllTd.ReplaceAllString(content, "")
contentTemp := TextAfterRemoveTable(content)
tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
var regContenSerialTitle *regexp.Regexp
var regSerialTitleIndex int
if ruleBlock != nil && len(ruleBlock.BlockRegs) > 0 {
regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
} else {
regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, regSerialTitles_1)
}
//没有分块
if regSerialTitleIndex == -1 {
if len(contentTemp) == len(content) {
//没有分块
return blocks, -1
} else { //有table
return blocks, -2
}
}
//匹配序号和标题
var regSerialTitle *regexp.Regexp
if ruleBlock != nil && len(ruleBlock.TitleRegs) > 0 {
regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
} else {
regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
}
indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
indexs = filterSerial(content, indexs, tdIndexs)
//头块
var headBlock, endBlock *util.Block
currentIndex := 0
for k, v := range indexs {
start, end := v[0], v[1]
//添加开头部分
if k == 0 {
if headTemp := content[:start]; regReplAllSpace.ReplaceAllString(headTemp, "") != "" {
headBlock = &util.Block{
Index: -1, //序号
Text: headTemp, //内容
Title: "", //标题
Start: 0,
End: start,
}
}
}
//分块
blockSerialTitle := regTrimSpace.ReplaceAllString(content[start:end], "")
serialTitles := regSerialTitle.FindStringSubmatch(blockSerialTitle) //序号和标题
if len(serialTitles) < 3 {
continue
}
indexSting := regReplAllSpace.ReplaceAllString(serialTitles[1], "") //序号
index := 0
//转成数字序号
if regIsNumber.MatchString(indexSting) {
index, _ = strconv.Atoi(indexSting)
} else if regIsChineseNumber.MatchString(indexSting) {
index = util.ChineseNumberToInt(indexSting)
}
//序号开始就是错误的
if k+1 != index {
if k == 0 {
returnValue = 3
break
} else {
if currentIndex+1 != index {
//如果序号不是连续的,不往下走
returnValue = 2
//添加结尾部分
if from != 3 {
endBlock = &util.Block{
Index: -2, //序号
Text: content[start:], //内容
Title: "", //标题
Start: start,
End: len(content),
}
break
}
}
}
currentIndex = index
}
//
title := serialTitles[2] //标题
title = regTrimSpace.ReplaceAllString(title, "") //清除前后空格
//分块后的块文
nextStart := len(content)
if k < len(indexs)-1 {
nextStart = indexs[k+1][0]
}
//获取块中除了序号和标题的内容
blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
if title != "" {
blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
//特殊情况处理
if blockTextTemp == "" {
if regDivision.MatchString(title) {
/*
一、项目编号:HMEC170223
二、项目名称:执法记录仪采购
*/
blockText = title
divisionIndexs := regDivision.FindStringIndex(title)
title = title[:divisionIndexs[0]]
} else {
/*
十一、投标代表须持本人身份证原件亲自递交投标文件,代理机构项目经理审核通过后,办理签收手续,否则投标文件被拒收。
十二、开标时间:2017年3月20日9时30分
*/
blockText = title
title = ""
}
} else if blockTextTemp != "" && regDivision.MatchString(title) {
/*
2、采购单位名称:福建省汀州医院
采购单位地址: 龙岩市长汀县
联系人:胡科长
联系方式:0597-6826353
*/
//多个标题
divisionIndexs := regDivision.FindStringIndex(title)
titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
blockText = title + "\n" + blockText
if titleAfter != "" {
title = ""
} else {
title = titleBefore
}
} else {
blockText = title + "\n" + blockText
}
}
//没有内容的块,不打标签,不分段
if blockText == "" {
continue
}
//过滤
if regexp.MustCompile("投标文件格式|业绩").MatchString(title) &&
!regexp.MustCompile("拟定的唯一供应商名称").MatchString(title){
continue
}
blockText = hasMergeKV(title, blockText)
//
titleIsExists := map[string]bool{} //去重
title = filterTitle(title)
//分割标题 [和及]。。。 参与
splitTitles := ProcTitle(title)
blockText = mergetext(splitTitles, blockText)
block := &util.Block{
Index: index, //序号
Text: blockText, //内容
Title: title, //标题
Titles: splitTitles,
Start: start,
End: nextStart,
}
titles := []string{}
for _, sv := range splitTitles {
if sv == "" || titleIsExists[sv] {
continue
}
titleIsExists[sv] = true
//标题过短过长不打标签
if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
//打标签
block.Tags = append(block.Tags, util.GetBlockTags(sv))
titles = append(titles, sv)
}
}
block.Title = title
block.Titles = titles
if ruleBlock != nil {
block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles)
}
tagsToBlocks(blocks, block)
//log.Println(index, sv, splitTitles)
//log.Println(blockText)
blocks = append(blocks, block)
}
var returnBlocks []*util.Block
if len(blocks) > 0 {
//头
if headBlock != nil {
if tp == "招标" {
headBlock.Classify = map[string]bool{"bidcondition": true}
}
returnBlocks = append(returnBlocks, headBlock)
}
//中间块
returnBlocks = append(returnBlocks, blocks...)
//尾
if endBlock != nil {
returnBlocks = append(returnBlocks, endBlock)
}
if returnValue == 0 {
returnValue = 1
}
}
contactFormat := &util.ContactFormat{
IndexMap: map[int]string{},
MatchMap: map[string]map[string]bool{},
}
for _, bl := range returnBlocks {
//解析kv
newText := TextAfterRemoveTable(bl.Text) //取出纯文本
bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from, isSite, codeSite)
bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat, isSite, codeSite)
//正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
bl.Text = appendWarpStop(bl.Text)
}
return returnBlocks, returnValue
}
func mergetext(titles []string, text string) string {
if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
return text
}
splitLenstrs := strings.Split(text, "\n")
if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
return text
}
tt := ""
for i, v := range splitLenstrs[1:] {
lentexts := regDivision.Split(v, -1)
if len(lentexts) == 2 {
if strings.Contains(titles[i], lentexts[0]) {
tt += titles[i] + ":" + lentexts[1] + "\n"
}else if strings.Contains(titles[i], lentexts[0]) ||strings.Contains(titles[i], lentexts[0]){
tt += titles[i] + ":" + lentexts[1] + "\n"
}
}else {
//特殊处理
if strings.Contains(v,"中标人 ") {
tt +=v+"\n"
}
}
}
if len(tt) == 0 {
return text
} else {
return tt
}
}
//块标题处理
func ProcTitle(title string) []string {
if title == "" {
return []string{}
}
for k, v := range confusion {
title = strings.Replace(title, k, v, -1)
}
direct := 1
prev := ""
ara := regSplit.Split(title, -1)
for kk, vv := range ara {
for kkk, vvv := range confusion {
vv = strings.Replace(vv, vvv, kkk, -1)
}
ara[kk] = vv
if len([]rune(vv)) == 2 {
if kk == 0 {
direct = -1
} else {
start := ""
if len([]rune(prev)) > 3 {
start = string([]rune(prev)[:len([]rune(prev))-2])
}
ara[kk] = start + vv
}
} else if vv == "联系人" || vv == "联系方式" {
if strings.Contains(prev, "代理") {
ara[kk] = "代理机构" + vv
} else if strings.Contains(prev, "中标") {
ara[kk] = "中标单位" + vv
} else if strings.Contains(prev, "采购") {
ara[kk] = "采购单位" + vv
}
}
if len([]rune(vv)) > 3 {
if direct == -1 {
end := string([]rune(vv)[len([]rune(vv))-2:])
for i := 0; i < kk; i++ {
ara[i] = ara[i] + end
}
break
}
prev = vv
}
}
return ara
}
//有合并kv的 例如项目名称及编号
func hasMergeKV(title, text string) string {
title = regDivision.ReplaceAllString(title, "")
titles := regSplit.Split(title, -1)
if len(titles) <= 1 {
return text
}
before := titles[0]
after := titles[1]
if strings.Contains(title, "项目") && len([]rune(after)) == 2 {
after = "项目" + after
} else {
return text
}
if strings.Count(text, "\n") != 1 {
return text
}
texts := strings.Split(text, "\n")
textOneLine := texts[0]
textTwoLine := texts[1]
if regDivision.MatchString(textTwoLine) {
return text
}
if textTwoLine := strings.SplitN(textTwoLine, ",", 2); len(textTwoLine) == 2 {
text = textOneLine + "\n" + before + ":" + textTwoLine[0] + "," + after + ":" + textTwoLine[1]
}
return text
}
//过滤序号,判断序号是不是在td里,如果是的话这个序号作废
func filterSerial(content string, indexs, tdIndexs [][]int) [][]int {
returnIndexs := [][]int{}
for _, v := range indexs {
flag := false
//根据序号的开始位置,判断是不是在td里面
for _, tv := range tdIndexs {
if v[0] > tv[0] && v[0] < tv[1] {
flag = true
continue
}
}
if flag {
continue
}
returnIndexs = append(returnIndexs, []int{v[0], v[1]})
}
return returnIndexs
}
//获取正文所用的序号类型
func getSerialType(content string, blockRegs []*regexp.Regexp) (*regexp.Regexp, int) {
var regContenSerialTitle *regexp.Regexp
//先判断文章最外层使用的是哪种序号
contentStartIndex, regSerialTitleIndex := -1, -1
for k, v := range blockRegs {
indexs := v.FindStringIndex(content)
//只用最外层的序号,里面的过滤掉
if len(indexs) == 2 && !strings.Contains(content,"中标候选人排序") && !regSpliteSegment.MatchString(strings.TrimSpace(content[indexs[0]:indexs[1]])) && (contentStartIndex == -1 || indexs[0] < contentStartIndex) {
regSerialTitleIndex = k
contentStartIndex = indexs[0]
regContenSerialTitle = v
}
}
return regContenSerialTitle, regSerialTitleIndex
}
//添加换行和句号
func appendWarpStop(text string) string {
//清理前后空格
text = regTrimSpace.ReplaceAllString(text, "")
//添加句号
if !strings.HasSuffix(text, "。") {
text += "。"
}
//添加换行
if !regEndWrap.MatchString(text) {
text += "\n"
}
return text
}
//分段
func DivideSegmentHtml(txt string) []*util.Segment {
//先分段
_segs := strings.FieldsFunc(txt, func(r rune) bool {
return r == 10 || r == 13
})
//再去除空行
segs := make([]*util.Segment, 0)
_index := 0
for _, seg := range _segs {
if seg != " " && len(seg) > 1 {
_seg := util.Segment{}
_index = _index + 1
_seg.Index = _index
_seg.Text = seg
segs = append(segs, &_seg)
}
}
return segs
}
//分段
func DivideSegment(txt string) []*util.Segment {
//先分段
tmpstr := ""
_segs := strings.FieldsFunc(txt, func(r rune) bool {
if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
if tmpstr == "" {
tmpstr += fmt.Sprint(r)
return false
} else if strings.Contains(tmpstr, "_") {
tmpstr = ""
tmpstr += fmt.Sprint(r)
return false
} else if tmpstr == fmt.Sprint(r) {
if r == 46 || r == 12289 {
tmpstr = ""
}
return false
}
tmpstr += "_" + fmt.Sprint(r)
if xuhao[tmpstr] {
return true
}
}
tmpstr = ""
return r == 10 || r == 13
})
//再去除空行
segs := make([]*util.Segment, 0)
_index := 0
for _, seg := range _segs {
if seg != " " && len(seg) > 1 {
_seg := util.Segment{}
_index = _index + 1
_seg.Index = _index
_seg.Text = seg
segs = append(segs, &_seg)
}
}
return segs
}
/** 给块打标签 **/
func tagsToBlocks(blocks []*util.Block, block *util.Block) {
if len(block.Tags) == 0 {
return
}
tag := map[string]bool{}
tagWeight := map[string]int{}
for _, v := range block.Tags {
for _, ts := range v {
tag[ts.Value] = true
tagWeight[ts.Value] = ts.Weight
}
}
for v, _ := range tag {
for _, block := range blocks {
if block.Tag[v] {
for _, blockTags := range block.Tags {
for _, ts := range blockTags {
if ts.Value == v && ts.Weight < tagWeight[v] {
block.Tag[v] = false
}
}
}
}
}
}
block.Tag = tag
}
func filterTitle(title string) string {
if strings.Contains(title, ",") && strings.Contains(title, "。") {
return ""
}
if len([]rune(title)) > 30 {
return ""
}
//清理空格
title = regReplAllSpace.ReplaceAllString(title, "")
//清理成对出现的符号中的内容
title = regFilterTitle.ReplaceAllString(title, "")
//清理特殊符号
title = regReplAllSymbol.ReplaceAllString(title, "")
//清理序号
title = regFilterNumber.ReplaceAllString(title, "")
title = regFilter.ReplaceAllString(title, "")
return title
}
//从块里面找分包
func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
blockPackage = map[string]*util.BlockPackage{}
//块分包
for _, v := range *blocks {
text := regPackageFilter.ReplaceAllString(v.Text, "")
text = TextAfterRemoveTable(text)
if text == "" {
continue
}
//var ok bool
//var surplusText string
//分析分包-金额,中标单位,人电话,包名,中标后选人
divisionPackageChild(&blockPackage, text, v.Title, true, v.Tag["中标单位"], isSite, codeSite)
}
//orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
for k, v := range blockPackage {
findWinnerBugetBidmountByKv(v, blockPackage, k) //根据kv-find字段
}
return
}
func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) {
if v.ColonKV != nil && v.ColonKV.KvTags != nil {
for kc, cv := range v.ColonKV.KvTags {
if kc == "预算" && v.Budget <= 0 {
moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
if len(moneys) > 0 {
if vf, ok := moneys[0].(float64); ok {
blockPackage[k].Budget = vf
blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
} else if vi, ok := moneys[0].(int); ok {
blockPackage[k].Budget = float64(vi)
blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
}
}
} else if (kc == "中标金额"||kc=="各包中标/成交候选供应商及报价") && v.Bidamount <= 0 {
moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
if len(moneys) > 0 {
if vf, ok := moneys[0].(float64); ok {
blockPackage[k].Bidamount = vf
blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
} else if vi, ok := moneys[0].(int); ok {
blockPackage[k].Bidamount = float64(vi)
blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
}
}
} else if (kc == "中标单位"||kc=="第1 名"||kc=="各包中标/成交候选供应商及报价") && v.Winner == "" {
if !unPackageWinnerReg.MatchString(cv[0].Value) {
isW:=false
if len(cv)>1 {
for _,v_cv :=range cv{
if v_cv.Key=="中标单位" && v_cv.Value!="" {
isW = true
blockPackage[k].Winner = v_cv.Value
break
}
}
}
if !isW {
blockPackage[k].Winner = cv[0].Value
}
}
}else { //特殊情况-特殊处理
res := conformWinnerKVReg.FindAllStringSubmatch(cv[0].Value, -1)
if len(res) > 0 {
text := res[0][2]
if text!="" {
blockPackage[k].Winner = text
continue
}
}
if kc=="中标信息" && conformWinnerKVReg1.MatchString(cv[0].Value){
blockPackage[k].Winner = cv[0].Value
continue
}
if conformWinnerKVReg2.MatchString(cv[0].Value) {
blockPackage[k].Winner = conformWinnerKVReg2.ReplaceAllString(cv[0].Value,"${2}")
continue
}
//全文找
res = conformWinnerTextReg3.FindAllStringSubmatch(v.Text, -1)
if len(res) > 0 {
text := res[0][1]
if text!="" {
blockPackage[k].Winner = text
continue
}
}
}
}
}
if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
for kc, cv := range v.SpaceKV.KvTags {
if kc == "预算" && v.Budget <= 0 {
moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
if len(moneys) > 0 {
if vf, ok := moneys[0].(float64); ok {
blockPackage[k].Budget = vf
blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
} else if vi, ok := moneys[0].(int); ok {
blockPackage[k].Budget = float64(vi)
blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
}
}
} else if kc == "中标金额" && v.Bidamount <= 0 {
moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
if len(moneys) > 0 {
if vf, ok := moneys[0].(float64); ok {
blockPackage[k].Bidamount = vf
blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
} else if vi, ok := moneys[0].(int); ok {
blockPackage[k].Bidamount = float64(vi)
blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
}
}
} else if kc == "中标单位" && v.Winner == "" {
blockPackage[k].Winner = cv[0].Value
}
}
}
}
//从正文里面找分包
func FindPackageFromText(title string, content string, isSite bool, codeSite string) (blockPackage map[string]*util.BlockPackage) {
blockPackage = map[string]*util.BlockPackage{}
//从正文里面找分包
divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite)
for k, v := range blockPackage {
findWinnerBugetBidmountByKv(v, blockPackage, k)
}
//winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
return
}
//分块之后分包
func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) {
//查找知否有分包
content = regFJWarap.ReplaceAllString(content, "\n")
content = regAZWarap.ReplaceAllString(content, "\n")
content = regStrWrap.ReplaceAllString(content, "\n")
content = regMoreWrap.ReplaceAllString(content, "\n")
content = regEndWrap.ReplaceAllString(content, "")
content = regBZJWarap.ReplaceAllString(content, "")
//替换敏感词
content = packageReg1.ReplaceAllString(content,"${1}\n中标单位:${2}\n")
content = packageReg2.ReplaceAllString(content,"\n标段${1}:")
con, pkg, flag := CheckMultiPackage(content, title) //找pkg分包包名
if !flag {
return false, ""
}
// util.Debug(con)
// util.Debug(pkg)
//分包前面添加换行
appendWarpIndex := []int{} //分包名,正文下标位置: 1000长 300下标
for _, v := range pkg {
//如果文本内容以识别出来的分包标识结尾,不是分包
if len(pkg) == 1 && strings.HasSuffix(con, v[0]) {
return false, ""
}
is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
for _, sv := range is {
appendWarpIndex = append(appendWarpIndex, sv[0])
}
}
appendWarpIndex = getPkgIndex(appendWarpIndex)
conTemp := ""
for k, v := range appendWarpIndex {
if k == 0 {
conTemp += con[:v] + "\n"
} else {
conTemp += "\n" + con[appendWarpIndex[k-1]:v]
}
if k == len(appendWarpIndex)-1 {
conTemp += "\n" + con[v:]
}
}
con = conTemp
con = replSerial.ReplaceAllString(con, "\n")
con = regMoreWrap.ReplaceAllString(con, "\n")
//根据分包,找索引位置
indexMap := map[int]int{}
indexKeyStringMap := map[int]string{}
indexKeyIntMap := map[int]int{}
indexs := []int{}
startEndMap := map[int]int{}
pkgIndexMap := map[string][]int{}
indexPkgMap := map[int]string{}
//小标题
titleindexs := indexTile.FindAllStringIndex(con, -1)
if len(titleindexs) == 0 {
titleindexs = indexTile2.FindAllStringIndex(con, -1)
}
//遍历分包,把kv在包前面的移动到包后面
for _, v := range pkg {
pgflag := v[0] + "[::]*"
is := regexp.MustCompile(pgflag).FindAllStringIndex(con, -1)
for _, sv := range is {
indexMap[sv[0]] = sv[1]
indexs = append(indexs, sv[0])
pkgIndexMap[v[0]] = append(pkgIndexMap[v[0]], sv[0])
indexPkgMap[sv[0]] = v[0]
}
//key在包前面,并且在一行的开头
keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
if len(keys) == 0 {
//key在包前面,并且key以冒号结尾
keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
}
if len(keys) == 0 {
keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
}
for _, key := range keys {
startEndMap[key[5]] = key[4]
//
headkey := con[key[4]:key[5]]
headkey = regReplAllSpace.ReplaceAllString(headkey, "")
if !regDivision.MatchString(headkey) {
headkey += ":"
}
headkey = moreColonReg.ReplaceAllString(headkey, ":")
colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
if len(colonIndexs) > 1 {
headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
}
indexKeyStringMap[key[5]] = headkey
indexKeyIntMap[key[5]] = key[1]
}
}
indexs = getPkgIndex(indexs)
for ik, iv := range indexs {
if indexKeyStringMap[iv] != "" {
continue
}
if indexKeyIntMap[iv] == indexMap[iv] {
continue
}
if ik > 0 {
indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
}
}
//获取截取标识
surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
//查找分包内容,分kv
for _, iv := range indexs {
text := indexTextMap[iv]
tmptext := text
//
warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
if len(indexWarpMap) > 0 {
maxWarpCount = indexWarpMap[iv]
}
if maxWarpCount > 0 && len(warpIndex) >= 5 && len(warpIndex) > maxWarpCount {
textTemp := text
text = textTemp[:warpIndex[maxWarpCount-1][1]]
surplusText += textTemp[warpIndex[maxWarpCount-1][0]:]
}
for bk, bv := range pkg {
//判断分包如果在这段文字里面,该段文字就属于该包的
if !strings.HasPrefix(text, bv[0]) {
continue
}
index := util.PackageNumberConvert(bk)
//去掉前缀,空格必须要加,分kv的时候要用
text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text || strings.TrimLeft(tmptext, bv[0]+":") == text {
var tagtitle string
for i, v := range titleindexs {
if i == 0 {
continue
}
if v[0] > iv {
tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]]
break
}
}
tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
if tagtitle == "" {
tagtitle = title
} else if strings.Contains(tagtitle, bv[0]) && title != "" {
tagtitle = title
}
text = tagtitle + ":" + text
}
headKey := ""
if indexKeyStringMap[iv] != "" {
//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
headKey = indexKeyStringMap[iv]
//}
for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
delete(indexKeyStringMap, pkgIndexMap_v)
break
}
}
//如果一块中有多个相同的包,合并到一个
if (*blockPackage)[index] != nil {
//合并文本
(*blockPackage)[index].Text += "\n" + text
//合并冒号kv
colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1, isSite, codeSite)
if headKey != "" {
kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
MergeKvTags(colonJobKv.KvTags, kvAgain.KvTags)
}
MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags)
//合并空格kv
spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite)
MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags)
} else {
newBpkg := &util.BlockPackage{
Origin: bk,
Text: text,
Index: index,
Name: bv[0],
Type: bv[1],
Accuracy: accuracy,
}
//fmt.Println(text)
finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4, isSite, codeSite)
if headKey != "" {
kvAgain := GetKVAll(text, "", nil, 4, isSite, codeSite)
MergeKvTags(finalKv.KvTags, kvAgain.KvTags)
}
//kv-字段-
newBpkg.ColonKV = finalKv
newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil, isSite, codeSite)
(*blockPackage)[index] = newBpkg
}
}
}
//中标人排序
//if isFindWinnerOrder && blockPackage != nil && len(*blockPackage) > 0 {
// for _, v := range *blockPackage {
// v.WinnerOrder = winnerOrderEntity.Find(v.Text, true, 2, isSite, codeSite)
// }
//}
return true, surplusText
}
func getPkgIndex(indexs []int) []int {
sort.Ints(indexs)
indexsNew := []int{}
count := 0
for k, v := range indexs {
if k > 0 && v-indexs[k-1] <= 10 {
count++
continue
}
indexsNew = append(indexsNew, v)
}
if count > 0 && count == len(indexs)-1 {
return []int{}
}
return indexsNew
}
//每个包对应的结束位置,都是整行结束
func interceptText(indexs []int, indexPkgMap map[int]string, pkgIndexMap map[string][]int, startEndMap map[int]int, con string) (string, int, map[int]string, map[int]int) {
//util.Debug(con)
surplusText := ""
indexTextMap := map[int]string{}
indexWarpMap := map[int]int{}
maxWarpCount := 0
for ik, iv := range indexs {
text := ""
if ik < len(indexs)-1 {
if startEndMap[indexs[ik+1]] != 0 {
text = con[iv:startEndMap[indexs[ik+1]]]
} else {
text = con[iv:indexs[ik+1]]
}
} else {
text = con[iv:]
}
//fmt.Println(text)
tmptext := text
//if strings.Contains(text, "、") {
// text = strings.Split(text, "、")[0]
//} else
if strings.Contains(text, "\n") {
texts := strings.Split(text, "\n")
text2 := ""
if ik+1 < len(indexs)-1 {
if startEndMap[indexs[ik+1+1]] != 0 {
text2 = con[startEndMap[indexs[ik+1]]:startEndMap[indexs[ik+1+1]]]
} else {
text2 = con[indexs[ik+1]:indexs[ik+1+1]]
}
if texts[len(texts)-1] == text2 {
text = texts[0]
}
}
}
if utf8.RuneCountInString(text) < 5 {
indexTextMap[iv] = tmptext
} else {
indexTextMap[iv] = text
}
warpCount := len(regSpliteSegment.FindAllStringIndex(text, -1))
if warpCount > maxWarpCount {
maxWarpCount = warpCount
}
indexWarpMap[iv] = warpCount
if ik == 0 {
surplusText += con[:iv]
}
}
pkgLaw := ""
if len(pkgIndexMap) > 1 {
//有规律的出现 AB or ABAB
if pkgLaw == "" {
prevVal := ""
notRepeatCount, currentIndex, onceMax, allMax := 0, -1, 0, 0
indexMaxMap := map[int]int{}
for ik, iv := range indexs {
if notRepeatCount == len(pkgIndexMap) {
notRepeatCount = 0
}
if prevVal != indexPkgMap[iv] {
notRepeatCount++
} else {
notRepeatCount = -1
currentIndex = ik
break
}
prevVal = indexPkgMap[iv]
if notRepeatCount == len(pkgIndexMap) {
indexMaxMap[iv] = onceMax
onceMax = 0
}
if indexWarpMap[iv] > onceMax {
onceMax = indexWarpMap[iv]
allMax = onceMax
}
if ik == len(indexs)-1 && notRepeatCount != len(pkgIndexMap) {
notRepeatCount = -2
currentIndex = ik
}
}
//util.Debug(allMax, currentIndex, indexWarpMap, indexMaxMap)
if len(indexMaxMap) > 0 {
pkgLaw = "AB"
thisMax := 0
for ik := len(indexs) - 1; ik >= 0; ik-- {
iv := indexs[ik]
if currentIndex != -1 && ik >= currentIndex {
indexWarpMap[iv] = allMax
continue
}
if indexMaxMap[iv] > 0 {
thisMax = indexMaxMap[iv]
}
indexWarpMap[iv] = thisMax
}
}
}
}
if pkgLaw == "" {
indexWarpMap = map[int]int{}
}
//util.Debug(pkgLaw, maxWarpCount, indexTextMap, indexWarpMap)
return surplusText, maxWarpCount, indexTextMap, indexWarpMap
}
//分块之后的kv
func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, isSite bool, codeSite string) []*util.Kv {
blocks, _ := DivideBlock(tp, text, from, ruleBlock, isSite, codeSite)
kvs := []*util.Kv{}
for _, v := range blocks {
//util.Debug(v.Text)
// for _, vvv := range v.ColonKV.Kvs {
// util.Debug(vvv.Key, vvv.Value, vvv.Title)
// }
kvs = append(kvs, v.ColonKV.Kvs...)
}
return kvs
}