/** 信息预处理入口 **/ package pretreated import ( "encoding/json" "fmt" "jy/clear" "jy/util" "regexp" "strings" "github.com/PuerkitoBio/goquery" ) var yjReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|主要人员相关资料|投标文件格式|唱标记录|否决投标的?情况说明") var hisReg = regexp.MustCompile("(开标记录|类似业绩|历史业绩|填报项目业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?()") var hisReg2 = regexp.MustCompile("(开标记录|业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(||)") var formattext = regexp.MustCompile("(投标总价)([0-9,.万元]*)") var formattext2 = regexp.MustCompile("中标单价.*(中标总价.*)") var formattext3 = regexp.MustCompile("(同类项目业绩、|[1-9].[0-9]包段划分)") func AnalyStart(job *util.Job, isSite bool, codeSite string) { con := job.Content //全文的需要修复表格 con = RepairCon(con) //格式化正文 -断点 con = formattext3.ReplaceAllString(con,"") con = hisReg.ReplaceAllString(con, "${2}") con = hisReg2.ReplaceAllString(con, "${2}") con = formattext.ReplaceAllString(con, "${1}:${2}") con = formattext2.ReplaceAllString(con, "${1}") con = formatText(con, "all") job.Content = con //计算表格占比,返回表格数组、占比 tabs, _ := ComputeConRatio(con, 1) /*if len(tabs) > 0 { newcon, newtabs, newration := FindBigText(con, ration, tabs) if newcon != "" { con = newcon con = formatText(con, "all") tabs = newtabs ration = newration } }*/ job.BlockPackage = map[string]*util.BlockPackage{} //分块+处理每块kv blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite) if len(blockArrays) > 0 { //有分块 //从块里面找分包-文本 if !job.IsFile { job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包 } for _, bl := range blockArrays { //log.Println(bl.Text) if len([]rune(bl.Text)) > 80 { bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock, isSite, codeSite) for _, bl_bl := range bl.Block { processTableInBlock(bl_bl, job, isSite, codeSite) } } FindProjectCode(bl.Text, job) //匹配项目编号 processTableInBlock(bl, job, isSite, codeSite) //处理表格 //新加 未分块table中未能解析到中标候选人,从正文中解析 if job.Winnerorder == nil || len(job.Winnerorder) == 0 { bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite) job.Winnerorder = bl.Winnerorder } job.Block = append(job.Block, bl) } } else { //未分块,创建分块 //log.Println(con) bl := &util.Block{} newCon := con //log.Println(con) if len(tabs) > 0 { //解析表格逻辑 job.HasTable = 1 //添加标识:文本中有table newCon = TextAfterRemoveTable(con) //log.Println(newCon) if newCon != "" { job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite) } for i := 0; i < len(tabs); i++ { blockTag := "" if len(tabs[i].Nodes) > 0 { if tabs[i].Nodes[0].PrevSibling != nil { blockTag = tabs[i].Nodes[0].PrevSibling.Data } } //添加标识:文本中有table //blockTag - 块标签 //处理表格 tabres := AnalyTableV2(tabs[i], job.Category, blockTag, con, 1, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象 processTableResult(tabres, bl, job, isSite, codeSite) } } else { //从正文里面找分包 job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite) } bl.Text = HtmlToText(con) //log.Println(bl.Text) FindProjectCode(bl.Text, job) //匹配项目编号 if yjReg.MatchString(bl.Text) { if strings.Index(bl.Text, "业绩") > 1 { bl.Text = bl.Text[:strings.Index(bl.Text, "业绩")] } } //调用kv解析库-处理detail bl.Text = formatText(bl.Text, "all") //处理 : bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite) //处理空格 bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite) //新加 未分块table中未能解析到中标候选人,从正文中解析 if job.Winnerorder == nil || len(job.Winnerorder) == 0 { bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite) job.Winnerorder = bl.Winnerorder } job.Block = append(job.Block, bl) } } func processTableInBlock(bl *util.Block, job *util.Job, isSite bool, codeSite string) { //块中再查找表格(块,处理完把值赋到块) //bl.Text = formatText(bl.Text, "biangeng") tabs, _ := ComputeConRatio(bl.Text, 2) for i, tab := range tabs { job.HasTable = 1 tmptag := "" if i == 0 && bl.Title != "" && len(bl.Title) < 20 { tmptag = bl.Title } else if tab.Nodes[0] != nil && tab.Nodes[0].PrevSibling != nil { tmptag = strings.TrimSpace(tab.Nodes[0].PrevSibling.Data) } //添加标识:文本中有table tabres := AnalyTableV2(tab, job.Category, tmptag, tab.Text(), 2, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象 //if packageFlag { // tabres.PackageMap = nil // tabres.IsMultiPackage = false //} processTableResult(tabres, bl, job, isSite, codeSite) //分析table解析结果 if bl.Title == "" && tabres.BlockTag != "" { bl.Title = tabres.BlockTag } } } //匹配项目编号 func FindProjectCode(newCon string, job *util.Job) { newCon = HtmlToText(newCon) if strings.TrimSpace(newCon) == "" { return } var proCode string blCode := &util.Block{} /* if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的 //5d424bdfa5cb26b9b7ac7a85 //5d425a48a5cb26b9b7df5fec //5d425506a5cb26b9b7cd2c3c splitStr := strings.Split(newConTMP, " ") if len(splitStr) >= 2 { if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 { newCon = "项目编号:" + splitStr[len(splitStr)-1] } else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" { //5d4253f3a5cb26b9b7ca2662 newCon = "项目编号:" + tmpstr } } else if len(splitStr) == 1 { if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" { newCon = "项目编号:" + tmpstr } else if strings.Contains(newConTMP, "、") { tmpstrs := strings.Split(newCon, "、") newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1] } } } proCode = projectcodeReg.FindString(newCon) if proCode != "" { ckv := GetKVAll(proCode, job.Title, nil, 1) blCode.ColonKV = ckv blCode.Text = proCode job.Block = append(job.Block, blCode) } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" { ckv := GetKVAll(proCode, job.Title, nil, 1) blCode.ColonKV = ckv blCode.Text = proCode job.Block = append(job.Block, blCode) } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" { ckv := GetKVAll(proCode, job.Title, nil, 1) blCode.Text = proCode blCode.ColonKV = ckv job.Block = append(job.Block, blCode) }*/ if proCode = jsonReg.FindString(newCon); proCode != "" { jsonMap := make(map[string]string) json.Unmarshal([]byte(proCode), &jsonMap) jobKv := util.NewJobKv() kvTags := map[string][]*util.Tag{} for k, v := range jsonMap { kvTags[k] = append(kvTags[k], &util.Tag{Key: k, Value: v}) tmpkv := new(util.Kv) tmpkv.Line = k + v tmpkv.Key = k tmpkv.Value = v jobKv.Kvs = append(jobKv.Kvs, tmpkv) } jobKv.KvTags = kvTags blCode.ColonKV = jobKv job.Block = append(job.Block, blCode) } } //分析table解析结果 func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, isSite bool, codeSite string) { //解析结果中的kv if block.TableKV == nil { block.TableKV = util.NewJobKv() } MergeKvTags(block.TableKV.KvTags, tabres.KvTags) isorderwiner := true //分包 tablePackage := map[string]*util.BlockPackage{} if tabres.IsMultiPackage && !job.IsFile { //分包中的map for _, v := range tabres.PackageMap.Keys { blockPackage, ok := tabres.PackageMap.Map[v].(*util.BlockPackage) if !ok { continue } //解析kv //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配 labelKVs := []*util.Kv{} if blockPackage.TableKV != nil { for tk, tv := range blockPackage.TableKV.KvTags { for _, tvv := range tv { if regReplKey.MatchString(tk) || regSplit.MatchString(tk) { labelKVs = append(labelKVs, &util.Kv{ Key: tk, Value: tvv.Value, }) } } } } else { blockPackage.TableKV = util.NewJobKv() } MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil, isSite, codeSite)) if blockPackage.WinnerOrder != nil && len(blockPackage.WinnerOrder) > 0 { for i, v := range blockPackage.WinnerOrder { if entName, ok := v["entname"].(string); ok { v["entname"] = winnerOrderEntity.clear("中标单位", entName) if i == 0 && blockPackage.Winner == "" { blockPackage.Winner = fmt.Sprint(v["entname"]) } if price, ok := v["price"].(string); ok && len(price) < 30 && len(price) > 0 && !clearnum.MatchString(price) { v["price"] = winnerOrderEntity.clear("中标金额", price) if !blockPackage.IsTrueBidamount { moneys := clear.ObjToMoney([]interface{}{v["price"], ""}, job.SpiderCode, job.IsClearnMoney) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { blockPackage.Bidamount = vf blockPackage.IsTrueBidamount = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { blockPackage.Bidamount = float64(vi) blockPackage.IsTrueBidamount = moneys[len(moneys)-1].(bool) } } } } v["type"] = tabres.Toptype + "_" + tabres.BlockTag + "_" + blockPackage.Origin job.Winnerorder = append(job.Winnerorder, v) } } isorderwiner = false } tablePackage[v] = blockPackage } } //处理中标人排序 if isorderwiner { tmpWins := make(map[string]int) for _, v := range job.Winnerorder { if v["entname"] != nil && v["entname"] != "" { tmpWins[v["entname"].(string)] = v["sort"].(int) } } wror := []map[string]interface{}{} if len(tmpWins) == 0 && len(tabres.WinnerOrder) > 0 { for _, v := range tabres.WinnerOrder { if entName, ok := v["entname"].(string); ok { v["entname"] = winnerOrderEntity.clear("中标单位", entName) if price, ok := v["price"].(string); ok { v["price"] = winnerOrderEntity.clear("中标金额", price) } v["type"] = tabres.Toptype + "_" + tabres.BlockTag wror = append(wror, v) } } } else { for _, v := range tabres.WinnerOrder { if entName, ok := v["entname"].(string); ok { v["entname"] = winnerOrderEntity.clear("中标单位", entName) if v["entname"] == "" { continue } if price, ok := v["price"].(string); ok { v["price"] = winnerOrderEntity.clear("中标金额", price) } v["type"] = tabres.Toptype + "_" + tabres.BlockTag if tmpWins[v["entname"].(string)] == v["sort"].(int) && v["price"] == nil { continue } else if tmpWins[v["entname"].(string)] != v["sort"].(int) && v["type"] != tabres.BlockTag { wror = append(wror, v) continue } else if tmpWins[v["entname"].(string)] > 0 && tmpWins[v["entname"].(string)] == v["sort"].(int) && v["price"] != nil { if tmpWins[v["entname"].(string)]-1 >= 0 && len(job.Winnerorder) > 0 { if len(job.Winnerorder) > (tmpWins[v["entname"].(string)] - 1) { job.Winnerorder[tmpWins[v["entname"].(string)]-1] = v } continue } } } } } if len(wror) > 0 { job.Winnerorder = append(job.Winnerorder, wror...) block.Winnerorder = job.Winnerorder } } //分包 if len(tablePackage) > 0 && !job.IsFile { pkgMap := map[string]*util.BlockPackage{} for tk, tv := range tablePackage { bv := job.BlockPackage[tk] if bv == nil { pkgMap[tk] = tv continue } bv.Text += "\n" + tv.Text /************table中的分包替换块里面找到的****************/ // if tv.ColonKV != nil { if bv.ColonKV == nil { bv.ColonKV = util.NewJobKv() } MergeKvTags(bv.ColonKV.KvTags, tv.ColonKV.KvTags) } // if tv.TableKV != nil { if bv.TableKV == nil { bv.TableKV = util.NewJobKv() } MergeKvTags(bv.TableKV.KvTags, tv.TableKV.KvTags) } // if tv.Origin != "" { bv.Origin = tv.Origin } // if tv.Index != "" { bv.Index = tv.Index } // if tv.Type != "" { bv.Type = tv.Type } // if tv.BidStatus != "" { bv.BidStatus = tv.BidStatus } // if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 { bv.WinnerOrder = tv.WinnerOrder } if tv.Bidamount >= 0 && tv.IsTrueBidamount { bv.Bidamount = tv.Bidamount bv.IsTrueBidamount = tv.IsTrueBidamount } if tv.Budget >= 0 && tv.IsTrueBudget { bv.Budget = tv.Budget bv.IsTrueBudget = tv.IsTrueBudget } } for k, v := range pkgMap { job.BlockPackage[k] = v } } //增加brand if tabres.HasKey != 0 { job.HasKey = tabres.HasKey } if tabres.HasBrand != 0 { job.HasBrand = tabres.HasBrand } if tabres.HasGoods != 0 { job.HasGoods = tabres.HasGoods } job.HasGoods = tabres.HasGoods if len(tabres.BrandData) > 0 { //分块table合并 for _, v := range tabres.BrandData { job.BrandData = append(job.BrandData, v) //加入job } } //加入job if len(tabres.PriceNumberData) > 0 { for _, tabledata := range tabres.PriceNumberData { //校验重复的table对象 job.PriceNumberData = append(job.PriceNumberData, tabledata) } } } //一行多列 一列多行,按照分块逻辑处理 //ration==1 遍历所有tabs,ration!=1 tabs只有一个 func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string { if len(tabs) != 1 { return "" //5c2aca5ea5cb26b9b7a8229b } for _, tab := range tabs { content := "" tbody := tab.ChildrenFiltered("tbody,thead") var tr *goquery.Selection if tbody.Length() == 1 { tr = tbody.ChildrenFiltered("tr") } else { tr = tab.ChildrenFiltered("tr") } if tr.Length() == 1 { tds := tr.ChildrenFiltered("td") tds.Each(func(index int, sn *goquery.Selection) { ret, _ := sn.Html() if strings.TrimSpace(ret) != "" { content += ret + "\n" } }) } else { flag := true tr.EachWithBreak(func(index int, sn *goquery.Selection) bool { th := sn.ChildrenFiltered("th") td := sn.ChildrenFiltered("td") if th.Length() > 0 || td.Length() > 1 { flag = false return false } else if td.Length() == 1 { ret, _ := td.Html() if strings.TrimSpace(ret) != "" { content += ret + "\n" } } return true }) if !flag { return "" } } if content != "" { content = regMoreWrap.ReplaceAllString(content, "\n") content = regEndWrap.ReplaceAllString(content, "") doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) doc.Find("table").Eq(0).ReplaceWithHtml(content) con, _ = doc.Find("body").Html() } } return con } //查找大文本,5次 func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) { content = tableDivideBlock(con, r, t) if content == "" { return } for i := 0; i < 4; i++ { if content != "" { tabs, ration = ComputeConRatio(content, 1) if len(tabs) > 0 { con := tableDivideBlock(content, ration, tabs) if con == "" { return } else { content = con } } else { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) content = doc.Text() return } } else { return } } return }