/** 信息预处理入口 **/ package pretreated import ( "encoding/json" "jy/util" "strings" "github.com/PuerkitoBio/goquery" ) func AnalyStart(job *util.Job) { con := job.Content //全文的需要修复表格 con = RepairCon(con) //格式化正文 con = formatText(con, "all") job.Content = con //计算表格占比,返回表格数组、占比 tabs, ration := ComputeConRatio(con, 1) if len(tabs) > 0 { newcon, newtabs, newration := FindBigText(con, ration, tabs) if newcon != "" { con = newcon tabs = newtabs ration = newration } } blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块 if len(blockArrays) > 0 { //有分块 //从块里面找分包 job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包 for _, bl := range blockArrays { if len([]rune(bl.Text)) > 80 { bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock) for _, bl_bl := range bl.Block { processTableInBlock(bl_bl, job) } } processTableInBlock(bl, job) //新加 未分块table中未能解析到中标候选人,从正文中解析 if job.Winnerorder == nil || len(job.Winnerorder) == 0 { bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1) } job.Block = append(job.Block, bl) } } else { //未分块,创建分块 bl := &util.Block{} newCon := con if len(tabs) > 0 { //解析表格逻辑 job.HasTable = 1 //添加标识:文本中有table newCon = TextAfterRemoveTable(con) job.BlockPackage = FindPackageFromText(job.Title, newCon) for i := 0; i < len(tabs); i++ { //添加标识:文本中有table tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象 processTableResult(tabres, bl, job) //分析table解析结果 } // for k, v := range bl.TableKV.Kv { // log.Println("bl.TableKV.Kv", k, v) // } } else { //从正文里面找分包 job.BlockPackage = FindPackageFromText(job.Title, newCon) } FindProjectCode(newCon, job) //匹配项目编号 bl.Text = newCon //调用kv解析 bl.ColonKV = GetKVAll(newCon, "", nil, 1) bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil) //新加 未分块table中未能解析到中标候选人,从正文中解析 if job.Winnerorder == nil || len(job.Winnerorder) == 0 { bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1) } job.Block = append(job.Block, bl) } } func processTableInBlock(bl *util.Block, job *util.Job) { //块中再查找表格(块,处理完把值赋到块) tabs, _ := ComputeConRatio(bl.Text, 2) for _, tab := range tabs { job.HasTable = 1 //添加标识:文本中有table tabres := AnalyTableV2(tab, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象 processTableResult(tabres, bl, job) //分析table解析结果 if bl.Title == "" && tabres.BlockTag != "" { bl.Title = tabres.BlockTag } } } //匹配项目编号 func FindProjectCode(newCon string, job *util.Job) { newCon = TextAfterRemoveTable(newCon) if strings.TrimSpace(newCon) == "" { return } var proCode string proCode = projectcodeReg.FindString(newCon) blCode := &util.Block{} if proCode != "" { ckv := GetKVAll(proCode, job.Title, nil, 1) blCode.ColonKV = ckv blCode.Text = proCode job.Block = append(job.Block, blCode) } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" { ckv := GetKVAll(proCode, job.Title, nil, 1) blCode.ColonKV = ckv blCode.Text = proCode job.Block = append(job.Block, blCode) } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" { ckv := GetKVAll(proCode, job.Title, nil, 1) blCode.Text = proCode blCode.ColonKV = ckv job.Block = append(job.Block, blCode) } if proCode = jsonReg.FindString(newCon); proCode != "" { jsonMap := make(map[string]string) json.Unmarshal([]byte(proCode), &jsonMap) jobKv := util.NewJobKv() kvTags := map[string][]*util.Tag{} for k, v := range jsonMap { kvTags[k] = append(kvTags[k], &util.Tag{Key: k, Value: v}) tmpkv := new(util.Kv) tmpkv.Line = k + v tmpkv.Key = k tmpkv.Value = v jobKv.Kvs = append(jobKv.Kvs, tmpkv) } jobKv.KvTags = kvTags blCode.ColonKV = jobKv job.Block = append(job.Block, blCode) } } //分析table解析结果 func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) { //解析结果中的kv block.TableKV = &util.JobKv{KvTags: tabres.KvTags} //分包 tablePackage := map[string]*util.BlockPackage{} if tabres.IsMultiPackage { //分包中的map for k, v := range tabres.PackageMap.Map { blockPackage, ok := v.(*util.BlockPackage) if !ok { continue } //解析kv //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配 labelKVs := []*util.Kv{} if blockPackage.TableKV != nil && len(blockPackage.TableKV.KvTags) > 0 { for tk, tv := range blockPackage.TableKV.KvTags { for _, tvv := range tv { if regReplKey.MatchString(tk) || regSplit.MatchString(tk) { labelKVs = append(labelKVs, &util.Kv{ Key: tk, Value: tvv.Value, }) } } } } blockPackage.TableKV.KvTags = GetKvTags(labelKVs, "", nil) tablePackage[k] = blockPackage } } //处理中标人排序 wror := []map[string]interface{}{} for _, v := range tabres.WinnerOrder { entName, _ := v["entname"].(string) v["entname"] = winnerOrderEntity.clear("中标单位", entName) if price, ok := v["price"].(string); ok { v["price"] = winnerOrderEntity.clear("中标金额", price) } v["type"] = 2 wror = append(wror, v) } if len(wror) > 0 { job.Winnerorder = wror } //分包 if len(tablePackage) > 0 { pkgMap := map[string]*util.BlockPackage{} for tk, tv := range tablePackage { bv := job.BlockPackage[tk] if bv == nil { pkgMap[tk] = tv continue } bv.Text += "\n" + tv.Text /************table中的分包替换块里面找到的****************/ // if tv.ColonKV != nil { if bv.ColonKV == nil { bv.ColonKV = util.NewJobKv() } MergeKvTags(bv.ColonKV.KvTags, tv.ColonKV.KvTags) } // if tv.TableKV != nil { if bv.TableKV == nil { bv.TableKV = util.NewJobKv() } MergeKvTags(bv.TableKV.KvTags, tv.TableKV.KvTags) } // if tv.Origin != "" { bv.Origin = tv.Origin } // if tv.Index != "" { bv.Index = tv.Index } // if tv.Type != "" { bv.Type = tv.Type } // if tv.BidStatus != "" { bv.BidStatus = tv.BidStatus } // if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 { bv.WinnerOrder = tv.WinnerOrder } } for k, v := range pkgMap { job.BlockPackage[k] = v } } //增加brand if tabres.HasKey != 0 { job.HasKey = tabres.HasKey } if tabres.HasBrand != 0 { job.HasBrand = tabres.HasBrand } if tabres.HasGoods != 0 { job.HasGoods = tabres.HasGoods } job.HasGoods = tabres.HasGoods if len(tabres.BrandData) > 0 { //分块table合并 for _, v := range tabres.BrandData { job.BrandData = append(job.BrandData, v) //加入job } } } //一行多列 一列多行,按照分块逻辑处理 //ration==1 遍历所有tabs,ration!=1 tabs只有一个 func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string { if len(tabs) != 1 { return "" //5c2aca5ea5cb26b9b7a8229b } for _, tab := range tabs { content := "" tbody := tab.ChildrenFiltered("tbody,thead") var tr *goquery.Selection if tbody.Length() == 1 { tr = tbody.ChildrenFiltered("tr") } else { tr = tab.ChildrenFiltered("tr") } if tr.Length() == 1 { tds := tr.ChildrenFiltered("td") tds.Each(func(index int, sn *goquery.Selection) { ret, _ := sn.Html() if strings.TrimSpace(ret) != "" { content += ret + "\n" } }) } else { flag := true tr.EachWithBreak(func(index int, sn *goquery.Selection) bool { th := sn.ChildrenFiltered("th") td := sn.ChildrenFiltered("td") if th.Length() > 0 || td.Length() > 1 { flag = false return false } else if td.Length() == 1 { ret, _ := td.Html() if strings.TrimSpace(ret) != "" { content += ret + "\n" } } return true }) if !flag { return "" } } if content != "" { content = regMoreWrap.ReplaceAllString(content, "\n") content = regEndWrap.ReplaceAllString(content, "") doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) doc.Find("table").Eq(0).ReplaceWithHtml(content) con, _ = doc.Find("body").Html() } } return con } //查找大文本,5次 func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) { content = tableDivideBlock(con, r, t) if content == "" { return } for i := 0; i < 4; i++ { if content != "" { tabs, ration = ComputeConRatio(content, 1) if len(tabs) > 0 { con := tableDivideBlock(content, ration, tabs) if con == "" { return } else { content = con } } else { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con)) content = doc.Text() return } } else { return } } return }