123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495 |
- /**
- 信息预处理入口
- **/
- package pretreated
- import (
- "encoding/json"
- "fmt"
- "jy/clear"
- "jy/util"
- "regexp"
- "strings"
- "github.com/PuerkitoBio/goquery"
- )
- var yjReg *regexp.Regexp = regexp.MustCompile("(打分表|负责人|单位|个人|投标人|项目|企业)业绩|主要人员相关资料|投标文件格式|唱标记录|否决投标的?情况说明")
- var hisReg = regexp.MustCompile("(开标记录|类似业绩|历史业绩|填报项目业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(</td>)")
- var hisReg2 = regexp.MustCompile("(开标记录|业绩|[得评]+[审打]{0,2}分情况|无效标)[::\n]*.*?[\n]?(</tr>|</table>|</td>)")
- var formattext = regexp.MustCompile("(投标总价)([0-9,.万元]*)")
- var formattext2 = regexp.MustCompile("中标单价.*(中标总价.*)")
- var formattext3 = regexp.MustCompile("(同类项目业绩、|[1-9].[0-9]包段划分)")
- func AnalyStart(job *util.Job, isSite bool, codeSite string) {
- con := job.Content
- //全文的需要修复表格
- con = RepairCon(con)
- //格式化正文 -断点
- con = formattext3.ReplaceAllString(con,"")
- con = hisReg.ReplaceAllString(con, "${2}")
- con = hisReg2.ReplaceAllString(con, "${2}")
- con = formattext.ReplaceAllString(con, "${1}:${2}")
- con = formattext2.ReplaceAllString(con, "${1}")
- con = formatText(con, "all")
- job.Content = con
- //计算表格占比,返回表格数组、占比
- tabs, _ := ComputeConRatio(con, 1)
- /*if len(tabs) > 0 {
- newcon, newtabs, newration := FindBigText(con, ration, tabs)
- if newcon != "" {
- con = newcon
- con = formatText(con, "all")
- tabs = newtabs
- ration = newration
- }
- }*/
- job.BlockPackage = map[string]*util.BlockPackage{}
- //分块+处理每块kv
- blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite)
- if len(blockArrays) > 0 { //有分块
- //从块里面找分包-文本
- if !job.IsFile {
- job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
- }
- for _, bl := range blockArrays {
- //log.Println(bl.Text)
- if len([]rune(bl.Text)) > 80 {
- bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock, isSite, codeSite)
- for _, bl_bl := range bl.Block {
- processTableInBlock(bl_bl, job, isSite, codeSite)
- }
- }
- FindProjectCode(bl.Text, job) //匹配项目编号
- processTableInBlock(bl, job, isSite, codeSite) //处理表格
- //新加 未分块table中未能解析到中标候选人,从正文中解析
- if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
- bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
- job.Winnerorder = bl.Winnerorder
- }
- job.Block = append(job.Block, bl)
- }
- } else { //未分块,创建分块
- //log.Println(con)
- bl := &util.Block{}
- newCon := con
- //log.Println(con)
- if len(tabs) > 0 { //解析表格逻辑
- job.HasTable = 1 //添加标识:文本中有table
- newCon = TextAfterRemoveTable(con)
- //log.Println(newCon)
- if newCon != "" {
- job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
- }
- for i := 0; i < len(tabs); i++ {
- blockTag := ""
- if len(tabs[i].Nodes) > 0 {
- if tabs[i].Nodes[0].PrevSibling != nil {
- blockTag = tabs[i].Nodes[0].PrevSibling.Data
- }
- }
- //添加标识:文本中有table
- //blockTag - 块标签
- //处理表格
- tabres := AnalyTableV2(tabs[i], job.Category, blockTag, con, 1, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象
- processTableResult(tabres, bl, job, isSite, codeSite)
- }
- } else {
- //从正文里面找分包
- job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
- }
- bl.Text = HtmlToText(con)
- //log.Println(bl.Text)
- FindProjectCode(bl.Text, job) //匹配项目编号
- if yjReg.MatchString(bl.Text) {
- if strings.Index(bl.Text, "业绩") > 1 {
- bl.Text = bl.Text[:strings.Index(bl.Text, "业绩")]
- }
- }
- //调用kv解析库-处理detail
- bl.Text = formatText(bl.Text, "all")
- //处理 :
- bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
- //处理空格
- bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
- //新加 未分块table中未能解析到中标候选人,从正文中解析
- if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
- bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
- job.Winnerorder = bl.Winnerorder
- }
- job.Block = append(job.Block, bl)
- }
- }
- func processTableInBlock(bl *util.Block, job *util.Job, isSite bool, codeSite string) {
- //块中再查找表格(块,处理完把值赋到块)
- //bl.Text = formatText(bl.Text, "biangeng")
- tabs, _ := ComputeConRatio(bl.Text, 2)
- for i, tab := range tabs {
- job.HasTable = 1
- tmptag := ""
- if i == 0 && bl.Title != "" && len(bl.Title) < 20 {
- tmptag = bl.Title
- } else if tab.Nodes[0] != nil && tab.Nodes[0].PrevSibling != nil {
- tmptag = strings.TrimSpace(tab.Nodes[0].PrevSibling.Data)
- }
- //添加标识:文本中有table
- tabres := AnalyTableV2(tab, job.Category, tmptag, tab.Text(), 2, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象
- //if packageFlag {
- // tabres.PackageMap = nil
- // tabres.IsMultiPackage = false
- //}
- processTableResult(tabres, bl, job, isSite, codeSite) //分析table解析结果
- if bl.Title == "" && tabres.BlockTag != "" {
- bl.Title = tabres.BlockTag
- }
- }
- }
- //匹配项目编号
- func FindProjectCode(newCon string, job *util.Job) {
- newCon = HtmlToText(newCon)
- if strings.TrimSpace(newCon) == "" {
- return
- }
- var proCode string
- blCode := &util.Block{}
- /* if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
- //5d424bdfa5cb26b9b7ac7a85
- //5d425a48a5cb26b9b7df5fec
- //5d425506a5cb26b9b7cd2c3c
- splitStr := strings.Split(newConTMP, " ")
- if len(splitStr) >= 2 {
- if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
- newCon = "项目编号:" + splitStr[len(splitStr)-1]
- } else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
- //5d4253f3a5cb26b9b7ca2662
- newCon = "项目编号:" + tmpstr
- }
- } else if len(splitStr) == 1 {
- if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
- newCon = "项目编号:" + tmpstr
- } else if strings.Contains(newConTMP, "、") {
- tmpstrs := strings.Split(newCon, "、")
- newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
- }
- }
- }
- proCode = projectcodeReg.FindString(newCon)
- if proCode != "" {
- ckv := GetKVAll(proCode, job.Title, nil, 1)
- blCode.ColonKV = ckv
- blCode.Text = proCode
- job.Block = append(job.Block, blCode)
- } else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
- ckv := GetKVAll(proCode, job.Title, nil, 1)
- blCode.ColonKV = ckv
- blCode.Text = proCode
- job.Block = append(job.Block, blCode)
- } else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
- ckv := GetKVAll(proCode, job.Title, nil, 1)
- blCode.Text = proCode
- blCode.ColonKV = ckv
- job.Block = append(job.Block, blCode)
- }*/
- if proCode = jsonReg.FindString(newCon); proCode != "" {
- jsonMap := make(map[string]string)
- json.Unmarshal([]byte(proCode), &jsonMap)
- jobKv := util.NewJobKv()
- kvTags := map[string][]*util.Tag{}
- for k, v := range jsonMap {
- kvTags[k] = append(kvTags[k], &util.Tag{Key: k, Value: v})
- tmpkv := new(util.Kv)
- tmpkv.Line = k + v
- tmpkv.Key = k
- tmpkv.Value = v
- jobKv.Kvs = append(jobKv.Kvs, tmpkv)
- }
- jobKv.KvTags = kvTags
- blCode.ColonKV = jobKv
- job.Block = append(job.Block, blCode)
- }
- }
- //分析table解析结果
- func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, isSite bool, codeSite string) {
- //解析结果中的kv
- if block.TableKV == nil {
- block.TableKV = util.NewJobKv()
- }
- MergeKvTags(block.TableKV.KvTags, tabres.KvTags)
- isorderwiner := true
- //分包
- tablePackage := map[string]*util.BlockPackage{}
- if tabres.IsMultiPackage && !job.IsFile {
- //分包中的map
- for _, v := range tabres.PackageMap.Keys {
- blockPackage, ok := tabres.PackageMap.Map[v].(*util.BlockPackage)
- if !ok {
- continue
- }
- //解析kv
- //找到key是“包1中标单位”这种的key,过滤掉包1,再次到标签库中匹配
- labelKVs := []*util.Kv{}
- if blockPackage.TableKV != nil {
- for tk, tv := range blockPackage.TableKV.KvTags {
- for _, tvv := range tv {
- if regReplKey.MatchString(tk) || regSplit.MatchString(tk) {
- labelKVs = append(labelKVs, &util.Kv{
- Key: tk,
- Value: tvv.Value,
- })
- }
- }
- }
- } else {
- blockPackage.TableKV = util.NewJobKv()
- }
- MergeKvTags(blockPackage.TableKV.KvTags, GetKvTags(labelKVs, "", nil, isSite, codeSite))
- if blockPackage.WinnerOrder != nil && len(blockPackage.WinnerOrder) > 0 {
- for i, v := range blockPackage.WinnerOrder {
- if entName, ok := v["entname"].(string); ok {
- v["entname"] = winnerOrderEntity.clear("中标单位", entName)
- if i == 0 && blockPackage.Winner == "" {
- blockPackage.Winner = fmt.Sprint(v["entname"])
- }
- if price, ok := v["price"].(string); ok && len(price) < 30 && len(price) > 0 && !clearnum.MatchString(price) {
- v["price"] = winnerOrderEntity.clear("中标金额", price)
- if !blockPackage.IsTrueBidamount {
- moneys := clear.ObjToMoney([]interface{}{v["price"], ""}, job.SpiderCode, job.IsClearnMoney)
- if len(moneys) > 0 {
- if vf, ok := moneys[0].(float64); ok {
- blockPackage.Bidamount = vf
- blockPackage.IsTrueBidamount = moneys[len(moneys)-1].(bool)
- } else if vi, ok := moneys[0].(int); ok {
- blockPackage.Bidamount = float64(vi)
- blockPackage.IsTrueBidamount = moneys[len(moneys)-1].(bool)
- }
- }
- }
- }
- v["type"] = tabres.Toptype + "_" + tabres.BlockTag + "_" + blockPackage.Origin
- job.Winnerorder = append(job.Winnerorder, v)
- }
- }
- isorderwiner = false
- }
- tablePackage[v] = blockPackage
- }
- }
- //处理中标人排序
- if isorderwiner {
- tmpWins := make(map[string]int)
- for _, v := range job.Winnerorder {
- if v["entname"] != nil && v["entname"] != "" {
- tmpWins[v["entname"].(string)] = v["sort"].(int)
- }
- }
- wror := []map[string]interface{}{}
- if len(tmpWins) == 0 && len(tabres.WinnerOrder) > 0 {
- for _, v := range tabres.WinnerOrder {
- if entName, ok := v["entname"].(string); ok {
- v["entname"] = winnerOrderEntity.clear("中标单位", entName)
- if price, ok := v["price"].(string); ok {
- v["price"] = winnerOrderEntity.clear("中标金额", price)
- }
- v["type"] = tabres.Toptype + "_" + tabres.BlockTag
- wror = append(wror, v)
- }
- }
- } else {
- for _, v := range tabres.WinnerOrder {
- if entName, ok := v["entname"].(string); ok {
- v["entname"] = winnerOrderEntity.clear("中标单位", entName)
- if v["entname"] == "" {
- continue
- }
- if price, ok := v["price"].(string); ok {
- v["price"] = winnerOrderEntity.clear("中标金额", price)
- }
- v["type"] = tabres.Toptype + "_" + tabres.BlockTag
- if tmpWins[v["entname"].(string)] == v["sort"].(int) && v["price"] == nil {
- continue
- } else if tmpWins[v["entname"].(string)] != v["sort"].(int) && v["type"] != tabres.BlockTag {
- wror = append(wror, v)
- continue
- } else if tmpWins[v["entname"].(string)] > 0 && tmpWins[v["entname"].(string)] == v["sort"].(int) && v["price"] != nil {
- if tmpWins[v["entname"].(string)]-1 >= 0 && len(job.Winnerorder) > 0 {
- if len(job.Winnerorder) > (tmpWins[v["entname"].(string)] - 1) {
- job.Winnerorder[tmpWins[v["entname"].(string)]-1] = v
- }
- continue
- }
- }
- }
- }
- }
- if len(wror) > 0 {
- job.Winnerorder = append(job.Winnerorder, wror...)
- block.Winnerorder = job.Winnerorder
- }
- }
- //分包
- if len(tablePackage) > 0 && !job.IsFile {
- pkgMap := map[string]*util.BlockPackage{}
- for tk, tv := range tablePackage {
- bv := job.BlockPackage[tk]
- if bv == nil {
- pkgMap[tk] = tv
- continue
- }
- bv.Text += "\n" + tv.Text
- /************table中的分包替换块里面找到的****************/
- //
- if tv.ColonKV != nil {
- if bv.ColonKV == nil {
- bv.ColonKV = util.NewJobKv()
- }
- MergeKvTags(bv.ColonKV.KvTags, tv.ColonKV.KvTags)
- }
- //
- if tv.TableKV != nil {
- if bv.TableKV == nil {
- bv.TableKV = util.NewJobKv()
- }
- MergeKvTags(bv.TableKV.KvTags, tv.TableKV.KvTags)
- }
- //
- if tv.Origin != "" {
- bv.Origin = tv.Origin
- }
- //
- if tv.Index != "" {
- bv.Index = tv.Index
- }
- //
- if tv.Type != "" {
- bv.Type = tv.Type
- }
- //
- if tv.BidStatus != "" {
- bv.BidStatus = tv.BidStatus
- }
- //
- if tv.WinnerOrder != nil && len(tv.WinnerOrder) > 0 {
- bv.WinnerOrder = tv.WinnerOrder
- }
- if tv.Bidamount >= 0 && tv.IsTrueBidamount {
- bv.Bidamount = tv.Bidamount
- bv.IsTrueBidamount = tv.IsTrueBidamount
- }
- if tv.Budget >= 0 && tv.IsTrueBudget {
- bv.Budget = tv.Budget
- bv.IsTrueBudget = tv.IsTrueBudget
- }
- }
- for k, v := range pkgMap {
- job.BlockPackage[k] = v
- }
- }
- //增加brand
- if tabres.HasKey != 0 {
- job.HasKey = tabres.HasKey
- }
- if tabres.HasBrand != 0 {
- job.HasBrand = tabres.HasBrand
- }
- if tabres.HasGoods != 0 {
- job.HasGoods = tabres.HasGoods
- }
- job.HasGoods = tabres.HasGoods
- if len(tabres.BrandData) > 0 { //分块table合并
- for _, v := range tabres.BrandData {
- job.BrandData = append(job.BrandData, v) //加入job
- }
- }
- //加入job
- if len(tabres.PriceNumberData) > 0 {
- for _, tabledata := range tabres.PriceNumberData { //校验重复的table对象
- job.PriceNumberData = append(job.PriceNumberData, tabledata)
- }
- }
- }
- //一行多列 一列多行,按照分块逻辑处理
- //ration==1 遍历所有tabs,ration!=1 tabs只有一个
- func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
- if len(tabs) != 1 {
- return "" //5c2aca5ea5cb26b9b7a8229b
- }
- for _, tab := range tabs {
- content := ""
- tbody := tab.ChildrenFiltered("tbody,thead")
- var tr *goquery.Selection
- if tbody.Length() == 1 {
- tr = tbody.ChildrenFiltered("tr")
- } else {
- tr = tab.ChildrenFiltered("tr")
- }
- if tr.Length() == 1 {
- tds := tr.ChildrenFiltered("td")
- tds.Each(func(index int, sn *goquery.Selection) {
- ret, _ := sn.Html()
- if strings.TrimSpace(ret) != "" {
- content += ret + "\n"
- }
- })
- } else {
- flag := true
- tr.EachWithBreak(func(index int, sn *goquery.Selection) bool {
- th := sn.ChildrenFiltered("th")
- td := sn.ChildrenFiltered("td")
- if th.Length() > 0 || td.Length() > 1 {
- flag = false
- return false
- } else if td.Length() == 1 {
- ret, _ := td.Html()
- if strings.TrimSpace(ret) != "" {
- content += ret + "\n"
- }
- }
- return true
- })
- if !flag {
- return ""
- }
- }
- if content != "" {
- content = regMoreWrap.ReplaceAllString(content, "\n")
- content = regEndWrap.ReplaceAllString(content, "")
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
- doc.Find("table").Eq(0).ReplaceWithHtml(content)
- con, _ = doc.Find("body").Html()
- }
- }
- return con
- }
- //查找大文本,5次
- func FindBigText(con string, r float32, t []*goquery.Selection) (content string, tabs []*goquery.Selection, ration float32) {
- content = tableDivideBlock(con, r, t)
- if content == "" {
- return
- }
- for i := 0; i < 4; i++ {
- if content != "" {
- tabs, ration = ComputeConRatio(content, 1)
- if len(tabs) > 0 {
- con := tableDivideBlock(content, ration, tabs)
- if con == "" {
- return
- } else {
- content = con
- }
- } else {
- doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
- content = doc.Text()
- return
- }
- } else {
- return
- }
- }
- return
- }
|