package pretreated import ( "fmt" u "jy/util" qutil "qfw/util" "regexp" "strings" "github.com/PuerkitoBio/goquery" ) /** 全局变量,主要是一堆判断正则 **/ var ( //清理品目中数字 numclear = regexp.MustCompile("^[\\d一二三四五六七八九十.]+") //清理表格title中的不需要的内容 tabletitleclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/((人民币万元件个公斤))]") //清理表格中是key中包含的空格或数字等 tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]") //清理表格td中的符号 tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、,。、_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕¥$]*") //判断key是金额,对万元的处理 moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)") //根据表格的内容判断是不是表头,如果含有金额则不是表头 MoneyReg = regexp.MustCompile("^[\\s ::0-9.万元()()人民币¥$]+$") //判断分包时 moneyNum = regexp.MustCompile("[元整¥万]") //对隐藏表格的判断 display = regexp.MustCompile("(?i).*?display\\s?[:]\\s?none.*") //--------------- //求是分包的概率 //根据表格的标签对分包进行打分 TableMultiPackageReg_4 = regexp.MustCompile("(标段|分包|包段|划分|子包|标包|合同段)") TableMultiPackageReg_2 = regexp.MustCompile("(概况|范围|情况|内容|详细|结果|信息)") //在判断分包打分前过虑表格key FilterKey_2 = regexp.MustCompile("招标|投标|项目") //根据表格的key进行分包打分 FindKey_2 = regexp.MustCompile("([分子][包标](号)?|标[号项段包](划分)?|包件?[号段名数])") //对值进行分包判断 FindVal_1 = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)") FindVal2_1 = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)|^(设计|施工|监理|验收)[分子]?[标包]?[段号]?$") //判断分包前排除 excludeKey = regexp.MustCompile("(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)") //编号|划分 //------------- cut = u.NewCut() //清理表格标签正则 ClearTagReg = regexp.MustCompile("<[^>]*?>|[\\s\\n\\r]*$") //查找表格标签正则 ttagreg = regexp.MustCompile("(?s)([^\\n::。,;\\s\u3000\u2003\u00a0]{2,30})[::]?[^::。;!\\n]{0,35}[\\s\\n]*$") //判断表格是表头的概率 checkval = float32(0.6) //tdval_reg = regexp.MustCompile(`([\p{Han}][\p{Han}\s、()\\(\\)]{1,9})[::]([^::\\n。]{5,60})(?:[;;,,.。\\n\\t\\s])?`) //空格替换 repSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0::]+|\\\\t+") //对表格kv的处理 //对不能标准化的key做批识 filter_tag_zb = regexp.MustCompile("(中标|成交|投标)[\\p{Han}]{0,6}(情况|结果|信息|明细)") //中标金额 //包含以下字眼做标准化处理 filter_zbje_k = regexp.MustCompile("(中标|成交|总|拦标|合同|供[应货]商|报)[\\p{Han}、]{0,6}(价|额|[大小]写|[万亿]?元).{0,4}$") //简单判断金额 filter_zbje_jd = regexp.MustCompile("^[^售]{0,4}(价|额).{0,4}$") //且排队以下字眼的key filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分$") //且值包含以下字眼 filter_zbje_v = regexp.MustCompile("[¥$$0-9一二三四五六七八九十,,〇零点..壹贰叁肆伍陆柒捌玖拾百佰千仟万亿億元圆角分整正()::大小写]{2,16}") //中标单位的处理 //包含以下字眼的Key标准化 filter_zbdw_ky = regexp.MustCompile("(中标|成交|拦标|合同|选中|投标|拟|预|最终)[\\p{Han}、]{0,6}(供[应货]商|企业|单位|人|机构)(名称)?.{0,4}$") //简单判断 filter_zbdw_jd = regexp.MustCompile("(投标|成交|中标|合同)(供应商|单位|人|名称).{0,4}$") //且不包含以下字眼 filter_zbdw_kn = regexp.MustCompile("第[2二3三4四5五]|得分|地址") //且值包含以下字眼 filter_zbdw_v = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$|([^购]中心|办公|用品)") //且值包含以下字眼 filter_zbdw_v2 = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$") //Tg = map[string]interface{}{} //一些表格没有表头,是空的,对值是排序的做处理对应 NullTxBid NullTdReg = regexp.MustCompile("(首选|第[一二三四五1-5])(中标|成交)?(名(称)?|(候选|排序)?(人|单位|供应商))") NullTxtBid = "成交供应商排名" projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$") MhSpilt = regexp.MustCompile("[::]") //识别采购单位联系人、联系电话、代理机构联系人、联系电话 ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|(征求意见|报名审核购买)?((联系人?|办公)?((电话([//]传真)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|(项目)?经办)人)|采购方代表") ContactInfoMustReg = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$") ContactType = map[string]*regexp.Regexp{ "采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|发布人?|甲|招标(服务)?|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人"), "代理机构": regexp.MustCompile("(代理|受托).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"), } ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$") MultipleValueSplitReg = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]") BuyerContacts = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"} FilterSerial = regexp.MustCompile(".+[、..::,]") filterTableWror = regexp.MustCompile("班子成员") underline = regexp.MustCompile("_+$") iswinnertabletag = regexp.MustCompile("(中标|候选人|成交|结果)") nswinnertabletag = regexp.MustCompile("[评得分估]+") ) //在解析时,判断表格元素是否隐藏 func IsHide(g *goquery.Selection) (b bool) { style, exists := g.Attr("style") if exists { b = display.MatchString(style) } return } //对表格的key进行标准化处理,多个k相同时,出现覆盖问题 //待扩展,暂不支持正则标签库 func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string, weight []int, v1, returntag string, b bool) { k1 = []string{} weight = []int{} tk := k if sv, sok := v.(string); sok { //取KV v1 = sv } else if sv, sok := v.([]string); sok { //是数组先默认取第一个 v1 = sv[0] } //对值单位的处理 if moneyreg.MatchString(tk) { v1 += GetMoneyUnit(tk, v1) } //先清理key //u.Debug(1, k, v1) k = ClearKey(k, 2) //u.Debug(2, k) //取标准key res := u.GetTags(k) if len(res) == 0 && tk != k { res = u.GetTags(tk) } //log.Println(k, res) // if len(res) == 0 { // go u.AddtoNoMatchMap(tk) // } //当取到标准化值时,放入数组 if len(res) > 0 { b = true for _, t1 := range res { k1 = append(k1, t1.Value) weight = append(weight, t1.Weight) } //k1 = res[0].Value } //没有取到标准化key时,对中标金额和中标单位的逻辑处理 if !b { if filter_zbje_k.MatchString(k) && !filter_zbje_kn.MatchString(k) && filter_zbje_v.MatchString(v1) { if tabletag == "" { returntag = "中标情况" } k1 = append(k1, "中标金额") weight = append(weight, -100) b = true } else if filter_zbdw_ky.MatchString(k) && !filter_zbdw_kn.MatchString(k) && filter_zbdw_v.MatchString(v1) { k1 = append(k1, "中标单位") weight = append(weight, -100) if tabletag == "" { returntag = "中标情况" } b = true } } //对上一步没有取到标准化key的进一步处理 if !b { if tabletag == "" { } if filter_tag_zb.MatchString(tabletag) || filter_tag_zb.MatchString(tabledesc) { //u.Debug(v1, k, "-----", filter_zbdw_jd.MatchString(k), filter_zbdw_v.MatchString(v1)) if filter_zbje_jd.MatchString(k) && !filter_zbje_kn.MatchString(k) && filter_zbje_v.MatchString(v1) { k1 = append(k1, "中标金额") weight = append(weight, -100) b = true } /*else if filter_zbdw_jd.MatchString(k) && filter_zbdw_v.MatchString(v1) { k1 = append(k1, "中标单位") weight = append(weight, -100) b = true }*/ } } return } //对解析后的表格的kv进行过滤 func (table *Table) KVFilter() { //1.标准化值查找 //2.对数组的处理 //3.对分包的处理 //4.对KV的处理 //判断表格是否有用,调用abandontable正则数组进行判断 //遍历每一行 winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签 if !winnertag { winnertag = iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签 } for _, tr := range table.TRs { for _, td := range tr.TDs { //fmt.Println(td.BH, td.MustBH, td.Val, td.SortKV.Map) bc := false if !td.BH { //表头是否是无用内容 if td.HeadTd != nil { bc, _, _, _, _ = CheckCommon(td.HeadTd.Val, "abandontable") } } if !bc { //td元素有内嵌kv,遍历放入table的Kv中 if len(td.SortKV.Keys) > 0 { for _, k3 := range td.SortKV.Keys { _val := td.SortKV.Map[k3] //thisFlag := false if td.HeadTd != nil && len([]rune(k3)) < 4 { k3 = td.HeadTd.Val + k3 } if table.SortKV.Map[k3] == nil { //u.Debug(k3, _val) //if !thisFlag || (thisFlag && table.SortKV.Map[k3] == nil) { table.SortKV.AddKey(k3, _val) } } } } //td有子表格的处理 //u.Debug(td.BH, td.Val, td.SonTableResult) if td.SonTableResult != nil { //u.Debug(td.SonTableResult.SortKV.Map, "-------", td.SonTableResult.Tabs) for _, k3 := range td.SonTableResult.SortKV.Keys { if table.StandKV[k3] == "" || td.SonTableResult.SortKVWeight[k3] > table.StandKVWeight[k3] { table.StandKV[k3] = qutil.ObjToString(td.SonTableResult.SortKV.Map[k3]) table.StandKVWeight[k3] = td.SonTableResult.SortKVWeight[k3] } } //中标候选人排序 if table.WinnerOrder == nil || len(table.WinnerOrder) == 0 { table.WinnerOrder = td.SonTableResult.WinnerOrder } else { winnerOrderEntity.Merge(table.WinnerOrder, td.SonTableResult.WinnerOrder) } } } } as := NewSortMap() //表格描述处理,对成交结果的处理 for _, k := range table.SortKV.Keys { if regexp.MustCompile("(成交|中标|候选|排名|名次|供应商排序)").MatchString(k) { table.Desc += "成交结果," } } //遍历table.sortkv,进行过滤处理,并放入标准化KV中,如果值是数组跳到下一步处理 for _, k := range table.SortKV.Keys { if regexp.MustCompile("^单价").MatchString(k) { continue } v := table.SortKV.Map[k] if _, ok := v.(string); ok { k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "") k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //qutil.Debug(k, v, k1, w1, v1, tag, b) if b { //降低冒号值的权重 if MhSpilt.MatchString(v1) { for pos, _ := range k1 { w1[pos] -= 50 } } if tag != "" && table.Tag == "" { table.Tag = tag } for pos, k2 := range k1 { if table.StandKV[k2] == "" || w1[pos] > table.StandKVWeight[k2] { table.StandKV[k2] = v1 //本节点 table.StandKVWeight[k2] = w1[pos] } // else if k2 == "中标金额" { // // u.Debug(qutil.Float64All(v1), qutil.Float64All(table.StandKV[k2])) // if qutil.Float64All(v1) > qutil.Float64All(table.StandKV[k2]) { // table.StandKV[k2] = v1 // } // } } } } else { //u.Debug(k, v, "---------") as.AddKey(k, v) } } //处理值是数组的kv放入标准化kv中 checkKey := map[int]bool{} for kn, k := range as.Keys { v := as.Map[k] if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid { if table.WinnerOrder == nil { table.WinnerOrder = []map[string]interface{}{} } table.WinnerOrder = append(table.WinnerOrder, vm...) } else { //增加候选人排序逻辑 if table.WinnerOrder == nil && !checkKey[kn] { if vs1, ok := v.([]string); ok { smap := make([]map[string]interface{}, len(vs1)) for n1, _ := range vs1 { smap[n1] = map[string]interface{}{} } //hadSort := false tmpEntname := make([]string, len(vs1)) tmpPrice := make([]string, len(vs1)) for kn1, k := range as.Keys[kn:] { v := as.Map[k] if ContactType["采购单位"].MatchString(k) || ContactType["代理机构"].MatchString(k) { continue } //目前对数组数据的key做判断,但是某些额可以是不满足情况的 //载明内容:[第一中标候选人 第二中标候选人] id:5d00587da5cb26b9b75e367b if vs, ok := v.([]string); ok && len(vs) == len(vs1) { //数组值的个数相同 res, _, _, _, repl := CheckCommon(k, "bidorder") kv := "" if !res { kt := u.GetTags(filterThText.ReplaceAllString(ClearKey(k, 2), "")) if kt.Len() > 0 { kv = kt[0].Value } } //qutil.Debug(k, res, repl, kv, "--", vs) if !res && kv == "" { //key未验证出,验证数组的val值 checkKey[kn+kn1] = true if winnertag { //如果是中标信息 在根据val数组信息解析候选人 for vsk, vsv := range vs { if NullTdReg.MatchString(vsv) { //数据先验证val是否有排序 //hadSort = true smap[vsk]["sortstr"] = vsv smap[vsk]["sort"] = GetBidSort(vsv, vsk+1) } else if findCandidate2.MatchString(vsv) && tmpEntname[vsk] == "" { //数据验证val是否是候选人 entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) if entname != "" { tmpEntname[vsk] = entname } } else { //验证val时如果数组中的第一条数据既不满足sort或者entname 判定此数组数据错误 break } } } } if res || kv != "" { //连续往下找几个key checkKey[kn+kn1] = true SORT: if repl == "sort" { //hadSort = true for vsk, vsv := range vs { smap[vsk]["sortstr"] = vsv smap[vsk]["sort"] = GetBidSort(vsv, vsk+1) } } else if repl == "entname" || kv == "中标单位" { for vsk, vsv := range vs { if winnerReg6.MatchString(vsv) { //k:中标候选人 v:["第一名","第二名"] repl = "sort" goto SORT } // if entname, _ := smap[vsk]["entname"].(string); entname != "" || len([]rune(vsv)) < 3 { // break // } // entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) // if entname != "" { // smap[vsk]["entname"] = entname // if tmpEntname[vsk] != "" || len([]rune(vsv)) < 4 { //排除 单位:["台","个","套"] break } entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) if entname != "" { tmpEntname[vsk] = entname } } } else if kv == "中标金额" { for vsk, vsv := range vs { //过滤price 2348273.432元(万元)-->2348273.432 //tmp1, _ := smap[vsk]["price"].(string) tmp1 := tmpPrice[vsk] p1num := numberReg2.FindString(tmp1) p2num := numberReg2.FindString(vsv) p1 := qutil.Float64All(p1num) p2 := qutil.Float64All(p2num) if p2 > p1 { //smap[vsk]["price"] = winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv)) price := winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv)) if pricestr, _ := price.(string); len(pricestr) < 30 && len(pricestr) > 0 { tmpPrice[vsk] = pricestr } } } } } } else { //break } } newSmap := []map[string]interface{}{} //qutil.Debug("smap=======", smap) //qutil.Debug("tmpEntname--", len(tmpEntname), tmpEntname) //qutil.Debug("tmpPrice--", len(tmpPrice), tmpPrice) for n, smap_v := range smap { //if hadSort { //有排序,再添加entname和price if len(tmpEntname) > 0 && n < len(tmpEntname) && tmpEntname[n] != "" { smap_v["entname"] = tmpEntname[n] if len(tmpPrice) > 0 && n < len(tmpPrice) && tmpPrice[n] != "" { smap_v["price"] = tmpPrice[n] } } //} else if len(tmpEntname) > 0 { //fmt.Println("table winnerorder only has entname", tmpEntname) //} if len(smap_v) > 2 { //只有排序信息 sort和sortstr newSmap = append(newSmap, smap_v) } } if len(newSmap) > 0 { table.WinnerOrder = newSmap } } } k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) if b { if tag != "" && table.Tag == "" { table.Tag = tag } for pos, k2 := range k1 { if table.StandKV[k2] == "" || w1[pos] > table.StandKVWeight[k2] { table.StandKV[k2] = v1 //本节点 table.StandKVWeight[k2] = w1[pos] } // else if k2 == "中标金额" { // if qutil.Float64All(v1) > qutil.Float64All(table.StandKV[k2]) { // table.StandKV[k2] = v1 // } // } } } } } // if filterTableWror.MatchString(table.Tag) { table.WinnerOrder = nil } // if len(table.WinnerOrder) > 0 || !table.BPackage { winnerOrder := []map[string]interface{}{} maxSort := 0 //调整顺序 for i := 0; i < 2; i++ { for _, v := range table.WinnerOrder { sortstr, _ := v["sortstr"].(string) if (i == 0 && sortstr == "") || (i == 1 && sortstr != "") { continue } sort, _ := v["sort"].(int) if i == 0 { if maxSort == 0 || sort > maxSort { maxSort = sort } } else { maxSort++ v["sort"] = maxSort } winnerOrder = append(winnerOrder, v) } if len(winnerOrder) == len(table.WinnerOrder) { break } } table.WinnerOrder = winnerOrder winnerOrder = []map[string]interface{}{} L: for _, tr := range table.TRs { for _, td := range tr.TDs { winnerOrder = winnerOrderEntity.Find(td.Val, true, 3) if len(winnerOrder) > 0 { break L } } } if len(table.WinnerOrder) > 0 { //中标候选人合并 winnerOrderEntity.Merge(table.WinnerOrder, winnerOrder) if table.StandKV["中标单位"] == "" { ent := table.WinnerOrder[0]["entname"] if ent != nil { table.StandKV["中标单位"], _ = ent.(string) table.StandKVWeight["中标单位"] = -25 } } } else if !table.BPackage { //没有table.WinnerOrder也没有分包 将td中的WinnerOrder赋值给table.WinnerOrder if len(winnerOrder) > 1 { table.WinnerOrder = winnerOrder } } } //对中标候选人进行排序 winnerOrderEntity.Order(table.WinnerOrder) //该表格有一个分包,并且有中标候选人排序的情况下,把中标候选人放到包里面 if table.BlockPackage != nil && table.BlockPackage.Keys != nil && len(table.BlockPackage.Keys) == 1 { if table.BlockPackage.Map != nil { onePkgKey := table.BlockPackage.Keys[0] onePkg, _ := table.BlockPackage.Map[onePkgKey].(*u.BlockPackage) if onePkg != nil && onePkg.WinnerOrder != nil && len(onePkg.WinnerOrder) == 0 { onePkg.WinnerOrder = table.WinnerOrder table.BlockPackage.Map[onePkgKey] = onePkg } } } } //表格结果合并到父表格集中 func (table *Table) MergerToTableresult() { //对多包表格的多包值的合并处理 if table.BPackage { table.TableResult.IsMultiPackage = true for k, v := range table.BlockPackage.Map { package1 := table.TableResult.PackageMap.Map[k] if package1 == nil { table.TableResult.PackageMap.AddKey(k, v) } else { bp := package1.(*u.BlockPackage) if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } v1 := v.(*u.BlockPackage) if v1.TableKV != nil && v1.TableKV.Kv != nil { for k2, v2 := range v1.TableKV.Kv { if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } if bp.TableKV.Kv[k2] == "" || (v1.TableKV.KvTag[k2] != nil && bp.TableKV.KvTag[k2] != nil && v1.TableKV.KvTag[k2].Weight > bp.TableKV.KvTag[k2].Weight) { //可能会报错 assignment to entry in nil map bp.TableKV.Kv[k2] = v2 bp.Text += fmt.Sprintf("%v:%v\n", k2, v2) } } } bp.WinnerOrder = v1.WinnerOrder //table.TableResult.PackageMap.AddKey(k, v) } } // str := "" // for _, k := range table.TableResult.PackageMap.Keys { // v := table.TableResult.PackageMap.Map[k].(*u.BlockPackage) // str += fmt.Sprintf("包号:%s,中标人:%s,中标价:%s,预算:%s,文本:%s,排名:%v ---\t", v.Index, v.TableKV["中标单位"]+v.ColonKV["中标单位"], v.TableKV["中标金额"]+v.ColonKV["中标金额"], v.TableKV["预算"]+v.ColonKV["预算"], v.Text, v.WinnerOrder) // } // u.Debug(table, table.TableResult, str) } //遍历标准key到tableresult.sortkv中 for k, v := range table.StandKV { if table.TableResult.SortKV.Map[k] == nil || table.StandKVWeight[k] > table.TableResult.SortKVWeight[k] || strings.Contains(table.Tag, "变更") { v = strings.Replace(v, "__", "", -1) if table.TableResult.SortKV.Map[k] == nil { table.TableResult.SortKV.AddKey(k, v) //父集 } else { table.TableResult.SortKV.ReplaceKey(k, v, k) } table.TableResult.SortKVWeight[k] = table.StandKVWeight[k] } else if table.TableResult.SortKV.Map[k] != nil { //u.Debug(k, v, table.TableResult.SortKV.Map[k], "..............") } } //表格的块标签 if table.TableResult.BlockTag == "" && table.Tag != "" { table.TableResult.BlockTag = table.Tag } //中标候选人(多个table,现在默认取第一个table的信息,考虑需不需要多个table分析合并数据?) if table.TableResult.WinnerOrder == nil || len(table.TableResult.WinnerOrder) == 0 { table.TableResult.WinnerOrder = table.WinnerOrder } //增加brand 并列table if len(table.BrandData) > 0 { for _, v := range table.BrandData { if len(v) > 0 { table.TableResult.BrandData = append(table.TableResult.BrandData, v) } } } } /** 解析表格入口 返回:汇总表格对象 **/ func AnalyTableV2(tabs []*goquery.Selection, toptype, blockTag, con string, itype int, _id interface{}) (tabres *TableResult) { defer qutil.Catch() //u.Debug(con) if itype == 1 { //修复表格 con = RepairCon(con) } //生成tableresult对象 tabres = NewTableResult(_id, toptype, blockTag, con, itype) //可以有多个table for _, table := range tabs { //隐藏表格跳过 if IsHide(table) { continue } tabres.GoqueryTabs = append(tabres.GoqueryTabs, table) } //解析表格集 tabres.Analy() return } //开始解析表格集 func (ts *TableResult) Analy() { tabs := []*Table{} contactFormat := &u.ContactFormat{ IndexMap: map[int]string{}, MatchMap: map[string]map[string]bool{}, } for _, table := range ts.GoqueryTabs { tn := NewTable(ts.Html, ts, table) //核心模块 ts := tn.Analy(contactFormat) for _, tab := range ts { tabs = append(tabs, tab) //fmt.Println("tab.SortKV.Map", tab.SortKV.Keys) } //tn.SonTables = append(tn.SonTables, tn) } //统一合并,考虑统一多表格是多包的情况---新增 if len(tabs) > 1 { pns := map[string]string{} pnarr := []string{} for _, table := range tabs { pn := table.StandKV["项目名称"] if pn != "" && TitleReg.MatchString(pn) { pnarr = append(pnarr, pn) matchres := TitleReg.FindAllStringSubmatch(pn, -1) if len(matchres) == 1 && len(matchres[0]) > 0 { v1 := u.PackageNumberConvert(matchres[0][0]) pns[v1] = matchres[0][0] bp := &u.BlockPackage{} bp.Index = v1 bp.Origin = matchres[0][0] bp.TableKV = u.NewJobKv() for _, k := range []string{"中标金额", "中标单位", "预算", "成交状态", "项目名称", "项目编号", "采购范围"} { bp.TableKV.Kv[k] = table.StandKV[k] } bp.WinnerOrder = table.WinnerOrder if table.BlockPackage.Map[v1] == nil { table.BPackage = true table.BlockPackage.AddKey(v1, bp) } } } } if len(tabs) == len(pns) { //多个表格,每个表格都是一个分包 http://www.cxzwfw.gov.cn/info/1009/6963.htm //项目名称、项目编号、采购单位、招标机构、预算 pname := projectnameReg.ReplaceAllString(pnarr[0], "") btrue := true for _, pn := range pnarr[1:] { pn = projectnameReg.ReplaceAllString(pn, "") //u.Debug(pn, pname) if pn != pname { //项目名称不一致 btrue = false break } } if btrue { ts.SortKV.AddKey("项目名称", pname) ts.SortKVWeight["项目名称"] = 100 for _, table := range tabs { table.BPackage = true //预算、中标金额、NullTxtBid成交供应商排名 中标单位 成交状态 if table.BlockPackage != nil && len(table.BlockPackage.Keys) == 1 { bp := table.BlockPackage.Map[table.BlockPackage.Keys[0]].(*u.BlockPackage) if table.TableResult.WinnerOrder != nil { bp.WinnerOrder = table.WinnerOrder } if bp != nil && table.StandKV != nil { if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } for nk, k := range []string{"中标金额", "中标单位", "预算", "成交状态", "项目名称", "项目编号", "采购范围"} { bp.TableKV.Kv[k] = table.StandKV[k] if nk < 4 { delete(table.StandKV, k) } } } } } } } } for _, table := range tabs { table.MergerToTableresult() // for k, v := range table.TableResult.SortKV.Map { // qutil.Debug(k, "=====", v) // } } } //解析表格 func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table { //查找表体中的tr对象 trs := table.Goquery.ChildrenFiltered("tbody,thead,tfoot").ChildrenFiltered("tr") if trs.Size() == 0 { trs = table.Goquery.ChildrenFiltered("tr") } //num := 0 //遍历tr数组 trs.Each(func(n int, sel *goquery.Selection) { //隐藏行不处理 if IsHide(sel) { return } //遍历每行的td tds := sel.ChildrenFiltered("td,th") TR := NewTR(table) tdTextIsNull := true tds.Each(func(m int, selm *goquery.Selection) { //对隐藏列不处理!!! if IsHide(selm) { return } //进入每一个单元格 td := NewTD(selm, TR, table) //num++ TR.AddTD(td) if td.Val != "" { //删除一个tr,tr中所有td是空值的 tdTextIsNull = false } }) //tr中所有td的内容为空 将tr删除 if !tdTextIsNull { table.AddTR(TR) } }) //重置行列 table.ComputeRowColSpan() // for n, tr := range table.TRs { // for m, td := range tr.TDs { // qutil.Debug(td.BH, n, m, td.Text, td.StartRow, td.EndRow, td.StartCol, td.EndCol) // } // } tm := []map[string]interface{}{} tmk := map[string]bool{} tmn := map[int]map[string]interface{}{} for rownum, tr := range table.TRs { if len(tr.TDs) == 1 && table.ColNum > 1 { td := tr.TDs[0] if td.StartCol == 0 && td.EndCol+1 == table.ColNum && len([]rune(td.Val)) > 1 && len([]rune(td.Val)) < 50 { con, m1, b := CheckMultiPackage(td.Val, "") if b { for k, _ := range m1 { numstr := u.PackageNumberConvert(k) m2 := map[string]interface{}{ "tag": con, //"num": numstr, //"numtxt": v[0], "startrow": rownum, } tmk[numstr] = true tmn[rownum] = m2 tm = append(tm, m2) break } } } } } //拆表 ts := []*Table{} if len(tmk) > 1 && len(tmk) == len(tm) { var tab1 *Table for rownum, tr := range table.TRs { if tab1 == nil { tab1 = NewTable("", table.TableResult, table.Goquery) tab1.BSplit = true if tmn[rownum] != nil { tab1.StandKV["项目名称"] = tmn[rownum]["tag"].(string) tab1.StandKVWeight["项目名称"] = -100 } ts = append(ts, tab1) } if tmn[rownum] != nil { tab1.Tag = tmn[rownum]["tag"].(string) } else { tab1.AddTR(tr) } if tmn[rownum+1] != nil { tab1 = nil } } } else { ts = append(ts, table) } for n, table := range ts { if len(table.TRs) > 0 { //删除尾部空行 for len(table.TRs) > 0 { npos := len(table.TRs) tailTR := table.TRs[npos-1] //最后一个tr bspace := true for _, v := range tailTR.TDs { if v.Val != "" || v.SonTableResult != nil || len(v.SortKV.Keys) > 0 { bspace = false break } } if bspace { table.TRs = table.TRs[:npos-1] } else { break } } //table.Print() //校对表格 table.Adjust() //查找表格的标签 table.FindTag() //log.Println(table.TableResult.Id, table.Html) //分割表格 if table.BSplit { if !table.BHeader && n > 0 { for i := n - 1; i > -1; i-- { if ts[i].BHeader { if ts[i].BFirstRow { //取第一行插入到 table.InsertTR(ts[i].TRs[0]) table.Adjust() } break } } } } //对没有表头表格的处理 _, _, b := CheckMultiPackage(table.Tag, "") if b { table.StandKV["项目名称"] = table.Tag table.StandKVWeight["项目名称"] = -100 } table.TdContactFormat(contactFormat) //开始查找kv,核心模块 table.FindKV() //table中抽取品牌 if u.IsBrandGoods { table.analyBrand() } //判断是否是多包,并处理分包的 table.CheckMultiPackageByTable() str := "\n" for k, v := range table.StandKV { str += fmt.Sprintf("_==___%s:%v\n", k, v) if table.TableResult.SortKV.Map[k] == nil { table.TableResult.SortKV.AddKey(k, v) table.TableResult.SortKVWeight[k] = table.StandKVWeight[k] } } res, _, _, _, _ := CheckCommon(table.Tag, "abandontable") if !res { //过滤、标准化、合并kv table.KVFilter() } str = "\n" for k, v := range table.StandKV { str += fmt.Sprintf("_____%s:%v\n", k, v) if table.TableResult.SortKV.Map[k] == nil { table.TableResult.SortKV.AddKey(k, v) table.TableResult.SortKVWeight[k] = table.StandKVWeight[k] } } //u.Debug(str) } } return ts } func (table *Table) Adjust() { table.TDNum = func() int { n := 0 for _, tr := range table.TRs { n += len(tr.TDs) } return n }() //有多少行 table.RowNum = len(table.TRs) // for k1, tr := range table.TRs { // for k2, td := range tr.TDs { // qutil.Debug(k1, k2, td.Val, td.Rowspan, td.Colspan, td.ColPos, tr.RowPos) // } // } //计算行列起止位置,跨行跨列处理 table.ComputeRowColSpan() // for k1, tr := range table.TRs { // for k2, td := range tr.TDs { // qutil.Debug(k1, k2, td.Val, td.StartRow, td.EndRow, td.StartCol, td.EndCol) // } // } //大概计算每个起止行列的概率 table.GetKeyRation() /* for k, v := range table.StartAndEndRation { for k1, v1 := range v.Poss { bs, _ := json.Marshal(v1) str := "" for _, td := range v.Tdmap[v1] { str += "__" + td.Val + fmt.Sprintf("%d_%d_%d_%d", td.StartRow, td.EndRow, td.StartCol, td.EndCol) } qutil.Debug(k, k1, string(bs), v.Rationmap[v1], str) } } */ //u.Debug("tdnum:", num, table.RowNum, table.ColNum) //是否是规则的表格,单元各个数=行数*列数 table.Brule = table.TDNum == table.RowNum*table.ColNum count := 0 for _, trs := range table.TRs { for _, td := range trs.TDs { if td.BH { count++ } } } if float32(count)/float32(table.TDNum) < 0.85 { //精确计算起止行列是表头的概率 table.ComputeRowColIsKeyRation() bhead := false L: for i, tr := range table.TRs { for _, td := range tr.TDs { if td.BH { //qutil.Debug("----=====---", td.Val, len(table.TRs[len(table.TRs)-1].TDs), i, len(table.TRs)-1) if i == len(table.TRs)-1 && len(table.TRs[len(table.TRs)-1].TDs) == 2 { res, _, _, _, _ := CheckCommon(td.Val, "abandontable") if res { //删除此行 table.TRs = table.TRs[:len(table.TRs)-1] table.Adjust() return } } bhead = true break L } } } table.BHeader = bhead } } //计算行/列表格的结束位置 StartRow=0 EndRow=0 func (table *Table) ComputeRowColSpan() { mapRC := map[int]map[int]int{} //记录第几行pos,起始列对应的合并值 for k, v := range table.TRs { nk := 0 //nk列的起始,k行的起始||如果有合并,起始就不是0 ball := true rowspans := v.TDs[0].Rowspan //某一行第一个td的rowspan for _, v1 := range v.TDs { if v1.Rowspan != rowspans { ball = false break } } for _, v1 := range v.TDs { if ball { v1.Rowspan = 1 } mc := mapRC[k] for { if mc != nil && mc[nk] > 0 { nk += mc[nk] } else { break } } v1.StartCol = nk nk += v1.Colspan - 1 v1.EndCol = nk if nk >= table.ColNum { table.ColNum = nk + 1 } nk++ v1.StartRow = k v1.EndRow = k + v1.Rowspan - 1 ck := fmtkey("c", v1.StartCol, v1.EndCol) tdcs := table.StartAndEndRation[ck] if tdcs == nil { tdcs = NewTDRationScope(ck) table.StartAndEndRation[ck] = tdcs table.StartAndEndRationKSort.AddKey(ck, 1) } tdcs.Addtd(v1) rk := fmtkey("r", v1.StartRow, v1.EndRow) tdrs := table.StartAndEndRation[rk] if tdrs == nil { tdrs = NewTDRationScope(rk) table.StartAndEndRation[rk] = tdrs table.StartAndEndRationKSort.AddKey(rk, 1) } tdrs.Addtd(v1) if v1.Rowspan > 1 { for i := 1; i < v1.Rowspan; i++ { r := k + i if r < len(table.TRs) { mc := mapRC[r] if mc == nil { mc = map[int]int{} } mc[v1.StartCol] = v1.Colspan mapRC[r] = mc } } } } } } func fmtkey(t string, start, end int) string { return fmt.Sprintf("%s_%d_%d", t, start, end) } func (table *Table) FindTag() { //查找每个table的标签,如果有标签可按标签处理,否则根据表格去判断 if table.Tag != "" { return } t1, _ := table.Goquery.OuterHtml() html := table.Html pos := strings.Index(html, t1) if pos <= 0 { doc, _ := goquery.NewDocumentFromReader(strings.NewReader(table.Html)) html, _ = doc.Html() pos = strings.Index(html, t1) } //u.Debug("--------", t1, "====\n\n\n\n=====", html) if pos > 0 { tcon := html[:pos] tcon = cut.ClearHtml(tcon) tcon = ClearTagReg.ReplaceAllString(tcon, "") //u.Debug(pos, "-----------", tcon) strs := ttagreg.FindStringSubmatch(tcon) if len(strs) > 0 { table.Tag = strs[0] //u.Debug(table.Tag) } } if table.Tag == "" { table.Tag = table.TableResult.BlockTag } //u.Debug(table.Tag) } //计算r/c_start_end的概率 func (table *Table) GetKeyRation() { for _, vn := range table.StartAndEndRationKSort.Keys { v := table.StartAndEndRation[vn] for _, v1 := range v.Poss { count := 0 n := 0 for _, td := range v.Tdmap[v1] { n++ if td.BH { count++ } } v.Rationmap[v1] = float32(count) / float32(n) } } } //计算行列是表头的概率调用GetKeyRation func (table *Table) ComputeRowColIsKeyRation() { //增加对跨行校正限止 // u.Debug(table.Brule, table.ColNum, table.RowNum, table.TDNum) bkeyfirstrow := false bkeyfirstcol := false if table.Brule { //不存在跨行跨列的情况,规则表格 checkCompute := map[string]bool{} for k, tr := range table.TRs { rk := fmtkey("r", tr.TDs[0].StartRow, tr.TDs[0].EndRow) if k == 0 { //第1行的概率 ck := fmtkey("c", tr.TDs[0].StartCol, tr.TDs[0].EndCol) //u.Debug(table.BFirstRow, "--", table.StartAndEndRation[rk], table.StartAndEndRation[ck]) ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0]) ration2, _ := table.StartAndEndRation[ck].GetTDRation(tr.TDs[0]) if (len(tr.TDs) == 2 && ration2 < 0.55) && (len(tr.TDs) == 2 && ration1 > 0.5) { //第一行为key bkeyfirstrow = true ball := true for _, td := range tr.TDs { if MoneyReg.MatchString(td.Val) { bkeyfirstrow = false ball = false td.BH = false break } } for _, td := range tr.TDs { if ball { td.BH = true td.KeyDirect = 1 td.KVDirect = 2 } } } else if ration2 > 0.55 { //第1列 bkeyfirstcol = true if !checkCompute[ck] { checkCompute[ck] = true //重置第1列 for _, tr1 := range table.TRs { for _, td1 := range tr1.TDs { if td1.StartCol == 0 { if !MoneyReg.MatchString(td1.Val) { td1.BH = true td1.KeyDirect = 2 td1.KVDirect = 1 } } } } } } if !bkeyfirstrow && !bkeyfirstcol { if len(tr.TDs) > 1 && ration1 > ration2 && ration1 > 0.5 { bkeyfirstrow = true for _, td := range tr.TDs { if !MoneyReg.MatchString(td.Val) { td.BH = true td.KeyDirect = 1 td.KVDirect = 2 } } } else if tr.Table.ColNum > 1 && ration2 > 0.5 { bkeyfirstcol = true if !checkCompute[ck] { checkCompute[ck] = true //重置第1列 for _, tr1 := range table.TRs { for _, td1 := range tr1.TDs { if td1.StartCol == 0 { if !MoneyReg.MatchString(td1.Val) { td1.BH = true td1.KeyDirect = 2 td1.KVDirect = 1 } } } } } } } } else { if bkeyfirstrow { //第一列的概率 ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0]) if k == 1 || ration1 < checkval { for _, td := range tr.TDs { if !td.MustBH { td.BH = false td.KeyDirect = 0 td.KVDirect = 0 } } } //else {for _, td := range tr.TDs {}} } else { //列在起作用 if bkeyfirstcol { for _, td := range tr.TDs { ck := fmtkey("c", td.StartCol, td.EndCol) ration1, _ := table.StartAndEndRation[ck].GetTDRation(td) if !checkCompute[ck] { checkCompute[ck] = true if ration1 >= checkval && td.ColPos != 1 { for _, tr1 := range table.TRs { for _, td1 := range tr1.TDs { if td1.StartCol == td.StartCol { if !MoneyReg.MatchString(td1.Val) { td1.BH = true td1.KeyDirect = 2 td1.KVDirect = 1 } } } } } else { for _, tr1 := range table.TRs[1:] { for _, td1 := range tr1.TDs[1:] { if td1.StartCol == td.StartCol && !td1.MustBH { td1.BH = false td1.KeyDirect = 0 td1.KVDirect = 0 } } } } } } } } } } } //qutil.Debug("table.Brule", table.Brule, !bkeyfirstcol && !bkeyfirstrow) if !table.Brule || (!bkeyfirstcol && !bkeyfirstrow) { //断行问题,虽然同列或同行,但中间被跨行截断,表格方向调整 for _, k := range table.StartAndEndRationKSort.Keys { v := table.StartAndEndRation[k] //横向判断,要判断最多的方向,否则会出现不定的情况(map遍历问题) k1 := k[:1] for _, v2 := range v.Poss { lentds := len(v.Tdmap[v2]) if v.Rationmap[v2] > checkval { for _, td := range v.Tdmap[v2] { if td.KeyDirect == 0 && !MoneyReg.MatchString(td.Val) { if k1 == "r" { ck := fmtkey("c", td.StartCol, td.EndCol) rt := table.StartAndEndRation[ck] //clen := 0 var fv float32 var tdn []*TD if rt != nil { fv, tdn = rt.GetTDRation(td) //clen = len(tdn) } if lentds > 1 { if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" { td.KeyDirect = 1 td.KVDirect = 2 td.BH = true } } } else { ck := fmtkey("r", td.StartRow, td.EndRow) rt := table.StartAndEndRation[ck] var fv float32 var tdn []*TD //clen := 0 if rt != nil { fv, tdn = rt.GetTDRation(td) //clen = len(tdn) } if lentds > 1 { if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" { td.KeyDirect = 2 td.KVDirect = 1 td.BH = true } } } } else { break } } } else if v.Rationmap[v2] < 0.5 && len(v.Tdmap[v2]) > 3 { for _, td := range v.Tdmap[v2] { // u.Debug(td.Val, "-----", td.BH) if td.KeyDirect == 0 && td.BH && !td.MustBH { if k1 == "r" { ck := fmtkey("c", td.StartCol, td.EndCol) rt := table.StartAndEndRation[ck] clen := 0 var fv float32 var tdn []*TD if rt != nil { fv, tdn = rt.GetTDRation(td) clen = len(tdn) } if lentds >= clen && lentds > 1 { if (tdn != nil && v.Rationmap[v2] < fv) || tdn == nil { td.BH = false } } } else { ck := fmtkey("r", td.StartRow, td.EndRow) rt := table.StartAndEndRation[ck] var fv float32 var tdn []*TD clen := 0 if rt != nil { fv, tdn = rt.GetTDRation(td) clen = len(tdn) } if lentds >= clen && lentds > 1 { if (tdn != nil && v.Rationmap[v2] < fv) || tdn == nil { td.BH = false } } } } else { break } } } } } } table.GetKeyRation() if len(table.TRs) > 0 && len(table.TRs[0].TDs) > 0 { t0 := table.TRs[0].TDs[0] key := fmtkey("r", t0.StartRow, t0.EndRow) r, t := table.StartAndEndRation[key].GetTDRation(t0) if r > 0.9 && len(t) > 1 { table.BFirstRow = true } for k, tr := range table.TRs { if len(tr.TDs) == 1 && tr.TDs[0].StartCol == 0 && tr.TDs[0].EndCol+1 == table.ColNum { tr.TDs[0].BH = false tr.TDs[0].KVDirect = 0 sv := FindKv(tr.TDs[0].Val, "", 2) _, resm := colonkvEntity.entrance(tr.TDs[0].Val, "", 2) for k, v := range resm { sv.AddKey(k, v) } if len(sv.Keys) > 0 { for k1, v1 := range sv.Map { if tr.TDs[0].SortKV.Map[k1] == nil { table.SortKV.AddKey(k1, v1) } } } else if table.Tag == "" && k == 0 && len(tr.TDs[0].Val) > 11 { table.Tag = tr.TDs[0].Val } // subVal := tdval_reg.FindAllStringSubmatch(tr.TDs[0].Val, -1) // //u.Debug(tr.TDs[0].Val, subVal) // if len(subVal) > 0 { // for _, subv1 := range subVal { // if len(subv1) == 3 { // table.SortKV.AddKey(subv1[1], subv1[2]) // } // } // } else if k == 0 && len(tr.TDs[0].Val) > 11 { // table.Tag = tr.TDs[0].Val // } } // for _, td := range tr.TDs { // u.Debug(td.BH, td.Val, "----") // } } } } //查找表格的kv,调用FindTdVal func (table *Table) FindKV() { //判断全是key的表格不再查找 if table.BHeader { //只要一个是key即为true direct := If(table.BFirstRow, 2, 1).(int) //kv,2查找方向,向上查找 vdirect := If(direct == 2, 1, 2).(int) //控制跨行表格 bcon := false //增加表格切块判断,只判断切块分包 //控制中标人排序方向 bodirect := 0 //控制中标人排序数值 sort := 1 //开始抽取 for _, tr := range table.TRs { if len(tr.TDs) == 1 { bcon = false td := tr.TDs[0] if td.StartCol == 0 && td.EndCol+1 == table.ColNum && len([]rune(td.Val)) > 4 && len([]rune(td.Val)) < 50 { res, _, _, _, _ := CheckCommon(td.Val, "abandontable") if res { //以下内容丢弃 bcon = true } } } if bcon { continue } if tr.TDs[0].StartRow > 0 { numbh := 0 for _, td := range tr.TDs { if td.BH { numbh++ } } if numbh > 0 && numbh <= len(tr.TDs)/2 { direct, vdirect = 1, 2 } else { direct, vdirect = 2, 1 } } for _, td := range tr.TDs { /** rt := table.StartAndEndRation[fmtkey("r", td.StartCol, td.EndCol)] if rt != nil { r, t := rt.GetTDRation(td) u.Debug(td.BH, td.Val, r, t) } **/ // if td.Val == "电视" || td.Val == "电话机" || td.Val == "传真机" || td.Val == "音响" { //qutil.Debug("----td.Valtype", td.Valtype, "td.BH:", td.BH, "KVDirect:", td.KVDirect, "Val:", td.Val, "direct:", direct, "vdirect:", vdirect) // } if !td.BH && td.KVDirect < 3 { if !table.FindTdVal(td, direct, vdirect) { if !table.FindTdVal(td, vdirect, direct) { //都识别不到时,对第一、二中标候选人的处理 bo, res := GetBidOrder(td, bodirect, sort) if res { sort++ bodirect = bo } } } //fmt.Println("td:", td.Val, td.BH, td.HeadTd, td.KVDirect) } } } //qutil.Debug("FindKV", table.SortKV.Map) } else if len(table.TRs) > 0 { //没有表头的表格处理,默认纵向吧 res := make([][]string, len(table.TRs[0].TDs)) for n, _ := range res { res[n] = []string{} } for _, tr := range table.TRs { for n, td := range table.TRs[0].TDs { //第一行的所有td td1 := table.GetTdByRCNo(tr.TDs[0].StartRow, td.StartCol) if td1 != nil { res[n] = append(res[n], td1.Val) } else { res[n] = append(res[n], "") } } } //再拆值,类似http://www.ggzy.hi.gov.cn/cgzbgg/16553.jhtml第二列,有多个值 nmapkeys := []int{} nmap := map[int][]*u.Kv{} L: for _, r1 := range res { for n, r := range r1 { if len([]rune(r)) < 60 { // 长度小于60才去分 //res1, _ := GetKVAll(r, "", nil) res1, _ := colonkvEntity.entrance(r, "", 2) if res1 != nil { nmap[n] = res1 nmapkeys = append(nmapkeys, n) /** //截取串 for _k1, _ := range res1 { r = regexp.MustCompile(_k1+".*").ReplaceAllString(r, "") } r1[n] = r res[pos] = r1 **/ } else if nmap[n] != nil { //放空值 nmap[n] = append(nmap[n], &u.Kv{}) } } else { nmap = nil nmapkeys = nil break L } } } //调整 if len(nmap) > 0 { kmapkeys := []string{} kmap := map[string][]string{} for _, mk := range nmapkeys { //同是第n列 for pos, m1 := range nmap[mk] { k, v := m1.Key, m1.Value kv := kmap[k] if kv == nil { kv = []string{} } kv = append(kv, v) kmap[k] = kv kmapkeys = append(kmapkeys, k) for _, k := range kmapkeys { arr := kmap[k] if len(arr) < pos { arr = append(arr, "") kmap[k] = arr kmapkeys = append(kmapkeys, k) } } } } if len(kmap) > 0 { for _, k := range kmapkeys { table.SortKV.AddKey(k, kmap[k]) } } } //================= //解析值放到map中 for _, arr := range res { if len(arr) > 0 { v1 := arr[0] _, _, _, _, repl := CheckCommon(v1, "con") if repl == "ENT" { table.SortKV.AddKey("中标单位", arr) continue } else if repl == "BO" { table.SortKV.AddKey("排名", arr) continue } } } } //qutil.Debug("FindKV", table.SortKV.Map) } //获取中标人顺序 //direct 0默认 1横向 2纵向 func GetBidOrder(td *TD, direct, n int) (d int, res bool) { if td.Valtype != "BO" { return } if td.Rowspan > 1 { for i := 0; i < td.Rowspan; i++ { nextcol := 1 L1: for { vtd := td.TR.Table.GetTdByRCNo(td.StartRow+i, td.EndCol+nextcol) if vtd == nil { break L1 } nextcol += vtd.Colspan if filter_zbdw_v2.MatchString(vtd.Val) { arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": vtd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.Map[NullTxtBid] = a1 } } } } else if td.Colspan > 1 { for i := 1; i < td.Colspan; i++ { nextcol := 0 L2: for { vtd := td.TR.Table.GetTdByRCNo(td.StartRow+i, td.StartCol+nextcol) if vtd == nil || vtd.Colspan >= td.Colspan { break L2 } nextcol += vtd.Colspan if filter_zbdw_v2.MatchString(vtd.Val) { arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": vtd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.Map[NullTxtBid] = a1 } } } } else { rtd := td.TR.Table.GetTdByRCNo(td.StartRow, td.EndCol+1) btd := td.TR.Table.GetTdByRCNo(td.EndRow+1, td.StartCol) //if ((rtd != nil && !rtd.BH && rtd.Valtype == "BO") || direct == 1) && btd != nil && filter_zbdw_v.MatchString(btd.Val) { if ((rtd != nil && !rtd.BH) || direct == 1) && btd != nil && filter_zbdw_v2.MatchString(btd.Val) { d = 1 arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": btd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.Map[NullTxtBid] = a1 //} else if ((btd != nil && !btd.BH && btd.Valtype == "BO") || direct == 2) && rtd != nil && filter_zbdw_v.MatchString(rtd.Val) { } else if ((btd != nil && !btd.BH) || direct == 2) && rtd != nil && filter_zbdw_v2.MatchString(rtd.Val) { d = 2 arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": rtd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.Map[NullTxtBid] = a1 } } return } func GetBidSort(str string, n int) int { val := n if strings.Index(str, "首选") > -1 { val = 1 } else { val = winnerOrderEntity.toNumber(str, n) } return val } //查找每一个单元格的表头,调用FindNear func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) { near := table.FindNear(td, direct) // if near != nil { // fmt.Println("near----", near.Val, td.Val) // } // qutil.Debug(near != nil) // qutil.Debug(near.BH) // qutil.Debug(near.KeyDirect == vdirect, near.KeyDirect == 0) // qutil.Debug(near.KVDirect == direct, near.KVDirect == 0) // qutil.Debug(near.KVDirect < 3) if near != nil && near.BH && (near.KeyDirect == vdirect || near.KeyDirect == 0) && (near.KVDirect == direct || near.KVDirect == 0) && near.KVDirect < 3 { near.KVDirect = direct near.KeyDirect = vdirect td.KVDirect = direct key := near.Val if near.Val == "" { key = fmtkey("k", near.TR.RowPos, near.ColPos) } val := table.SortKV.Map[key] //qutil.Debug("====================", "key:", key, "val:", val) bthiskey := false if val != nil { curpos := table.SortKV.Index[key] thistr := table.kTD[curpos] if thistr != near { near.Val += "_" for table.SortKV.Map[near.Val] != nil { near.Val += "_" } key = near.Val //之前这个地方没有重置,导致把之前结果覆盖了 } else { bthiskey = true } } bfind := false barr := false varrpos := -1 if bthiskey { //处理是数组值,且有合并行或合并列的情况 kvscope,对数组值的处理 pos := table.SortKV.Index[key] mval := table.kvscope[pos] bvalfind := false if direct == 1 { //kv是横向 L1: for k3, v3 := range mval { for _, v4 := range v3 { if v4.EndRow+1 == td.StartRow && v4.EndCol == td.EndCol { varrpos = k3 bvalfind = true break L1 } } } } else { //kv是纵向 L2: for k3, v3 := range mval { for _, v4 := range v3 { if v4.EndCol+1 == td.StartCol && v4.EndRow == td.EndRow { varrpos = k3 bvalfind = true break L2 } } } } if vals, ok := val.([]string); ok { if near.Val == "" { bn := false for _, vs := range vals { if vs != "" && NullTdReg.MatchString(vs) { bn = true } else { bn = false break } } if bn { near.Val = NullTxtBid key = NullTxtBid bfind = true } } if bvalfind { vals[varrpos] = td.Val // += "__" + td.Val } else { vals = append(vals, td.Val) val = vals varrpos = len(vals) - 1 } } else if vals, ok := val.(string); ok { if bvalfind { val = td.Val //vals + "__" + td.Val } else { tval := []string{vals} tval = append(tval, td.Val) val = tval varrpos = 1 } } barr = true } else { val = td.Val } td.HeadTd = near if bfind { tkey := fmtkey("k", near.TR.RowPos, near.ColPos) table.SortKV.ReplaceKey(key, val, tkey) } else { table.SortKV.AddKey(key, val) //if table.SortKV.Map[key] != nil { pos := table.SortKV.Index[key] //qutil.Debug("=========", "key:", key, "val:", val, "pos:", pos) if barr { mval := table.kvscope[pos] if mval != nil { tds := mval[varrpos] if tds != nil { tds = append(tds, td) } else { tds = []*TD{td} } if varrpos > -1 { mval[varrpos] = tds table.kvscope[pos] = mval } } } else { table.kvscope[pos] = map[int][]*TD{ 0: []*TD{td}, } table.kTD[pos] = near } //} } b = true } return } //查找单元格的表头时,横向或纵向 func (table *Table) FindNear(td *TD, direct int) *TD { if direct == 1 && td.StartCol > 0 { //左临 tr := table.TRs[:td.TR.RowPos+1] for i := len(tr) - 1; i > -1; i-- { tds := tr[i].TDs for _, td1 := range tds { if td1.StartRow <= td.StartRow && td1.EndRow >= td.EndRow && td1.EndCol+1 == td.StartCol { //找到左临节点 if td1.BH { return td1 } else { if td1.HeadTd != nil && td1.HeadTd.KVDirect == direct { return td1.HeadTd } } } } } } else if direct == 2 && td.StartRow > 0 { //上临 tr := table.TRs[:td.TR.RowPos] for i := len(tr) - 1; i > -1; i-- { tds := tr[i].TDs for _, td1 := range tds { if td1.StartCol <= td.StartCol && td1.EndCol >= td.EndCol && td1.EndRow+1 == td.StartRow { //找到左临节点 if td1.BH { return td1 } else { if td1.HeadTd != nil && td1.HeadTd.KVDirect == direct { return td1.HeadTd } } } } } } return nil } //根据行号列号获取td对象 func (tn *Table) GetTdByRCNo(row, col int) *TD { for _, tr := range tn.TRs { for _, td := range tr.TDs { if td.StartCol <= col && td.EndCol >= col && td.StartRow <= row && td.EndRow >= row { return td } } } return nil } //判断表格是否是分包 func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) { pac := 0 val := 0 index = []string{} index_pos := []int{} //是数组且能找到标段之类的提示 arr_count := 0 key_index := -1 hasPkgTd := map[string]bool{} for in, k := range tn.SortKV.Keys { if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) { continue } v := tn.SortKV.Map[k] vs, bvs := v.([]string) if bvs { arr_count++ haspkgs := []string{} for in2, v1 := range vs { v1 = replPkgConfusion(v1) if len([]rune(v1)) < 8 && !moneyNum.MatchString(v1) && FindVal_1.MatchString(v1) { if key_index == -1 { key_index = in } else if key_index != in { break } index = append(index, FindVal_1.FindString(v1)) index_pos = append(index_pos, in2) val += 1 pac++ } else { if ok, v1new := isHasOnePkgAndNoKv(v1); ok { haspkgs = append(haspkgs, v1new) } } } /*处理这种情况: