package pretreated import ( "fmt" "jy/clear" u "jy/util" qutil "qfw/util" "regexp" "strings" "unicode/utf8" "github.com/PuerkitoBio/goquery" ) /** 全局变量,主要是一堆判断正则 **/ var ( //key 的日期单位 dateReg *regexp.Regexp = regexp.MustCompile(`[年|月|日|天]`) //清理品目中数字 numclear = regexp.MustCompile("^[\\d一二三四五六七八九十.]+") num1 = regexp.MustCompile("(\\d)") //清理表格title中的不需要的内容 tabletitleclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、.,.。_/((人民币万元件个公斤户))]") tabletitleclear2 = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕]*") //清理表格中是key中包含的空格或数字等 tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]") //清理表格td中的符号 tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕¥$]*") //判断key是金额,对万元的处理 moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)") //key不需要清理-例如折扣 费率 no_clear_key_reg = regexp.MustCompile(`[((](费率|年|月|日|天|日历天|历天)[))]`) //根据表格的内容判断是不是表头,如果含有金额则不是表头 MoneyReg = regexp.MustCompile("^[\\s  ::0-9.万元()()人民币¥$]+$") GSReg = regexp.MustCompile(".*公司.*") //判断分包时 moneyNum = regexp.MustCompile("[元整¥万]") //对隐藏表格的判断 display = regexp.MustCompile("(?i).*?display\\s?[:]\\s?none.*") //--------------- //求是分包的概率 //根据表格的标签对分包进行打分 TableMultiPackageReg_4 = regexp.MustCompile("(标段|分包|包段|划分|子包|标包|合同段)") TableMultiPackageReg_2 = regexp.MustCompile("(概况|范围|情况|内容|详细|结果|信息)") //在判断分包打分前过虑表格key FilterKey_2 = regexp.MustCompile("招标|投标|项目") //根据表格的key进行分包打分 FindKey_2 = regexp.MustCompile("([分子][包标](号)?|标[号项段包](划分)?|包件?[号段名数]|包[组件])") FindKey_3 = regexp.MustCompile("(标段编号|标包)") //对值进行分包判断 FindVal_1 = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)") FindVal2_1 = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)|^(设计|施工|监理|验收)[分子]?[标包]?[段号]?$") //判断分包前排除 包件号? excludeKey = regexp.MustCompile("(标识|数量|分包个数|标段代码|涉及包号|分包数量|项目标号|规格|型号|招标范围|业绩|废标|标段选择要求)|(^编号$)|([^包段标]编号)") //编号|划分 excludeKey2 = regexp.MustCompile("包/[0-9]{0,4}[箱纸张]") excludeKey3 = regexp.MustCompile("(分包个数|每包[0-9]*元|标线|国标|享受一包服务)") //------------- cut = u.NewCut() //清理表格标签正则 ClearTagReg = regexp.MustCompile("<[^>]*?>|[\\s\\n\\r]*$") //查找表格标签正则 ttagreg = regexp.MustCompile("(?s)([^\\n::。,;\\s\u3000\u2003\u00a0]{2,30})[::]?[^::。;!\\n]{0,35}[\\s\\n]*$") //判断表格是表头的概率 checkval = float32(0.6) //tdval_reg = regexp.MustCompile(`([\p{Han}][\p{Han}\s、()\\(\\)]{1,9})[::]([^::\\n。]{5,60})(?:[;;,,.。\\n\\t\\s])?`) //空格替换 repSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0::]+|\\\\t+") //对表格kv的处理 //对不能标准化的key做批识 filter_tag_zb = regexp.MustCompile("(中标|成交|投标)[\\p{Han}]{0,6}(情况|结果|信息|明细)?") //中标金额 //包含以下字眼做标准化处理 filter_zbje_k = regexp.MustCompile("(中标|成交|总|拦标|合同|供[应货]商|报)[\\p{Han}、]{0,6}(价|额|[大小]写|[万亿]?元).{0,4}$") //简单判断金额 filter_zbje_jd = regexp.MustCompile("^[^(售|保证)]{0,4}(价|额).{0,4}$") //预算金额 filter_ysje_jd = regexp.MustCompile("(预算|预控价|项目概.|项目信息)") //且排队以下字眼的key filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分|电话|要求|需求数量|发布规模$|第[2二3三4四5五]|地址|询价保证金|行号") //且值包含以下字眼 filter_zbje_v = regexp.MustCompile("[¥$$0-9一二三四五六七八九十,,〇零点..壹贰叁肆伍陆柒捌玖拾百佰千仟万亿億元圆角分整正()::大小写]{2,16}") //中标单位的处理 //包含以下字眼的Key标准化 filter_zbdw_ky = regexp.MustCompile("(中标|成交|拦标|合同|选中|投标|拟|预|最终)[\\p{Han}、]{0,6}(供[应货]商|企业|单位|人|机构)(名称)?.{0,4}$") //识别中标单位相关信息 filter_zbdw_info = regexp.MustCompile("(中标|成交|中选|供(货|应))[^候选]{0,}") //简单判断 filter_zbdw_jd = regexp.MustCompile("(投标|成交|中标|合同)(供应商|单位|人|名称).{0,4}$") //且不包含以下字眼 filter_zbdw_kn = regexp.MustCompile("第[2二3三4四5五]|得分|地址|询价保证金") //且值包含以下字眼 //且值包含以下字眼 filter_zbdw_v = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$|([^购]中心|办公|用品)") //且值包含以下字眼 filter_zbdw_v2 = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$") //Tg = map[string]interface{}{} //一些表格没有表头,是空的,对值是排序的做处理对应 NullTxBid NullTdReg = regexp.MustCompile("(首选|第[一二三四五1-5])(中标|成交)?(名(称)?|(候选|排序)?(人|单位|供应商))") NullTxtBid = "成交供应商排名" projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$") MhSpilt = regexp.MustCompile("[::]") //识别采购单位联系人、联系电话、代理机构联系人、联系电话 -- 名称有异常 ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?(((联系)?(电话|方式|号码)([//及]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(详细)?(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|经办)人)|采购方代表") ContactInfoMustReg = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$") ContactType = map[string]*regexp.Regexp{ "采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|招标(服务)?|甲|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心(地址)?|业主|收料人|采购部"), "代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"), "中标单位": regexp.MustCompile("^((拟(定)?|预|最终|唯一)?(中标|成交|中选|供(货|应))((成交))?)[^候选]{0,2}(人|方|单位|公司|(服务|供应)?商|企业)"), "监督部门": regexp.MustCompile("投诉受理部门"), } ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$") MultipleValueSplitReg = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]") BuyerContacts = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"} FilterSerial = regexp.MustCompile(".+[、..::,]") underline = regexp.MustCompile("_+$") iswinnertabletag = regexp.MustCompile("(中标|候选人|成交|结果|磋商情况)") nswinnertabletag = regexp.MustCompile("评得分估|标的信息|班子成员") jsonReg = regexp.MustCompile(`\{.+:[^}]*\} `) // \{".*\":\".+\"} regHz = regexp.MustCompile("[\u4e00-\u9fa5]") winnerOrderAndBidResult = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)") WinnerOrderStr = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|[大中小]+学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`) DoubtReg = regexp.MustCompile("(我中心|有(疑问|质疑|异议|意见)|(书面)?提出|不再受理|投诉|质疑|书面形式|监督|公示期(限)?)") ) //在解析时,判断表格元素是否隐藏 func IsHide(g *goquery.Selection) (b bool) { style, exists := g.Attr("style") if exists { b = display.MatchString(style) } return } //59.992664,33.495715,20.001306 var clearnum *regexp.Regexp = regexp.MustCompile("(([0-9.]{1,6}[,,]+){4,}|(\\d{6}[,,]\\d{2}.){2,})") //对表格的key进行标准化处理,多个k相同时,出现覆盖问题 //待扩展,暂不支持正则标签库 清理key func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool, codeSite string) (kvTags map[string][]*u.Tag, returntag string) { kvTags = map[string][]*u.Tag{} v1 := "" if sv, sok := v.(string); sok { //取KV v1 = sv } else if sv, sok := v.([]string); sok { //是数组先默认取第一个 if len(sv) >= 1 { v1 = sv[0] } } //对值单位的处理 (预算|费|价|额|规模|投资) if moneyreg.MatchString(k) { v1 += GetMoneyUnit(k, v1) } //先清理key //u.Debug(1, k, v1) //指定-key不清理 拦标价(费率或单价等) k1:="" if !no_clear_key_reg.MatchString(k) { k1 = ClearKey(k, 2) } //u.Debug(2, k) //取标准key if tabletag == "中标情况" { if k1=="价格" { k1="中标金额" } } res := u.GetTags(k1, isSite, codeSite) if len(res) == 0 && k1 != k { res = u.GetTags(k, isSite, codeSite) k1 = k } //log.Println(k, res) // if len(res) == 0 { // go u.AddtoNoMatchMap(tk) // } //当取到标准化值时,放入数组 if len(res) > 0 { for _, t1 := range res { //降低冒号值的权重 if MhSpilt.MatchString(v1) { t1.Weight -= 50 } if winnerOrderAndBidResult.MatchString(tabletag) && t1.Value == "采购单位联系人" { //处理table中项目负责人 kvTags[k] = append(kvTags[k], &u.Tag{Key: k, Value: v1, IsInvalid: true}) } else if regexp.MustCompile("(中标候选人|名单及其排序|排序)").MatchString(tabletag) && t1.Value == "采购单位" { kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k1, Value: v1, Weight: t1.Weight - 150}) } else { kvTags[t1.Value] = append(kvTags[t1.Value], &u.Tag{Key: k1, Value: v1, Weight: t1.Weight}) } } res[0].IsInvalid = true //k1 = res[0].Value } /*else { kvTags[k] = append(kvTags[k], &u.Tag{Key: k, Value: v1, IsInvalid: true}) //没有取到标准化key时,对中标金额和中标单位的逻辑处理 if filter_zbje_k.MatchString(k) && !filter_zbje_kn.MatchString(k) && filter_zbje_v.MatchString(v1) && utf8.RuneCountInString(v1) < 20 { if tabletag == "" { returntag = "中标情况" } kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100, IsInvalid: true}) } else if filter_zbdw_ky.MatchString(k) && !filter_zbdw_kn.MatchString(k) && filter_zbdw_v.MatchString(v1) { kvTags["中标单位"] = append(kvTags["中标单位"], &u.Tag{Key: k, Value: v1, Weight: -100, IsInvalid: true}) if tabletag == "" { returntag = "中标情况" } } else if !filter_zbje_jd.MatchString(tabletag) && !filter_zbje_jd.MatchString(k) && utf8.RuneCountInString(v1) < 13 { //对上一步没有取到标准化key的进一步处理 if filter_tag_zb.MatchString(tabletag) || filter_tag_zb.MatchString(tabledesc) { //u.Debug(v1, k, "-----", filter_zbdw_jd.MatchString(k), filter_zbdw_v.MatchString(v1)) if filter_zbje_jd.MatchString(k) && !filter_zbje_kn.MatchString(k) && filter_zbje_v.MatchString(v1) { if filter_ysje_jd.MatchString(k) { kvTags["预算金额"] = append(kvTags["预算金额"], &u.Tag{Key: k, Value: v1, Weight: -100}) } else if !filter_zbdw_kn.MatchString(k) { kvTags["中标金额"] = append(kvTags["中标金额"], &u.Tag{Key: k, Value: v1, Weight: -100}) } } } } }*/ return } var glRex *regexp.Regexp = regexp.MustCompile("(成交|中标|候选|排名|名次|供应商排序|中标候选人|名单及其排序|排序)") var djReg *regexp.Regexp = regexp.MustCompile("^单价") //对解析后的表格的kv进行过滤 func (table *Table) KVFilter(isSite bool, codeSite string) { //1.标准化值查找 //2.对数组的处理 //3.对分包的处理 //4.对KV的处理 //判断表格是否有用,调用abandontable正则数组进行判断 //遍历每一行 table.analyTdKV() //1.遍历每行每列td的sortkv添加到table.SorkVK中;2.td有子表格的处理 as := NewSortMap() //遍历table.sortkv,进行过滤处理,并放入标准化KV中,如果值是数组跳到下一步处理 for _, k := range table.SortKV.Keys { //遍历所有key sort.kv //表格描述处理,对成交结果的处理 if k=="第一询价结果候选人" { //fmt.Println("标准化key") } if glRex.MatchString(k) { table.Desc += "成交结果," } if djReg.MatchString(k) { continue } v := table.SortKV.Map[k] if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight k = pkgFilter.ReplaceAllString(k, "") k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "") kvTags, tag := CommonDataAnaly(k, table.Tag, table.Desc, v, isSite, codeSite) //对key标准化处理,没有找到会走中标 if tag != "" && table.Tag == "" { table.Tag = tag } MergeKvTags(table.StandKV, kvTags) } else { as.AddKey(k, v) } } //核心-候选人-相关 //处理值是数组的kv 放入标准化kv中 standKV //处理table.SortKV.value为数组的情况 table.sortKVArr(as, isSite, codeSite) // if len(table.WinnerOrder) > 0 || !table.BPackage { winnerOrder := []map[string]interface{}{} maxSort := 0 //调整顺序 for i := 0; i < 2; i++ { for _, v := range table.WinnerOrder { sortstr, _ := v["sortstr"].(string) if (i == 0 && sortstr == "") || (i == 1 && sortstr != "") { continue } sort, _ := v["sort"].(int) if i == 0 { if maxSort == 0 || sort > maxSort { maxSort = sort } } else { maxSort++ v["sort"] = maxSort } winnerOrder = append(winnerOrder, v) } if len(winnerOrder) == len(table.WinnerOrder) { break } } table.WinnerOrder = winnerOrder if len(table.WinnerOrder) == 0 { winnerOrder = []map[string]interface{}{} //遍历每个td,查询中标人 for _, tr := range table.TRs { for _, td := range tr.TDs { winnerOrder = winnerOrderEntity.Find(td.Val, true, 3, isSite, codeSite) if len(winnerOrder) > 0 { //中标候选人合并 winnerOrderEntity.Merge(table.WinnerOrder, winnerOrder) } } } } if !table.BPackage { //没有table.WinnerOrder也没有分包 将td中的WinnerOrder赋值给table.WinnerOrder if len(winnerOrder) > 1 { table.WinnerOrder = winnerOrder } } } //对中标候选人进行排序 winnerOrderEntity.Order(table.WinnerOrder) //该表格有一个分包,并且有中标候选人排序的情况下,把中标候选人放到包里面 if table.BlockPackage != nil && table.BlockPackage.Keys != nil && len(table.BlockPackage.Keys) == 1 { if table.BlockPackage.Map != nil { onePkgKey := table.BlockPackage.Keys[0] onePkg, _ := table.BlockPackage.Map[onePkgKey].(*u.BlockPackage) if onePkg != nil && (onePkg.WinnerOrder != nil || len(onePkg.WinnerOrder) == 0) { onePkg.WinnerOrder = table.WinnerOrder table.BlockPackage.AddKey(onePkgKey, onePkg) } } } } var winMoneyReg *regexp.Regexp = regexp.MustCompile("(报价|投标价|投标总价)") //处理table.SortKV.value为数组的情况 func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) { winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签 if !winnertag { winnertag = iswinnertabletag.MatchString(table.TableResult.BlockTag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签 } if !winnertag { winnertag = iswinnertabletag.MatchString(table.Desc) } if !winnertag { winnertag = iswinnertabletag.MatchString(table.Html) } checkKey := map[int]bool{} //tmpBidmout := []string{} //log.Println(tmpBidmout) for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key v := as.Map[k] if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid { if table.WinnerOrder == nil { table.WinnerOrder = []map[string]interface{}{} } table.WinnerOrder = append(table.WinnerOrder, vm...) } else { //增加候选人排序逻辑 if (table.WinnerOrder == nil || len(table.WinnerOrder) == 0) && !checkKey[kn] { if vs1, ok := v.([]string); ok { smap := make([]map[string]interface{}, len(vs1)) for n1, _ := range vs1 { smap[n1] = map[string]interface{}{} } //hadSort := false tmpEntname := make([]string, len(vs1)) tmpPrice := make([]string, len(vs1)) for kn1, k := range as.Keys[kn:] { v := as.Map[k] if ContactType["采购单位"].MatchString(k) || ContactType["代理机构"].MatchString(k) { kvTags, _ := CommonDataAnaly(k, table.Tag, table.Desc, v, isSite, codeSite) //对key标准化处理,没有找到会走中标 for k := range kvTags { if table.StandKV[k] == nil { MergeKvTags(table.StandKV, kvTags) } } continue } //目前对数组数据的key做判断,但是某些额可以是不满足情况的 //载明内容:[第一中标候选人 第二中标候选人] id:5d00587da5cb26b9b75e367b if vs, ok := v.([]string); ok && len(vs) == len(vs1) { //数组值的个数相同 res, _, _, _, repl := CheckCommon(k, "bidorder") kv := "" if !res { kt := u.GetTags(k, isSite, codeSite) if kt.Len() > 0 { if kt[0].Value == "单品报价" && winnertag { kv = "中标金额" } else { kv = kt[0].Value } } } if !res && kv == "" { //key未验证出,验证数组的val值 checkKey[kn+kn1] = true if winnertag { //如果是中标信息 在根据val数组信息解析候选人 for vsk, vsv := range vs { if NullTdReg.MatchString(vsv) { //数据先验证val是否有排序 //hadSort = true smap[vsk]["sortstr"] = vsv smap[vsk]["sort"] = GetBidSort(vsv, vsk+1) } else if findCandidate2.MatchString(vsv) && tmpEntname[vsk] == "" { //数据验证val是否是候选人 entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) if entname != "" { tmpEntname[vsk] = entname } } else if winMoneyReg.MatchString(k) && len(tmpPrice[vsk]) == 0 { kv = "中标金额" }else { //验证val时如果数组中的第一条数据既不满足sort或者entname 判定此数组数据错误 break } } } } if res || kv != "" { //连续往下找几个key checkKey[kn+kn1] = true SORT: if repl == "sort" { //hadSort = true for vsk, vsv := range vs { smap[vsk]["sortstr"] = vsv smap[vsk]["sort"] = GetBidSort(vsv, vsk+1) if findCandidate2.MatchString(vsv) && kv == "中标单位" && tmpEntname[vsk] == "" { //数据验证val是否是候选人 entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) if entname != "" { tmpEntname[vsk] = entname } } } } else if repl == "entname" || kv == "中标单位" { for vsk, vsv := range vs { if winnerReg6.MatchString(vsv) { //k:中标候选人 v:["第一名","第二名"] repl = "sort" goto SORT } //if entname, _ := smap[vsk]["entname"].(string); entname != "" || len([]rune(vsv)) < 3 { // break //} //entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) //if entname != "" { // smap[vsk]["entname"] = entname if tmpEntname[vsk] != "" || len([]rune(vsv)) < 4 { //排除 单位:["台","个","套"] break } entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string) if entname != "" { tmpEntname[vsk] = entname } } } else if kv == "中标金额" { for vsk, vsv := range vs { //过滤price 2348273.432元(万元)-->2348273.432 //tmp1, _ := smap[vsk]["price"].(string) tmp1 := tmpPrice[vsk] p1num := numberReg2.FindString(tmp1) p2num := numberReg2.FindString(vsv) p1 := qutil.Float64All(p1num) p2 := qutil.Float64All(p2num) if p2 > p1 { //smap[vsk]["price"] = winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv)) price := winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv)) if pricestr, _ := price.(string); len(pricestr) < 30 && len(pricestr) > 0 { tmpPrice[vsk] = pricestr } } } } else if kv == "预算" { if strings.Contains(k, "万元") { for vsk, vsv := range vs { if !strings.Contains(vsv, "万元") { vs[vsk] = vsv + "万元" } } } } } } else { //break } } newSmap := []map[string]interface{}{} //qutil.Debug("smap=======", smap) //qutil.Debug("tmpEntname--", len(tmpEntname), tmpEntname) //qutil.Debug("tmpPrice--", len(tmpPrice), tmpPrice) for n, smap_v := range smap { //if hadSort { //有排序,再添加entname和price if len(tmpEntname) > 0 && n < len(tmpEntname) && tmpEntname[n] != "" { smap_v["entname"] = tmpEntname[n] if len(tmpPrice) > 0 && n < len(tmpPrice) && tmpPrice[n] != "" { smap_v["price"] = tmpPrice[n] } } //} else if len(tmpEntname) > 0 { //fmt.Println("table winnerorder only has entname", tmpEntname) //} //qutil.Debug("len-smap_v--", len(smap_v)) if len(smap_v) > 2 { //只有排序信息 sort和sortstr newSmap = append(newSmap, smap_v) } } if len(newSmap) > 0 { table.WinnerOrder = newSmap } } } else if vsss, ok := v.([]string); ok { if (len(table.WinnerOrder) > 0 && table.WinnerOrder[0]["price"] == nil && len(vsss) == len(table.WinnerOrder)) || (len(table.WinnerOrder) > 0 && strings.Contains(k,"总报价") && len(vsss) == len(table.WinnerOrder)){ kv := "" if winMoneyReg.MatchString(k) { kv = "中标金额" } else { kt := u.GetTags(k, isSite, codeSite) if kt.Len() > 0 { if kt[0].Value == "单品报价" && winnertag { kv = "中标金额" } else { kv = kt[0].Value } } } if kv == "中标金额" { for i, vx := range vsss { p1num := numberReg2.FindString(vx) if strings.Contains(p1num, ",") && strings.Contains(p1num, ".") { p1num = strings.ReplaceAll(p1num, ",", "") } p1 := qutil.Float64All(p1num) if p1 > 0 { //smap[vsk]["price"] = winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv)) price := winnerOrderEntity.clear(kv, vx+GetMoneyUnit(k, vx)) if pricestr, _ := price.(string); len(pricestr) < 30 && len(pricestr) > 0 && !clearnum.MatchString(pricestr) { table.WinnerOrder[i]["price"] = pricestr } } } } } else if table.StandKV[k] == nil { kvTags, _ := CommonDataAnaly(k, table.Tag, table.Desc, v, isSite, codeSite) //对key标准化处理,没有找到会走中标 MergeKvTags(table.StandKV, kvTags) } } } } //特殊处理--组合候选人 } //1.遍历每行每列td的sortkv添加到table.SorkVK中;2.td有子表格的处理 func (table *Table) analyTdKV() { //遍历每一行 for _, tr := range table.TRs { for _, td := range tr.TDs { //fmt.Println(td.BH, td.MustBH, td.Val, td.SortKV.Map) bc := false if !td.BH { //表头是否是无用内容 if td.HeadTd != nil { bc, _, _, _, _ = CheckCommon(td.HeadTd.Val, "abandontable") } } if !bc { //td元素有内嵌kv,遍历放入table的Kv中 if len(td.SortKV.Keys) > 0 { for _, k3 := range td.SortKV.Keys { _val := td.SortKV.Map[k3] //thisFlag := false if td.HeadTd != nil && len([]rune(k3)) < 4 { k3 = td.HeadTd.Val + k3 } if table.SortKV.Map[k3] == nil && _val != nil && _val != "" { //u.Debug(k3, _val) //if !thisFlag || (thisFlag && table.SortKV.Map[k3] == nil) { table.SortKV.AddKey(k3, _val) } } } } //td有子表格的处理 //u.Debug(td.BH, td.Val, td.SonTableResult) if td.SonTableResult != nil { //u.Debug(td.SonTableResult.SortKV.Map, "-------", td.SonTableResult.Tabs) for k3, v3 := range td.SonTableResult.KvTags { table.StandKV[k3] = append(table.StandKV[k3], v3...) } //中标候选人排序 if table.WinnerOrder == nil || len(table.WinnerOrder) == 0 { table.WinnerOrder = td.SonTableResult.WinnerOrder } else { winnerOrderEntity.Merge(table.WinnerOrder, td.SonTableResult.WinnerOrder) } } } } } //表格结果合并到父表格集中 func (table *Table) MergerToTableresult() { //对多包表格的多包值的合并处理 if table.BPackage { table.TableResult.IsMultiPackage = true for _, v2 := range table.BlockPackage.Keys { package1 := table.TableResult.PackageMap.Map[v2] if package1 == nil { table.TableResult.PackageMap.AddKey(v2, table.BlockPackage.Map[v2]) if vvv, ok := table.BlockPackage.Map[v2].(*u.BlockPackage); ok { if vvv.TableKV != nil && len(vvv.TableKV.KvTags) > 0 { MergeKvTags(table.TableResult.KvTags, vvv.TableKV.KvTags) } } } else { bp := package1.(*u.BlockPackage) if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } v1 := table.BlockPackage.Map[v2].(*u.BlockPackage) if v1.TableKV != nil && len(v1.TableKV.KvTags) > 0 { for k2, v2 := range v1.TableKV.KvTags { if k2 == "" { continue } isExists := false for _, v2v := range v2 { if v2v.Value == "" { continue } for _, v2vv := range bp.TableKV.KvTags[k2] { if v2v.Value == v2vv.Value { isExists = true break } } if !isExists { bp.TableKV.KvTags[k2] = append(bp.TableKV.KvTags[k2], v2v) bp.Text += fmt.Sprintf("%v:%v\n", k2, v2) } } } } if bp.Bidamount <= 0 && !bp.IsTrueBidamount { bp.Bidamount = v1.Bidamount bp.IsTrueBidamount = v1.IsTrueBidamount } if bp.Budget <= 0 && !bp.IsTrueBudget { bp.Budget = v1.Budget bp.IsTrueBudget = v1.IsTrueBudget } bp.Text += bp.Text if len(v1.WinnerOrder) > 0 && len(bp.WinnerOrder) == 0 { bp.WinnerOrder = v1.WinnerOrder } } } } //遍历标准key到tableresult.sortkv中 for fieldKey, v := range table.StandKV { for _, vv := range v { if fieldKey=="项目周期"||fieldKey=="工期单位"||fieldKey=="工期时长" { dateStr := dateReg.FindString(vv.Key) if dateStr !="" && !strings.Contains(vv.Value,dateStr) { vv.Value = vv.Value+dateStr } } vv.Value = strings.Replace(vv.Value, "__", "", -1) } } MergeKvTags(table.TableResult.KvTags, table.StandKV) //表格的块标签 if table.TableResult.BlockTag == "" && table.Tag != "" { table.TableResult.BlockTag = table.Tag } //中标候选人(多个table,现在默认取第一个table的信息,考虑需不需要多个table分析合并数据?) if table.TableResult.WinnerOrder == nil || len(table.TableResult.WinnerOrder) == 0 { table.TableResult.WinnerOrder = table.WinnerOrder } //增加brand 并列table if len(table.BrandData) > 0 { for _, v := range table.BrandData { if len(v) > 0 { table.TableResult.BrandData = append(table.TableResult.BrandData, v) } } } //抽取prince和number 并列table if len(table.PriceNumberData) > 0 { for _, v := range table.PriceNumberData { if len(v) > 0 { table.TableResult.PriceNumberData = append(table.TableResult.PriceNumberData, v) } } } if table.BlockPackage != nil && len(table.BlockPackage.Keys) == 0 { for _, v := range table.BlockPackage.Keys { if table.BlockPackage.Map[v] != nil { if vvv, ok := table.BlockPackage.Map[v].((*u.BlockPackage)); ok { if vvv.TableKV != nil && len(vvv.TableKV.KvTags) > 0 { for kk, vv := range vvv.TableKV.KvTags { if kk == "" { continue } if len(table.TableResult.KvTags[kk]) == 0 { table.TableResult.KvTags[kk] = vv } } } } } } } } /** 解析表格入口 返回:汇总表格对象 **/ func AnalyTableV2(tabs *goquery.Selection, toptype, blockTag, con string, itype int, _id interface{}, ruleBlock *u.RuleBlock, isSite bool, codeSite string) (tabres *TableResult) { defer qutil.Catch() //u.Debug(con) if itype == 1 { //修复表格 con = RepairCon(con) } //生成tableresult对象 tabres = NewTableResult(_id, toptype, blockTag, con, itype, ruleBlock) if fblbReg.MatchString(blockTag) { return } //可以有多个table //for _, table := range tabs { //隐藏表格跳过 if IsHide(tabs) { return } tabres.GoqueryTabs = tabs //} //解析表格集 tabres.Analy(isSite, codeSite) return } //开始解析表格集 func (ts *TableResult) Analy(isSite bool, codeSite string) { tabs := []*Table{} contactFormat := &u.ContactFormat{ IndexMap: map[int]string{}, MatchMap: map[string]map[string]bool{}, } //for _, table := range ts.GoqueryTabs { tn := NewTable(ts.Html, ts, ts.GoqueryTabs) //核心模块 tsw := tn.Analy(contactFormat, isSite, codeSite) for _, tab := range tsw { if len(tab.TRs) > 0 { tabs = append(tabs, tab) } //fmt.Println("tab.SortKV.Map", tab.SortKV.Keys) } //tn.SonTables = append(tn.SonTables, tn) //} //统一合并,考虑统一多表格是多包的情况---新增 与子表格合并 if len(tabs) > 1 { pns := map[string]string{} pnarr := []string{} for _, table := range tabs { if len(table.StandKV["项目名称"]) == 0 { continue } pn := table.StandKV["项目名称"][0] if pn != nil && pn.Value != "" && TitleReg.MatchString(pn.Value) { pnarr = append(pnarr, pn.Value) matchres := TitleReg.FindAllStringSubmatch(pn.Value, -1) if len(matchres) == 1 && len(matchres[0]) > 0 { v1 := u.PackageNumberConvert(matchres[0][0]) pns[v1] = matchres[0][0] bp := &u.BlockPackage{} bp.Index = v1 bp.Origin = matchres[0][0] if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } for _, k := range []string{"中标金额", "中标单位", "预算", "成交状态", "项目名称", "项目编号", "采购范围"} { if len(table.StandKV[k]) > 0 { bp.TableKV.KvTags[k] = append(bp.TableKV.KvTags[k], &u.Tag{Key: k, Value: table.StandKV[k][0].Value}) } } bp.WinnerOrder = table.WinnerOrder if table.BlockPackage.Map[v1] == nil { table.BPackage = true table.BlockPackage.AddKey(v1, bp) } else { table.BlockPackage.RemoveKey(v1) table.BlockPackage.AddKey(v1, bp) } } } } if len(tabs) == len(pns) { //多个表格,每个表格都是一个分包 http://www.cxzwfw.gov.cn/info/1009/6963.htm //项目名称、项目编号、采购单位、招标机构、预算 pname := projectnameReg.ReplaceAllString(pnarr[0], "") btrue := true for _, pn := range pnarr[1:] { pn = projectnameReg.ReplaceAllString(pn, "") //u.Debug(pn, pname) if pn != pname { //项目名称不一致 btrue = false break } } if btrue { ts.KvTags["项目名称"] = append(ts.KvTags["项目名称"], &u.Tag{Key: "项目名称", Value: pname, Weight: 100}) for _, table := range tabs { table.BPackage = true //预算、中标金额、NullTxtBid成交供应商排名 中标单位 成交状态 if table.BlockPackage != nil && len(table.BlockPackage.Keys) == 1 { bp := table.BlockPackage.Map[table.BlockPackage.Keys[0]].(*u.BlockPackage) if table.TableResult.WinnerOrder != nil { bp.WinnerOrder = table.WinnerOrder } if bp != nil && table.StandKV != nil { if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } for nk, k := range []string{"中标金额", "中标单位", "预算", "成交状态", "项目名称", "项目编号", "采购范围"} { if len(table.StandKV[k]) > 0 { bp.TableKV.KvTags[k] = append(bp.TableKV.KvTags[k], &u.Tag{Key: k, Value: table.StandKV[k][0].Value}) } if nk < 4 { delete(table.StandKV, k) } } } } } } } } for _, table := range tabs { table.MergerToTableresult() MergeKvTags(ts.KvTags, table.TableResult.KvTags) if !table.Brule { ts.isUnRulesTab = true } } } //解析表格 func (table *Table) Analy(contactFormat *u.ContactFormat, isSite bool, codeSite string) []*Table { //查找表体中的tr对象 trs := table.Goquery.ChildrenFiltered("tbody,thead,tfoot").ChildrenFiltered("tr") if trs.Size() == 0 { trs = table.Goquery.ChildrenFiltered("tr") } ztb := table.Goquery.Find("table").Size() if ztb >= 9 { return []*Table{} } //遍历节点,初始化table 结构 TRs Sorts table.createTabe(trs, isSite, codeSite) if len(table.TRs) == 0 { return []*Table{} } //重置行列 table.ComputeRowColSpan() //对table结构体进行整体解析处理 ts := table.AnalyTables(contactFormat, isSite, codeSite) return ts } var fblbReg *regexp.Regexp = regexp.MustCompile("(废标|流标|负责人资格|负责人业绩|相关业绩|技术评分明细表|开标记录|附件[:0-9]|越南盾|技术分|填报项目业绩|未通过.*原因)") //遍历节点,初始化table 结构体 func (table *Table) createTabe(trs *goquery.Selection, isSite bool, codeSite string) { trs.Each(func(n int, sel *goquery.Selection) { //隐藏行不处理 if IsHide(sel) { return } //遍历每行的td tds := sel.ChildrenFiltered("td,th") TR := NewTR(table) tdTextIsNull := false var empty int tds.Each(func(m int, selm *goquery.Selection) { //对隐藏列不处理!!! if IsHide(selm) { return } //进入每一个单元格 td := NewTD(selm, TR, table, isSite, codeSite) //初始化td,kv处理,td中有table处理,td的方向 //num++ TR.AddTD(td) if td.Val == "" && td.SonTableResult == nil && len(td.SortKV.Map) == 0 { //删除一个tr,tr中所有td是空值的 empty++ if tds.Size() == empty { tdTextIsNull = true } } }) //向table添加每行不为空的tr if !tdTextIsNull { table.AddTR(TR) } }) } //对table进行整体解析处理 func (tn *Table) AnalyTables(contactFormat *u.ContactFormat, isSite bool, codeSite string) []*Table { ts := tn.tableSubDemolitionTable() //分包,拆表 for n, table := range ts { //处理每个table if len(table.TRs) > 0 { //删除尾部空白行 table.deleteTrimTr() //table.Print() //校对表格 table.Adjust(isSite, codeSite) //查找表格的标签,table.Tag字段 table.FindTag() //分割表格 table.bSplit(n, ts, isSite, codeSite) table.TdContactFormat(contactFormat, isSite, codeSite) //contactFormat,处理采购单位,代理机构 //开始查找kv,核心模块,table.SortKV table.FindKV(isSite, codeSite) //table中抽取品牌,table.BrandData if u.IsBrandGoods { table.analyBrand() } //table中抽取单价和个数 if u.IsPriceNumber { //qutil.Debug("======================抽取price和number===========") table.extractPriceNumber() } res, _, _, _, _ := CheckCommon(table.Tag, "abandontable") if !res { //过滤、标准化、合并kv,table.StandKV,table.StandKVWeight table.KVFilter(isSite, codeSite) } //对有表头表格的处理 if table.Tag != "" { co, m, b := CheckMultiPackage(table.Tag) //分包处理 if b { table.BPackage = b if len(table.BlockPackage.Map) == 0 { for _, av := range m { kv := u.NewJobKv() kv.KvTags = table.StandKV bd := u.PackageNumberConvert(av[0]) blockPackage := &u.BlockPackage{ Origin: av[0], Name: av[0], Text: co, TableKV: kv, Index: bd, } if bd != "" { table.BlockPackage.AddKey(bd, blockPackage) } else { table.BlockPackage.AddKey(av[0], blockPackage) } } } table.StandKV["项目名称"] = append(table.StandKV["项目名称"], &u.Tag{Key: "项目名称", Value: table.Tag, Weight: -300}) } } //判断是否是多包,并处理分包的//遍历td分块 table.CheckMultiPackageByTable(isSite, codeSite) //分包处理 //MergeKvTags(table.TableResult.KvTags, table.StandKV) } } return ts } //分包,拆表 func (table *Table) tableSubDemolitionTable() []*Table { tm := []map[string]interface{}{} tmk := map[string]bool{} tmn := map[int]map[string]interface{}{} for rownum, tr := range table.TRs { if len(tr.TDs) == 1 && table.ColNum > 1 { //tr里面有一列,table里面有多列 td := tr.TDs[0] //取每行第一个td //td开始列等于0 && td结束列+1等于table列数 && td长度大于1小于50 if td.StartCol == 0 && td.EndCol+1 == table.ColNum && len([]rune(td.Val)) > 1 && len([]rune(td.Val)) < 50 { con, m1, b := CheckMultiPackage(td.Val) //判断分包 if b { for k, _ := range m1 { numstr := u.PackageNumberConvert(k) m2 := map[string]interface{}{ "tag": con, //"num": numstr, //"numtxt": v[0], "startrow": rownum, } tmk[numstr] = true tmn[rownum] = m2 tm = append(tm, m2) break } } } } } //拆表 ts := []*Table{} if len(tmk) > 1 && len(tmk) == len(tm) { var tab1 *Table for rownum, tr := range table.TRs { if tab1 == nil { tab1 = NewTable("", table.TableResult, table.Goquery) tab1.BSplit = true if tmn[rownum] != nil { tab1.StandKV["项目名称"] = append(tab1.StandKV["项目名称"], &u.Tag{Key: "项目名称", Value: tmn[rownum]["tag"].(string), Weight: -100}) } ts = append(ts, tab1) } if tmn[rownum] != nil { tab1.Tag = tmn[rownum]["tag"].(string) } else { tab1.AddTR(tr) } if tmn[rownum+1] != nil { tab1 = nil } } } else { ts = append(ts, table) } return ts } //分割表格 func (table *Table) bSplit(n int, ts []*Table, isSite bool, codeSite string) { if table.BSplit { if !table.BHeader && n > 0 { for i := n - 1; i > -1; i-- { if ts[i].BHeader { if ts[i].BFirstRow { //取第一行插入到 table.InsertTR(ts[i].TRs[0]) table.Adjust(isSite, codeSite) } break } } } } } //删除尾部空白行 func (table *Table) deleteTrimTr() { for len(table.TRs) > 0 { npos := len(table.TRs) tailTR := table.TRs[npos-1] //最后一个tr,取最后一行 bspace := true for _, v := range tailTR.TDs { if v.Val != "" || v.SonTableResult != nil || len(v.SortKV.Keys) > 0 { bspace = false break } } //删除尾部空行,是空行的话就删除 if bspace { table.TRs = table.TRs[:npos-1] } else { break } } } //校对表格 func (table *Table) Adjust(isSite bool, codeSite string) { //计算行列起止位置,跨行跨列处理 table.ComputeRowColSpan() // for k1, tr := range table.TRs { // for k2, td := range tr.TDs { // qutil.Debug(k1, k2, td.Val, td.StartRow, td.EndRow, td.StartCol, td.EndCol) // } // } //大概计算每个起止行列的概率 table.GetKeyRation() /* for k, v := range table.StartAndEndRation { for k1, v1 := range v.Poss { bs, _ := json.Marshal(v1) str := "" for _, td := range v.Tdmap[v1] { str += "__" + td.Val + fmt.Sprintf("%d_%d_%d_%d", td.StartRow, td.EndRow, td.StartCol, td.EndCol) } qutil.Debug(k, k1, string(bs), v.Rationmap[v1], str) } } */ //u.Debug("tdnum:", num, table.RowNum, table.ColNum) //是否是规则的表格,单元各个数=行数*列数 table.Brule = table.TDNum == table.RowNum*table.ColNum count := 0 for _, trs := range table.TRs { for _, td := range trs.TDs { if td.BH { count++ } } } if float32(count)/float32(table.TDNum) < 0.85 { //精确计算起止行列是表头的概率 table.ComputeRowColIsKeyRation(isSite, codeSite) bhead := false L: for i, tr := range table.TRs { for _, td := range tr.TDs { if td.BH { //qutil.Debug("----=====---", td.Val, len(table.TRs[len(table.TRs)-1].TDs), i, len(table.TRs)-1) if i == len(table.TRs)-1 && len(table.TRs[len(table.TRs)-1].TDs) == 2 { res, _, _, _, _ := CheckCommon(td.Val, "abandontable") if res { //删除此行 table.TRs = table.TRs[:len(table.TRs)-1] table.Adjust(isSite, codeSite) return } } bhead = true break L } } } table.BHeader = bhead } } //计算行/列表格的结束位置 StartRow=0 EndRow=0,table.TDNum td个数 table.RowNum 行数 func (table *Table) ComputeRowColSpan() { n := 0 //td总个数 mapRC := map[int]map[int]int{} //记录第几行pos,起始列对应的合并值 for k, v := range table.TRs { n += len(v.TDs) //每行的td总数相加 nk := 0 //nk列的起始,k行的起始||如果有合并,起始就不是0 ball := true rowspans := v.TDs[0].Rowspan //某一行第一个td的rowspan for k1, v1 := range v.TDs { if k1 == 0 && k == 0 { table.TRs[k].TDs[k1].MustBH = true table.TRs[k].TDs[k1].BH = true } if v1.Rowspan != rowspans { ball = false break } } for _, v1 := range v.TDs { if ball { v1.Rowspan = 1 } mc := mapRC[k] for { if mc != nil && mc[nk] > 0 { nk += mc[nk] } else { break } } v1.StartCol = nk nk += v1.Colspan - 1 v1.EndCol = nk if nk >= table.ColNum { table.ColNum = nk + 1 } nk++ v1.StartRow = k v1.EndRow = k + v1.Rowspan - 1 ck := fmtkey("c", v1.StartCol, v1.EndCol) tdcs := table.StartAndEndRation[ck] if tdcs == nil { tdcs = NewTDRationScope(ck) table.StartAndEndRation[ck] = tdcs table.StartAndEndRationKSort.AddKey(ck, 1) } tdcs.Addtd(v1) rk := fmtkey("r", v1.StartRow, v1.EndRow) tdrs := table.StartAndEndRation[rk] if tdrs == nil { tdrs = NewTDRationScope(rk) table.StartAndEndRation[rk] = tdrs table.StartAndEndRationKSort.AddKey(rk, 1) } tdrs.Addtd(v1) if v1.Rowspan > 1 { for i := 1; i < v1.Rowspan; i++ { r := k + i if r < len(table.TRs) { mc := mapRC[r] if mc == nil { mc = map[int]int{} } mc[v1.StartCol] = v1.Colspan mapRC[r] = mc } } } } } table.TDNum = n //td总个数 table.RowNum = len(table.TRs) //tr总行数 } func fmtkey(t string, start, end int) string { return fmt.Sprintf("%s_%d_%d", t, start, end) } //查找每个table的标签,如果有标签可按标签处理,否则根据表格去判断 func (table *Table) FindTag() { //查找每个table的标签,如果有标签可按标签处理,否则根据表格去判断 if table.Tag != "" { return } if table.Tag == "" { table.Tag = table.TableResult.BlockTag } //u.Debug(table.Tag) } //计算r/c_start_end的概率 func (table *Table) GetKeyRation() { for _, vn := range table.StartAndEndRationKSort.Keys { v := table.StartAndEndRation[vn] for _, v1 := range v.Poss { count := 0 n := 0 for _, td := range v.Tdmap[v1] { n++ if td.BH { count++ } } v.Rationmap[v1] = float32(count) / float32(n) } } } //计算行列是表头的概率调用GetKeyRation func (table *Table) ComputeRowColIsKeyRation(isSite bool, codeSite string) { //增加对跨行校正限止 // u.Debug(table.Brule, table.ColNum, table.RowNum, table.TDNum) bkeyfirstrow := false bkeyfirstcol := false if table.Brule { //不存在跨行跨列的情况,规则表格 checkCompute := map[string]bool{} for k, tr := range table.TRs { rk := fmtkey("r", tr.TDs[0].StartRow, tr.TDs[0].EndRow) if k == 0 { //第1行的概率 ck := fmtkey("c", tr.TDs[0].StartCol, tr.TDs[0].EndCol) //u.Debug(table.BFirstRow, "--", table.StartAndEndRation[rk], table.StartAndEndRation[ck]) ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0]) ration2, _ := table.StartAndEndRation[ck].GetTDRation(tr.TDs[0]) if (len(tr.TDs) == 2 && ration2 < 0.55) && (len(tr.TDs) == 2 && ration1 > 0.5) { //第一行为key bkeyfirstrow = true ball := true for _, td := range tr.TDs { if MoneyReg.MatchString(td.Val) { bkeyfirstrow = false ball = false td.BH = false break } } for _, td := range tr.TDs { if ball { //td.BH = true td.KeyDirect = 1 td.KVDirect = 2 } } } else if ration2 > 0.55 { //第1列 bkeyfirstcol = true if !checkCompute[ck] { checkCompute[ck] = true //重置第1列 for _, tr1 := range table.TRs { for _, td1 := range tr1.TDs { if td1.StartCol == 0 { if !MoneyReg.MatchString(td1.Val) { //td1.BH = true td1.KeyDirect = 2 td1.KVDirect = 1 } } } } } } if !bkeyfirstrow && !bkeyfirstcol { if len(tr.TDs) > 1 && ration1 > ration2 && ration1 > 0.5 { bkeyfirstrow = true for _, td := range tr.TDs { if !MoneyReg.MatchString(td.Val) { //td.BH = true td.KeyDirect = 1 td.KVDirect = 2 } } } else if tr.Table.ColNum > 1 && ration2 > 0.5 { bkeyfirstcol = true if !checkCompute[ck] { checkCompute[ck] = true //重置第1列 for _, tr1 := range table.TRs { for _, td1 := range tr1.TDs { if td1.StartCol == 0 { if !MoneyReg.MatchString(td1.Val) { td1.BH = true td1.KeyDirect = 2 td1.KVDirect = 1 } } } } } } } } else { if bkeyfirstrow { //第一列的概率 ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0]) if k == 1 || ration1 < checkval { for _, td := range tr.TDs { if !td.MustBH { td.BH = false td.KeyDirect = 0 td.KVDirect = 0 } } } //else {for _, td := range tr.TDs {}} } else { //列在起作用 if bkeyfirstcol { for _, td := range tr.TDs { ck := fmtkey("c", td.StartCol, td.EndCol) ration1, _ := table.StartAndEndRation[ck].GetTDRation(td) if !checkCompute[ck] { checkCompute[ck] = true if ration1 >= checkval && td.ColPos != 1 { for _, tr1 := range table.TRs { for _, td1 := range tr1.TDs { if td1.StartCol == td.StartCol { if !MoneyReg.MatchString(td1.Val) { td1.BH = true td1.KeyDirect = 2 td1.KVDirect = 1 } } } } } else { for _, tr1 := range table.TRs[1:] { for _, td1 := range tr1.TDs[1:] { if td1.StartCol == td.StartCol && !td1.MustBH { td1.BH = false td1.KeyDirect = 0 td1.KVDirect = 0 } } } } } } } } } } } //qutil.Debug("table.Brule", table.Brule, !bkeyfirstcol && !bkeyfirstrow) if !table.Brule || (!bkeyfirstcol && !bkeyfirstrow) { //断行问题,虽然同列或同行,但中间被跨行截断,表格方向调整 for _, k := range table.StartAndEndRationKSort.Keys { v := table.StartAndEndRation[k] //横向判断,要判断最多的方向,否则会出现不定的情况(map遍历问题) k1 := k[:1] for _, v2 := range v.Poss { lentds := len(v.Tdmap[v2]) if v.Rationmap[v2] > checkval { for _, td := range v.Tdmap[v2] { if td.KeyDirect == 0 && !MoneyReg.MatchString(td.Val) && !GSReg.MatchString(td.Val) { if k1 == "r" { ck := fmtkey("c", td.StartCol, td.EndCol) rt := table.StartAndEndRation[ck] //clen := 0 var fv float32 var tdn []*TD if rt != nil { fv, tdn = rt.GetTDRation(td) //clen = len(tdn) } if lentds > 1 { if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" { td.KeyDirect = 1 td.KVDirect = 2 //td.BH = true } } } else { ck := fmtkey("r", td.StartRow, td.EndRow) rt := table.StartAndEndRation[ck] var fv float32 var tdn []*TD //clen := 0 if rt != nil { fv, tdn = rt.GetTDRation(td) //clen = len(tdn) } if lentds > 1 { if td.Valtype != "NOHEAD" && utf8.RuneCountInString(td.Val) < 15 && ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" { td.KeyDirect = 2 td.KVDirect = 1 td.BH = true } } } } else { break } } } else if v.Rationmap[v2] < 0.5 && len(v.Tdmap[v2]) > 3 { for _, td := range v.Tdmap[v2] { // u.Debug(td.Val, "-----", td.BH) if td.KeyDirect == 0 && td.BH && !td.MustBH { if k1 == "r" { ck := fmtkey("c", td.StartCol, td.EndCol) rt := table.StartAndEndRation[ck] clen := 0 var fv float32 var tdn []*TD if rt != nil { fv, tdn = rt.GetTDRation(td) clen = len(tdn) } if lentds >= clen && lentds > 1 { if (tdn != nil && v.Rationmap[v2] < fv) || tdn == nil { td.BH = false } } } else { ck := fmtkey("r", td.StartRow, td.EndRow) rt := table.StartAndEndRation[ck] var fv float32 var tdn []*TD clen := 0 if rt != nil { fv, tdn = rt.GetTDRation(td) clen = len(tdn) } if lentds >= clen && lentds > 1 { if (tdn != nil && v.Rationmap[v2] < fv) || tdn == nil { td.BH = false } } } } else { break } } } } } } table.GetKeyRation() if len(table.TRs) > 0 && len(table.TRs[0].TDs) > 0 { t0 := table.TRs[0].TDs[0] key := fmtkey("r", t0.StartRow, t0.EndRow) r, t := table.StartAndEndRation[key].GetTDRation(t0) if r > 0.9 && len(t) > 1 { table.BFirstRow = true } for k, tr := range table.TRs { if len(tr.TDs) == 1 && tr.TDs[0].StartCol == 0 && tr.TDs[0].EndCol+1 == table.ColNum { tr.TDs[0].BH = false tr.TDs[0].KVDirect = 0 sv := FindKv(tr.TDs[0].Val, "", 2) _, resm := colonkvEntity.entrance(tr.TDs[0].Val, "", nil, 2, isSite, codeSite) for k, v := range resm { sv.AddKey(k, v) } if len(sv.Keys) > 0 { for _, v1 := range sv.Keys { if tr.TDs[0].SortKV.Map[v1] == nil { table.SortKV.AddKey(v1, sv.Map[v1]) } } } else if table.Tag == "" && k == 0 && len(tr.TDs[0].Val) > 11 { table.Tag = tr.TDs[0].Val } } } } } //查找表格的kv,调用FindTdVal func (table *Table) FindKV(isSite bool, codeSite string) { //判断全是key的表格不再查找 if table.BHeader { //只要一个是key即为true direct := If(table.BFirstRow, 2, 1).(int) //kv,2查找方向,向上查找 vdirect := If(direct == 2, 1, 2).(int) //控制跨行表格 bcon := false //增加表格切块判断,只判断切块分包 //控制中标人排序方向 //bodirect := 0 //控制中标人排序数值 //sort := 1 nextdirect, nextvdirect := 0, 0 //开始抽取 //若第一排全为头-临时让第二排-新增 左临 查询,zhengkun tb_first_allhead := false for tr_index, tr := range table.TRs { if tr_index==6 { //fmt.Println("调试指定tr") } bcon = trSingleColumn(tr, bcon, table) //tr单列,是否丢弃内容 if bcon { continue } if tr.TDs[0].StartRow >= 0 { numbh := 0 for _, td := range tr.TDs { //log.Println(tr_index,kkk,td.Val) if td.BH { numbh++ } } if numbh != 0 && numbh == len(tr.TDs) { //5e0d53ef0cf41612e0640495 if tr_index==0 { tb_first_allhead = true } nextdirect, nextvdirect = 2, 1 continue } else if nextdirect > 0 && nextvdirect > 0 { direct, vdirect = 2, 1 } else if numbh > 0 && numbh <= len(tr.TDs)/2 { direct, vdirect = 1, 2 } else { direct, vdirect = 2, 1 } } for _, td := range tr.TDs { if !td.BH && td.KVDirect < 3 { if !table.FindTdVal(td, direct, vdirect) { //table.FindTdVal()存储了table.SortKV if !table.FindTdVal(td, vdirect, direct) { ////都识别不到时,对第一、二中标候选人的处理 //bo, res := GetBidOrder(td, bodirect, sort) //if res { // sort++ // bodirect = bo //} //if len(td.SortKV.Map) > 0 { // for _, tdv := range td.SortKV.Keys { // if tdv == "" || td.SortKV.Map[tdv] == "" { //value为空或者null不再添加到table.SortKV // continue // } // table.SortKV.AddKey(tdv, td.SortKV.Map[tdv]) // } //} } } if tb_first_allhead && tr_index==1 { //临时-让第二排-向左比对 if !table.FindTdVal(td, 1, 2) { //table.FindTdVal()存储了table.SortKV if !table.FindTdVal(td, vdirect, direct) { } } tb_first_allhead = false } //fmt.Println("td:", td.Val, td.BH, td.HeadTd, td.KVDirect) } } nextdirect, nextvdirect = 0, 0 } //qutil.Debug("FindKV", table.SortKV.Map) } else if len(table.TRs) > 0 { //没有表头的表格处理,默认纵向吧 res := initLongitudinalData(table) //拼装纵向数组 //再拆值,类似http://www.ggzy.hi.gov.cn/cgzbgg/16553.jhtml第二列,有多个值 nmapkeys := []int{} nmap := map[int][]*u.Kv{} L: for _, r1 := range res { for n, r := range r1 { if len([]rune(r)) < 60 { // 长度小于60才去分 //res1, _ := GetKVAll(r, "", nil) res1, _ := colonkvEntity.entrance(r, "", nil, 2, isSite, codeSite) if res1 != nil { nmap[n] = res1 nmapkeys = append(nmapkeys, n) /** //截取串 for _k1, _ := range res1 { r = regexp.MustCompile(_k1+".*").ReplaceAllString(r, "") } r1[n] = r res[pos] = r1 **/ } else if nmap[n] != nil { //放空值 nmap[n] = append(nmap[n], &u.Kv{}) } } else { nmap = nil nmapkeys = nil break L } } } //调整 if len(nmap) > 0 { kmapkeys := []string{} kmap := map[string][]string{} for _, mk := range nmapkeys { //同是第n列 for pos, m1 := range nmap[mk] { k, v := m1.Key, m1.Value kv := kmap[k] if kv == nil { kv = []string{} } kv = append(kv, v) kmap[k] = kv kmapkeys = append(kmapkeys, k) for _, k := range kmapkeys { arr := kmap[k] if len(arr) < pos { arr = append(arr, "") kmap[k] = arr kmapkeys = append(kmapkeys, k) } } } } if len(kmap) > 0 { for _, k := range kmapkeys { if len(kmap[k]) == 1 { table.SortKV.AddKey(k, kmap[k][0]) } else if len(kmap[k]) > 1 { table.SortKV.AddKey(k, kmap[k]) } } } } //================= //解析值放到map中 for _, arr := range res { if len(arr) > 0 { v1 := arr[0] _, _, _, _, repl := CheckCommon(v1, "con") if repl == "ENT" { table.SortKV.AddKey("中标单位", arr) continue } else if repl == "BO" { table.SortKV.AddKey("排名", arr) continue } } } } //qutil.Debug("Table-FindKV", table.SortKV.Map) } //初始化组装纵向数据 func initLongitudinalData(table *Table) [][]string { res := make([][]string, len(table.TRs[0].TDs)) //创建table第一行的列数长度 for n, _ := range res { res[n] = []string{} } for _, tr := range table.TRs { for n, td := range table.TRs[0].TDs { //第一行的所有td td1 := table.GetTdByRCNo(tr.TDs[0].StartRow, td.StartCol) //根据行号列号获取td对象 if td1 != nil { res[n] = append(res[n], td1.Val) } else { res[n] = append(res[n], "") } } } return res } //tr单列,是否丢弃内容 func trSingleColumn(tr *TR, bcon bool, table *Table) bool { if len(tr.TDs) == 1 { bcon = false td := tr.TDs[0] if td.StartCol == 0 && td.EndCol+1 == table.ColNum && len([]rune(td.Val)) > 4 && len([]rune(td.Val)) < 50 { res, _, _, _, _ := CheckCommon(td.Val, "abandontable") if res { //以下内容丢弃 bcon = true } } } return bcon } //获取中标人顺序 //direct 0默认 1横向 2纵向 func GetBidOrder(td *TD, direct, n int) (d int, res bool) { if td.Valtype != "BO" { return } if td.Rowspan > 1 { for i := 0; i < td.Rowspan; i++ { nextcol := 1 L1: for { vtd := td.TR.Table.GetTdByRCNo(td.StartRow+i, td.EndCol+nextcol) if vtd == nil { break L1 } nextcol += vtd.Colspan if filter_zbdw_v2.MatchString(vtd.Val) { arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": vtd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.AddKey(NullTxtBid, a1) } } } } else if td.Colspan > 1 { for i := 1; i < td.Colspan; i++ { nextcol := 0 L2: for { vtd := td.TR.Table.GetTdByRCNo(td.StartRow+i, td.StartCol+nextcol) if vtd == nil || vtd.Colspan >= td.Colspan { break L2 } nextcol += vtd.Colspan if filter_zbdw_v2.MatchString(vtd.Val) { arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": vtd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.AddKey(NullTxtBid, a1) } } } } else { rtd := td.TR.Table.GetTdByRCNo(td.StartRow, td.EndCol+1) btd := td.TR.Table.GetTdByRCNo(td.EndRow+1, td.StartCol) //if ((rtd != nil && !rtd.BH && rtd.Valtype == "BO") || direct == 1) && btd != nil && filter_zbdw_v.MatchString(btd.Val) { if ((rtd != nil && !rtd.BH) || direct == 1) && btd != nil && filter_zbdw_v2.MatchString(btd.Val) { d = 1 arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": btd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.AddKey(NullTxtBid, a1) //} else if ((btd != nil && !btd.BH && btd.Valtype == "BO") || direct == 2) && rtd != nil && filter_zbdw_v.MatchString(rtd.Val) { } else if ((btd != nil && !btd.BH) || direct == 2) && rtd != nil && filter_zbdw_v2.MatchString(rtd.Val) { d = 2 arrbo := td.TR.Table.SortKV.Map[NullTxtBid] if arrbo == nil { arrbo = []map[string]interface{}{} td.TR.Table.SortKV.AddKey(NullTxtBid, arrbo) } a1 := arrbo.([]map[string]interface{}) a1 = append(a1, map[string]interface{}{ "entname": rtd.Val, "sortstr": td.Val, "sort": GetBidSort(td.Val, n), }) res = true td.TR.Table.SortKV.AddKey(NullTxtBid, a1) } } return } func GetBidSort(str string, n int) int { val := n if strings.Index(str, "首选") > -1 { val = 1 } else { val = winnerOrderEntity.toNumber(str, n) } return val } var cleardwReg *regexp.Regexp = regexp.MustCompile("[((]{1}\\d*[人元件个公斤户]/[人元件个公斤户][))]") var zbhxrReg *regexp.Regexp = regexp.MustCompile("(中标候选人|投标单位名称)") //查找每一个单元格的表头,调用FindNear func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) { if td.Val == "" || strings.TrimSpace(td.Val) == "" { return } near := table.FindNear(td, direct) if near != nil && near.BH && (near.KeyDirect == vdirect || near.KeyDirect == 0) && (near.KVDirect == direct || near.KVDirect == 0) && near.KVDirect < 3 { near.KVDirect = direct near.KeyDirect = vdirect td.KVDirect = direct key := repSpace.ReplaceAllString(near.Val, "") if key == "名称" && near.StartCol == 0 && near.Rowspan > 0 { for _, vn := range table.TRs[near.Rowspan-1].TDs { if strings.Contains(vn.Val, "代理") { key = "代理机构" break } else if strings.Contains(vn.Val, "招标") { key = "采购单位" break } else if strings.Contains(vn.Val, "中标") { key = "中标单位" break } } } else if zbhxrReg.MatchString(key) && findCandidate2.MatchString(td.Val) { key = "中标单位" } else if key == "单位名称" { tmpnewnear := table.FindNear(near, 2) if tmpnewnear != nil { if tmpnewnear.MustBH || tmpnewnear.BH { key = tmpnewnear.Val + near.Val } } else { tmpnewnear = table.FindNear(near, 1) if tmpnewnear != nil { if tmpnewnear.MustBH || tmpnewnear.BH { key = tmpnewnear.Val + near.Val } } } } if near.Val == "" { key = fmtkey("k", near.TR.RowPos, near.ColPos) } val := table.SortKV.Map[key] //qutil.Debug("====================", "key:", key, "val:", val) bthiskey := false if val != nil { curpos := table.SortKV.Index[key] thistr := table.kTD[curpos] if thistr != near { if strings.TrimSpace(near.Val) == "名称" && near.TR != nil && len(near.TR.TDs) > 0 && near.ColPos-1 >= 0 { rv := near.TR.TDs[near.ColPos-1].Val if near.ColPos > 0 && (strings.Contains(rv, "招标") || strings.Contains(rv, "代理") || strings.Contains(rv, "采购") || strings.Contains(rv, "中标")) { near = near.TR.TDs[near.ColPos-1] } } else { bthiskey = true } } else { bthiskey = true } } bfind := false barr := false varrpos := -1 if bthiskey { //处理是数组值,且有合并行或合并列的情况 kvscope,对数组值的处理 pos := table.SortKV.Index[key] mval := table.kvscope[pos] bvalfind := false if direct == 1 { //kv是横向 L1: for k3, v3 := range mval { for _, v4 := range v3 { if v4.EndRow+1 == td.StartRow && v4.EndCol == td.EndCol { varrpos = k3 bvalfind = true break L1 } } } } else { //kv是纵向 L2: for k3, v3 := range mval { for _, v4 := range v3 { if v4.EndCol+1 == td.StartCol && v4.EndRow == td.EndRow { varrpos = k3 bvalfind = true break L2 } } } } if vals, ok := val.([]string); ok { if near.Val == "" { bn := false for _, vs := range vals { if vs != "" && NullTdReg.MatchString(vs) { bn = true } else { bn = false break } } if bn { near.Val = NullTxtBid key = NullTxtBid bfind = true } } if bvalfind && varrpos > -1 && len(vals) > varrpos { tmapval := strings.TrimSpace(cleardwReg.ReplaceAllString(td.Val, "")) if tmapval == "" { vals = append(vals, td.Val) // 累加 } else { vals = append(vals, tmapval) // 累加 } val = vals //vals[varrpos] = td.Val // += "__" + td.Val } else { //添加时候去除空值和nil newVals := []string{} for _, isval := range vals { if isval == "" { continue } newVals = append(newVals, isval) } //vals = append(vals, td.Val) if td.Val != "" { newVals = append(newVals, td.Val) } val = newVals varrpos = len(vals) - 1 } } else if vals, ok := val.(string); ok && vals != "" && td.Val != "" { tmapval := strings.TrimSpace(cleardwReg.ReplaceAllString(vals, ""))//已存在的kv tmapvaltd := strings.TrimSpace(cleardwReg.ReplaceAllString(td.Val, "")) if bvalfind { //if tmapvaltd == "" { // val = td.Val //vals + "__" + td.Val //} else { // val = tmapvaltd //} if key=="中标单位" { //不能覆盖--- }else { if tmapvaltd == "" { val = td.Val //vals + "__" + td.Val } else { val = tmapvaltd } } } else{ if key=="中标单位" { //新增不能数组 }else { tval := []string{} if tmapval == "" { tval = append(tval, vals) } else { tval = append(tval, tmapval) } if tmapvaltd == "" { tval = append(tval, td.Val) } else { tval = append(tval, tmapvaltd) } val = tval varrpos = 1 } } } barr = true } else { if td.Val != "" { tmapval := strings.TrimSpace(cleardwReg.ReplaceAllString(td.Val, "")) if tmapval == "" { val = td.Val } else { val = tmapval } } else if len(near.SortKV.Map) == 1 && near.SortKV.Map[near.Val] != "" { val = near.SortKV.Map[near.Val] } } td.HeadTd = near if bfind { tkey := fmtkey("k", near.TR.RowPos, near.ColPos) table.SortKV.ReplaceKey(key, val, tkey) } else { if key == "单位名称" && len(near.TR.TDs) > 1 { if near.TR.TDs[0].Val != "序号" { key = near.TR.TDs[0].Val } } table.SortKV.AddKey(key, val) pos := table.SortKV.Index[key] if barr { mval := table.kvscope[pos] if mval != nil { tds := mval[varrpos] if tds != nil { tds = append(tds, td) } else { tds = []*TD{td} } if varrpos > -1 { mval[varrpos] = tds table.kvscope[pos] = mval } } } else { table.kvscope[pos] = map[int][]*TD{ 0: []*TD{td}, } table.kTD[pos] = near } } b = true } return } //查找单元格的表头时,横向或纵向 func (table *Table) FindNear(td *TD, direct int) *TD { if direct == 1 && td.StartCol > 0 { //左临 tr := table.TRs[:td.TR.RowPos+1] for i := len(tr) - 1; i > -1; i-- { tds := tr[i].TDs for _, td1 := range tds { if td1.StartRow <= td.StartRow && td1.EndRow >= td.EndRow && td1.EndCol+1 == td.StartCol { //找到左临节点 if td1.BH { return td1 } else { if td1.HeadTd != nil && td1.HeadTd.KVDirect == direct { return td1.HeadTd } } } } } } else if direct == 2 && td.StartRow > 0 { //上临 tr := table.TRs[:td.TR.RowPos] for i := len(tr) - 1; i > -1; i-- { tds := tr[i].TDs for it, td1 := range tds { if td1.StartCol <= td.StartCol && td1.EndCol >= td.EndCol && td1.EndRow+1 == td.StartRow { //找到左临节点 if td1.BH { return td1 } else if len(tr[i].TDs) == len(td.TR.TDs) && td1.HeadTd != nil && td1.HeadTd.KVDirect == direct { return td1.HeadTd } else if it > 0 && td1.Val == "" && td1.TR.TopTR == nil && len(td.TR.TDs)-(td.StartCol-1) > 0 && strings.Contains(td.TR.TDs[td.StartCol-1].Val, "中标候选人") { return tds[it-1] } else if td1.HeadTd != nil && td1.HeadTd.KVDirect == direct && td.Colspan == td1.Colspan && td.Rowspan == td.Rowspan { return td1.HeadTd } } else if td1.StartCol <= td.StartCol && td1.EndCol >= td.EndCol && td1.EndRow+1 == td.StartRow { } } } } return nil } //根据行号列号获取td对象 func (tn *Table) GetTdByRCNo(row, col int) *TD { for _, tr := range tn.TRs { for _, td := range tr.TDs { if td.StartCol <= col && td.EndCol >= col && td.StartRow <= row && td.EndRow >= row { return td } } } return nil } //判断表格是否是分包 func (tn *Table) CheckMultiPackageByTable(isSite bool, codeSite string) (b bool, index []string) { pac := 0 //包的数量 val := 0 //分值 index = []string{} //存储分包,使用tbale.SortKV的key和value使用正则等处理对值进行判断 index_pos := []int{} //下标 //是数组且能找到标段之类的提示 //arr_count := 0 //计数table.SortKV的value是数组的数量,后面没用 key_index := -1 hasPkgTd := map[string]bool{} //初始化CheckMultiPackageByTable方法需要的数据 key_index, index, index_pos, val, pac, hasPkgTd = initCheckMultiPackageByTable(tn, key_index, index, index_pos, val, pac, hasPkgTd) //key是分包的情况 //记录key对应的值 commonKeyVals := map[string][]string{} //记录key出现的次数 keyExistsCount := map[string]int{} if pac > 1 { val = 10 } else { //查找标签 if TableMultiPackageReg_4.MatchString(tn.Tag) { val += 4 } else if TableMultiPackageReg_2.MatchString(tn.Tag) { val += 4 } //根据table.SortKV的key判断是否分包,如果没有再根据value判断 val, index, index_pos = foundPacBySortKV(tn, val, index, index_pos, &keyExistsCount, &commonKeyVals, key_index, hasPkgTd) } // u.Debug(index) //过滤重复及标准化! standIndex := []string{} standIndex_pos := []int{} oldIndex := []string{} //存放包的原始值 brepeat := map[string]bool{} for k, v := range index { v = u.PackageNumberConvert(v) if !brepeat[v] { brepeat[v] = true standIndex = append(standIndex, v) standIndex_pos = append(standIndex_pos, index_pos[k]) oldIndex = append(oldIndex, index[k]) } } index = standIndex //有一个以上的包,并且相同的key出现一次以上,认为这个key是属于包里面的 if len(commonKeyVals) > 0 { for k, v := range commonKeyVals { if len(index) > 1 && keyExistsCount[k] < 2 { continue } tn.SortKV.AddKey(k, v) } } // isGoonNext := false if val > 4 && len(brepeat) > 0 { b = true //多包解析 if b { tn.BPackage = true //pnum := len(index) //根据数组index分包长度添加table.BlockPackage子包数组 for nk, v := range index { if tn.BlockPackage.Map[v] == nil { kv := u.NewJobKv() for tnk, tnv := range tn.StandKV { if nk >= len(tnv) { continue } else if len(index) == len(tnv) { //特殊处理- if tnk=="预算"&& codeSite=="ha_zmdszfcgw_cgxx" && len(tnv)>1{ isEqErr,budget_v := false,"" for bk,bv:=range tnv { if bk==0 { budget_v = bv.Value }else { if budget_v != bv.Value { isEqErr = true break } } } if isEqErr { kv.KvTags[tnk] = append(kv.KvTags[tnk], tnv[nk]) } }else { kv.KvTags[tnk] = append(kv.KvTags[tnk], tnv[nk]) } } } //kv.KvTags = tn.StandKV bp := &u.BlockPackage{} bp.Index = v //序号 (转换后编号,只有数字或字母) bp.Origin = oldIndex[nk] //包的原始值 bp.TableKV = kv //table kv (分出的对应的KV值) bp.Name = v if bp.TableKV != nil && bp.TableKV.KvTags != nil && len(bp.TableKV.KvTags) > 0 { for kc, cv := range bp.TableKV.KvTags { if kc == "预算" && bp.Budget <= 0 { moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { bp.Budget = vf bp.IsTrueBudget = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { bp.Budget = float64(vi) bp.IsTrueBudget = moneys[len(moneys)-1].(bool) } } } else if kc == "中标金额" && bp.Bidamount <= 0 { moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { bp.Bidamount = vf bp.IsTrueBidamount = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { bp.Bidamount = float64(vi) bp.IsTrueBidamount = moneys[len(moneys)-1].(bool) } } } else if kc == "中标单位" && bp.Winner == "" { bp.Winner = cv[0].Value } //拼接内容 if !excludeKey.MatchString(kc) { bp.Text += fmt.Sprintf("%v:%v\n", kc, cv[0].Value) } } } tn.BlockPackage.AddKey(v, bp) //table子包数组 } } isGoonNext = tn.manyPackageProcessByIndex(index, standIndex_pos, isSite, codeSite) //多包处理,处理不同情况下的分包 } } else { isGoonNext = true } if isGoonNext { //没有处理成数组的情况下,继续调用正文查找分包的方法 tn.isGoonNext(isSite, codeSite) } //查找分包中的中标人排序 if tn.BlockPackage != nil && tn.BlockPackage.Keys != nil && len(tn.BlockPackage.Keys) > 0 { for _, v := range tn.BlockPackage.Keys { vv, ok := tn.BlockPackage.Map[v].(*u.BlockPackage) if ok && (vv.WinnerOrder == nil || len(vv.WinnerOrder) == 0) { vv.WinnerOrder = winnerOrderEntity.Find(vv.Text, true, 2, isSite, codeSite) } } } return } //多包处理,处理不同情况下的分包 func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int, isSite bool, codeSite string) (isGoonNext bool) { if len(index) == 1 { //是一个的情况 if len(tn.SortKV.Keys) < 10 && tn.ColNum < 10 && tn.RowNum < 4 { //table带排序的KV值小于10并且小于10列和小于4行 beq := true for _, v2 := range tn.SortKV.Keys { if _, ok := tn.SortKV.Map[v2].(string); !ok { beq = false break } } if beq { //统一处理为数组 td := tn.GetTdByRCNo(tn.RowNum-1, 0) if !td.BH && FindVal2_1.MatchString(td.Val) { for _, v2 := range tn.SortKV.Keys { tn.SortKV.AddKey(v2, []string{tn.SortKV.Map[v2].(string)}) } } else { //没有处理成数组的情况下,继续调用正文查找分包的方法 isGoonNext = true } } } } for _, k1 := range tn.SortKV.Keys { v1 := tn.SortKV.Map[k1] var v1_arr []string if vtmpv1, ok := v1.(string); ok { v1_arr = PreCon4.FindAllString(qutil.ObjToString(vtmpv1), -1) if len(v1_arr) > 0 { if dw := Precon4dw.FindString(vtmpv1); dw != "" { for i, v := range v1_arr { v1_arr[i] = v + dw } } } } else if vtmpv1s, ok := v1.([]string); ok { v1_arr = vtmpv1s } if len(v1_arr) > 0 && len(v1_arr) <= len(index) { //table.SortKV.Map.value数组小于等于分包index for k, v := range v1_arr { tn.assemblePackage(k1, v, index[k], isSite, codeSite) //组装解析到的分包 } } } return isGoonNext } //没有处理成数组的情况下,继续调用正文查找分包的方法 func (tn *Table) isGoonNext(isSite bool, codeSite string) { blockPackage := map[string]*u.BlockPackage{} for _, k := range tn.SortKV.Keys { if excludeKey.MatchString(k) || strings.Contains(k, "批复") || excludeKey3.MatchString(k) { continue } str := "" //拼装为冒号kv v := tn.SortKV.Map[k] nk := regReplAllSpace.ReplaceAllString(k, "") if vs, ok := v.([]string); ok { str += fmt.Sprintf("%s:%s\n", nk, strings.Join(vs, " ")) } else { str += fmt.Sprintf("%s:%s\n", nk, v) } if excludeKey2.MatchString(str) { continue } b, _ := divisionPackageChild(&blockPackage, str, tn.Tag, false, false, isSite, codeSite) //分块之后分包 if b && len(blockPackage) > 0 { tn.BPackage = true for mk, mv := range blockPackage { if tn.BlockPackage.Map[mk] == nil { tn.BlockPackage.AddKey(mk, mv) } else { bp := tn.BlockPackage.Map[mk].(*u.BlockPackage) if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } if bp.SpaceKV == nil { bp.SpaceKV = u.NewJobKv() } for k2, v2 := range mv.ColonKV.KvTags { for _, v2v := range v2 { isExists := false for _, v2vv := range bp.TableKV.KvTags[k2] { if v2v.Value == v2vv.Value { isExists = true break } } if !isExists { bp.TableKV.KvTags[k2] = append(bp.TableKV.KvTags[k2], v2v) bp.Text += fmt.Sprintf("%v:%v\n", k2, v2) } } } for k2, v2 := range mv.SpaceKV.KvTags { for _, v2v := range v2 { isExists := false for _, v2vv := range bp.SpaceKV.KvTags[k2] { if v2v.Value == v2vv.Value { isExists = true break } } if !isExists { bp.SpaceKV.KvTags[k2] = append(bp.SpaceKV.KvTags[k2], v2v) bp.Text += fmt.Sprintf("%v:%v\n", k2, v2) } } } } } tn.BPackage = true tn.SortKV.RemoveKey(k) } } } //根据table.SortKV的key判断是否分包,如果没有再根据value判断 func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, keyExistsCount *map[string]int, commonKeyVals *map[string][]string, key_index int, hasPkgTd map[string]bool) (rval int, rindex []string, rindex_pos []int) { keyIsPkg := false for in, k := range tn.SortKV.Keys { if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) || excludeKey3.MatchString(k) || regFJWarap.MatchString(k) || regAZWarap.MatchString(k) { //判断分包前排除 continue } v := tn.SortKV.Map[k] //key是分包的情况 if ismatch := FindVal_1.MatchString(k); keyIsPkg || ismatch { if ismatch { keyIsPkg = true val += 4 pkgFlag := FindVal_1.FindString(k) //对值进行分包判断 k = strings.Replace(k, pkgFlag, "", -1) index = append(index, pkgFlag) index_pos = append(index_pos, len(index)) val += 1 //pac++ } else { k = strings.TrimRight(k, "_") } (*keyExistsCount)[k] = (*keyExistsCount)[k] + 1 (*commonKeyVals)[k] = append((*commonKeyVals)[k], qutil.ObjToString(v)) } else if k1 := FilterKey_2.ReplaceAllString(k, ""); FindKey_2.MatchString(k1) { val += 4 //value数组分包 if vs, bvs1 := v.([]string); bvs1 { L: for in2, v1 := range vs { if len([]rune(v1)) < 20 && !moneyNum.MatchString(v1) && FindVal2_1.MatchString(v1) { for _, serial := range tn.TableResult.RuleBlock.TitleRegs { if serial.MatchString(v1) { break L } } if key_index == -1 { key_index = in } else if key_index != in { break } index = append(index, v1) index_pos = append(index_pos, in2) val += 1 //pac++ } } } else if v1, ok := v.(string); ok && !hasPkgTd[k] { //value字符串分包 v1 = replPkgConfusion(v1) //替换分包中混淆的词 for _, v2 := range strings.Split(v1, "/") { if len([]rune(v2)) < 20 && !moneyNum.MatchString(v2) && FindVal2_1.MatchString(v2) { key_index = in index = append(index, v1) index_pos = append(index_pos, 0) val += 1 //pac++ underline := "" for { underline += "_" if tn.SortKV.Map[k+underline] == nil { break } else if v3, v2_ok := tn.SortKV.Map[k+underline].(string); v2_ok && v3 != "" { index = append(index, v3) index_pos = append(index_pos, 1) } else if v3, v2_ok := tn.SortKV.Map[k+underline].([]string); v2_ok { for v2_k, v2_v := range v3 { index = append(index, v2_v) index_pos = append(index_pos, v2_k+1) } } } break } } } if k1=="标段" && len(index)==0 { continue }else { break } } } return val, index, index_pos } //初始化CheckMultiPackageByTable方法需要的数据 func initCheckMultiPackageByTable(tn *Table, key_index int, index []string, index_pos []int, val int, pac int, hasPkgTd map[string]bool) (rkey_index int, rindex []string, rindex_pos []int, rval int, rpac int, rhasPkgTd map[string]bool) { for in, k := range tn.SortKV.Keys { //涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)就跳过 if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) || excludeKey3.MatchString(k) || strings.Contains(k, "批复") { continue } v := tn.SortKV.Map[k] if vs, bvs := v.([]string); bvs { //arr_count++ haspkgs := []string{} for in2, v1 := range vs { v1 = replPkgConfusion(v1) //替换分包中混淆的词 if len([]rune(v1)) < 8 && !moneyNum.MatchString(v1) && FindVal_1.MatchString(v1) { if key_index == -1 { key_index = in } else if key_index != in { break } index = append(index, FindVal_1.FindString(v1)) index_pos = append(index_pos, in2) val += 1 pac++ } else if FindKey_3.MatchString(k) { //5db2a101a5cb26b9b73054ac index = append(index, v1) index_pos = append(index_pos, in2) val += 1 pac++ } else { if ok, v1new := isHasOnePkgAndNoKv(v1); ok { //td的值里面有一个包,并且没有冒号kv haspkgs = append(haspkgs, v1new) } } } /*处理这种情况: 包一:xxxxxxxxx 包二:xxxxxxxxx */ if len(index) == 0 && len(haspkgs) > 0 && len(haspkgs) == len(vs) { for in2, v1 := range haspkgs { if key_index == -1 { key_index = in } else if key_index != in { break } index = append(index, v1) index_pos = append(index_pos, in2) val += 1 pac++ } } } else if v1, ok := v.(string); ok { v1 = replPkgConfusion(v1) //替换分包中混淆的词 if len([]rune(v1)) < 8 && !moneyNum.MatchString(v1) && FindVal_1.MatchString(v1) { key_index = in index = append(index, FindVal_1.FindString(v1)) index_pos = append(index_pos, 0) val += 1 pac++ } else if getTd := tn.GetTdByRCNo(0, tn.SortKV.Index[k]); getTd != nil && getTd.KVDirect == 2 { //纵向 /*处理这种情况: 包一:xxxxxxxxx */ if ok, v1new := isHasOnePkgAndNoKv(v1); ok { hasPkgTd[k] = true key_index = in index = append(index, v1new) index_pos = append(index_pos, 0) val += 1 pac++ } } } } return key_index, index, index_pos, val, pac, hasPkgTd } //组装解析到的分包,//key如果匹配到抽取关键词就添加到table.SortKV func (tn *Table) assemblePackage(k1, v1, key string, isSite bool, codeSite string) { bp := tn.BlockPackage.Map[key].(*u.BlockPackage) if bp.TableKV == nil { bp.TableKV = u.NewJobKv() } if v1 != "" { kvTags, _ := CommonDataAnaly(k1, "中标情况", "", v1, isSite, codeSite) //匹配抽取关键词 for k3, v3 := range kvTags { if bp.TableKV.KvTags[k3] == nil { bp.TableKV.KvTags[k3] = append(bp.TableKV.KvTags[k3], v3...) } else if k3 == "预算" && bp.Budget <= 0 { moneys := clear.ObjToMoney([]interface{}{v3[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { bp.Budget = vf bp.IsTrueBudget = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { bp.Budget = float64(vi) bp.IsTrueBudget = moneys[len(moneys)-1].(bool) } } } else if k3 == "中标金额" && bp.Bidamount <= 0 { moneys := clear.ObjToMoney([]interface{}{v3[0].Value, ""}) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { bp.Bidamount = vf bp.IsTrueBidamount = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { bp.Bidamount = float64(vi) bp.IsTrueBidamount = moneys[len(moneys)-1].(bool) } } } } } k1 = regReplAllSpace.ReplaceAllString(k1, "") //拼接内容 if !excludeKey.MatchString(k1) { bp.Text += fmt.Sprintf("%v:%v\n", k1, v1) } tn.BlockPackage.AddKey(key, bp) } /** 之前爬虫过来的数据对table表格的抓取异常问题 查找并修正不规则表格的字符串,只对全文做处理,块内的表格不需要修正 **/ var thbf = regexp.MustCompile("(?i)") //需要保留thead var saveThead = regexp.MustCompile("(?is)(.+?)") var clearpkg = regexp.MustCompile("(标示|标识)") func RepairCon(con string) string { con = clearpkg.ReplaceAllString(con, "") res := saveThead.FindAllStringSubmatch(con, 1) th := "" if len(res) == 1 && len(res[0]) == 2 { th = u.TrimLeftSpace(res[0][1], "") } con = thbf.ReplaceAllString(con, "") con = u.TrimLeftSpace(con, "") itbody := strings.Index(con, " itbody { con = findpos(con, iLen, itbody) } } //保留第一个thead if th != "" { con = strings.Replace(con, th, ""+th+"", 1) } //u.Debug(con) return con } //修复表格 func findpos(con string, iLen, start int) (newcon string) { defer qutil.Catch() n := len(con) layer := 0 pos := 0 if start >= 0 { if iLen == 6 { for i := iLen + start; i < len(con); i++ { if con[i] == '<' && i+6 < n { str := con[i : i+6] if str == " 3 { pos = lasttr + 5 } else if pos > 0 { pos += 5 } if pos <= n && pos < len(con) && start < pos { newcon = con[:start] + "" + con[start:pos] + "
" + con[pos:] } } } if newcon == "" { newcon = con } return } //td的值里面有一个包,并且没有冒号kv func isHasOnePkgAndNoKv(v1 string) (bool, string) { v1s := FindVal_1.FindAllString(v1, -1) colonCount := len(regDivision.FindAllString(v1, -1)) if len(v1s) == 1 && colonCount < 2 { ispkgcolon := regexp.MustCompile(v1s[0] + "[::]").MatchString(v1) if (ispkgcolon && colonCount == 1) || (!ispkgcolon && colonCount == 0) { return true, v1s[0] } } return false, v1 } //替换分包中混淆的词 func replPkgConfusion(v1 string) string { v1 = PreReg.ReplaceAllString(v1, "") v1 = PreReg1.ReplaceAllString(v1, "") v1 = PreCon.ReplaceAllString(v1, "") v1 = PreCon2.ReplaceAllString(v1, "") return v1 } //对td中的值,进行再处理 func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat, isSite bool, codeSite string) { //处理表格中的联系人信息 indexMap := contactFormat.IndexMap matchMap := contactFormat.MatchMap //qutil.Debug("==============================td=======================", indexMap, matchMap) weightMap := map[string]map[string]interface{}{} //权重 mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上 reCreate := false matchCount := 0 contactTypeTagMap := map[string]map[string][]interface{}{} //qutil.Debug("============================", mustMatchFirst, indexMap, matchMap) notMatchTrCount := 0 allAscFind := true //开启正序查询 //涉及变量allAscFind,indexMap if len(indexMap) == 0 { isCanAddToIndexMap := false matchPrevFlag := false prevCanAddToIndexMap := false LS: for _, tr := range tn.TRs { for td_index, td := range tr.TDs { thisTdKvs := tn.tdkv(td) //获取td冒号kv //qutil.Debug(td.Val, len(thisTdKvs)) // for _, v := range thisTdKvs { // qutil.Debug(v.Key, v.Value) // } if len(thisTdKvs) != 1 { preTdIndex := td_index - 1 if preTdIndex >= 0 { preTdVal := tr.TDs[td_index-1].Val tdType := "" //前一个td中是否是采购、代理、中标 for k, v := range ContactType { if v.MatchString(preTdVal) { tdType = k break } } if tdType != "" { for _, this := range thisTdKvs { if str := ContactInfoVagueReg.FindString(this.Key); str != "" { td.SortKV.AddKey(tdType+str, this.Value) } } } } continue } //1.处理带括号的()[]【】采购单位,代理机构;2.识别采购单位联系人、联系电话、代理机构联系人、联系电话 goOnFunc, isContinue, td_k := tn.tdKV(thisTdKvs[0].Key, &matchPrevFlag, &isCanAddToIndexMap, &indexMap, "LS") //qutil.Debug("goOnFunc---", goOnFunc, "isContinue---", isContinue, "indexMap---", indexMap, "isCanAddToIndexMap---", isCanAddToIndexMap) if !goOnFunc { break LS } if isContinue { continue } //采购单位,代理机构,中标单位 //qutil.Debug("td_k---", td_k, HasOrderContactType(td_k)) for _, k := range HasOrderContactType(td_k) { if !ContactType[k].MatchString(td_k) { //不是采购单位,代理机构,中标单位跳过 continue } if len(indexMap) == 0 { //qutil.Debug("isCanAddToIndexMap---", isCanAddToIndexMap, "prevCanAddToIndexMap---", prevCanAddToIndexMap, len(tr.TDs)) if isCanAddToIndexMap || (prevCanAddToIndexMap && len(tr.TDs) == 1) { myPrevTdVal := "" if td_index-2 >= 0 { myPrevTdVal = tr.TDs[td_index-2].Val if myPrevTdVal != "" && len([]rune(myPrevTdVal)) < 10 && ContactInfoMustReg.MatchString(myPrevTdVal) { matchPrevFlag = true } } indexMap[0] = k break } } else { indexMap = map[int]string{} break LS } } } prevCanAddToIndexMap = isCanAddToIndexMap isCanAddToIndexMap = false } if len(indexMap) > 0 { allAscFind = false } } ////// L: for tr_index, tr := range tn.TRs { thisTrHasMatch := false jumpNextTd := false for td_index, td := range tr.TDs { //和|以?及|与|、多个词和在一起 jumpNextTd, thisTrHasMatch = tn.tdsMultipleWords(jumpNextTd, td, td_index, tr, thisTrHasMatch, indexMap) //分块之后的kv thisTdKvs := kvAfterDivideBlock("", td.Text, 3, tn.TableResult.RuleBlock, isSite, codeSite) if len(thisTdKvs) == 0 { thisTdKvs = tn.tdkv(td) //获取冒号kv } tdAscFind := true //开启td正序查询 //qutil.Debug("---", td.Val, len(thisTdKvs), len(indexMap)) if len(thisTdKvs) == 0 { continue } else if allAscFind && len(thisTdKvs) >= 3 && len(indexMap) == 0 { //采购人在联系人、电话后面的处理 tdAscFind = tn.hasIndexMap(thisTdKvs, &indexMap, tdAscFind) } //qutil.Debug(len(thisTdKvs), len(tr.TDs)) // if len(thisTdKvs) >= 2 && len(tr.TDs) == 1 { //td中包含多个kv值 5d6b2aa2a5cb26b9b73e79d2 // tmpIndexMap := map[int]string{} // start := 0 // for _, td_kv := range thisTdKvs { // qutil.Debug(td_kv.Key) // for _, k := range HasOrderContactType(td_kv.Key) { // tmpIndexMap[start] = k // start++ // } // } // indexMap = tmpIndexMap // } prevKey := "" oldIndexMapLength := len(indexMap) thidTdIndex := td_index //notmatchCount := 0 kvTitle := "" //qutil.Debug("indexMap++++++++++++++++++", indexMap, "len(thisTdKvs)", len(thisTdKvs), oldIndexMapLength) if len(thisTdKvs) >= 2 { //td中有多个kv重置indexMap indexMap = map[int]string{} allAscFind = true } for _, td_kv := range thisTdKvs { iscontinue := false td_v := td_kv.Value td_k := FilterContactKey(td_kv.Key) //带括号()[]的采购单位,代理机构处理 td_k_length := len([]rune(td_k)) if td_k_length < 2 || td_k_length > 15 { continue } //都为正序查询 //qutil.Debug("td_k+++", td_k, "td_v+++", td_v, "allAscFind+++", allAscFind, "tdAscFind+++", tdAscFind) if allAscFind && tdAscFind { //都为正序查询处理 matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex = tn.asdFind(td_k, matchCount, weightMap, matchMap, td, thisTrHasMatch, td_kv, indexMap, iscontinue, reCreate, thidTdIndex, isSite, codeSite) } //qutil.Debug("indexMap++++++", indexMap, len(indexMap), "iscontinue+++", iscontinue) if iscontinue { continue } //不在同一块中 //qutil.Debug(td_kv.Title, kvTitle, !ContactInfoMustReg.MatchString(td_kv.Key)) if td_kv.Title != "" && kvTitle != td_kv.Title && len(indexMap) > 0 && !ContactInfoMustReg.MatchString(td_kv.Key) { thidTdIndex = 0 matchMap = map[string]map[string]bool{} indexMap = map[int]string{} } kvTitle = td_kv.Title //qutil.Debug(td_k_length, td_k_length, len(indexMap)) if td_k_length < 2 || td_k_length > 10 { continue } if len(indexMap) > 0 { //没有识别到采购单位联系人、联系电话、代理机构联系人、联系电话 if !ContactInfoMustReg.MatchString(td_k) { //notmatchCount++ //if notmatchCount < len(indexMap)*2 && false {//false??????? // notmatchCount = 0 // thidTdIndex = 0 // indexMap = map[int]string{} // matchMap = map[string]map[string]bool{} //} if mustMatchFirst { //indexMap初始值大于0 break L } continue } reCreate = true index := td_index //oldIndexMapLength原来的indexMap等于0 ,现在的indexMap大于1 if oldIndexMapLength == 0 && len(indexMap) > 1 { if prevKey != td_k { prevKey = td_k index = td_index } else if prevKey == td_k { index++ } } //kv.value为空 if filterValue.MatchString(td_v) { thisTrHasMatch = true continue } //u.Debug(indexMap, td_k, td_v, matchMap, index, modle) //myContactType myContactType := indexMap[index] //qutil.Debug(indexMap, index, myContactType) if myContactType == "" && len(indexMap) == 1 { _, onlyContactType := u.FirstKeyValueInMap(indexMap) myContactType, _ = onlyContactType.(string) } //qutil.Debug("indexMap+++", indexMap, "index+++", index, "myContactType+++", myContactType) if myContactType == "" { continue } matchCount++ if matchMap[myContactType] == nil { matchMap[myContactType] = map[string]bool{} } if IsContactKvHandle(ContactInfoMustReg.FindString(td_k), matchMap[myContactType]) { continue } matchMap[myContactType][ContactInfoMustReg.FindString(td_k)] = true if ContactType[myContactType].MatchString(td_k) { continue } thisTrHasMatch = true //modle modle(thisTdKvs, td, myContactType, td_k, td_v, &contactTypeTagMap, tn, &weightMap, tr_index, td_index, isSite, codeSite) } } //qutil.Debug("map===", td.SortKV.Map) } if allAscFind && !thisTrHasMatch { notMatchTrCount++ if notMatchTrCount >= 2 { notMatchTrCount = 0 indexMap = map[int]string{} } } } //u.Debug("end", matchCount, indexMap, matchMap) if matchCount == 0 { indexMap = map[int]string{} matchMap = map[string]map[string]bool{} } (*contactFormat).IndexMap = indexMap (*contactFormat).MatchMap = matchMap // for _, tr := range tn.TRs { // for _, td := range tr.TDs { // qutil.Debug("td.sort.map---", td.SortKV.Map) // } // } } //modle func modle(thisTdKvs []*u.Kv, td *TD, myContactType, td_k, td_v string, contactTypeTagMap *map[string]map[string][]interface{}, tn *Table, weightMap *map[string]map[string]interface{}, tr_index, td_index int, isSite bool, codeSite string) { modle := 0 if len(thisTdKvs) == 1 { if regReplAllSpace.ReplaceAllString(thisTdKvs[0].Value, "") == "" { modle = 1 } else { modle = 2 } } if modle == 1 { td.Text = myContactType + td_k td.Val = td.Text } else { // if !strings.HasSuffix(td_k, "方式") { kvTags := GetKvTags([]*u.Kv{&u.Kv{Key: myContactType + td_k, Value: td_v}}, "", BuyerContacts, isSite, codeSite) if len(kvTags) == 1 { tagVal, _ := u.FirstKeyValueInMap(kvTags) if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(td_v) { return } if (*contactTypeTagMap)[myContactType] == nil { (*contactTypeTagMap)[myContactType] = map[string][]interface{}{} } myOldKeyArray := (*contactTypeTagMap)[myContactType][tagVal] if myOldKeyArray != nil { tn.TRs[myOldKeyArray[0].(int)].TDs[myOldKeyArray[1].(int)].SortKV.RemoveKey(myContactType + myOldKeyArray[2].(string)) } else { (*contactTypeTagMap)[myContactType][tagVal] = make([]interface{}, 3) } if (*weightMap)[myContactType] == nil { (*weightMap)[myContactType] = map[string]interface{}{} } (*weightMap)[myContactType][tagVal] = 1 (*contactTypeTagMap)[myContactType][tagVal] = []interface{}{tr_index, td_index, td_k} } } td.SortKV.AddKey(myContactType+td_k, td_v) } } //都为正序查询 func (tn *Table) asdFind(td_k string, matchCount int, weightMap map[string]map[string]interface{}, matchMap map[string]map[string]bool, td *TD, thisTrHasMatch bool, td_kv *u.Kv, indexMap map[int]string, iscontinue bool, reCreate bool, thidTdIndex int, isSite bool, codeSite string) (int, map[string]map[string]interface{}, map[string]map[string]bool, bool, map[int]string, bool, bool, int) { for _, k := range HasOrderContactType(td_k) { //采购单位,代理机构,中标单位 if !ContactType[k].MatchString(td_k) { //没有匹配到采购单位,代理机构,中标单位 continue } matchCount++ if weightMap[k] == nil { weightMap[k] = map[string]interface{}{} } //匹配到进行处理 if ContactInfoVagueReg.MatchString(td_k) { thisTrHasMatch = tn.matchContactType(&matchMap, k, td_k, td_kv.Value, td, &weightMap, thisTrHasMatch, isSite, codeSite) } else if k == "采购单位" { //打标签,权重高的重新覆盖 kvTags := GetKvTags([]*u.Kv{td_kv}, "", []string{"采购单位"}, isSite, codeSite) tagVal, weightVal := u.FirstKeyValueInMap(kvTags) if tagVal == k { if weightMap[k][k] == nil || (weightVal != nil && weightVal.(int) >= weightMap[k][k].(int)) || len(matchMap[k]) == 0 { weightMap[k][k] = weightVal.(int) matchMap[k] = map[string]bool{} indexMap = map[int]string{} } } } if u.IsMapHasValue(k, indexMap) { //map中是否存在value thisTrHasMatch = true iscontinue = true continue } if reCreate { indexMap = map[int]string{} reCreate = false } indexMap[thidTdIndex] = k iscontinue = true thisTrHasMatch = true thidTdIndex++ break } if len(indexMap) == 0 && td_kv.PrevLine != "" { //td_kv.PrevLine prevLine := FilterSerial.ReplaceAllString(td_kv.PrevLine, "") for k, v := range ContactType { //采购单位,代理机构正则 if u.IsArrayHasValue(prevLine, v.FindAllString(prevLine, -1)) { indexMap[thidTdIndex] = k thisTrHasMatch = true thidTdIndex++ } } } if len(indexMap) == 0 && td_kv.Title != "" { //td_kv.Title if titleMatchType := ContactTypeTitleMatch(td_kv.Title); titleMatchType != "" { thidTdIndex = 0 matchMap = map[string]map[string]bool{} indexMap = map[int]string{1: titleMatchType} // for i, t := range titleMatchType { // indexMap[i+1] = t // } } } return matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex } //匹配到进行处理 func (tn *Table) matchContactType(matchMap *map[string]map[string]bool, k string, td_k string, td_v string, td *TD, weightMap *map[string]map[string]interface{}, thisTrHasMatch bool, isSite bool, codeSite string) bool { if (*matchMap)[k] == nil { (*matchMap)[k] = map[string]bool{} } isAddToMatchMap := true if !strings.HasSuffix(td_k, "方式") { kvTags := GetKvTags([]*u.Kv{&u.Kv{Key: td_k, Value: td_v}}, "", BuyerContacts, isSite, codeSite) if len(kvTags) == 1 { tagVal, weightVal := u.FirstKeyValueInMap(kvTags) if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(td_v) { isAddToMatchMap = false } if td.SortKV.Map[tagVal] != nil { if (*weightMap)[k][tagVal] == nil || (weightVal != nil && weightVal.(int) >= (*weightMap)[k][tagVal].(int)) { (*weightMap)[k][tagVal] = weightVal.(int) td.SortKV.AddKey(tagVal, td_v) thisTrHasMatch = true } } else { (*weightMap)[k][tagVal] = weightVal.(int) } } } if isAddToMatchMap && !filterValue.MatchString(td_v) && td_v != "" { (*matchMap)[k][ContactInfoVagueReg.FindString(td_k)] = true } return thisTrHasMatch } //采购人在联系人、电话后面的处理 func (tn *Table) hasIndexMap(thisTdKvs []*u.Kv, indexMap *map[int]string, tdAscFind bool) bool { //采购人在联系人、电话后面的处理 isCanAddToIndexMap := false LL: for _, td_kv := range thisTdKvs { //1.处理带括号的()[]【】采购单位,代理机构;2.识别采购单位联系人、联系电话、代理机构联系人、联系电话 goOnFunc, isContinue, td_k := tn.tdKV(td_kv.Key, nil, &isCanAddToIndexMap, indexMap, "LL") if !goOnFunc { break LL } if isContinue { continue } if len(*indexMap) == 0 { for _, k := range HasOrderContactType(td_k) { //采购单位,代理机构 if !ContactType[k].MatchString(td_k) { continue } if isCanAddToIndexMap && len(*indexMap) == 0 { (*indexMap)[0] = k break } } } } if len(*indexMap) > 0 { tdAscFind = false } return tdAscFind } //和|以?及|与|、多个词和在一起 func (tn *Table) tdsMultipleWords(jumpNextTd bool, td *TD, td_index int, tr *TR, thisTrHasMatch bool, indexMap map[int]string) (jump, thisTr bool) { if !jumpNextTd && len([]rune(td.Text)) >= 5 && len([]rune(td.Text)) <= 15 && regSplit.MatchString(td.Text) && td_index+1 < len(tr.TDs) { thisTdVals := regSplit.Split(td.Text, -1) nextTdVals := MultipleValueSplitReg.Split(tr.TDs[td_index+1].Val, -1) if len(thisTdVals) == len(nextTdVals) { //本次和下个长度相等 for _, k := range HasOrderContactType(td.Text) { //采购单位,代理机构 if ContactType[k].MatchString(td.Text) { //采购单位,代理机构 for thisTdVals_k, thisTdVals_v := range thisTdVals { thisTdVals_v = strings.TrimSpace(thisTdVals_v) if ContactType[k].MatchString(thisTdVals_v) { //采购单位,代理机构 thisTrHasMatch = true tr.TDs[td_index+1].SortKV.AddKey(thisTdVals_v, nextTdVals[thisTdVals_k]) continue } if !ContactInfoMustReg.MatchString(thisTdVals_v) { continue } jumpNextTd = true thisTrHasMatch = true tr.TDs[td_index+1].SortKV.AddKey(k+thisTdVals_v, nextTdVals[thisTdVals_k]) } break } } if len(indexMap) > 0 { _, onlyContactType := u.FirstKeyValueInMap(indexMap) if myContactType, _ := onlyContactType.(string); myContactType != "" { for thisTdVals_k, thisTdVals_v := range thisTdVals { thisTdVals_v = strings.TrimSpace(thisTdVals_v) if ContactInfoMustReg.MatchString(thisTdVals_v) { jumpNextTd = true thisTrHasMatch = true tr.TDs[td_index+1].SortKV.AddKey(myContactType+thisTdVals_v, nextTdVals[thisTdVals_k]) } } } } } } else { jumpNextTd = false } return jumpNextTd, thisTrHasMatch } //采购单位,代理机构 func (tn *Table) tdHasOrderContactType(td_k string, indexMap *map[int]string, tr *TR, prevCanAddToIndexMap, isCanAddToIndexMap, matchPrevFlag *bool, td_index int) (gotoFunc bool) { for _, k := range HasOrderContactType(td_k) { //采购单位,代理机构 if !ContactType[k].MatchString(td_k) { continue } if len(*indexMap) == 0 { if (*isCanAddToIndexMap) || (*prevCanAddToIndexMap && len(tr.TDs) == 1) { myPrevTdVal := "" if td_index-2 >= 0 { myPrevTdVal = tr.TDs[td_index-2].Val } if myPrevTdVal != "" && len([]rune(myPrevTdVal)) < 10 && ContactInfoMustReg.MatchString(myPrevTdVal) { (*matchPrevFlag) = true } (*indexMap)[0] = k break } } else { (*indexMap) = map[int]string{} return false } } return true } //1.处理带括号的()[]【】采购单位,代理机构;2.识别采购单位联系人、联系电话、代理机构联系人、联系电话 func (tn *Table) tdKV(key string, matchPrevFlag, isCanAddToIndexMap *bool, indexMap *map[int]string, gotoName string) (goOnFunc, isContinue bool, td_k string) { td_k = FilterContactKey(key) //带括号的()[]【】采购单位,代理机构处理 td_k_length := len([]rune(td_k)) if td_k_length < 2 || td_k_length > 15 { return true, true, "" //继续执行,跳过当前循环 } isContinue = ContactInfoMustReg.MatchString(td_k) //识别采购单位联系人、联系电话、代理机构联系人、联系电话 if isContinue || (ContactInfoVagueReg.MatchString(td_k) && u.IsMapHasValue(td_k, ContactType)) { if gotoName == "LS" && !(*matchPrevFlag) && len(*indexMap) > 0 { (*indexMap) = map[int]string{} return false, false, "" //中断外层循环 } if gotoName == "LL" && len(*indexMap) > 0 { (*indexMap) = map[int]string{} return false, false, "" } (*isCanAddToIndexMap) = true } return true, false, td_k //继续执行,不结束当前循环,返回处理后的值 } //获取td冒号kv func (tn *Table) tdkv(td *TD) []*u.Kv { thisTdKvs := colonkvEntity.GetKvs(td.Text, tn.Desc, 2) //获取冒号kv //获取冒号 if len(thisTdKvs) == 0 { tdValue := regReplAllSpace.ReplaceAllString(td.Text, "") //去除空格换行 if tdValue != "" && len([]rune(tdValue)) < 10 { thisTdKvs = append(thisTdKvs, &u.Kv{ Key: tdValue, Value: "", }) } } return thisTdKvs } func (table *Table) extractPriceNumber() { lineMapArr := make(map[string]*SortMap) lineMap := make(map[string]*SortMap) lineMapArr, lineMap = initLineMapLineMapArr(table) //不同数据类型的数据组合 //qutil.Debug("lineMapArr----", lineMapArr) if len(lineMapArr) > 0 { for _, arrMap := range lineMapArr { resultArrMap := table.matchMapArrPrinceNumber(arrMap) //最终数据 //qutil.Debug("resultArrMap-------------------", resultArrMap) //处理数组长度不一致情况 if len(resultArrMap) > 0 { numLen := len(resultArrMap["number"]) priceLen := len(resultArrMap["price"]) itemLen := len(resultArrMap["item"]) maxNum := numLen //获取最大长度 if numLen == 0 { //没有 maxNum = priceLen } //取个数数据的长度为基准(数据长度可能不一致) if numLen != priceLen && numLen > 0 && priceLen > 0 { //有number和price数据且长度不同,进行数据增减补齐 if priceLen > numLen { //price多,删 tmpArr := resultArrMap["price"] resultArrMap["price"] = tmpArr[:numLen] } else if priceLen < numLen { //price少,补空 for { resultArrMap["price"] = append(resultArrMap["price"], "") //qutil.Debug("=============price==============") if len(resultArrMap["price"]) == numLen { break } } } } if maxNum > 0 && itemLen > 0 && maxNum != itemLen { //有price或者number,item长度保持一致 if itemLen > maxNum { tmpArr := resultArrMap["item"] resultArrMap["item"] = tmpArr[:maxNum] } else if itemLen < maxNum { for { resultArrMap["item"] = append(resultArrMap["item"], "") //qutil.Debug("=============item==============") if len(resultArrMap["item"]) == maxNum { break } } } } //封装成一一对应数据 /* { "price" :["123","125"], "number" :["1","12"] } 转换为: [ {"price":"123","number":"1"}, {"price":"125","number":"12"} ] */ finishData := []map[string]interface{}{} //qutil.Debug("maxNum--------------------", maxNum) for t := 0; t < maxNum; t++ { tmpMap := make(map[string]interface{}) if len(resultArrMap["number"]) > 0 { number := resultArrMap["number"][t] tmpMap["number"] = number } if len(resultArrMap["price"]) > 0 { price := resultArrMap["price"][t] tmpMap["price"] = price } if len(resultArrMap["item"]) > 0 { item := resultArrMap["item"][t] runeItem := []rune(qutil.ObjToString(item)) if len(runeItem) > 50 { tmpMap["item"] = string(runeItem[:50]) } else { tmpMap["item"] = item } } finishData = append(finishData, tmpMap) } //qutil.Debug(finishData) if len(finishData) > 0 { table.PriceNumberData = append(table.PriceNumberData, finishData) } } } } if len(lineMap) > 0 { for _, strMap := range lineMap { resultStrMap := table.matchMapPrinceNumber(strMap) //qutil.Debug("resultStrMap---", resultStrMap) if len(resultStrMap) > 0 { if resultStrMap["price"] != nil || resultStrMap["number"] != nil { //有price或者number在保存 if item := qutil.ObjToString(resultStrMap["item"]); item != "" { //item过长截取 runeItem := []rune(item) if len(runeItem) > 50 { resultStrMap["item"] = string(runeItem[:50]) } } finishData := []map[string]interface{}{} finishData = append(finishData, resultStrMap) //qutil.Debug("finishData---", finishData) if len(finishData) > 0 { table.PriceNumberData = append(table.PriceNumberData, finishData) } } } } } //qutil.Debug("table.PriceNumberData---------", table.PriceNumberData) } //数组数据抽取price和number func (table *Table) matchMapArrPrinceNumber(arrMap *SortMap) map[string][]interface{} { result := make(map[string][]interface{}) //最终存储数据 for _, key := range arrMap.Keys { val := arrMap.Map[key].([]string) for f, reg := range u.PriceNumberReg { key = tabletitleclear2.ReplaceAllString(key, "") //过滤 if reg.MatchString(key) { //匹配成功 //qutil.Debug("arr----key", key, val, f) tmp := []interface{}{} if f == "price" { tmp = dealPriceInterface(key, val...) //处理金额,单位转换 } else if f == "number" { tmp = dealNumberInterface(val...) //处理数量 } else { for _, v := range val { tmp = append(tmp, v) } } if len(tmp) > 0 { result[f] = tmp } } } } return result } //字符串数据抽取price和number func (table *Table) matchMapPrinceNumber(strMap *SortMap) map[string]interface{} { result := make(map[string]interface{}) for _, key := range strMap.Keys { val := qutil.ObjToString(strMap.Map[key]) for f, reg := range u.PriceNumberReg { key = tabletitleclear2.ReplaceAllString(key, "") //过滤 if reg.MatchString(key) { //匹配成功 //qutil.Debug("str----key", key, val) if f == "price" { if len(regHz.FindAllString(val, -1)) > 5 { //price中汉字过多视为内容错误 continue } tmp := dealPriceInterface(key, val)[0] //处理金额,单位转换 result[f] = tmp } else if f == "number" { tmp := dealNumberInterface(val)[0] result[f] = tmp } else { result[f] = val } } } } return result } //table中抽取品牌,table.BrandData func (table *Table) analyBrand() { //5c2d8c05a5cb26b9b782572b //产品名称 品牌 规格 单价 单位 数量 小计 质保期 lineMapArr := make(map[string]*SortMap) lineMap := make(map[string]*SortMap) brandRule := u.BrandRules //初始化lineMapArr,lineMap; lineMapArr, lineMap = initLineMapLineMapArr(table) //处理数组数据后,匹配必须title和替换要保存的title //qutil.Debug("lineMapArr----", len(lineMapArr)) if len(lineMapArr) > 0 { for _, aMap := range lineMapArr { maxNum := 0 //记录最大长度 arrcount1 := 0 //记录key是否存在必须title(数组数据) arrcount2 := 0 ka := make(map[string][]string) //最终存储数据 //qutil.Debug("aMap.Keys----", aMap.Keys) //匹配商品规则 arrcount1, arrcount2, ka = table.matchMapArrBrandRule(aMap, brandRule, ka, arrcount1, arrcount2) //找最终存储数据的最小len(arr) // for _, vf := range ka { // //找最短的数组 // lenVal := len(vf) // if minNum == 0 || minNum > lenVal { //maxNum = len(最短数组) // minNum = lenVal // } // } //找最终存储数据的最大len(arr),小的补空 for _, vf1 := range ka { lenVal := len(vf1) if lenVal > maxNum { maxNum = lenVal } } //table.BrandData商品存储 finishKa := make(map[string][]string) for vf2K, vf2 := range ka { if len(vf2) < maxNum { if vf2K == "unitprice" { //价格的当前总数比最大的总数小就跳过,可能是总价格而不是单个的价格 continue } lenMv := maxNum - len(vf2) for i := 0; i < lenMv; i++ { vf2 = append(vf2, "") } } finishKa[vf2K] = vf2 } hasKey(table, arrcount1) //是否匹配到table中的标题 //qutil.Debug("finishKa----", finishKa) if arrcount1 >= 1 { if arrcount1+arrcount2 == 1 { //删除只匹配到一个价钱(总价) delete(finishKa, "unitprice") } finishData := dealArrData(maxNum, finishKa) table.BrandData = append(table.BrandData, finishData) //存储table.BrandData } } } //处理string数据后,匹配必须title和替换要保存的title //qutil.Debug("lineMap----", len(lineMap)) if len(lineMap) > 0 { for _, sMap := range lineMap { strcount1 := 0 //记录key是否存在必须title(字符串数据) strcount2 := 0 endStrMap := make(map[string]string) //qutil.Debug(k, "aMap.Keys----", sMap.Keys) //匹配商品规则 strcount1, strcount2, endStrMap = table.matchMapBrandRule(sMap, brandRule, endStrMap, strcount1, strcount2) //原始字符串数据处理 hasKey(table, strcount1) //是否匹配到table中的标题 //qutil.Debug("endStrMap----", endStrMap) if strcount1 >= 1 { if strcount1+strcount2 == 1 { //删除只匹配到一个价钱(总价) delete(endStrMap, "unitprice") } finishData := dealStrData(endStrMap) //处理数据 if len(finishData) > 0 { table.BrandData = append(table.BrandData, finishData) } } } } } //字符串匹配商品规则 func (table *Table) matchMapBrandRule(sMap *SortMap, brandRule map[string]map[string]string, endStrMap map[string]string, strcount1, strcount2 int) (int, int, map[string]string) { for _, k1 := range sMap.Keys { match := false //记录must是否匹配到 v1 := qutil.ObjToString(sMap.Map[k1]) // for k1, v1 := range sMap { //qutil.Debug(k1, "++++++++++", v1) if v1 == "" { continue } //匹配必须title for nameM, r := range brandRule["must"] { if convert(k1, r) { //匹配成功 v1tmp1 := v1 match = true if nameM == "itemname" || nameM == "modal" { //特殊处理itemname hasGoods(table, v1) if nameM == "itemname" { v1tmp1 = filterItem(v1)[0] //过滤itemname if v1tmp1 == "" { break } } } if nameM == "brandname" || nameM == "modal" { //特殊处理brandname if endStrMap["brandname"] == "" { brand, allNull := hasBrand(table, v1) if !allNull { endStrMap["brandname"] = brand[0] } } } //unitprice if nameM == "unitprice" { //处理金额 v1tmp1 = dealPrice(k1, v1)[0] } if nameM != "brandname" && endStrMap[nameM] == "" { endStrMap[nameM] = v1tmp1 } strcount1++ } } //替换其它要保存字段 if !match { for nameR, r := range brandRule["replace"] { if convert(k1, r) { //匹配成功 v1tmp2 := v1 //totalprice if nameR == "totalprice" { //处理金额 v1tmp2 = dealPrice(k1, v1)[0] } //number if nameR == "number" { //处理数量 varr1, uname1 := dealNumber(v1) v1tmp2 = varr1[0] //从number中获取到的单位 if endStrMap["unitname"] == "" && uname1[0] != "" { endStrMap["unitname"] = uname1[0] } } if v1tmp2 != "" { endStrMap[nameR] = v1tmp2 } strcount2++ } } } //} } return strcount1, strcount2, endStrMap } //数组匹配商品规则 func (table *Table) matchMapArrBrandRule(aMap *SortMap, brandRule map[string]map[string]string, ka map[string][]string, arrcount1, arrcount2 int) (int, int, map[string][]string) { for _, k0 := range aMap.Keys { match := false //记录must是否匹配到 v0 := aMap.Map[k0].([]string) //匹配必须title for nameM, r := range brandRule["must"] { if convert(k0, r) { //匹配成功 v0tmp1 := v0 match = true if len(ka[nameM]) != 0 && strings.Contains(k0, "描述") { //防止k0匹配到多次 和特殊情况 物料名称 物料描述同时出现 continue } if nameM == "itemname" || nameM == "modal" { hasGoods(table, v0...) //判断itemname和modal中有没有商品 if nameM == "itemname" { v0tmp1 = filterItem(v0...) //过滤itemname } } if nameM == "brandname" || nameM == "modal" { if len(ka["brandname"]) == 0 { brand, allNull := hasBrand(table, v0...) if !allNull { ka["brandname"] = brand } } } //unitprice if nameM == "unitprice" { //处理金额 v0tmp1 = dealPrice(k0, v0...) } if nameM != "brandname" && len(ka[nameM]) == 0 { ka[nameM] = v0tmp1 } arrcount1++ } } //替换其它要保存字段 if !match { //must未匹配,匹配replace for nameR, r := range brandRule["replace"] { if convert(k0, r) { //匹配成功 v0tmp2 := v0 //totalprice if nameR == "totalprice" { //处理金额 v0tmp2 = dealPrice(k0, v0...) } //number if nameR == "number" { //处理数量 uname0 := []string{} v0tmp2, uname0 = dealNumber(v0...) if len(ka["unitname"]) == 0 && len(uname0) != 0 { ka["unitname"] = uname0 } } if len(v0tmp2) > 0 { ka[nameR] = v0tmp2 } arrcount2++ } } } } return arrcount1, arrcount2, ka } //初始化lineMapArr,lineMap func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMap map[string]*SortMap) { lineMapArr = make(map[string]*SortMap) lineMap = make(map[string]*SortMap) for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序 val := table.SortKV.Map[key] key = regReplAllSpace.ReplaceAllString(key, "") key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉 //qutil.Debug(key, "---------------------------", val) if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]} /* { "商品":["",""], "商品_"["",""], } */ valArr, allempty := filterVal(realTypeVal...) //过滤数据 if allempty { continue } realTypeVal = valArr line := underline.FindString(key) lineValMap1 := lineMapArr[line] // i := 1 // L: // for { //去除数组空数据 // last := realTypeVal[len(realTypeVal)-i] // if last == "" { // i++ // if i > len(realTypeVal) { // break // } // goto L // } else { // break // } // } // dislodgeNull := realTypeVal[:(len(realTypeVal) - i + 1)] //去除数组中空数据 if len(realTypeVal) > 0 { if lineValMap1 == nil { tmp := NewSortMap() tmp.AddKey(key, realTypeVal) lineMapArr[line] = tmp } else { lineValMap1.AddKey(key, realTypeVal) } } //qutil.Debug("lineMapArr---", lineMapArr[line].Keys, lineMapArr[line].Map) } else if realTypeVal, b := val.(string); b { //val为字符串 {"数量":"1"} /* { "商品:"",名称:"", "商品_:"",名称_:"", "商品__:"",名称__:"", } */ valArr, allempty := filterVal(realTypeVal) //过滤数据 if allempty { continue } realTypeVal = valArr[0] line := underline.FindString(key) lineValMap2 := lineMap[line] if lineValMap2 == nil { tmp := NewSortMap() tmp.AddKey(key, realTypeVal) lineMap[line] = tmp } else { lineValMap2.AddKey(key, realTypeVal) } //qutil.Debug("lineMap---", lineMap[line].Keys, lineMap[line].Map) } else { // "_id" : ObjectId("5c2c3802a5cb26b9b78646c4")5c2b0551a5cb26b9b7cb05db否5c2a42e6a5cb26b9b763ba5a采购人:一、采购人5c2b06f5a5cb26b9b7cc4409 //成交供应商排名 [map[entname:昆明合优科技有限公司 sortstr:第一中标候选人 sort:1] map[sort:2 entname:昆明厚起科技有限公司 sortstr:第二中标候选人] map[entname:云南远安科技发展有限公司 sortstr:第三中标候选人 sort:3]] //qutil.Debug("err data:", key, val) } } return lineMapArr, lineMap } func dealArrData(maxNum int, ka map[string][]string) []map[string]string { for k2, v2 := range ka { //处理数组长度不相等,使长度一致 if len(v2) > maxNum { ka[k2] = v2[:maxNum] } } finalData := assembleData(ka, 1) if len(finalData) > 0 { return finalData } return nil } func dealStrData(kv map[string]string) []map[string]string { finalData := []map[string]string{} if len(kv) > 0 { finalData = assembleData(kv, 2) } return finalData } //组装数据,每一行的数据为一数据集合 func assembleData(m interface{}, n int) []map[string]string { defer qutil.Catch() /* { "itemname":["计算机","打印机","机柜"], "number" :["1","12","4"] } */ datas := []map[string]string{} if n == 1 { //数组数据 realTypeM := m.(map[string][]string) //根据数组数据的顺序 将多个数组中索引相同的数据拼装成一个map,并将这多个map放入一个arr /* arr1 ["a1","b1","c1"] arr2 ["a2","b2","c2"] [ {"a1","a2"}, {"b1","b2"}, {"c1","c2"} ] */ //start for k3, v3 := range realTypeM { for _, val := range v3 { data := make(map[string]string) data[k3] = val datas = append(datas, data) } break } for i, data := range datas { for k4, v4 := range realTypeM { if i < len(v4) { //数组数据长度不一致 if v4[i] != " " { data[k4] = v4[i] } else { delete(data, k4) } } else { fmt.Println("err table") } } datas[i] = data } //end for _, fdv := range datas { //清除空数据和只含特殊符号的数据 for fmk, fmv := range fdv { if tabletdclear.ReplaceAllString(fmv, "") == "" { delete(fdv, fmk) } } } } else { //字符串数据 realTypeM := m.(map[string]string) datas = append(datas, realTypeM) } return datas } ////组装数据,每一行的数据为一数据集合 //func assembleData(m interface{}, n int) []map[string]string { // defer qutil.Catch() // /* // { // "itemname":["计算机","打印机","机柜"], // "number" :["1","12","4"] // } // */ // datas := []map[string]string{} // switch reflect.TypeOf(m).String() { // case "map[string][]string": //数组数据 // realTypeM := m.(map[string][]string) // //根据数组数据的顺序 将多个数组中索引相同的数据拼装成一个map,并将这多个map放入一个arr // /* // arr1 ["a1","b1","c1"] // arr2 ["a2","b2","c2"] // [ // {"a1","a2"}, // {"b1","b2"}, // {"c1","c2"} // ] // */ // //start // for k3, v3 := range realTypeM { // for _, val := range v3 { // data := make(map[string]string) // data[k3] = val // datas = append(datas, data) // } // break // } // for i, data := range datas { // for k4, v4 := range realTypeM { // if i < len(v4) { //数组数据长度不一致 // if v4[i] != " " { // data[k4] = v4[i] // } else { // delete(data, k4) // //continue // } // } else { // fmt.Println("err table") // //continue // } // } // datas[i] = data // } // //end // // for _, fdv := range datas { //清除空数据和只含特殊符号的数据 // // for fmk, fmv := range fdv { // // if tabletdclear.ReplaceAllString(fmv, "") == "" { // // delete(fdv, fmk) // // } // // } // // } // case "map[string]string": //字符串数据 // realTypeM := m.(map[string]string) // datas = append(datas, realTypeM) // default: // } // return datas //} func convert(key, r string) bool { defer qutil.Catch() flag := false key = tabletitleclear.ReplaceAllString(key, "") reg, err := regexp.Compile(r) if err != nil { fmt.Println("reg err:", err) return false } flag = reg.MatchString(key) return flag } func hasKey(table *Table, n int) { defer qutil.Catch() if table.TableResult.HasKey == 1 { return } if n >= 1 { table.TableResult.HasKey = 1 } } func hasGoods(table *Table, data ...string) { defer qutil.Catch() goodsArr := make([]string, len(data)) //fmt.Println("table.TableResult.HasGoods=====", table.TableResult.HasGoods) if table.TableResult.HasGoods == 1 { return } for i, d := range data { if d != "" { goods := u.GoodsGet.CheckSensitiveWord(d) //fmt.Println("goods======", goods) goodsArr[i] = goods if len(goods) > 0 { table.TableResult.HasGoods = 1 break } } } } //func hasBrand(table *Table, data ...string) { // defer qutil.Catch() // if table.TableResult.HasBrand == 1 { // return // } // for i, d := range data { // if d != "" { // brand := u.BrandGet.CheckSensitiveWord(d) // qutil.Debug(d, brand) // if brand != "" { // table.TableResult.HasBrand = 1 // break // } // } // } //} func hasBrand(table *Table, data ...string) ([]string, bool) { defer qutil.Catch() //fmt.Println("table.TableResult.HasBrand---------", table.TableResult.HasBrand) brandArr := make([]string, len(data)) // if table.TableResult.HasBrand == 1 { // return brandArr, 1 // } allNull := true for i, d := range data { //if d != "" { brand := u.BrandGet.CheckSensitiveWord(d) if brand != "" { allNull = false } //fmt.Println("brand======", brand) brandArr[i] = brand if len(brand) > 0 { table.TableResult.HasBrand = 1 } //} } return brandArr, allNull } var clearnn *regexp.Regexp = regexp.MustCompile("([\\d.]*)[\\n\\s]*[\\((][\\d.]+[)\\)]") //过滤td值 func filterVal(val ...string) ([]string, bool) { defer qutil.Catch() n := 0 //记录被过滤的个数 for i, v := range val { if len(clearnn.FindStringSubmatch(v)) > 0 { tmpv := clearnn.FindStringSubmatch(v)[1] if tmpv != "" { v = tmpv } } afterFilter := tabletdclear.ReplaceAllString(v, "") afterFilter = NullVal.ReplaceAllString(afterFilter, "") if afterFilter == "" { n++ } val[i] = afterFilter } allempty := false if n == len(val) { //所有都被过滤掉 allempty = true } return val, allempty } //过滤itemname全是数字 func filterItem(itemval ...string) []string { defer qutil.Catch() result := []string{} for _, v := range itemval { afterFilter := numclear.ReplaceAllString(v, "") if afterFilter != "" { result = append(result, v) } else { result = append(result, afterFilter) } } return result } //处理价格 func dealPriceInterface(key string, val ...string) (result []interface{}) { defer qutil.Catch() for _, v := range val { if num1.MatchString(v) { //含数字 tdIsWan := strings.Contains(v, "万") if !tdIsWan { if strings.Contains(key, "万") { v = v + "万" } } data := []interface{}{v, ""} money := clear.ObjToMoney(data)[0] result = append(result, money) } else { result = append(result, "") } } return } //处理number func dealNumberInterface(val ...string) (result []interface{}) { defer qutil.Catch() for _, v := range val { //1个 1.00个 n := numclear.FindString(v) if n == "" { result = append(result, "") } else if tmp := clear.NumChar[n]; tmp != nil { //一二三... result = append(result, tmp) } else { //数字 result = append(result, qutil.IntAll(strings.Split(n, ".")[0])) } } return } //处理价格 func dealPrice(key string, val ...string) []string { defer qutil.Catch() result := []string{} for _, v := range val { data := []interface{}{v, key} money := clear.ObjToMoney(data)[0] result = append(result, fmt.Sprintf("%v", money)) } // result := []string{} // for _, v := range val { //1.00万元 1元 2.25元/斤 // tmparr := strings.Split(v, ".") // tmparr[0] = moneyNum.ReplaceAllString(tmparr[0], "") // if iswan { // result = append(result, tmparr[0]+"0000") // } else { //td val值带万 // if strings.Contains(v, "万") { //价格中带有万 // result = append(result, tmparr[0]+"0000") // } else { // result = append(result, tmparr[0]) // } // } // } return result } //处理number func dealNumber(val ...string) ([]string, []string) { defer qutil.Catch() unitnameArr := []string{} result := []string{} for _, v := range val { //1个 1.00个 n := numclear.FindString(v) unitname := numclear.ReplaceAllString(v, "") //匹配个数后的单位 unitnameArr = append(unitnameArr, unitname) //val[i] = strings.Split(n, ".")[0] result = append(result, strings.Split(n, ".")[0]) } return result, unitnameArr } func (tn *Table) analyProNameAndItemNumber() { }