1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087 |
- package pretreated
- import (
- "fmt"
- "github.com/shopspring/decimal"
- "jy/clear"
- u "jy/util"
- qu "qfw/util"
- "regexp"
- "strings"
- )
- /*
- *
- 全局变量,主要是一堆判断正则
- *
- */
- var (
- //key 的日期单位
- dateReg *regexp.Regexp = regexp.MustCompile(`[年|月|日|天]`)
- //清理品目中数字
- numclear = regexp.MustCompile("^[\\d一二三四五六七八九十.]+")
- num1 = regexp.MustCompile("(\\d)")
- //清理表格title中的不需要的内容
- tabletitleclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、.,.。_/((人民币万元件个公斤户))]")
- tabletitleclear2 = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕]*")
- //清理表格中是key中包含的空格或数字等
- tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]")
- //清理上阶段kv的匹配的短词
- tablekeyclear2 = regexp.MustCompile("(供应商信用融资|供应商公章|主要标的名称|中标人推荐理由|成交供应商推荐理由)")
- //清理表格td中的符号
- tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n\u001c、,。_??;;~\\-#\\\\()(){}【】\\[\\]<>《》{}〔〕¥$]*")
- //判断key是金额,对万元的处理
- moneyReg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
- //特殊文本-为表头
- specHeadReg = regexp.MustCompile("(成交供应商|中选人)")
- //key不需要清理-例如折扣 费率
- noClearKeyReg = regexp.MustCompile(`[((](费率|年|月|日|天|日历天|历天)[))]`)
- //根据表格的内容判断是不是表头,如果含有金额则不是表头
- MoneyReg = regexp.MustCompile("^[\\s ::0-9.万元()()人民币¥$]+$")
- //特殊情况值,不能为表头
- noStartHeadReg = regexp.MustCompile("^(\\d标段)$")
- GSReg = regexp.MustCompile(".*公司.*")
- //判断分包时
- moneyNum = regexp.MustCompile("[元整¥万]")
- //对隐藏表格的判断
- display = regexp.MustCompile("(?i).*?display\\s?[:]\\s?none.*")
- //---------------
- //求是分包的概率
- //根据表格的标签对分包进行打分
- TableMultiPackageReg_4 = regexp.MustCompile("(标段|分包|包段|划分|子包|标包|合同段)")
- TableMultiPackageReg_2 = regexp.MustCompile("(概况|范围|情况|内容|详细|结果|信息)")
- //在判断分包打分前过虑表格key
- FilterKey_2 = regexp.MustCompile("招标|投标|项目")
- //根据表格的key进行分包打分
- FindKey_2 = regexp.MustCompile("([分子][包标](号)?|标[号项段包](划分)?|包件?[号段名数]|包[组件])")
- FindKey_3 = regexp.MustCompile("(标段编号|标包|包件|包号)")
- //对值进行分包判断
- FindVal_1 = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|标的[一二三四五六七八九十1-9A-Za-z]+|((子|合同|分|施工|监理)?(包|包件|标)(段|号)?)[ \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)")
- FindVal2_1 = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)|^(设计|施工|监理|验收)[分子]?[标包]?[段号]?$")
- //判断分包前排除 包件号?
- excludeKey = regexp.MustCompile("(标识|数量|分包个数|标段代码|涉及包号|分包数量|项目标号|规格|型号|招标范围|业绩|废标|标段选择要求)|(^编号$)|([^包段标]编号)") //编号|划分
- excludeKey2 = regexp.MustCompile("包/[0-9]{0,4}[箱纸张]")
- excludeKey3 = regexp.MustCompile("(分包个数|每包[0-9]*元|标线|国标|享受一包服务)")
- //-------------
- cut = u.NewCut()
- //清理表格标签正则
- ClearTagReg = regexp.MustCompile("<[^>]*?>|[\\s\\n\\r]*$")
- //查找表格标签正则
- ttagreg = regexp.MustCompile("(?s)([^\\n::。,;\\s\u3000\u2003\u00a0]{2,30})[::]?[^::。;!\\n]{0,35}[\\s\\n]*$")
- //判断表格是表头的概率
- checkval = float32(0.6)
- //tdval_reg = regexp.MustCompile(`([\p{Han}][\p{Han}\s、()\\(\\)]{1,9})[::]([^::\\n。]{5,60})(?:[;;,,.。\\n\\t\\s])?`)
- //空格替换
- repSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0::]+|\\\\t+")
- //对表格kv的处理
- //对不能标准化的key做批识
- filter_tag_zb = regexp.MustCompile("(中标|成交|投标)[\\p{Han}]{0,6}(情况|结果|信息|明细)?")
- //中标金额
- //包含以下字眼做标准化处理
- filter_zbje_k = regexp.MustCompile("(中标|成交|总|拦标|合同|供[应货]商|报)[\\p{Han}、]{0,6}(价|额|[大小]写|[万亿]?元).{0,4}$")
- //简单判断金额
- filter_zbje_jd = regexp.MustCompile("^[^(售|保证)]{0,4}(价|额).{0,4}$")
- //预算金额
- filter_ysje_jd = regexp.MustCompile("(预算|预控价|项目概.|项目信息)")
- //且排队以下字眼的key
- filter_zbje_kn = regexp.MustCompile("得分|打分|时间|业绩|须知|分|电话|要求|需求数量|发布规模$|第[2二3三4四5五]|地址|询价保证金|行号")
- //且值包含以下字眼
- filter_zbje_v = regexp.MustCompile("[¥$$0-9一二三四五六七八九十,,〇零点..壹贰叁肆伍陆柒捌玖拾百佰千仟万亿億元圆角分整正()::大小写]{2,16}")
- //中标单位的处理
- //包含以下字眼的Key标准化
- filter_zbdw_ky = regexp.MustCompile("(中标|成交|拦标|合同|选中|投标|拟|预|最终)[\\p{Han}、]{0,6}(供[应货]商|企业|单位|人|机构)(名称)?.{0,4}$")
- //识别中标单位相关信息
- filter_zbdw_info = regexp.MustCompile("(中标|成交|中选|供(货|应))[^候选]{0,}")
- //简单判断
- filter_zbdw_jd = regexp.MustCompile("(投标|成交|中标|合同)(供应商|单位|人|名称).{0,4}$")
- //且不包含以下字眼
- filter_zbdw_kn = regexp.MustCompile("第[2二3三4四5五]|得分|地址|询价保证金") //且值包含以下字眼
- //且值包含以下字眼
- filter_zbdw_v = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$|([^购]中心|办公|用品)")
- //且值包含以下字眼
- filter_zbdw_v2 = regexp.MustCompile("(公司|集团|研究院|设计院|局|厂|部|站|城|店|市|所|处)$")
- //Tg = map[string]interface{}{}
- //一些表格没有表头,是空的,对值是排序的做处理对应 NullTxBid
- NullTdReg = regexp.MustCompile("(首选|第[一二三四五1-5])(中标|成交)?(名(称)?|(候选|排序)?(人|单位|供应商))")
- NullTxtBid = "成交供应商排名"
- projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
- MhSpilt = regexp.MustCompile("[::]") //降低冒号权重
- //指定字段且时间格式
- UnTimeSpiltKey = regexp.MustCompile("(招标文件获取截止时间|招标文件获取开始时间|报名截止时间|报名开始时间|投标文件递交开始时间|开工日期|竣工日期)")
- UnTimeSpiltValue = regexp.MustCompile("\\d{1,2}[::]\\d{1,2}")
- //识别采购单位联系人、联系电话、代理机构联系人、联系电话 -- 名称有异常
- ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?(((联系)?(电话|方式|号码)([//及]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(详细)?(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|经办)人)|采购方代表")
- ContactInfoExcluReg = regexp.MustCompile("[商]名称$")
- ContactInfoMustReg = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
- ContactType = map[string]*regexp.Regexp{
- "采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|招标(服务)?|甲|建设|招标|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心(地址)?|业主|收料人|采购部"),
- "代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
- "中标单位": regexp.MustCompile("^((拟(定)?|预|最终|唯一)?(中标|成交|中选|供(货|应))((成交))?)[^候选]{0,2}(人|方|单位|公司|(服务|供应)?商|企业)"),
- "监督部门": regexp.MustCompile("投诉受理部门"),
- }
- ContactHeadReg = regexp.MustCompile("^(招标人|采购人)$")
- ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
- MultipleValueSplitReg = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
- BuyerContacts = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"}
- FilterSerial = regexp.MustCompile(".+[、..::,]")
- underline = regexp.MustCompile("_+$")
- iswinnertabletag = regexp.MustCompile("(中标|候选人|成交|结果|磋商情况)")
- nswinnertabletag = regexp.MustCompile("评得分估|标的信息|班子成员")
- jsonReg = regexp.MustCompile(`\{.+:[^}]*\} `) // \{".*\":\".+\"}
- regHz = regexp.MustCompile("[\u4e00-\u9fa5]")
- winnerOrderAndBidResult = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
- WinnerOrderStr = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|[大中小]+学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`)
- DoubtReg = regexp.MustCompile("(我中心|有(疑问|质疑|异议|意见)|(书面)?提出|不再受理|投诉|质疑|书面形式|监督|公示期(限)?)")
- //新增-分包-表格-sortKV
- budgetSortKVReg = regexp.MustCompile("(预算)")
- bidamountSortKVReg = regexp.MustCompile("(成交结果[((]万元[))]|成交金额|履约金额|中[标选]金额)")
- winnerSortKVReg = regexp.MustCompile("(投标人[((]供应商[))]名称)|供应商名称|中标候选人|中[标选]人|中[标选]单位")
- )
- var fblbReg *regexp.Regexp = regexp.MustCompile("(废标|流标|否决依据|未中标情况说明|负责人资格|负责人业绩|相关业绩|类似项目情况表|技术评分明细表|否决投标人投标的原因|开标记录|附件[:0-9]|越南盾|技术分[^公]|填报项目业绩|未通过.*原因)")
- // 59.992664,33.495715,20.001306
- var clearnum *regexp.Regexp = regexp.MustCompile("(([0-9.]{1,6}[,,]+){4,}|(\\d{6}[,,]\\d{2}.){2,})")
- var glRex *regexp.Regexp = regexp.MustCompile("(成交|中标|候选|排名|名次|供应商排序|中标候选人|名单及其排序|排序)")
- var djReg *regexp.Regexp = regexp.MustCompile("^单价")
- var hxrRex *regexp.Regexp = regexp.MustCompile("((成交|中标|中选)?候选人[弟|第][1-5一二三四五]名|[弟|第][1-5一二三四五][名]?(成交|中标|中选)?候选人)")
- var winMoneyReg *regexp.Regexp = regexp.MustCompile("(报价|投标价|投标报价|评审价|投标总价|含税总价[((]元[))]|总金额)")
- var winNoMoneyReg *regexp.Regexp = regexp.MustCompile("(得分|时间|序号|分)")
- var cleardwReg *regexp.Regexp = regexp.MustCompile("[((]{1}\\d*[人元件个公斤户]/[人元件个公斤户][))]")
- var zbhxrReg *regexp.Regexp = regexp.MustCompile("(中标候选人|投标单位名称|候选人姓名|候选人名称)")
- var zbhxrSortReg_1 *regexp.Regexp = regexp.MustCompile("^[第|弟][12345一二三四五]名$")
- var zbhxrSortReg_2 *regexp.Regexp = regexp.MustCompile("^([12345一二三四五])$")
- var zbhxrSortReg_3 *regexp.Regexp = regexp.MustCompile("^([12345一二三四五])")
- var zbhxrSortNameReg *regexp.Regexp = regexp.MustCompile("(中标候选人[第|弟][123一二三]名)|[第|弟][123一二三]中标候选人")
- var zbhxrSecondReg *regexp.Regexp = regexp.MustCompile("(中标候选人[第|弟][2二]名)|[第|弟][2二]中标候选人")
- var clearnn *regexp.Regexp = regexp.MustCompile("([\\d.]*)[\\n\\s]*[\\((][\\d.]+[)\\)]")
- // 分包含有关键词
- var pkgValidReg1 *regexp.Regexp = regexp.MustCompile("(中标单位|中标金额)[::]")
- var tableClearTextReg *regexp.Regexp = regexp.MustCompile("业绩[::].*")
- // 特殊-爬虫文本-抽取单价数量-并计算
- func dealWithSpecStructToSpiderCode(text string) string {
- text = formattext50.ReplaceAllString(text, "$1&&$2")
- arr := strings.Split(text, "&&")
- if len(arr) == 2 {
- one := qu.Float64All(arr[0])
- two := qu.Float64All(arr[1])
- if one > 0 && two > 0 {
- return fmt.Sprintf("\n合同金额:%f\n", one*two)
- }
- }
- return ""
- }
- // 对比前后候选人的有效性-true -为新
- func thanWinnerOrderEffective(old_order []map[string]interface{}, new_order []map[string]interface{}) bool {
- if len(new_order) == 0 || new_order == nil {
- return false
- }
- if len(old_order) == 0 || old_order == nil {
- return true
- }
- old_info, new_info := old_order[0], new_order[0]
- if qu.IntAll(old_info["sort"]) > 1 {
- return true
- } //排序比对
- if qu.IntAll(new_info["sort"]) > 1 {
- return false
- }
- //金额比对 -
- isuse_1, isuse_2 := false, false
- if old_vf, ok := old_info["price"].(float64); ok && old_vf > 0.0 {
- isuse_1 = true
- } else {
- if old_vs, ok := old_info["price"].(string); ok && old_vs != "" {
- isuse_1 = true
- }
- }
- if new_vf, ok := new_info["price"].(float64); ok && new_vf > 0.0 {
- isuse_2 = true
- } else {
- if new_vs, ok := new_info["price"].(string); ok && new_vs != "" {
- isuse_2 = true
- }
- }
- if isuse_1 && !isuse_2 {
- return false
- }
- if !isuse_1 && isuse_2 {
- return true
- }
- //均正常-优先取新值
- return true
- }
- // 对比前后候选人的有效性-true -为新
- func onlyExistsWinEntName(winorder []map[string]interface{}) bool {
- if len(winorder) <= 3 {
- for _, v := range winorder {
- if vf, ok := v["price"].(float64); ok && vf > 0.0 {
- return false
- }
- }
- } else {
- return false
- }
- return true
- }
- func thanExistsNewWinOrder(winorder []map[string]interface{}, new_winorder []map[string]interface{}) bool {
- if len(winorder) != len(new_winorder) {
- return false
- }
- isok := 0
- for k, v := range winorder {
- if qu.ObjToString(v["entname"]) == qu.ObjToString(new_winorder[k]["entname"]) {
- if new_price, ok := new_winorder[k]["price"].(float64); ok && new_price > 0.0 {
- isok++
- }
- }
- }
- if isok == len(winorder) {
- return true
- }
- return false
- }
- // 多供应商文本构建分包
- func dealWithMultiSuppliersText(con string) (bool, string) {
- startIndex := MultiStartReg.FindAllStringIndex(con, 1)
- endIndex := MultiEndReg.FindAllStringIndex(con, 1)
- if len(startIndex) == 1 && len(endIndex) == 1 {
- if len(startIndex[0]) > 1 && len(endIndex[0]) > 1 {
- t_start, t_end := startIndex[0][1], endIndex[0][0]
- if t_end > t_start {
- text := con[t_start:t_end]
- arr1 := SupplyInfoReg1.FindAllStringSubmatch(text, -1)
- if text1 := supplyInfoMethod(arr1, 2, 4); text1 != "" {
- return true, strings.ReplaceAll(con, text, text1)
- }
- arr2 := SupplyInfoReg2.FindAllStringSubmatch(text, -1)
- if text2 := supplyInfoMethod(arr2, 2, 4); text2 != "" {
- return true, strings.ReplaceAll(con, text, text2)
- }
- }
- }
- }
- return false, ""
- }
- // 特殊-重构
- func supplyInfoMethod(arr [][]string, w_index int, b_index int) string {
- new_text := ""
- if len(arr) > 1 {
- for k, v := range arr {
- key := fmt.Sprintf("包%d", k+1)
- new_text += key + "\n中标单位:" + v[w_index] + "\n中标金额:" + v[b_index] + "\n"
- }
- }
- return new_text
- }
- // 分析方法
- func AnalyStart(job *u.Job, isSite bool, codeSite string) {
- con := job.Content
- //全文的需要修复表格
- con = RepairCon(con)
- //格式化正文
- //con = preConReg1.ReplaceAllString(con, "${1}${2}")
- hisReg1_str := hisReg1.FindString(con)
- if hisReg1_str != "" && !strings.Contains(hisReg1_str, "中标候选人得分") {
- con = hisReg1.ReplaceAllString(con, "${4}")
- }
- hisReg2_str := hisReg2.FindString(con)
- if hisReg2_str != "" && !strings.Contains(hisReg2_str, "中标候选人得分") {
- con = hisReg2.ReplaceAllString(con, "${6}")
- }
- con = formattext.ReplaceAllString(con, "${1}:${2}")
- con = formattext2.ReplaceAllString(con, "${1}")
- con = formattext3.ReplaceAllString(con, "")
- con = formattext4.ReplaceAllString(con, "\n${1}:${2}\n")
- //特殊格式-影响分包候选人抽取-候选人等识别-替换
- con = formattext5.ReplaceAllString(con, "中标金额:${2}\n")
- con = formattext6.ReplaceAllString(con, "$1$2")
- con = formattext7.ReplaceAllString(con, "$1$2")
- //改变特殊结构
- con = formattext10.ReplaceAllString(con, "\n分包$3\n中标单位:$5 中标金额:$6\n")
- con = formattext11.ReplaceAllString(con, "${1}\n${2}\n预算金额:${4}\n${5}\n预算金额:${7}\n${8}\n")
- con = formattext12.ReplaceAllString(con, "\n${1}:${3}万元\n")
- con = formattext13.ReplaceAllString(con, "\n包一\n中标单位:${1}\n中标金额:${3}\n"+"包二\n中标单位:${2}\n中标金额:${4}\n")
- con = formattext14.ReplaceAllString(con, "\n包一\n中标单位:${1}\n中标金额:${2}\n"+"包二\n中标单位:${3}\n中标金额:${4}\n")
- //多供应商~文本结构~重构
- if m_b, m_c := dealWithMultiSuppliersText(con); m_b {
- con = m_c
- }
- //工程业绩描述影响抽取
- con = formattext20.ReplaceAllString(con, "\n")
- con = formattext21.ReplaceAllString(con, "")
- //指定爬虫-特殊结构-计算抽取
- if codeSite == "a_zgzfcgw_zfcghtgg_new" {
- str := formattext50.FindString(con)
- if str != "" {
- new_str := dealWithSpecStructToSpiderCode(str)
- if new_str != "" {
- con = new_str + con
- }
- }
- }
- con = formatText(con, "all")
- job.ContentClean = HtmlToText(job.Content)
- job.Content = con
- //计算表格占比,返回表格数组、占比
- tabs, _ := ComputeConRatio(con, 1)
- /*if len(tabs) > 0 {
- newcon, newtabs, newration := FindBigText(con, ration, tabs)
- if newcon != "" {
- con = newcon
- con = formatText(con, "all")
- tabs = newtabs
- ration = newration
- }
- }*/
- job.BlockPackage = map[string]*u.BlockPackage{}
- //分块+处理每块kv
- blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite)
- if len(blockArrays) > 0 { //有分块
- //从块里面找分包-文本
- if !job.IsFile {
- job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
- }
- for _, bl := range blockArrays {
- if len([]rune(bl.Text)) > 80 {
- bl.Block, _ = DivideBlock(job.CategorySecond, bl.Text, 1, job.RuleBlock, isSite, codeSite)
- for _, bl_bl := range bl.Block {
- processTableInBlock(bl_bl, job, isSite, codeSite)
- }
- }
- FindProjectCode(bl.Text, job) //匹配项目编号
- //表格找分包相关---
- isUnRulesTab := processTableInBlock(bl, job, isSite, codeSite) //处理表格
- if isUnRulesTab { //是否不规则表格
- job.IsUnRulesTab = isUnRulesTab
- }
- //对块行内容业绩相关进行过滤
- bl.Text = tableClearTextReg.ReplaceAllString(bl.Text, "")
- //新加 未分块table中未能解析到中标候选人,从正文中解析-全文匹配一次
- if (job.Winnerorder == nil || len(job.Winnerorder) == 0) || len(job.Winnerorder) > 8 {
- //表格没有划分时候:-纯文本匹配
- tmp_text := HtmlToText(bl.Text)
- bl.Winnerorder = winnerOrderEntity.Find(tmp_text, true, 1, isSite, codeSite)
- if thanWinnerOrderEffective(job.Winnerorder, bl.Winnerorder) {
- job.Winnerorder = bl.Winnerorder
- }
- }
- //无分包-附件-格式化文本处理-
- if (job.BlockPackage == nil || len(job.BlockPackage) == 0) && job.IsFile {
- tmp_text := HtmlToText(bl.Text)
- job.BlockPackage = FindPackageFromText(job.Title, tmp_text, isSite, codeSite)
- }
- job.Block = append(job.Block, bl)
- }
- } else { //未分块,创建分块
- //log.Println(con)
- bl := &u.Block{}
- newCon := con
- //log.Println(con)
- if len(tabs) > 0 { //解析表格逻辑
- job.HasTable = 1 //添加标识:文本中有table
- newCon = TextAfterRemoveTable(con)
- //log.Println(newCon)
- if newCon != "" {
- job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
- }
- for i := 0; i < len(tabs); i++ {
- blockTag := ""
- if len(tabs[i].Nodes) > 0 {
- if tabs[i].Nodes[0].PrevSibling != nil {
- blockTag = tabs[i].Nodes[0].PrevSibling.Data
- }
- }
- //添加标识:文本中有table
- //blockTag - 块标签
- //处理表格
- tabres := AnalyTableV2(tabs[i], job.Category, blockTag, con, 1, job.SourceMid, job.RuleBlock, isSite, codeSite) //解析表格入口 返回:汇总表格对象
- job.IsUnRulesTab = tabres.isUnRulesTab
- processTableResult(tabres, bl, job, isSite, codeSite)
- }
- } else {
- //从正文里面找分包
- job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
- }
- bl.Text = HtmlToText(con)
- FindProjectCode(bl.Text, job) //匹配项目编号 ~~ 清洗无效信息文本
- if blTextReg.MatchString(bl.Text) && !unblTextReg.MatchString(bl.Text) {
- if strings.Index(bl.Text, "业绩") > 1 {
- //如果有采购单位信息~置前
- before_arr := []string{}
- if beforeTextReg.MatchString(bl.Text) {
- before_arr = beforeTextReg.FindAllString(bl.Text, -1)
- }
- bl.Text = bl.Text[:strings.Index(bl.Text, "业绩")]
- if len(before_arr) > 0 {
- bl.Text = strings.Join(before_arr, "\n") + bl.Text
- }
- }
- }
- //特殊-指定处理-结构转化formattext100
- if formattext100.MatchString(bl.Text) {
- new_str := formattext100.FindString(bl.Text)
- new_str = formattext100.ReplaceAllString(new_str, "$1")
- bl.Text = fmt.Sprintf("中标金额:%s万元\n", new_str) + bl.Text
- }
- //调用kv解析库-处理detail
- bl.Text = formatText(bl.Text, "all")
- //处理 :
- bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
- //处理空格
- bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
- //新加 未分块table中未能解析到 中标候选人,从正文中解析
- if job.Winnerorder == nil || len(job.Winnerorder) == 0 || len(job.Winnerorder) > 8 {
- bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
- if thanWinnerOrderEffective(job.Winnerorder, bl.Winnerorder) {
- job.Winnerorder = bl.Winnerorder
- }
- } else { //table里面识别出单位候选人-未识别金额...
- if onlyExistsWinEntName(job.Winnerorder) {
- new_winorder := winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
- if thanExistsNewWinOrder(job.Winnerorder, new_winorder) {
- job.Winnerorder = new_winorder
- }
- }
- }
- //如果表格查询分包-有分包-但是没有有效值的话 ,正文重新查找
- if len(tabs) > 0 && job.BlockPackage != nil {
- if !isUsefulPackage(job.BlockPackage) { //表格未识别出有效分包-且文本里面无有效字样
- text_pkg := FindPackageFromText(job.Title, bl.Text, isSite, codeSite)
- if len(text_pkg) > 0 {
- job.BlockPackage = text_pkg
- }
- }
- }
- job.Block = append(job.Block, bl)
- }
- }
- // 是否有效分包
- func isUsefulPackage(pkg map[string]*u.BlockPackage) bool {
- if pkg == nil || len(pkg) == 0 {
- return false
- }
- for _, v := range pkg {
- p_winner := v.Winner
- p_budget := v.Budget
- p_bidamout := v.Bidamount
- if p_winner != "" || p_budget > float64(0) || p_bidamout > float64(0) {
- return true
- }
- }
- return false
- }
- // 核查候选人字段是否合理
- func verifyPackageWinnerOrder(wins []map[string]interface{}) bool {
- temp := map[string]string{}
- for k, v := range wins {
- if qu.IntAll(v["sort"]) != k+1 {
- return false
- }
- entname := qu.ObjToString(v["entname"])
- if temp[entname] == "" {
- temp[entname] = entname
- } else {
- return false
- }
- }
- return true
- }
- // 判断数组string 是否重复
- func isRepeatArrString(arr1, arr2 []string) bool {
- is_r := true
- for k, v := range arr1 {
- if v != arr2[k] {
- is_r = false
- break
- }
- }
- return is_r
- }
- // 对sortkv重构
- func isResetUnitAmountSortKV(table *Table) {
- isUnitAmount := 0
- for _, k := range table.SortKV.Keys {
- v := table.SortKV.Map[k]
- if new_v, ok := v.(string); ok && (k == "中标金额" || k == "单位") {
- if k == "单位" && new_v == "万元" {
- isUnitAmount++
- }
- if k == "中标金额" && MoneyReg.MatchString(new_v) && !strings.Contains(new_v, "万") {
- isUnitAmount++
- }
- }
- }
- if isUnitAmount > 1 {
- table.SortKV.Map["中标金额"] = qu.ObjToString(table.SortKV.Map["中标金额"]) + "万元"
- }
- }
- func isResetUnitPriceSortKV(table *Table) {
- keyArr := []string{"序号", "数量", "单价"}
- isMatch := true
- for _, v := range keyArr {
- if _, ok := table.SortKV.Map[v].(string); !ok {
- isMatch = false
- break
- }
- }
- if isMatch && table.SortKV.Map["总价(元)"] == nil {
- if qu.ObjToString(table.SortKV.Map["序号"]) == "1" &&
- qu.ObjToString(table.SortKV.Map["数量"]) == "1" {
- table.SortKV.Map["总价(元)"] = table.SortKV.Map["单价"]
- table.SortKV.Keys = append(table.SortKV.Keys, "总价(元)")
- }
- }
- }
- func isResetAmountAggregateSortKV(table *Table) {
- keyGroup := [][]string{}
- keyGroup = append(keyGroup, []string{"序号", "标项名称", "总价(元)"})
- keyGroup = append(keyGroup, []string{"序号", "名称", "总价(元)"})
- keyGroup = append(keyGroup, []string{"序号", "服务内容", "验收金额(元)"})
- keyGroup = append(keyGroup, []string{"序号", "标项名称", "单价(元)", "数量"})
- for _, v := range keyGroup {
- if len(v) == 3 {
- arr1 := u.ConvertInterface(table.SortKV.Map[v[0]])
- arr2 := u.ConvertInterface(table.SortKV.Map[v[1]])
- arr3 := u.ConvertInterface(table.SortKV.Map[v[2]])
- if len(arr1) > 1 && len(arr1) == len(arr2) && len(arr1) == len(arr3) {
- amount := float64(0)
- for _, nv := range arr3 {
- amount = precisionFloat(amount, qu.Float64All(nv))
- }
- if amount > float64(0) {
- table.SortKV.Map[v[2]] = fmt.Sprintf("%f", amount)
- }
- break
- }
- }
- if len(v) == 4 {
- arr1 := u.ConvertInterface(table.SortKV.Map[v[0]])
- arr2 := u.ConvertInterface(table.SortKV.Map[v[1]])
- arr3 := u.ConvertInterface(table.SortKV.Map[v[2]])
- arr4 := u.ConvertInterface(table.SortKV.Map[v[3]])
- if len(arr1) > 1 && len(arr1) == len(arr2) && len(arr1) == len(arr3) && len(arr1) == len(arr4) {
- amount := float64(0)
- for kv, nv := range arr3 {
- amount = precisionFloat(amount, qu.Float64All(nv)*qu.Float64All(arr4[kv]))
- }
- if amount > float64(0) {
- if table.SortKV.Map["总价(元)"] == nil {
- table.SortKV.Map["总价(元)"] = fmt.Sprintf("%f", amount)
- table.SortKV.Keys = append(table.SortKV.Keys, "总价(元)")
- } else {
- table.SortKV.Map["总价(元)"] = fmt.Sprintf("%f", amount)
- }
- }
- break
- }
- }
- }
- }
- func isReseterialNumberSortKV(table *Table) {
- arr := u.ConvertInterface(table.SortKV.Map["序号"])
- if len(arr) > 5 {
- table.SortKV.Map["序号"] = arr[:3]
- }
- }
- func isResetWinnerRankingSortKV(table *Table) {
- if len(table.SortKV.Map) == 2 && table.SortKV.Map["中标人"] != nil && table.SortKV.Map["中标价格"] != nil {
- arr := u.ConvertInterface(table.SortKV.Map["中标人"])
- if len(arr) > 1 && len(arr) <= 3 {
- table.SortKV.Map["排名"] = []string{"1", "2"}
- table.SortKV.Keys = append(table.SortKV.Keys, "排名")
- }
- }
- }
- // 精度丢失-相加
- func precisionFloat(tmp1, tmp2 float64) float64 {
- n1 := decimal.NewFromFloat(tmp1)
- n2 := decimal.NewFromFloat(tmp2)
- decimalValue := n2.Add(n1)
- res, _ := decimalValue.Float64()
- return res
- }
- // 重置as~keys
- func resetAsKeysBidamount(as *SortMap) {
- keys, values_data := as.Keys, as.Map
- if len(keys) == 0 {
- return
- }
- k1, k2 := "投标报价(元)", "经评审的投标价(元)"
- value1, value2 := make([]string, 0), make([]string, 0)
- is_del := false
- if arr, ok := values_data[k1].([]string); ok && len(arr) > 0 {
- value1 = arr
- } else {
- return
- }
- if arr, ok := values_data[k2].([]string); ok && len(arr) > 0 {
- value2 = arr
- } else {
- return
- }
- if len(value1) == len(value2) && len(value1) > 0 {
- tmp_value := value2[0]
- price := winnerOrderEntity.clear("中标金额", tmp_value+GetMoneyUnit(k1, tmp_value))
- if pricestr, _ := price.(string); len(pricestr) < 30 && len(pricestr) > 0 {
- is_del = true
- }
- }
- if is_del {
- as.Map[k1] = as.Map[k2]
- }
- }
- // 判断是否特殊候选人结构表格
- func judgmentWinnerOrderHeaderInfo(TRs []*TR) bool {
- if len(TRs) < 3 {
- return false
- }
- //是否含有指定关键词
- TR_0 := TRs[0]
- isLen := 0
- for k, v := range TRs {
- if k > 0 {
- if len(v.TDs) == len(TR_0.TDs) {
- isLen++
- }
- if isLen >= 2 {
- break
- }
- }
- }
- if isLen < 2 {
- return false
- }
- textArr := [][]string{}
- textArr = append(textArr, []string{"投标人", "中标候选人排序", "投标报价(万元)"})
- textArr = append(textArr, []string{"投标人", "中标候选人排序", "投标总报价(万元)"})
- for _, arr := range textArr {
- isok := 0
- for _, v := range arr {
- for _, v1 := range TR_0.TDs {
- if v1.Val == v {
- isok++
- break
- }
- }
- }
- if isok == 3 {
- return true
- }
- }
- return false
- }
- // 预算标签-不一定为分包
- func isUnRealBudgetBp(tnv []*u.Tag) bool {
- if len(tnv) != 2 {
- return false
- }
- key_1, key_2 := tnv[0].Key, tnv[1].Key
- value_1, value_2 := tnv[0].Value, tnv[1].Value
- if value_1 != value_2 {
- if strings.Contains(key_1, "项目总投资") && strings.Contains(key_2, "项目投资") {
- return true
- }
- if strings.Contains(key_2, "项目总投资") && strings.Contains(key_1, "项目投资") {
- return true
- }
- }
- return false
- }
- // 初始化lineMapArr,lineMap
- func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMap map[string]*SortMap) {
- lineMapArr = make(map[string]*SortMap)
- lineMap = make(map[string]*SortMap)
- for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
- val := table.SortKV.Map[key]
- key = regReplAllSpace.ReplaceAllString(key, "")
- key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
- //qu.Debug(key, "---------------------------", val)
- if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
- /*
- {
- "商品":["",""],
- "商品_"["",""],
- }
- */
- valArr, allempty := filterVal(realTypeVal...) //过滤数据
- if allempty {
- continue
- }
- realTypeVal = valArr
- line := underline.FindString(key)
- lineValMap1 := lineMapArr[line]
- // i := 1
- // L:
- // for { //去除数组空数据
- // last := realTypeVal[len(realTypeVal)-i]
- // if last == "" {
- // i++
- // if i > len(realTypeVal) {
- // break
- // }
- // goto L
- // } else {
- // break
- // }
- // }
- // dislodgeNull := realTypeVal[:(len(realTypeVal) - i + 1)] //去除数组中空数据
- if len(realTypeVal) > 0 {
- if lineValMap1 == nil {
- tmp := NewSortMap()
- tmp.AddKey(key, realTypeVal)
- lineMapArr[line] = tmp
- } else {
- lineValMap1.AddKey(key, realTypeVal)
- }
- }
- //qu.Debug("lineMapArr---", lineMapArr[line].Keys, lineMapArr[line].Map)
- } else if realTypeVal, b := val.(string); b { //val为字符串 {"数量":"1"}
- /*
- {
- "商品:"",名称:"",
- "商品_:"",名称_:"",
- "商品__:"",名称__:"",
- }
- */
- valArr, allempty := filterVal(realTypeVal) //过滤数据
- if allempty {
- continue
- }
- realTypeVal = valArr[0]
- line := underline.FindString(key)
- lineValMap2 := lineMap[line]
- if lineValMap2 == nil {
- tmp := NewSortMap()
- tmp.AddKey(key, realTypeVal)
- lineMap[line] = tmp
- } else {
- lineValMap2.AddKey(key, realTypeVal)
- }
- //qu.Debug("lineMap---", lineMap[line].Keys, lineMap[line].Map)
- } else {
- // "_id" : ObjectId("5c2c3802a5cb26b9b78646c4")5c2b0551a5cb26b9b7cb05db否5c2a42e6a5cb26b9b763ba5a采购人:一、采购人5c2b06f5a5cb26b9b7cc4409
- //成交供应商排名 [map[entname:昆明合优科技有限公司 sortstr:第一中标候选人 sort:1] map[sort:2 entname:昆明厚起科技有限公司 sortstr:第二中标候选人] map[entname:云南远安科技发展有限公司 sortstr:第三中标候选人 sort:3]]
- //qu.Debug("err data:", key, val)
- }
- }
- return lineMapArr, lineMap
- }
- func dealArrData(maxNum int, ka map[string][]string) []map[string]string {
- for k2, v2 := range ka {
- //处理数组长度不相等,使长度一致
- if len(v2) > maxNum {
- ka[k2] = v2[:maxNum]
- }
- }
- finalData := assembleData(ka, 1)
- if len(finalData) > 0 {
- return finalData
- }
- return nil
- }
- func dealStrData(kv map[string]string) []map[string]string {
- finalData := []map[string]string{}
- if len(kv) > 0 {
- finalData = assembleData(kv, 2)
- }
- return finalData
- }
- // 组装数据,每一行的数据为一数据集合
- func assembleData(m interface{}, n int) []map[string]string {
- defer qu.Catch()
- /*
- {
- "itemname":["计算机","打印机","机柜"],
- "number" :["1","12","4"]
- }
- */
- datas := []map[string]string{}
- if n == 1 { //数组数据
- realTypeM := m.(map[string][]string)
- //根据数组数据的顺序 将多个数组中索引相同的数据拼装成一个map,并将这多个map放入一个arr
- /*
- arr1 ["a1","b1","c1"]
- arr2 ["a2","b2","c2"]
- [
- {"a1","a2"},
- {"b1","b2"},
- {"c1","c2"}
- ]
- */
- //start
- for k3, v3 := range realTypeM {
- for _, val := range v3 {
- data := make(map[string]string)
- data[k3] = val
- datas = append(datas, data)
- }
- break
- }
- for i, data := range datas {
- for k4, v4 := range realTypeM {
- if i < len(v4) { //数组数据长度不一致
- if v4[i] != " " {
- data[k4] = v4[i]
- } else {
- delete(data, k4)
- }
- } else {
- fmt.Println("err table")
- }
- }
- datas[i] = data
- }
- //end
- for _, fdv := range datas { //清除空数据和只含特殊符号的数据
- for fmk, fmv := range fdv {
- if tabletdclear.ReplaceAllString(fmv, "") == "" {
- delete(fdv, fmk)
- }
- }
- }
- } else { //字符串数据
- realTypeM := m.(map[string]string)
- datas = append(datas, realTypeM)
- }
- return datas
- }
- func convert(key, r string) bool {
- defer qu.Catch()
- flag := false
- key = tabletitleclear.ReplaceAllString(key, "")
- reg, err := regexp.Compile(r)
- if err != nil {
- fmt.Println("reg err:", err)
- return false
- }
- flag = reg.MatchString(key)
- return flag
- }
- func hasKey(table *Table, n int) {
- defer qu.Catch()
- if table.TableResult.HasKey == 1 {
- return
- }
- if n >= 1 {
- table.TableResult.HasKey = 1
- }
- }
- func hasGoods(table *Table, data ...string) {
- defer qu.Catch()
- goodsArr := make([]string, len(data))
- //fmt.Println("table.TableResult.HasGoods=====", table.TableResult.HasGoods)
- if table.TableResult.HasGoods == 1 {
- return
- }
- for i, d := range data {
- if d != "" {
- goods := u.GoodsGet.CheckSensitiveWord(d)
- //fmt.Println("goods======", goods)
- goodsArr[i] = goods
- if len(goods) > 0 {
- table.TableResult.HasGoods = 1
- break
- }
- }
- }
- }
- func hasBrand(table *Table, data ...string) ([]string, bool) {
- defer qu.Catch()
- //fmt.Println("table.TableResult.HasBrand---------", table.TableResult.HasBrand)
- brandArr := make([]string, len(data))
- // if table.TableResult.HasBrand == 1 {
- // return brandArr, 1
- // }
- allNull := true
- for i, d := range data {
- //if d != "" {
- brand := u.BrandGet.CheckSensitiveWord(d)
- if brand != "" {
- allNull = false
- }
- //fmt.Println("brand======", brand)
- brandArr[i] = brand
- if len(brand) > 0 {
- table.TableResult.HasBrand = 1
- }
- //}
- }
- return brandArr, allNull
- }
- // 过滤td值
- func filterVal(val ...string) ([]string, bool) {
- defer qu.Catch()
- n := 0 //记录被过滤的个数
- for i, v := range val {
- if len(clearnn.FindStringSubmatch(v)) > 0 {
- tmpv := clearnn.FindStringSubmatch(v)[1]
- if tmpv != "" {
- v = tmpv
- }
- }
- afterFilter := tabletdclear.ReplaceAllString(v, "")
- afterFilter = NullVal.ReplaceAllString(afterFilter, "")
- if afterFilter == "" {
- n++
- }
- val[i] = afterFilter
- }
- allempty := false
- if n == len(val) { //所有都被过滤掉
- allempty = true
- }
- return val, allempty
- }
- // 过滤itemname全是数字
- func filterItem(itemval ...string) []string {
- defer qu.Catch()
- result := []string{}
- for _, v := range itemval {
- afterFilter := numclear.ReplaceAllString(v, "")
- if afterFilter != "" {
- result = append(result, v)
- } else {
- result = append(result, afterFilter)
- }
- }
- return result
- }
- // 处理价格
- func dealPriceInterface(key string, val ...string) (result []interface{}) {
- defer qu.Catch()
- for _, v := range val {
- if num1.MatchString(v) { //含数字
- tdIsWan := strings.Contains(v, "万")
- if !tdIsWan {
- if strings.Contains(key, "万") {
- v = v + "万"
- }
- }
- data := []interface{}{v, ""}
- money := clear.ObjToMoney(data)[0]
- result = append(result, money)
- } else {
- result = append(result, "")
- }
- }
- return
- }
- // 处理number
- func dealNumberInterface(val ...string) (result []interface{}) {
- defer qu.Catch()
- for _, v := range val { //1个 1.00个
- n := numclear.FindString(v)
- if n == "" {
- result = append(result, "")
- } else if tmp := clear.NumChar[n]; tmp != nil { //一二三...
- result = append(result, tmp)
- } else { //数字
- result = append(result, qu.IntAll(strings.Split(n, ".")[0]))
- }
- }
- return
- }
- // 处理价格
- func dealPrice(key string, val ...string) []string {
- defer qu.Catch()
- result := []string{}
- for _, v := range val {
- data := []interface{}{v, key}
- money := clear.ObjToMoney(data)[0]
- result = append(result, fmt.Sprintf("%v", money))
- }
- // result := []string{}
- // for _, v := range val { //1.00万元 1元 2.25元/斤
- // tmparr := strings.Split(v, ".")
- // tmparr[0] = moneyNum.ReplaceAllString(tmparr[0], "")
- // if iswan {
- // result = append(result, tmparr[0]+"0000")
- // } else { //td val值带万
- // if strings.Contains(v, "万") { //价格中带有万
- // result = append(result, tmparr[0]+"0000")
- // } else {
- // result = append(result, tmparr[0])
- // }
- // }
- // }
- return result
- }
- // 处理number
- func dealNumber(val ...string) ([]string, []string) {
- defer qu.Catch()
- unitnameArr := []string{}
- result := []string{}
- for _, v := range val { //1个 1.00个
- n := numclear.FindString(v)
- unitname := numclear.ReplaceAllString(v, "") //匹配个数后的单位
- unitnameArr = append(unitnameArr, unitname)
- //val[i] = strings.Split(n, ".")[0]
- result = append(result, strings.Split(n, ".")[0])
- }
- return result, unitnameArr
- }
- // 是否符合指定结构
- func isPkgRegexArr(regs []*regexp.Regexp, con string) bool {
- S_Index := regs[0].FindAllStringIndex(con, -1)
- E_Index := regs[1].FindAllStringIndex(con, -1)
- if len(S_Index) == len(E_Index) && len(S_Index) == 1 {
- return true
- }
- return false
- }
|