package clean import ( "fmt" util "jygit.jydev.jianyu360.cn/data_processing/common_utils" "math" "regexp" "strconv" "strings" "unicode/utf8" ) var ( moneyReg1 = regexp.MustCompile("([\\s ,]+)") moneyReg2 = regexp.MustCompile("^([0-9.]+)E([1-7])$") numReg1 = regexp.MustCompile("([0-9\\.]+)") ) var unpkvBidamountReg = regexp.MustCompile("^([Xx]\\+[1-9\\.]+元/每)") var specBidamountReg = regexp.MustCompile("^([0-9.]+)E([1-7])$") var regUnitMoneyClean = regexp.MustCompile("^(.*单价[0-9.]+元[/][袋|块])[,,](含税总价[0-9.]+[万元]+)[.。]$") var blackMoneyClean = regexp.MustCompile("^([0-9.]+以下[万]?|分)$") var impactMoneyClean = regexp.MustCompile("(分二串口|分站模块)") // 大写金额补充 var impactMoneyeplenish = regexp.MustCompile("^([壹贰叁肆伍陆柒捌玖]分)") // 特殊金额-格式-重置 var resetAamountReg = regexp.MustCompile("[.](0|00)[.](0|00)") var regPercentMoney, _ = regexp.Compile(`[0-9.]+[((]?[%|%][))]?`) var regQianw, _ = regexp.Compile(`\d{1,2}千万`) var kxjsReg = regexp.MustCompile("[0-9][E|e]{1}[-—+]{1}[0-9]{1,2}") var regOperator, _ = regexp.Compile(`[*|+|)*)]`) var regNumFloat, _ = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`) var regStrUnit, _ = regexp.Compile(`[元|万|亿]`) var regStrJe = regexp.MustCompile(`([1-9]\d*|0)(\.\d_+)?[\s|元|万|亿]{0,3}`) var regStrChar = `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]` var moneyRegChar, _ = regexp.Compile(regStrChar) var contentUnit, _ = regexp.Compile(`(万元|单位/万)`) var numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`) var moneyUnitRegBool = regexp.MustCompile(`(中标金额|成交金额|合同金额|中标价|成交价|成交价格|中标\(成交\)金额|投标报价|中标标价|成交结果)?[::\s]?(0|零|0.0|¥0)+(0|\.)*[\s]?(万|元|){0,2}[\s]?((人民币))?$`) var cutAllSpace, _ = regexp.Compile(`\s*`) var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n", "\u0001"} var moneyClearSpidercode map[string]interface{} var moneyChar = map[string]interface{}{ //"〇": "0", "零": "0",壹贰叁肆伍陆柒捌玖 "一": float64(1), "壹": float64(1), "二": float64(2), "贰": float64(2), "三": float64(3), "叁": float64(3), "四": float64(4), "肆": float64(4), "五": float64(5), "伍": float64(5), "六": float64(6), "陆": float64(6), "七": float64(7), "柒": float64(7), "八": float64(8), "捌": float64(8), "九": float64(9), "玖": float64(9), "十": float64(10), "拾": float64(10), "百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), "零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01), } var NumChar = map[string]interface{}{ "一": 1, "二": 1, "三": 1, "四": 1, "五": 1, "六": 1, "七": 1, "八": 1, "久": 1, "十": 1, } var moneyUnit = map[string]float64{ "元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位 } func init() { regOperator, _ = regexp.Compile(`[*|+|)*)]`) regNumFloat, _ = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`) regStrUnit, _ = regexp.Compile(`[元|万|亿]`) regStrJe = regexp.MustCompile(`([1-9]\d*|0)(\.\d_+)?[\s|元|万|亿]{0,3}`) regStrChar = `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]` moneyRegChar, _ = regexp.Compile(regStrChar) contentUnit, _ = regexp.Compile(`(万元|单位/万)`) numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`) regQianw, _ = regexp.Compile(`\d{1,2}千万`) kxjsReg = regexp.MustCompile("[0-9][E|e]{1}[-—+]{1}[0-9]{1,2}") regPercentMoney, _ = regexp.Compile(`[0-9.]+[((]?[%|%][))]?`) } // 金额转换 func CleanMoney(data []interface{}) float64 { //isfindUnit := true tmpstr := (data)[0] totmpstr := "" if _, ok := tmpstr.(float64); ok { totmpstr = fmt.Sprintf("%f", tmpstr) } else { totmpstr = util.ObjToString(tmpstr) } //去除空格 totmpstr = strings.ReplaceAll(totmpstr, " ", "") (data)[0] = totmpstr //特殊转换-科学计数法 if specBidamountReg.MatchString(totmpstr) { price := util.Float64All(specBidamountReg.ReplaceAllString(totmpstr, "${1}")) if unit := util.Float64All(specBidamountReg.ReplaceAllString(totmpstr, "${2}")); unit > 0.0 && price > 0.0 { totmpstr = fmt.Sprintf("%f", math.Pow(10, unit)*price) (data)[0] = totmpstr } } //异常替换 if unpkvBidamountReg.MatchString(totmpstr) { totmpstr = unpkvBidamountReg.ReplaceAllString(totmpstr, "") (data)[0] = totmpstr } if resetAamountReg.MatchString(totmpstr) { totmpstr = resetAamountReg.ReplaceAllString(totmpstr, ".0") (data)[0] = totmpstr } //单位指定 if regUnitMoneyClean.MatchString(totmpstr) { totmpstr = regUnitMoneyClean.ReplaceAllString(totmpstr, "$2") (data)[0] = totmpstr } //特殊替换 if impactMoneyClean.MatchString(totmpstr) { totmpstr = impactMoneyClean.ReplaceAllString(totmpstr, "") (data)[0] = totmpstr } //大写金额补充 if impactMoneyeplenish.MatchString(totmpstr) { totmpstr = "零元" + totmpstr (data)[0] = totmpstr } //黑名单 if blackMoneyClean.MatchString(totmpstr) { totmpstr = "" (data)[0] = totmpstr } //未含税总价1454400.00元,税率6%,含税总价1541664.00元 Percent := regPercentMoney.FindAllString(totmpstr, -1) for _, v := range Percent { totmpstr = strings.ReplaceAll(totmpstr, v, "") } totmpstr = strings.ReplaceAll(totmpstr, "_", "") (data)[0] = totmpstr //过滤到%相关数字 if utf8.RuneCountInString(totmpstr) > 100 { //过长-字符无有效金额 (data)[0] = 0 data = append(data, false) return 0.0 } if utf8.RuneCountInString(totmpstr) > 20 { if numCapitals.MatchString(totmpstr) { tmpstr = numCapitals.FindString(totmpstr) } else if regStrJe.MatchString(totmpstr) { tmpstr = regStrJe.FindString(totmpstr) } else { (data)[0] = 0 data = append(data, false) return 0.0 } } ret := capitalMoney(data)[0] if ret.(float64) < float64(10000) || ret.(float64) > float64(50000000000) { ret2, _ := numMoney(data) //isfindUnit = b if ret2[0].(float64) > ret.(float64) { ret = ret2[0] } } f, _ := strconv.ParseFloat(strconv.FormatFloat(ret.(float64), 'f', 4, 64), 64) //if f < 1 { // f = 0 //} //若果金额小于50,全文检索单位:万 // if f < 50 && f > 0 && isfindUnit { // rep := contentUnit.FindAllStringIndex(fmt.Sprint(data[1]), -1) // if len(rep) > 0 { // f = f * 10000 // } // } data[0] = f if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(tmpstr)) { data = append(data, false) return 0.0 } data = append(data, true) if len(data) > 0 { return util.Float64All(data[0]) } else { return 0.0 } } // 数字金额转换 func numMoney(data []interface{}) ([]interface{}, bool) { tmp := fmt.Sprintf("%f", data[0]) tmp = strings.ReplaceAll(tmp, "(不含税)", "") //费率转换% ‰ flv := float64(1) if strings.HasSuffix(tmp, "%") { flv = 0.01 } else if strings.HasSuffix(tmp, "‰") { flv = 0.001 } repUnit := float64(1) if regQianw.MatchString(tmp) { tmp = strings.Replace(tmp, "千万", "万", -1) repUnit = float64(1000) } tmp = replaceSymbol(tmp, []string{",", ",", "(", ")", "(", ")", ":", "\n"}) tmp = replaceString(tmp, []string{"万元", "亿元", "."}, []string{"万", "亿", "."}) tmp = fmt.Sprint(CutAllSpace([]interface{}{tmp, data[1]})[0]) rets := regNumFloat.FindAllString(tmp, -1) fnums := []float64{} unitstrs := []string{} if len(rets) > 0 { pindex := 0 //单位前置 for k, v := range rets { f, err := strconv.ParseFloat(v, 64) if err == nil { fnums = append(fnums, f) index := strings.Index(tmp, v) //单位后置 start := index + len(v) end := start + 3 //log.Println("vvv", tmp, v, pindex, index, start) if k > 0 { if start >= pindex+3 { pstart := pindex + 3 if pstart >= index { pstart = index } if len(tmp) > end { unitstrs = append(unitstrs, tmp[pstart:index]+tmp[start:end]) } else { unitstrs = append(unitstrs, tmp[pstart:index]+tmp[start:]) } } else { if len(tmp) > end { unitstrs = append(unitstrs, tmp[start:end]) } else { unitstrs = append(unitstrs, tmp[start:]) } } } else { if len(tmp) > end { if index-3 >= 0 { unitstrs = append(unitstrs, tmp[index-3:index]+tmp[start:end]) } else { unitstrs = append(unitstrs, tmp[start:end]) } } else { if index-3 >= 0 { unitstrs = append(unitstrs, tmp[index-3:index]+tmp[start:]) } else { unitstrs = append(unitstrs, tmp[start:]) } } } pindex = start } } } //log.Println("unitstrs", fnums, unitstrs) unit := float64(0) fnum := float64(0) for k, v := range fnums { fnum = v units := regStrUnit.FindAllString(unitstrs[k], -1) for _, v := range units { if moneyUnit[v] != 0 { unit = moneyUnit[v] break } } if unit != float64(0) { //取第一个 break } } fnum = fnum * repUnit if unit == float64(0) { data[0] = fnum * flv } else { data[0] = fnum * unit * flv } if unit == 10000 { return data, false } else { return data, true } } // 大写数子金额转换 func capitalMoney(data []interface{}) []interface{} { nodes := []float64{} node := float64(0) tmp := float64(0) decimals := 0.0 ishaspoint := false //是否含小数点 fnum := float64(0) end := false str := fmt.Sprint(data[0]) //提取第一个大写信息 if strings.Contains(str, "壹") { str = strings.ReplaceAll(str, "一", "壹") } strmatch := numCapitals.FindAllStringSubmatch(str, -1) if len(strmatch) > 0 { str = strmatch[0][0] } suffixUnit := float64(1) if strings.HasSuffix(str, "万") || strings.HasSuffix(str, "万元") || strings.HasSuffix(str, "万元整") { index := strings.LastIndex(str, "万") str = str[0:index] suffixUnit = float64(10000) } yy := false moneyRegChar.ReplaceAllStringFunc(str, func(key string) string { if key == "元" || key == "圆" || key == "点" { ishaspoint = true } if v, ok := moneyChar[key].(float64); ok && !end { if ishaspoint && v > 10 { //排除后面有其他的单位 return "" } //fmt.Println(key, v, fnum) if v < 10 && v >= 0 { if ishaspoint { //小数部分 if v >= 1 { fnum = v } else if v < 1 && v > 0 { decimals += fnum * v } } else { if tmp != float64(0) { node += tmp } tmp = float64(v) } } else if v == 10000 || v == 100000000 { //单位万、亿 if tmp != float64(0) { node += tmp tmp = float64(0) } nodes = append(nodes, node*float64(v)) if v == 100000000 { yy = true } node = float64(0) } else { if v == 10 && tmp == 0 { tmp = 1 } tmp = tmp * float64(v) node += tmp tmp = float64(0) } } if key == "整" || key == "正" || key == "分" { end = true } return "" }) if yy { nodes = append(nodes, node*suffixUnit, tmp) } else { nodes = append(nodes, node, tmp) } ret := float64(0) for _, v := range nodes { ret += v } if yy { return []interface{}{(ret + decimals), data[1]} } else { return []interface{}{(ret + decimals) * suffixUnit, data[1]} } } // 过滤符号 func replaceSymbol(con string, rep []string) string { for _, v := range rep { con = strings.Replace(con, v, "", -1) } return con } // 符号替换 func replaceString(con string, ret, rep []string) string { for k, v := range ret { if len(rep) > k { con = strings.Replace(con, v, rep[k], -1) } } return con } // 清理所有空白符 func CutAllSpace(data []interface{}) []interface{} { tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "") tmp = replaceSymbol(tmp, spaces) data[0] = tmp return data }