|
- package clean
- import (
- "fmt"
- "github.com/shopspring/decimal"
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "math"
- "regexp"
- "strconv"
- "strings"
- "unicode/utf8"
- )
- var (
- moneyReg1 = regexp.MustCompile("([\\s ,]+)")
- moneyReg2 = regexp.MustCompile("^([0-9.]+)E([1-7])$")
- numReg1 = regexp.MustCompile("([0-9\\.]+)")
- )
- var unpkvBidamountReg = regexp.MustCompile("^([Xx]\\+[1-9\\.]+元/每)")
- var specBidamountReg = regexp.MustCompile("^([0-9.]+)E([1-7])$")
- var regUnitMoneyClean = regexp.MustCompile("^(.*单价[0-9.]+元[/][袋|块])[,,](含税总价[0-9.]+[万元]+)[.。]$")
- var blackMoneyClean = regexp.MustCompile("^([0-9.]+以下[万]?|分)$")
- var impactMoneyClean = regexp.MustCompile("(分二串口|分站模块)")
- // 大写金额补充
- var impactMoneyeplenish = regexp.MustCompile("^([壹贰叁肆伍陆柒捌玖]分)")
- // 特殊金额-格式-重置
- var resetAamountReg = regexp.MustCompile("[.](0|00)[.](0|00)")
- var regPercentMoney, _ = regexp.Compile(`[0-9.]+[((]?[%|%][))]?`)
- var regQianw, _ = regexp.Compile(`\d{1,2}千万`)
- var kxjsReg = regexp.MustCompile("[0-9][E|e]{1}[-—+]{1}[0-9]{1,2}")
- var regOperator, _ = regexp.Compile(`[*|+|)*)]`)
- var regNumFloat, _ = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`)
- var regStrUnit, _ = regexp.Compile(`[元|万|亿]`)
- var regStrJe = regexp.MustCompile(`([1-9]\d*|0)(\.\d_+)?[\s|元|万|亿]{0,3}`)
- var regStrChar = `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]`
- var moneyRegChar, _ = regexp.Compile(regStrChar)
- var contentUnit, _ = regexp.Compile(`(万元|单位/万)`)
- var numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`)
- var moneyUnitRegBool = regexp.MustCompile(`(中标金额|成交金额|合同金额|中标价|成交价|成交价格|中标\(成交\)金额|投标报价|中标标价|成交结果)?[::\s]?(0|零|0.0|¥0)+(0|\.)*[\s]?(万|元|){0,2}[\s]?((人民币))?$`)
- var cutAllSpace, _ = regexp.Compile(`\s*`)
- var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n", "\u0001"}
- var moneyClearSpidercode map[string]interface{}
- var moneyChar = map[string]interface{}{ //"〇": "0", "零": "0",壹贰叁肆伍陆柒捌玖
- "一": float64(1), "壹": float64(1), "二": float64(2), "贰": float64(2), "三": float64(3), "叁": float64(3), "四": float64(4), "肆": float64(4), "五": float64(5), "伍": float64(5),
- "六": float64(6), "陆": float64(6), "七": float64(7), "柒": float64(7), "八": float64(8), "捌": float64(8), "九": float64(9), "玖": float64(9), "十": float64(10), "拾": float64(10),
- "百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000),
- "零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01),
- }
- var NumChar = map[string]interface{}{
- "一": 1, "二": 1, "三": 1, "四": 1, "五": 1, "六": 1, "七": 1, "八": 1, "久": 1, "十": 1,
- }
- var moneyUnit = map[string]float64{
- "元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
- }
- func init() {
- regOperator, _ = regexp.Compile(`[*|+|)*)]`)
- regNumFloat, _ = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`)
- regStrUnit, _ = regexp.Compile(`[元|万|亿]`)
- regStrJe = regexp.MustCompile(`([1-9]\d*|0)(\.\d_+)?[\s|元|万|亿]{0,3}`)
- regStrChar = `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]`
- moneyRegChar, _ = regexp.Compile(regStrChar)
- contentUnit, _ = regexp.Compile(`(万元|单位/万)`)
- numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`)
- regQianw, _ = regexp.Compile(`\d{1,2}千万`)
- kxjsReg = regexp.MustCompile("[0-9][E|e]{1}[-—+]{1}[0-9]{1,2}")
- regPercentMoney, _ = regexp.Compile(`[0-9.]+[((]?[%|%][))]?`)
- }
- // 转换金额
- func ConvertMoney(money float64, unit string) float64 {
- if strings.Contains(unit, "万") && money > 0.0 {
- //倍率
- num1 := decimal.NewFromFloat(money)
- num2 := decimal.NewFromFloat(10000)
- decimalValue := num1.Mul(num2)
- res, _ := decimalValue.Float64()
- if res < 1000000000.0 {
- return res
- }
- }
- if strings.Contains(unit, "亿") && money > 0.0 {
- //倍率
- num1 := decimal.NewFromFloat(money)
- num2 := decimal.NewFromFloat(100000000)
- decimalValue := num1.Mul(num2)
- res, _ := decimalValue.Float64()
- if res < 1000000000.0 {
- return res
- }
- }
- return money
- }
- // 金额转换
- func CleanMoney(data []interface{}) (float64, bool) {
- isFindUnit := false
- tmpstr := (data)[0]
- totmpstr := ""
- if _, ok := tmpstr.(float64); ok {
- totmpstr = fmt.Sprintf("%f", tmpstr)
- } else {
- totmpstr = util.ObjToString(tmpstr)
- }
- //去除空格
- totmpstr = strings.ReplaceAll(totmpstr, " ", "")
- (data)[0] = totmpstr
- //特殊转换-科学计数法
- if specBidamountReg.MatchString(totmpstr) {
- price := util.Float64All(specBidamountReg.ReplaceAllString(totmpstr, "${1}"))
- if unit := util.Float64All(specBidamountReg.ReplaceAllString(totmpstr, "${2}")); unit > 0.0 && price > 0.0 {
- totmpstr = fmt.Sprintf("%f", math.Pow(10, unit)*price)
- (data)[0] = totmpstr
- }
- }
- //异常替换
- if unpkvBidamountReg.MatchString(totmpstr) {
- totmpstr = unpkvBidamountReg.ReplaceAllString(totmpstr, "")
- (data)[0] = totmpstr
- }
- if resetAamountReg.MatchString(totmpstr) {
- totmpstr = resetAamountReg.ReplaceAllString(totmpstr, ".0")
- (data)[0] = totmpstr
- }
- //单位指定
- if regUnitMoneyClean.MatchString(totmpstr) {
- totmpstr = regUnitMoneyClean.ReplaceAllString(totmpstr, "$2")
- (data)[0] = totmpstr
- }
- //特殊替换
- if impactMoneyClean.MatchString(totmpstr) {
- totmpstr = impactMoneyClean.ReplaceAllString(totmpstr, "")
- (data)[0] = totmpstr
- }
- //大写金额补充
- if impactMoneyeplenish.MatchString(totmpstr) {
- totmpstr = "零元" + totmpstr
- (data)[0] = totmpstr
- }
- //黑名单
- if blackMoneyClean.MatchString(totmpstr) {
- totmpstr = ""
- (data)[0] = totmpstr
- }
- //未含税总价1454400.00元,税率6%,含税总价1541664.00元
- Percent := regPercentMoney.FindAllString(totmpstr, -1)
- for _, v := range Percent {
- totmpstr = strings.ReplaceAll(totmpstr, v, "")
- }
- totmpstr = strings.ReplaceAll(totmpstr, "_", "")
- (data)[0] = totmpstr //过滤到%相关数字
- if utf8.RuneCountInString(totmpstr) > 100 { //过长-字符无有效金额
- (data)[0] = 0
- data = append(data, false)
- return 0.0, isFindUnit
- }
- if utf8.RuneCountInString(totmpstr) > 20 {
- if numCapitals.MatchString(totmpstr) {
- tmpstr = numCapitals.FindString(totmpstr)
- } else if regStrJe.MatchString(totmpstr) {
- tmpstr = regStrJe.FindString(totmpstr)
- } else {
- (data)[0] = 0
- data = append(data, false)
- return 0.0, isFindUnit
- }
- }
- //是否发现单位
- if strings.Contains(fmt.Sprint(data[0]), "万") || strings.Contains(fmt.Sprint(data[0]), "亿") {
- isFindUnit = true
- }
- ret := capitalMoney(data)[0]
- if ret.(float64) < float64(10000) || ret.(float64) > float64(50000000000) {
- ret2, _ := numMoney(data)
- //isfindUnit = b
- if ret2[0].(float64) > ret.(float64) {
- ret = ret2[0]
- }
- }
- f := util.Float64All(ret)
- //f, _ := strconv.ParseFloat(strconv.FormatFloat(ret.(float64), 'f', 4, 64), 64)
- //if f < 1 {
- // f = 0
- //}
- //若果金额小于50,全文检索单位:万
- // if f < 50 && f > 0 && isfindUnit {
- // rep := contentUnit.FindAllStringIndex(fmt.Sprint(data[1]), -1)
- // if len(rep) > 0 {
- // f = f * 10000
- // }
- // }
- data[0] = util.Float64All(ret)
- if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(tmpstr)) {
- data = append(data, false)
- return 0.0, isFindUnit
- }
- data = append(data, true)
- if len(data) > 0 {
- return util.Float64All(data[0]), isFindUnit
- } else {
- return 0.0, isFindUnit
- }
- }
- // 数字金额转换
- func numMoney(data []interface{}) ([]interface{}, bool) {
- tmp := ""
- if _, ok := data[0].(float64); ok {
- tmp = fmt.Sprintf("%f", data[0])
- } else {
- tmp = util.ObjToString(data[0])
- }
- tmp = strings.ReplaceAll(tmp, "(不含税)", "")
- //费率转换% ‰
- flv := float64(1)
- if strings.HasSuffix(tmp, "%") {
- flv = 0.01
- } else if strings.HasSuffix(tmp, "‰") {
- flv = 0.001
- }
- repUnit := float64(1)
- if regQianw.MatchString(tmp) {
- tmp = strings.Replace(tmp, "千万", "万", -1)
- repUnit = float64(1000)
- }
- tmp = replaceSymbol(tmp, []string{",", ",", "(", ")", "(", ")", ":", "\n"})
- tmp = replaceString(tmp, []string{"万元", "亿元", "."}, []string{"万", "亿", "."})
- tmp = fmt.Sprint(CutAllSpace([]interface{}{tmp, data[1]})[0])
- rets := regNumFloat.FindAllString(tmp, -1)
- fnums := []float64{}
- unitstrs := []string{}
- if len(rets) > 0 {
- pindex := 0 //单位前置
- for k, v := range rets {
- f, err := strconv.ParseFloat(v, 64)
- if err == nil {
- fnums = append(fnums, f)
- index := strings.Index(tmp, v)
- //单位后置
- start := index + len(v)
- end := start + 3
- //log.Println("vvv", tmp, v, pindex, index, start)
- if k > 0 {
- if start >= pindex+3 {
- pstart := pindex + 3
- if pstart >= index {
- pstart = index
- }
- if len(tmp) > end {
- unitstrs = append(unitstrs, tmp[pstart:index]+tmp[start:end])
- } else {
- unitstrs = append(unitstrs, tmp[pstart:index]+tmp[start:])
- }
- } else {
- if len(tmp) > end {
- unitstrs = append(unitstrs, tmp[start:end])
- } else {
- unitstrs = append(unitstrs, tmp[start:])
- }
- }
- } else {
- if len(tmp) > end {
- if index-3 >= 0 {
- unitstrs = append(unitstrs, tmp[index-3:index]+tmp[start:end])
- } else {
- unitstrs = append(unitstrs, tmp[start:end])
- }
- } else {
- if index-3 >= 0 {
- unitstrs = append(unitstrs, tmp[index-3:index]+tmp[start:])
- } else {
- unitstrs = append(unitstrs, tmp[start:])
- }
- }
- }
- pindex = start
- }
- }
- }
- //log.Println("unitstrs", fnums, unitstrs)
- unit := float64(0)
- fnum := float64(0)
- for k, v := range fnums {
- fnum = v
- units := regStrUnit.FindAllString(unitstrs[k], -1)
- for _, v := range units {
- if moneyUnit[v] != 0 {
- unit = moneyUnit[v]
- break
- }
- }
- if unit != float64(0) { //取第一个
- break
- }
- }
- fnum = fnum * repUnit
- if unit == float64(0) {
- num1 := decimal.NewFromFloat(fnum)
- num2 := decimal.NewFromFloat(flv)
- decimalValue := num1.Mul(num2)
- decimal_res, _ := decimalValue.Float64()
- data[0] = decimal_res
- } else {
- num1 := decimal.NewFromFloat(fnum)
- num2 := decimal.NewFromFloat(unit)
- num3 := decimal.NewFromFloat(flv)
- decimalValue := num1.Mul(num2).Mul(num3)
- decimal_res, _ := decimalValue.Float64()
- data[0] = decimal_res
- }
- if unit == 10000 {
- return data, false
- } else {
- return data, true
- }
- }
- // 大写数子金额转换
- func capitalMoney(data []interface{}) []interface{} {
- nodes := []float64{}
- node := float64(0)
- tmp := float64(0)
- decimals := 0.0
- ishaspoint := false //是否含小数点
- fnum := float64(0)
- end := false
- str := fmt.Sprint(data[0])
- //提取第一个大写信息
- if strings.Contains(str, "壹") {
- str = strings.ReplaceAll(str, "一", "壹")
- }
- strmatch := numCapitals.FindAllStringSubmatch(str, -1)
- if len(strmatch) > 0 {
- str = strmatch[0][0]
- }
- suffixUnit := float64(1)
- if strings.HasSuffix(str, "万") || strings.HasSuffix(str, "万元") || strings.HasSuffix(str, "万元整") {
- index := strings.LastIndex(str, "万")
- str = str[0:index]
- suffixUnit = float64(10000)
- }
- yy := false
- moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
- if key == "元" || key == "圆" || key == "点" {
- ishaspoint = true
- }
- if v, ok := moneyChar[key].(float64); ok && !end {
- if ishaspoint && v > 10 { //排除后面有其他的单位
- return ""
- }
- //fmt.Println(key, v, fnum)
- if v < 10 && v >= 0 {
- if ishaspoint { //小数部分
- if v >= 1 {
- fnum = v
- } else if v < 1 && v > 0 {
- decimals += fnum * v
- }
- } else {
- if tmp != float64(0) {
- node += tmp
- }
- tmp = float64(v)
- }
- } else if v == 10000 || v == 100000000 { //单位万、亿
- if tmp != float64(0) {
- node += tmp
- tmp = float64(0)
- }
- nodes = append(nodes, node*util.Float64All(v))
- if v == 100000000 {
- yy = true
- }
- node = float64(0)
- } else {
- if v == 10 && tmp == 0 {
- tmp = 1
- }
- tmp = tmp * util.Float64All(v)
- node += tmp
- tmp = float64(0)
- }
- }
- if key == "整" || key == "正" || key == "分" {
- end = true
- }
- return ""
- })
- if yy {
- nodes = append(nodes, node*suffixUnit, tmp)
- } else {
- nodes = append(nodes, node, tmp)
- }
- ret := float64(0)
- for _, v := range nodes {
- ret += v
- }
- if yy {
- return []interface{}{(ret + decimals), data[1]}
- } else {
- return []interface{}{(ret + decimals) * suffixUnit, data[1]}
- }
- }
- // 过滤符号
- func replaceSymbol(con string, rep []string) string {
- for _, v := range rep {
- con = strings.Replace(con, v, "", -1)
- }
- return con
- }
- // 符号替换
- func replaceString(con string, ret, rep []string) string {
- for k, v := range ret {
- if len(rep) > k {
- con = strings.Replace(con, v, rep[k], -1)
- }
- }
- return con
- }
- // 清理所有空白符
- func CutAllSpace(data []interface{}) []interface{} {
- tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
- tmp = replaceSymbol(tmp, spaces)
- data[0] = tmp
- return data
- }
|