wangchengcheng
/
jyps


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
							package main

import (
	util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
	"regexp"
	"strconv"
	"strings"
)

type TagMatching struct {
	tagName       string        //标签名称
	tagCode       string        //标签值(保存)
	matchField    []string      //关键词匹配字段,title,detail
	matchKey      string        //匹配的词语，多个使用逗号连接,"部队,国防,军事,军用"
	matchKeyReg   []*RegexpInfo //关键词的正则
	matchOutKey   string        //匹配关键词的同时，排除关键词中的特殊词
	matchOutReg   []*RegexpInfo //关键词中排除词的正则
	addField      []string      //附加词匹配字段
	addKey        string        //附件词匹配关键词
	addKeyReg     []*RegexpInfo //附加次的正则
	addOutKey     string        //满足附加词同时，需要排除的关键词，排除字段还是 addKey
	addOutReg     []*RegexpInfo //满足附加词中排除词的正则表达式
	excludeField  []string      //全局排除词
	excludeKey    string        //全局排除词匹配词
	excludeKeyReg []*RegexpInfo
	clearKey      []string //清理词匹配字段跟关键词一样
	buyerclass    string   //采购单位类型字段
}

type RegexpInfo struct {
	keyStr string
	regs   *regexp.Regexp
}

// GetRegex 根据关键词或者对应正则
func GetRegex(key string) []*RegexpInfo {
	var infos []*RegexpInfo
	for _, s := range strings.Split(key, ",") {
		if strings.Contains(s, "&&") || strings.Contains(s, "&!") {
			info := &RegexpInfo{
				keyStr: s,
				regs:   nil,
			}
			infos = append(infos, info)
		} else {
			info := &RegexpInfo{
				keyStr: s,
				regs:   regexp.MustCompile(".*(?i)" + s + ".*"),
			}
			infos = append(infos, info)
		}

	}
	return infos
}

// TaskTags 根据数据和正则规则，验证数据标签
func TaskTags(tmp map[string]interface{}, regs []TagMatching) (tags []string) {
	for _, v := range regs {
		// 1.排除词
		if len(v.excludeField) > 0 && len(v.excludeKeyReg) > 0 {
			// 遍历排除词对应的tmp字段信息
			for _, f := range v.excludeField {
				if val := util.ObjToString(tmp[f]); val != "" {
					if getRegsResult(val, v.excludeKeyReg) {
						return
					}
				}
			}
		}

		// 清理词；目的把 类似 fuck的单词替换为空字符串
		if len(v.clearKey) > 0 && len(v.matchField) > 0 {
			for _, s := range v.clearKey {
				for _, f := range v.matchField {
					if val := util.ObjToString(tmp[f]); val != "" {
						tmp[f] = strings.ReplaceAll(val, s, "")
					}
				}
			}
		}

		// 关键词
		if len(v.matchField) > 0 && len(v.matchKeyReg) > 0 {
			for _, f := range v.matchField {
				if val := util.ObjToString(tmp[f]); val != "" {
					//判断关键词字段的排除情况,含关键词的排除词直接退出
					if len(v.matchOutReg) > 0 && getRegsResult(val, v.matchOutReg) {
						return
					}
					// 符合关键词条件
					if getRegsResult(val, v.matchKeyReg) {
						// 附加词含有排除词时
						if len(v.addOutReg) > 0 && getRegsResult(val, v.addOutReg) {
							return
						}
						if len(v.addField) > 0 && len(v.addKeyReg) > 0 {
							// 不满足附加词，直接返回
							if !getRegsResult(val, v.addKeyReg) {
								continue
							}
						}
						tags = append(tags, v.tagName)
					}
				}
			}
		}
	}

	return
}

// getRegsResult 验证数据是否符合数组正则
func getRegsResult(data string, regs []*RegexpInfo) (res bool) {
	for _, e1 := range regs {
		if e1.regs != nil && e1.regs.MatchString(data) {
			return true
		} else {
			// && 特殊处理
			if strings.Contains(e1.keyStr, "&&") {
				flag := true
				for _, s := range strings.Split(e1.keyStr, "&&") {
					if !strings.Contains(data, s) {
						flag = false
						break
					}
				}
				if flag {
					return true
				}
			}
			// 前面是必须有的关键词&!，后面是不能有的关键词;比如 军队&!点军队，
			if strings.Contains(e1.keyStr, "&!") {
				keys := strings.Split(e1.keyStr, "&!")
				if strings.Contains(data, keys[0]) && !strings.Contains(data, keys[1]) {
					return true
				}
			}
		}
	}
	return false
}

var (
	regNumFloat, _  = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`)
	regStrUnit, _   = regexp.Compile(`[元|万|亿]`)
	contentUnit, _  = regexp.Compile(`(万元|单位/万)`)
	numCapitals, _  = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`)
	regStrChar      = `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]`
	moneyRegChar, _ = regexp.Compile(regStrChar)
	regQianw, _     = regexp.Compile(`\d{1,2}千万`)

	cutAllSpace, _ = regexp.Compile(`\s*`)
	spaces         = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}

	moneyChar = map[string]interface{}{ //"〇": "0", "零": "0",
		"一": float64(1), "壹": float64(1), "二": float64(2), "贰": float64(2), "三": float64(3), "叁": float64(3), "四": float64(4), "肆": float64(4), "五": float64(5), "伍": float64(5),
		"六": float64(6), "陆": float64(6), "七": float64(7), "柒": float64(7), "八": float64(8), "捌": float64(8), "九": float64(9), "玖": float64(9), "十": float64(10), "拾": float64(10),
		"百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000),
		"零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01),
	}
	moneyUnit = map[string]float64{
		"元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
	}
)

var currencyItem = map[string]string{
	"人民币": "人民币",
	"rmb": "人民币",
	"RMB": "人民币",
	"$":   "美元",
	"＄":   "美元",
	"美元":  "美元",
	"港元":  "港币",
	"港币":  "港币",
	"澳币":  "澳币",
	"澳元":  "澳币",
}

// 获取币种
func GetCurrency(text string) (currency string) {
	if text == "" {
		return
	}
	currency = "人民币"
	for k, v := range currencyItem {
		if strings.Contains(text, k) {
			currency = v
			return
		}
	}
	return
}

// 金额转换
func ObjToMoney(text string) float64 {
	isfindUnit := true
	ret := capitalMoney(text)
	if ret < float64(10000) || ret > float64(50000000000) {
		ret2, b := numMoney(text)
		isfindUnit = b
		if ret2 > ret {
			ret = ret2
		}
	}
	f, _ := strconv.ParseFloat(strconv.FormatFloat(ret, 'f', 4, 64), 64)
	// if f < 1 {
	// 	f = 0
	// }
	//如果金额小于50，全文检索单位：万
	if f < 50 && f > 0 && isfindUnit {
		rep := contentUnit.FindAllStringIndex(text, -1)
		if len(rep) > 0 {
			f = f * 10000
		}
	}
	return f
}

func capitalMoney(text string) float64 {
	nodes := []float64{}
	node := float64(0)
	tmp := float64(0)
	decimals := 0.0
	ishaspoint := false //是否含小数点
	fnum := float64(0)
	end := false
	//str := fmt.Sprint(data[0])
	//提取第一个大写信息
	strmatch := numCapitals.FindAllStringSubmatch(text, -1)
	if len(strmatch) > 0 {
		text = strmatch[0][0]
	}
	suffixUnit := float64(1)
	if strings.HasSuffix(text, "万") || strings.HasSuffix(text, "万元") || strings.HasSuffix(text, "万元整") {
		index := strings.LastIndex(text, "万")
		text = text[0:index]
		suffixUnit = float64(10000)
	}
	moneyRegChar.ReplaceAllStringFunc(text, func(key string) string {
		if key == "元" || key == "圆" || key == "点" {
			ishaspoint = true
		}
		if v, ok := moneyChar[key].(float64); ok && !end {
			if ishaspoint && v > 10 { //排除后面有其他的单位
				return ""
			}
			//fmt.Println(key, v, fnum)
			if v < 10 && v >= 0 {
				if ishaspoint { //小数部分
					if v >= 1 {
						fnum = v
					} else if v < 1 && v > 0 {
						decimals += fnum * v
					}
				} else {
					if tmp != float64(0) {
						node += tmp
					}
					tmp = float64(v)
				}
			} else if v == 10000 || v == 100000000 { //单位万、亿
				if tmp != float64(0) {
					node += tmp
					tmp = float64(0)
				}
				nodes = append(nodes, node*float64(v))
				node = float64(0)
			} else {
				if v == 10 && tmp == 0 {
					tmp = 1
				}
				tmp = tmp * float64(v)
				node += tmp
				tmp = float64(0)
			}
		}
		if key == "整" || key == "正" || key == "分" {
			end = true
		}
		return ""
	})
	nodes = append(nodes, node, tmp)
	ret := float64(0)
	for _, v := range nodes {
		ret += v
	}
	return (ret + decimals) * suffixUnit
}

// 数字金额转换
func numMoney(text string) (moneyFloat float64, flag bool) {
	//tmp := fmt.Sprintf("%f", data[0])
	repUnit := float64(1)
	if regQianw.MatchString(text) {
		text = strings.Replace(text, "千万", "万", -1)
		repUnit = float64(1000)
	}
	text = replaceSymbol(text, []string{",", "，", "（", "）", "(", ")", "：", "\n"})
	text = replaceString(text, []string{"万元", "亿元", "．"}, []string{"万", "亿", "."})
	text = CutAllSpace(text)
	rets := regNumFloat.FindAllString(text, -1)
	fnums := []float64{}
	unitstrs := []string{}
	if len(rets) > 0 {
		pindex := 0 //单位前置
		for k, v := range rets {
			f, err := strconv.ParseFloat(v, 64)
			if err == nil {
				fnums = append(fnums, f)
				index := strings.Index(text, v)
				//单位后置
				start := index + len(v)
				end := start + 3
				//log.Println("vvv", tmp, v, pindex, index, start)
				if k > 0 {
					if start >= pindex+3 {
						pstart := pindex + 3
						if pstart >= index {
							pstart = index
						}
						if len(text) > end {
							unitstrs = append(unitstrs, text[pstart:index]+text[start:end])
						} else {
							unitstrs = append(unitstrs, text[pstart:index]+text[start:])
						}
					} else {
						if len(text) > end {
							unitstrs = append(unitstrs, text[start:end])
						} else {
							unitstrs = append(unitstrs, text[start:])
						}
					}
				} else {
					if len(text) > end {
						if index-3 >= 0 {
							unitstrs = append(unitstrs, text[index-3:index]+text[start:end])
						} else {
							unitstrs = append(unitstrs, text[start:end])
						}
					} else {
						if index-3 >= 0 {
							unitstrs = append(unitstrs, text[index-3:index]+text[start:])
						} else {
							unitstrs = append(unitstrs, text[start:])
						}
					}
				}
				pindex = start
			}
		}
	}
	//log.Println("unitstrs", fnums, unitstrs)
	unit := float64(0)
	fnum := float64(0)
	for k, v := range fnums {
		fnum = v
		units := regStrUnit.FindAllString(unitstrs[k], -1)
		for _, v := range units {
			if moneyUnit[v] != 0 {
				unit = moneyUnit[v]
				break
			}
		}
		if unit != float64(0) { //取第一个
			break
		}
	}
	fnum = fnum * repUnit
	if unit == float64(0) {
		moneyFloat = fnum
	} else {
		moneyFloat = fnum * unit
	}
	if unit == 10000 {
		flag = false
	} else {
		flag = true
	}
	return
}

// 清理所有空白符
func CutAllSpace(text string) string {
	tmp := cutAllSpace.ReplaceAllString(text, "")
	tmp = replaceSymbol(tmp, spaces)
	return tmp
}

// 符号替换
func replaceString(con string, ret, rep []string) string {
	for k, v := range ret {
		if len(rep) > k {
			con = strings.Replace(con, v, rep[k], -1)
		}
	}
	return con
}

// 过滤符号
func replaceSymbol(con string, rep []string) string {
	for _, v := range rep {
		con = strings.Replace(con, v, "", -1)
	}
	return con
}

// MatchField 判断valus 是否包含key 字符串
func MatchField(keys []string, values []string) (ok bool) {
	var matchKeyRegs []*RegexpInfo

	if len(keys) > 0 {
		for _, key := range keys {
			KeyReg := GetRegex(key)
			matchKeyRegs = append(matchKeyRegs, KeyReg...)
		}

		for _, reg := range matchKeyRegs {
			for _, val := range values {
				if reg.regs.MatchString(val) {
					return true
				}
			}
		}
	}

	return
}

// SumFields 计算map 包含的字段个数
func SumFields(keys []string, val map[string]interface{}) (res int) {
	for _, v := range keys {
		if _, ok := val[v]; ok {
			res++
		}
	}

	return

}

// containsChinese  识别含有中文
func containsChinese(str string) bool {
	result, _ := regexp.MatchString(`[\x{4e00}-\x{9fa5}]+`, str)
	if result {
		return true
	}

	return false
}