Browse Source

抽取逻辑

zhangjinkun 6 years ago
parent
commit
db244f0095

+ 1 - 0
src/github.com/dlclark/regexp2

@@ -0,0 +1 @@
+Subproject commit 902a5ce7a7812e2ba9f73b9d96c09d5136df39cd

BIN
src/github.com/dlclark/regexp2.zip


+ 3 - 0
src/jy/clear/README.MD

@@ -0,0 +1,3 @@
+**此目录用于放置N多的清理方法**
+可以在配置中调用,
+主要是收尾工作。

+ 56 - 0
src/jy/clear/clear.go

@@ -0,0 +1,56 @@
+package clear
+
+import (
+	"fmt"
+	"regexp"
+	"sync"
+)
+
+//方法清单
+var clearfns = make(map[string]func(data []interface{}) []interface{})
+var lock sync.RWMutex
+
+func init() {
+	BindFn("cutspace", CutSpace)                 //去除首尾空格
+	BindFn("cutallspace", CutAllSpace)           //去除所有空格
+	BindFn("toint", ObjToInt)                    //转int
+	BindFn("tofloat", ObjToFloat)                //转float
+	BindFn("totimestamp", ObjToTimestamp)        //转时间戳
+	BindFn("tomoney", ObjToMoney)                //转换金额
+	BindFn("getcurrency", GetCurrency)           //获取币种
+	BindFn("cutSymbol", CutSymbol)               //清理符号
+	BindFn("cutNotPrs", CutNotPrs)               //不成对出现的符号,把符号后面的内容清理掉
+	BindFn("rateToFloat", RateToFloat)           //费率转小数
+	BindFn("clearAllWord", ClearAllWord)         //全部是汉字或者特殊符号的情况,清理掉
+	BindFn("clearMaxAmount", ClearMaxAmount)     //大于1万亿的过滤掉
+	BindFn("clearProjectName", ClearProjectName) //清理项目名称
+	BindFn("getPhone", GetPhone)                 //取手机号
+}
+
+//绑定清理方法
+func BindFn(fnname string, fn func(data []interface{}) []interface{}) {
+	lock.Lock()
+	clearfns[fnname] = fn
+	lock.Unlock()
+}
+
+//执行清理动作,如果调用的清理方法不存在,则不做处理
+func DoClearFn(clear []string, data []interface{}) []interface{} {
+	if len(clear) == 0 {
+		return data
+	}
+	for _, fnname := range clear {
+		if v, ok := clearfns[fnname]; ok {
+			data = v(data)
+		}
+	}
+	return data
+}
+
+//取手机号
+var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
+
+func GetPhone(data []interface{}) []interface{} {
+	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))
+	return data
+}

+ 150 - 0
src/jy/clear/cutspace.go

@@ -0,0 +1,150 @@
+package clear
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+var cutSpace *regexp.Regexp
+var cutAllSpace *regexp.Regexp
+var catSymbol *regexp.Regexp
+var spaces = []string{"\u3000", "\u2003", "\u00a0"}
+
+func init() {
+	cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
+	cutAllSpace, _ = regexp.Compile(`\s*`)
+	catSymbol, _ = regexp.Compile(`[]+`)
+}
+
+var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"
+var at = rune('&')
+var ed = rune(';')
+var lableMap = map[string]rune{
+	"&":  rune('&'),
+	" ": rune(' '),
+	">":   rune('>'),
+	"&lt;":   rune('<'),
+}
+
+//处理转义标签
+func CutLableStr(con string) string {
+	for i := 0; i < 3; i++ {
+		runes := []rune{}
+		pools := []rune{}
+		bpool := false
+		strings.IndexFunc(con, func(s rune) bool {
+			if !bpool && s == at {
+				bpool = true
+				pools = []rune{}
+			}
+			if bpool {
+				pools = append(pools, s)
+				if s == ed { //结束
+					lb := lableMap[string(pools)]
+					if lb != 0 {
+						runes = append(runes, lb)
+					} else {
+						runes = append(runes, pools...)
+					}
+					bpool = false
+				} else if len(pools) > 6 {
+					bpool = false
+					runes = append(runes, pools...)
+				}
+			} else {
+				runes = append(runes, s)
+			}
+			return false
+		})
+		str1 := string(runes)
+		if i > 0 && con == str1 {
+			break
+		}
+		con = str1
+	}
+	return con
+}
+
+//清理开始、结尾的空白字符
+func CutSpace(data []interface{}) []interface{} {
+	tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "")
+	tmp = replaceSymbol(tmp, spaces)
+	//fmt.Println("cutspace", tmp)
+	data[0] = tmp
+	return data
+}
+
+//清理所有空白符
+func CutAllSpace(data []interface{}) []interface{} {
+	tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
+	tmp = replaceSymbol(tmp, spaces)
+	data[0] = tmp
+	return data
+}
+
+//清理符号
+func CutSymbol(data []interface{}) []interface{} {
+	value := fmt.Sprint(CutSpace(data)[0])
+	symbol := ",,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·"
+	startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+"
+	endSymbol := "[" + "(\\(<《【\\[{{〔" + symbol + "]+$"
+	startReg := regexp.MustCompile(startSymbol)
+	endReg := regexp.MustCompile(endSymbol)
+	value = startReg.ReplaceAllString(value, "")
+	value = endReg.ReplaceAllString(value, "")
+	value = fmt.Sprint(CutSpace([]interface{}{value, data[1]})[0])
+	return []interface{}{value, data[1]}
+}
+
+//不成对出现的符号,把符号后面的内容清理掉
+func CutNotPrs(data []interface{}) []interface{} {
+	return childCutNotPrs(data, 1)
+}
+
+//不成对出现的符号,把符号后面的内容清理掉
+func childCutNotPrs(data []interface{}, count int) []interface{} {
+	value := fmt.Sprint(data[0])
+	if count >= 50 || value == "" {
+		return data
+	}
+	startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "[>》]", "〔"}
+	endChars := []string{"[))]", "[\\]】]", "[}}]", "[<《]", "[>》]", "〕"}
+	for k, v := range startChars {
+		sReg := regexp.MustCompile(v)
+		eReg := regexp.MustCompile(endChars[k])
+		sIndex := sReg.FindAllStringIndex(value, -1)
+		eIndex := eReg.FindAllStringIndex(value, -1)
+		sCount := len(sIndex)
+		eCount := len(eIndex)
+		if sCount == eCount {
+			continue
+		}
+		//清理前面
+		if sCount > eCount {
+			value = value[sIndex[eCount][1]:]
+		}
+		//清理后面
+		if sCount < eCount {
+			value = value[:eIndex[sCount][0]]
+		}
+	}
+	//交叉出现情况处理
+	sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$")
+	eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]")
+	if sReplReg.MatchString(value) || eReplReg.MatchString(value) {
+		value = sReplReg.ReplaceAllString(value, "")
+		value = eReplReg.ReplaceAllString(value, "")
+		value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0])
+	}
+	data[0] = value
+	return data
+}
+
+//全部是汉字或者特殊符号的情况,清理掉
+func ClearAllWord(data []interface{}) []interface{} {
+	value := fmt.Sprint(data[0])
+	reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$")
+	data[0] = reg.ReplaceAllString(value, "")
+	return data
+}

+ 32 - 0
src/jy/clear/getratecurrency.go

@@ -0,0 +1,32 @@
+// getratecurrency
+package clear
+
+import (
+	"fmt"
+	"regexp"
+)
+
+var currency *regexp.Regexp
+var encyitem = map[string]string{
+	"$": "美元", "$": "美元", "美元": "美元",
+	//待续
+}
+
+func init() {
+	//提取币种
+	currency, _ = regexp.Compile(`[$|$|美元]+`)
+}
+
+//获取币种
+func GetCurrency(data []interface{}) []interface{} {
+	val := "人民币"
+	currency.ReplaceAllStringFunc(fmt.Sprint(data[0]), func(key string) string {
+		v := encyitem[key]
+		if v != "" {
+			val = v
+		}
+		return key
+	})
+	data[0] = val
+	return data
+}

+ 36 - 0
src/jy/clear/projectname.go

@@ -0,0 +1,36 @@
+// projectname
+package clear
+
+import (
+	"regexp"
+
+	"github.com/dlclark/regexp2"
+)
+
+//清理项目名称
+
+var clearPreRegNameCode = regexp.MustCompile(`([\(\)\-\[\]【】()a-zA-Z0-9_—::]{10,30})?(.+)?`)
+var clearEndRegNameCode = regexp.MustCompile(`(.+?)?([\(\)\-\[\]【】()a-zA-Z0-9_—::]{8,100})$`)
+var clearSymbol = regexp.MustCompile(`["“”]`)
+var noclearNum = regexp2.MustCompile(`^(?!.+(?:标段|包|子项目|升级改造)[0-9123456789]{1,5})(.*)[0-9123456789]$`, regexp2.None)
+var mustHan = regexp.MustCompile(`[\p{Han}]+`) //项目名称必须包含汉子
+
+func ClearProjectName(data []interface{}) []interface{} {
+	value := clearPreRegNameCode.ReplaceAllString(CutSpace(data)[0].(string), "$2")
+	value = clearEndRegNameCode.ReplaceAllString(value, "$1")
+	b := mustHan.MatchString(value)
+	if !b {
+		value = ""
+	}
+	tmp := value
+	for i := 0; i < 5; i++ {
+		tmpval, _ := noclearNum.Replace(tmp, "$1", -1, -1)
+		if tmpval == tmp {
+			value = tmp
+			break
+		} else {
+			tmp = tmpval
+		}
+	}
+	return []interface{}{value, data[1]}
+}

+ 299 - 0
src/jy/clear/tonumber.go

@@ -0,0 +1,299 @@
+// tonumber
+package clear
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var contentUnit *regexp.Regexp //全文检索单位:万元
+var regOperator *regexp.Regexp //运算符号
+var regNumFloat *regexp.Regexp //提取整数或浮点数
+var regStrUnit *regexp.Regexp  //提取单位
+
+var moneyRegChar *regexp.Regexp //提取中文数字
+var numCapitals *regexp.Regexp  //中文大写金额过滤
+
+var regQianw *regexp.Regexp //部分千万单位
+
+var moneyChar = map[string]interface{}{ //"〇": "0", "零": "0",
+	"一": float64(1), "壹": float64(1), "二": float64(2), "贰": float64(2), "三": float64(3), "叁": float64(3), "四": float64(4), "肆": float64(4), "五": float64(5), "伍": float64(5),
+	"六": float64(6), "陆": float64(6), "七": float64(7), "柒": float64(7), "八": float64(8), "捌": float64(8), "九": float64(9), "玖": float64(9), "十": float64(10), "拾": float64(10),
+	"百": float64(100), "佰": float64(100), "千": float64(1000), "仟": float64(1000), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000),
+	"零": float64(0), "点": ".", "角": float64(0.1), "分": float64(0.01),
+}
+var moneyUnit = map[string]float64{
+	"元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
+}
+
+func init() {
+	regOperator, _ = regexp.Compile(`[*|+|)*)]`)
+	regNumFloat, _ = regexp.Compile(`([1-9]\d*|0)(\.\d+)?`)
+	regStrUnit, _ = regexp.Compile(`[元|万|亿]`)
+
+	regStrChar := `[〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]`
+	moneyRegChar, _ = regexp.Compile(regStrChar)
+	contentUnit, _ = regexp.Compile(`(万元|单位/万)`)
+	numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`)
+
+	regQianw, _ = regexp.Compile(`\d{1,2}千万`)
+}
+
+//转int
+func ObjToInt(data []interface{}) []interface{} {
+	tmp, err := strconv.Atoi(fmt.Sprint(data[0]))
+	if err != nil {
+		data[0] = 0
+		return data
+	} else {
+		data[0] = tmp
+		return data
+	}
+}
+
+//转float,精度小数点4位
+func ObjToFloat(data []interface{}) []interface{} {
+	tmp, err := strconv.ParseFloat(fmt.Sprint(data[0]), 64)
+	if err != nil {
+		return []interface{}{float64(0), data[1]}
+	} else {
+		tmp, err = strconv.ParseFloat(strconv.FormatFloat(tmp, 'f', 4, 64), 64)
+		if err != nil {
+			return []interface{}{float64(0), data[1]}
+		} else {
+			return []interface{}{tmp, data[1]}
+		}
+	}
+}
+
+//金额转换
+func ObjToMoney(data []interface{}) []interface{} {
+	isfindUnit := true
+	ret := capitalMoney(data)[0]
+	if ret.(float64) < float64(10000) || ret.(float64) > float64(50000000000) {
+		ret2, b := numMoney(data)
+		isfindUnit = b
+		if ret2[0].(float64) > ret.(float64) {
+			ret = ret2[0]
+		}
+	}
+	f, _ := strconv.ParseFloat(strconv.FormatFloat(ret.(float64), 'f', 4, 64), 64)
+	if f < 1 {
+		f = 0
+	}
+	//若果金额小于50,全文检索单位:万
+	if f < 50 && f > 0 && isfindUnit {
+		rep := contentUnit.FindAllStringIndex(fmt.Sprint(data[1]), -1)
+		if len(rep) > 0 {
+			f = f * 10000
+		}
+	}
+	data[0] = f
+	return data
+}
+
+//数字金额转换
+func numMoney(data []interface{}) ([]interface{}, bool) {
+	tmp := fmt.Sprint(data[0])
+	repUnit := float64(1)
+	if regQianw.MatchString(tmp) {
+		tmp = strings.Replace(tmp, "千万", "万", -1)
+		repUnit = float64(1000)
+	}
+	tmp = replaceSymbol(tmp, []string{",", ",", "(", ")", "(", ")", ":", "\n"})
+	tmp = replaceString(tmp, []string{"万元", "亿元", "."}, []string{"万", "亿", "."})
+	tmp = fmt.Sprint(CutAllSpace([]interface{}{tmp, data[1]})[0])
+	rets := regNumFloat.FindAllString(tmp, -1)
+	fnums := []float64{}
+	unitstrs := []string{}
+	if len(rets) > 0 {
+		pindex := 0 //单位前置
+		for k, v := range rets {
+			f, err := strconv.ParseFloat(v, 64)
+			if err == nil {
+				fnums = append(fnums, f)
+				index := strings.Index(tmp, v)
+				//单位后置
+				start := index + len(v)
+				end := start + 3
+				//log.Println("vvv", tmp, v, pindex, index, start)
+				if k > 0 {
+					if start >= pindex+3 {
+						pstart := pindex + 3
+						if pstart >= index {
+							pstart = index
+						}
+						if len(tmp) > end {
+							unitstrs = append(unitstrs, tmp[pstart:index]+tmp[start:end])
+						} else {
+							unitstrs = append(unitstrs, tmp[pstart:index]+tmp[start:])
+						}
+					} else {
+						if len(tmp) > end {
+							unitstrs = append(unitstrs, tmp[start:end])
+						} else {
+							unitstrs = append(unitstrs, tmp[start:])
+						}
+					}
+				} else {
+					if len(tmp) > end {
+						if index-3 >= 0 {
+							unitstrs = append(unitstrs, tmp[index-3:index]+tmp[start:end])
+						} else {
+							unitstrs = append(unitstrs, tmp[start:end])
+						}
+					} else {
+						if index-3 >= 0 {
+							unitstrs = append(unitstrs, tmp[index-3:index]+tmp[start:])
+						} else {
+							unitstrs = append(unitstrs, tmp[start:])
+						}
+					}
+				}
+				pindex = start
+			}
+		}
+	}
+	//log.Println("unitstrs", fnums, unitstrs)
+	unit := float64(0)
+	fnum := float64(0)
+	for k, v := range fnums {
+		fnum = v
+		units := regStrUnit.FindAllString(unitstrs[k], -1)
+		for _, v := range units {
+			if moneyUnit[v] != 0 {
+				unit = moneyUnit[v]
+				break
+			}
+		}
+		if unit != float64(0) { //取第一个
+			break
+		}
+	}
+	fnum = fnum * repUnit
+	if unit == float64(0) {
+		data[0] = fnum
+	} else {
+		data[0] = fnum * unit
+	}
+	if unit == 10000 {
+		return data, false
+	} else {
+		return data, true
+	}
+}
+
+//大写数子金额转换
+func capitalMoney(data []interface{}) []interface{} {
+	nodes := []float64{}
+	node := float64(0)
+	tmp := float64(0)
+	decimals := 0.0
+	ishaspoint := false //是否含小数点
+	fnum := float64(0)
+	end := false
+	str := fmt.Sprint(data[0])
+	//提取第一个大写信息
+	strmatch := numCapitals.FindAllStringSubmatch(str, -1)
+	if len(strmatch) > 0 {
+		str = strmatch[0][0]
+	}
+	//修正单位类似:捌万伍仟肆佰捌拾贰万元整
+	if strings.Contains(str, "万元") {
+		str = strings.Replace(str, "万元", "#B#", -1)
+		str = strings.Replace(str, "万", "亿", -1)
+		str = strings.Replace(str, "#B#", "万元", -1)
+	}
+	moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
+		if key == "元" || key == "圆" || key == "点" {
+			ishaspoint = true
+		}
+		if v, ok := moneyChar[key].(float64); ok && !end {
+			if ishaspoint && v > 10 { //排除后面有其他的单位
+				return ""
+			}
+			//fmt.Println(key, v, fnum)
+			if v < 10 && v >= 0 {
+				if ishaspoint { //小数部分
+					if v >= 1 {
+						fnum = v
+					} else if v < 1 && v > 0 {
+						decimals += fnum * v
+					}
+				} else {
+					if tmp != float64(0) {
+						node += tmp
+					}
+					tmp = float64(v)
+				}
+			} else if v == 10000 || v == 100000000 { //单位万、亿
+				if tmp != float64(0) {
+					node += tmp
+					tmp = float64(0)
+				}
+				nodes = append(nodes, node*float64(v))
+				node = float64(0)
+			} else {
+				if v == 10 && tmp == 0 {
+					tmp = 1
+				}
+				tmp = tmp * float64(v)
+				node += tmp
+				tmp = float64(0)
+			}
+		}
+		if key == "整" || key == "正" || key == "分" {
+			end = true
+		}
+		return ""
+	})
+	nodes = append(nodes, node, tmp)
+	ret := float64(0)
+	for _, v := range nodes {
+		ret += v
+	}
+	return []interface{}{ret + decimals, data[1]}
+}
+
+//过滤符号
+func replaceSymbol(con string, rep []string) string {
+	for _, v := range rep {
+		con = strings.Replace(con, v, "", -1)
+	}
+	return con
+}
+
+//符号替换
+func replaceString(con string, ret, rep []string) string {
+	for k, v := range ret {
+		if len(rep) > k {
+			con = strings.Replace(con, v, rep[k], -1)
+		}
+	}
+	return con
+}
+
+//费率转小数
+func RateToFloat(con []interface{}) []interface{} {
+	tmp := fmt.Sprint(CutAllSpace(con)[0])
+	if strings.Contains(tmp, "%") || strings.Contains(tmp, "%") {
+		tmp = strings.Replace(tmp, "%", "", -1)
+		tmp = strings.Replace(tmp, "%", "", -1)
+		rep := ObjToFloat([]interface{}{tmp, con[1]})[0]
+		con[0] = rep.(float64) / 100
+		return con
+	} else {
+		return ObjToFloat([]interface{}{tmp, con[1]})
+	}
+}
+
+//大于一万亿的过滤掉
+func ClearMaxAmount(data []interface{}) []interface{} {
+	value, _ := data[0].(float64)
+	if value >= 1000000000000 {
+		data[0] = float64(0)
+	}
+	return data
+}

+ 131 - 0
src/jy/clear/totimestamp.go

@@ -0,0 +1,131 @@
+// totimestamp
+package clear
+
+import (
+	"fmt"
+	"regexp"
+	"strings"
+	"time"
+)
+
+var reg, regA, regB, regC, regAfter *regexp.Regexp
+
+const (
+	T = 365 * 86400
+)
+
+var item = map[string]string{
+	"一": "1", "二": "2", "三": "3", "四": "4", "五": "5",
+	"六": "6", "七": "7", "九": "9", "十": "10", "零": "0", "〇": "0",
+	"1": "1", "2": "2", "3": "3", "4": "4", "5": "5",
+	"6": "6", "7": "7", "8": "8", "9": "9", "0": "0",
+}
+
+func init() {
+	reg, _ = regexp.Compile(`\d+`)
+	regA, _ = regexp.Compile(`[一|二|三|四|五|六|七|八|九|十|零|〇|1|2|3|4|5|6|7|8|9|0]`)
+	regB, _ = regexp.Compile(`\d+年\d+月\d+日((上|下)午)?\s*\d+[::时]\d+分?[-—]\d+[::时]\d+时?分?`)
+	regC, _ = regexp.Compile(`\s*\d+[::时]\d+分?[-—]`)
+	regAfter, _ = regexp.Compile(`(下午D?\d{1,2}[时|:|:|h|H])`)
+}
+
+/*字符时间转时间戳
+支持全角
+20060102->时间戳
+20060102150405->时间戳
+01%02->时间戳
+2006%01%02->时间戳
+2006%01%02%15->时间戳
+2006%01%02%15%04->时间戳
+2006%01%02%15%04%05->时间戳
+*/
+func ObjToTimestamp(data []interface{}) []interface{} {
+	tmp := fmt.Sprint(data[0])
+	//2016年12月7日上午9:00-11:30时 时间范围处理 取后面的时间
+	if regB.MatchString(tmp) {
+		tmp = regC.ReplaceAllString(tmp, "")
+	}
+	//2017年11月13日下午3时30分
+	addreptime := int64(0)
+	if regAfter.MatchString(tmp) {
+		addreptime = 12 * 60 * 60
+	}
+	regRepl, _ := regexp.Compile(`[,,]`)
+	tmp = regRepl.ReplaceAllString(tmp, "")
+	for _, v := range spaces {
+		strings.Replace(tmp, v, " ", -1)
+	}
+	tmps := reg.FindAllString(chineseToNumber(tmp), -1)
+	//处理类似2016-12-0909:30:00时间
+	if len(tmps) > 2 && len(tmps[2]) > 2 {
+		newtmp := []string{}
+		for k, v := range tmps {
+			if k == 2 {
+				newtmp = append(newtmp, v[0:2], v[2:])
+			} else {
+				newtmp = append(newtmp, v)
+			}
+		}
+		tmps = newtmp
+	}
+	timestr := "" //2006-01-02 15:04:05
+	timestamp := int64(0)
+	if len(tmps) == 1 {
+		if len(tmps[0]) == 8 {
+			timestr = tmps[0][0:4] + "-" + tmps[0][4:6] + "-" + tmps[0][6:8]
+			t, _ := time.ParseInLocation("2006-01-02-15-04", timestr+"-09-00", time.Local)
+			timestamp = t.Unix()
+		} else if len(tmps[0]) == 14 {
+			timestr = tmps[0][0:4] + "-" + tmps[0][4:6] + "-" + tmps[0][6:8] + " " + tmps[0][8:10] + ":" + tmps[0][10:12] + ":" + tmps[0][12:14]
+			t, _ := time.ParseInLocation("2006-01-02 15:04:00", timestr, time.Local)
+			timestamp = t.Unix()
+		}
+	} else if len(tmps) == 2 {
+		timestr = fmt.Sprint(time.Now().Year()) + "-" + MDhmsRepair(tmps[0]) + "-" + MDhmsRepair(tmps[1])
+		t, _ := time.ParseInLocation("2006-01-02", timestr, time.Local)
+		timestamp = t.Unix()
+	} else if len(tmps) == 3 {
+		timestr = tmps[0] + "-" + MDhmsRepair(tmps[1]) + "-" + MDhmsRepair(tmps[2])
+		t, _ := time.ParseInLocation("2006-01-02", timestr, time.Local)
+		timestamp = t.Unix()
+	} else if len(tmps) == 4 {
+		timestr = tmps[0] + "-" + MDhmsRepair(tmps[1]) + "-" + MDhmsRepair(tmps[2]) + " " + MDhmsRepair(tmps[3])
+		t, _ := time.ParseInLocation("2006-01-02 15", timestr, time.Local)
+		timestamp = t.Unix()
+	} else if len(tmps) >= 5 {
+		timestr = tmps[0] + "-" + MDhmsRepair(tmps[1]) + "-" + MDhmsRepair(tmps[2]) + " " + MDhmsRepair(tmps[3]) + ":" + MDhmsRepair(tmps[4])
+		t, _ := time.ParseInLocation("2006-01-02 15:04", timestr, time.Local)
+		timestamp = t.Unix()
+	}
+	if timestamp < 0 || timestamp > (time.Now().Unix()+T) {
+		data[0] = 0
+	} else {
+		if addreptime > 0 {
+			timestamp += addreptime
+		}
+		data[0] = timestamp
+	}
+	return data
+}
+
+//补位
+func MDhmsRepair(t string) string {
+	if len(t) == 1 {
+		return "0" + t
+	} else {
+		return t
+	}
+}
+
+//汉子数和全角转数字
+func chineseToNumber(con string) string {
+	tmp := regA.ReplaceAllStringFunc(con, func(key string) string {
+		if item[key] != "" {
+			return item[key]
+		} else {
+			return key
+		}
+		return key
+	})
+	return tmp
+}

+ 172 - 65
src/jy/extract/extract.go

@@ -2,6 +2,7 @@ package extract
 
 import (
 	"encoding/json"
+	"jy/clear"
 	db "jy/mongodbutil"
 	"jy/pretreated"
 	ju "jy/util"
@@ -37,6 +38,7 @@ func StartExtractTaskId(taskId string) bool {
 		ext.InitRuleBacks()
 		ext.InitRuleCore()
 		ext.InitTag()
+		ext.InitClearFn()
 		//只启动一次taskId
 		go RunExtractTask(ext)
 	}
@@ -76,8 +78,8 @@ func RunExtractTask(ext *ExtractTask) {
 //信息预处理
 func PreInfo(doc map[string]interface{}) *ju.Job {
 	detail := ""
-	d1 := doc["detail"].(string)
-	d2 := doc["contenthtml"].(string)
+	d1, _ := doc["detail"].(string)
+	d2, _ := doc["contenthtml"].(string)
 	if len(d1) >= len(d2) || d2 == "" {
 		detail = d1
 	} else {
@@ -126,7 +128,7 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 	qu.Catch()
 	qu.Try(func() {
 		doc := *j.Data
-		//前置规则,结果覆盖doc属性
+		//全局前置规则,结果覆盖doc属性
 		for _, v := range e.RulePres {
 			doc = ExtRegPre(doc, j, v, e.TaskInfo)
 		}
@@ -147,7 +149,7 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 
 			//抽取-规则
 			for _, v := range vc.RuleCores {
-				ExtRegCore(tmp, j, v, e)
+				ExtRegCore(vc.ExtFrom, tmp, j, v, e)
 			}
 			//log.Println("抽取-规则", tmp)
 
@@ -161,10 +163,18 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 			for _, v := range e.RuleBacks {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
-			bs, _ := json.Marshal(j.Result)
-			log.Println("抽取结果", string(bs))
 		}
-		//抽取结果保存 todo
+		//函数清理
+		for key, val := range j.Result {
+			for _, v := range val {
+				data := clear.DoClearFn(e.ClearFn[key], []interface{}{v.Value, j.Content})
+				v.Value = data[0]
+			}
+		}
+		bs, _ := json.Marshal(j.Result)
+		log.Println("抽取结果", j.SourceMid, string(bs))
+		//分析抽取结果并保存 todo
+		AnalysisSaveResult(j.Data, j.Result, e.TaskInfo.SaveColl)
 
 	}, func(err interface{}) {
 		log.Println(err)
@@ -198,11 +208,13 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 }
 
 //抽取-规则
-func ExtRegCore(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
+func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
 	if in.IsLua {
 		lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
 		if in.IsHasFields { //lua脚本配置有属性字段
 			lua.KvMap = getKvByLuaFields(j, in, et.Tag)
+		} else {
+			lua.KvMap = map[string][]map[string]interface{}{}
 		}
 		lua.Block = j.Block
 		extinfo := lua.RunScript("core")
@@ -212,17 +224,22 @@ func ExtRegCore(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *Extra
 			}
 			if tmps, ok := v.([]map[string]interface{}); ok {
 				for _, tmp := range tmps {
-					j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), tmp["value"]})
+					j.Result[k] = append(j.Result[k],
+						&ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
 				}
 			}
 		}
-		AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+		if len(extinfo) > 0 {
+			AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+		}
 	} else {
+		//全文正则
+		text := qu.ObjToString(doc[extfrom])
 		if in.Field != "" {
-			//全文正则
-			text := qu.ObjToString(doc["detail"])
-			extinfo := extRegCoreToResult(text, j, in)
-			AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+			extinfo := extRegCoreToResult(extfrom, text, j, in)
+			if len(extinfo) > 0 {
+				AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+			}
 		}
 	}
 }
@@ -240,31 +257,65 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 				kvs2 := bl.ColonKV.Kvs_2
 				for _, tag := range tags {
 					for _, kv := range kvs {
-						if kv.Key == tag.Key {
-							text := ju.TrimLRSpace(kv.Value, "")
-							if text != "" {
-								kvmap[field] = append(kvmap[field], map[string]interface{}{
-									"value": text,
-									"type":  "colon1",
-									"field": field,
-									"key":   tag.Key,
-								})
+						if tag.Type == "string" {
+							if kv.Key == tag.Key {
+								text := ju.TrimLRSpace(kv.Value, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "colon1",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "string",
+									})
+								}
+								break
+							}
+						} else if tag.Type == "regexp" {
+							if tag.Reg.MatchString(kv.Key) {
+								text := ju.TrimLRSpace(kv.Value, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "colon1",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "regexp",
+									})
+								}
+								break
 							}
-							break
 						}
 					}
 					for _, kv := range kvs2 {
-						if kv.Key == tag.Key {
-							text := ju.TrimLRSpace(kv.Value, "")
-							if text != "" {
-								kvmap[field] = append(kvmap[field], map[string]interface{}{
-									"value": text,
-									"type":  "colon2",
-									"field": field,
-									"key":   tag.Key,
-								})
+						if tag.Type == "string" {
+							if kv.Key == tag.Key {
+								text := ju.TrimLRSpace(kv.Value, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "colon2",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "string",
+									})
+								}
+								break
+							}
+						} else if tag.Type == "regexp" {
+							if tag.Reg.MatchString(kv.Key) {
+								text := ju.TrimLRSpace(kv.Value, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "colon2",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "regexp",
+									})
+								}
+								break
 							}
-							break
 						}
 					}
 				}
@@ -274,17 +325,34 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 				kvs := bl.SpaceKV.Kvs
 				for _, tag := range tags {
 					for _, kv := range kvs {
-						if kv.Key == tag.Key {
-							text := ju.TrimLRSpace(kv.Value, "")
-							if text != "" {
-								kvmap[field] = append(kvmap[field], map[string]interface{}{
-									"value": text,
-									"type":  "space",
-									"field": field,
-									"key":   tag.Key,
-								})
+						if tag.Type == "string" {
+							if kv.Key == tag.Key {
+								text := ju.TrimLRSpace(kv.Value, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "space",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "string",
+									})
+								}
+								break
+							}
+						} else if tag.Type == "regexp" {
+							if tag.Reg.MatchString(kv.Key) {
+								text := ju.TrimLRSpace(kv.Value, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "space",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "regexp",
+									})
+								}
+								break
 							}
-							break
 						}
 					}
 				}
@@ -294,17 +362,34 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 				kv := bl.TableKV.Kv
 				for _, tag := range tags {
 					for k, val := range kv {
-						if k == tag.Key {
-							text := ju.TrimLRSpace(val, "")
-							if text != "" {
-								kvmap[field] = append(kvmap[field], map[string]interface{}{
-									"value": text,
-									"type":  "table",
-									"field": field,
-									"key":   tag.Key,
-								})
+						if tag.Type == "string" {
+							if k == tag.Key {
+								text := ju.TrimLRSpace(val, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "table",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "string",
+									})
+								}
+								break
+							}
+						} else if tag.Type == "regexp" {
+							if tag.Reg.MatchString(k) {
+								text := ju.TrimLRSpace(val, "")
+								if text != "" {
+									kvmap[field] = append(kvmap[field], map[string]interface{}{
+										"value":     text,
+										"type":      "table",
+										"field":     field,
+										"key":       tag.Key,
+										"matchtype": "regexp",
+									})
+								}
+								break
 							}
-							break
 						}
 					}
 				}
@@ -315,7 +400,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 }
 
 //正则提取结果
-func extRegCoreToResult(text string, j *ju.Job, v *RegLuaInfo) map[string]interface{} {
+func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string]interface{} {
 	extinfo := map[string]interface{}{}
 	if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
 		apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
@@ -323,13 +408,16 @@ func extRegCoreToResult(text string, j *ju.Job, v *RegLuaInfo) map[string]interf
 			pos := apos[0]
 			for k, p := range v.RegCore.ExtractPos {
 				if len(pos) > p {
+					if pos[p] == -1 || pos[p+1] == -1 {
+						continue
+					}
 					val := text[pos[p]:pos[p+1]]
 					extinfo[k] = val
 					if val != "" {
 						if j.Result[v.Field] == nil {
 							j.Result[k] = [](*ju.ExtField){}
 						}
-						j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.RuleText, "regexp", val})
+						j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, "regexp", "regcontent", extfrom, val})
 					}
 				}
 			}
@@ -341,7 +429,7 @@ func extRegCoreToResult(text string, j *ju.Job, v *RegLuaInfo) map[string]interf
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.RuleText, "regexp", val})
+			j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, "regexp", "regcontent", extfrom, val})
 		}
 	}
 	return extinfo
@@ -360,11 +448,13 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 			if tmps, ok := v.([]map[string]interface{}); ok {
 				j.Result[k] = [](*ju.ExtField){}
 				for _, tmp := range tmps {
-					j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), tmp["value"]})
+					j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"]})
 				}
 			}
 		}
-		AddExtLog(j.SourceMid, result, extinfo, in, t) //抽取日志
+		if len(extinfo) > 0 {
+			AddExtLog(j.SourceMid, result, extinfo, in, t) //抽取日志
+		}
 	} else {
 		extinfo := map[string]interface{}{}
 		if in.Field != "" && j.Result[in.Field] != nil {
@@ -379,7 +469,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 				exts = append(exts, text)
 			}
 			extinfo[in.Field] = exts
-			AddExtLog(j.SourceMid, tmp, extinfo, in, t) //抽取日志
+			if len(extinfo) > 0 {
+				AddExtLog(j.SourceMid, tmp, extinfo, in, t) //抽取日志
+			}
 		} else {
 			for key, tmp := range j.Result {
 				exts := []interface{}{}
@@ -393,7 +485,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 				}
 				extinfo[key] = exts
 			}
-			AddExtLog(j.SourceMid, j.Result, extinfo, in, t) //抽取日志
+			if len(extinfo) > 0 {
+				AddExtLog(j.SourceMid, j.Result, extinfo, in, t) //抽取日志
+			}
 		}
 	}
 }
@@ -407,10 +501,12 @@ func getResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
 		}
 		for _, v := range val {
 			tmp := map[string]interface{}{
-				"field": v.Field,
-				"value": v.Value,
-				"type":  v.Type,
-				"key":   v.Key,
+				"field":     v.Field,
+				"value":     v.Value,
+				"type":      v.Type,
+				"matchtype": v.MatchType,
+				"key":       v.Key,
+				"extfrom":   v.ExtFrom,
 			}
 			result[key] = append(result[key], tmp)
 		}
@@ -466,3 +562,14 @@ func SaveExtLog() {
 	}
 	time.AfterFunc(10*time.Second, SaveExtLog)
 }
+
+//分析抽取结果并保存
+func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, totable string) {
+	//待完善
+	for key, val := range result {
+		for _, v := range val { //暂时取第一个保存
+			(*doc)[key] = v.Value
+		}
+	}
+	db.Mgo.Update(totable, `{"_id":"`+qu.BsonIdToSId((*doc)["_id"])+`"}`, doc, true, false)
+}

+ 45 - 14
src/jy/extract/extractInit.go

@@ -24,7 +24,7 @@ type ExtReg struct {
 }
 type RuleCore struct {
 	LuaLogic  string        //进入逻辑
-	IsBlock   bool          //是否分块
+	ExtFrom   string        //从哪个字段抽取
 	RulePres  []*RegLuaInfo //前置规则
 	RuleBacks []*RegLuaInfo //后置规则
 	RuleCores []*RegLuaInfo //抽取规则
@@ -38,18 +38,20 @@ type TaskInfo struct {
 	ProcessPool                  chan bool //任务进程池
 }
 type Tag struct {
-	Type string //标签类型 string 字符串、regexp 正则
-	Key  string //
+	Type string         //标签类型 string 字符串、regexp 正则
+	Key  string         //
+	Reg  *regexp.Regexp //
 }
 type ExtractTask struct {
-	Id        string            //任务id
-	IsRun     bool              //是否启动
-	Content   string            //信息内容
-	TaskInfo  *TaskInfo         //任务信息
-	RulePres  []*RegLuaInfo     //前置规则
-	RuleBacks []*RegLuaInfo     //后置规则
-	RuleCores []*RuleCore       //抽取规则
-	Tag       map[string][]*Tag //标签库
+	Id        string              //任务id
+	IsRun     bool                //是否启动
+	Content   string              //信息内容
+	TaskInfo  *TaskInfo           //任务信息
+	RulePres  []*RegLuaInfo       //前置规则
+	RuleBacks []*RegLuaInfo       //后置规则
+	RuleCores []*RuleCore         //抽取规则
+	Tag       map[string][]*Tag   //标签库
+	ClearFn   map[string][]string //清理函数
 }
 
 func init() {
@@ -138,7 +140,7 @@ func (e *ExtractTask) InitRuleCore() {
 		}
 		rcore := &RuleCore{}
 		rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
-		rcore.IsBlock, _ = vv["isblock"].(bool)            //是否分块
+		rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
 		//前置规则
 		rulePres := []*RegLuaInfo{}
 		plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
@@ -203,10 +205,13 @@ func (e *ExtractTask) InitRuleCore() {
 			}
 			if rinfo.IsLua {
 				rinfo.RuleText = v["s_luascript"].(string)
-				rinfo.LFields, _ = v["s_fields"].([]interface{})
+				//暂时提取全部属性
+				rinfo.LFields = getALLFields()
+				rinfo.IsHasFields = true
+				/*rinfo.LFields, _ = v["s_fields"].([]interface{})
 				if len(rinfo.LFields) > 0 {
 					rinfo.IsHasFields = true
-				}
+				}*/
 			} else {
 				rinfo.RuleText = v["s_rule"].(string)
 				rinfo.Field = v["s_field"].(string)
@@ -252,5 +257,31 @@ func (e *ExtractTask) InitTag() {
 		}
 	}
 	//正则标签库
+	list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"正则","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
+	for _, v := range *list {
+		field := qu.ObjToString(v["s_field"])
+		if tmp, ok := v["content"].([]interface{}); ok {
+			for _, key := range tmp {
+				tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
+				e.Tag[field] = append(e.Tag[field], tag)
+			}
+		}
+	}
+}
+
+//获取fields
+func getALLFields() []interface{} {
+	fields := []interface{}{}
+	list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1}`, false, -1, -1)
+	for _, v := range *list {
+		fields = append(fields, v["s_field"])
+	}
+	return fields
+}
 
+//加载clear函数
+func (e *ExtractTask) InitClearFn() {
+	fn := map[string][]string{}
+	fn["budget"] = []string{"tomoney", "clearMaxAmount"}
+	e.ClearFn = fn
 }

+ 6 - 4
src/jy/util/article.go

@@ -17,10 +17,12 @@ type Job struct {
 }
 
 type ExtField struct {
-	Field string      //属性
-	Key   string      //匹配标签或正则
-	Type  string      //抽取类型
-	Value interface{} //抽取结果
+	Field     string      //属性
+	Key       string      //匹配标签、正则代码
+	Type      string      //kv(细类:colon1,colon2,space,table)、正则(regexp)
+	MatchType string      //匹配类型:1:标签库类型(string,regexp),2:全文正则regcontent
+	ExtFrom   string      //抽取来源(title,detail)
+	Value     interface{} //抽取结果
 }
 
 //块

+ 1 - 0
src/web/templates/admin/rule_logiclist.html

@@ -104,6 +104,7 @@ $(function () {
 				tag=[{label:"名称",s_label:"s_name",placeholder:"",must:true},
 					{label:"描述",s_label:"s_descript",type:"tpl_text"},
 					{label:"启用",s_label:"isuse",type:"tpl_list_local",list:[{"s_name":"是","_id":true},{"s_name":"否","_id":false}],default:true},
+					{label:"标题抽取",s_label:"extfrom",type:"tpl_list_local",list:[{"s_name":"是","_id":true},{"s_name":"否","_id":false}],default:false},
 					{label:"是否适用",s_label:"s_luascript",type:"tpl_text",must:true},
 					{s_label:"_id",type:"tpl_hidden"},
 					{s_label:"s_version",type:"tpl_hidden"}]