Эх сурвалжийг харах

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

fengweiqiang 6 жил өмнө
parent
commit
1bca0092ce

+ 16 - 2
src/jy/admin/rulecheck.go

@@ -248,7 +248,17 @@ func checkBackReg(content, ruleText string) string {
 func checkCoreReg(field, content, ruleText string) map[string]string {
 	rep := map[string]string{}
 	qu.Try(func() {
-		tmp := strings.Split(ruleText, "__")
+		//处理正负数修正
+		ptmp := strings.Split(ruleText, "#")
+		sign := 0
+		if len(ptmp) == 2 {
+			if ptmp[1] == "正" {
+				sign = 1
+			} else if ptmp[1] == "负" {
+				sign = -1
+			}
+		}
+		tmp := strings.Split(ptmp[0], "__")
 		if len(tmp) == 2 {
 			epos := strings.Split(tmp[1], ",")
 			posm := map[string]int{}
@@ -279,7 +289,11 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 							continue
 						}
 						val := content[pos[p]:pos[p+1]]
-						rep[k] = val
+						if sign == -1 {
+							rep[k] = "-" + val
+						} else {
+							rep[k] = val
+						}
 					}
 				}
 			}

+ 9 - 1
src/jy/clear/tonumber.go

@@ -55,7 +55,15 @@ func ObjToInt(data []interface{}) []interface{} {
 
 //转float,精度小数点4位
 func ObjToFloat(data []interface{}) []interface{} {
-	tmp, err := strconv.ParseFloat(fmt.Sprint(data[0]), 64)
+	con := fmt.Sprint(data[0])
+	percent := strings.Contains(con, "%")
+	if percent {
+		con = strings.Replace(con, "%", "", -1)
+	}
+	tmp, err := strconv.ParseFloat(con, 64)
+	if percent {
+		tmp = tmp / 100
+	}
 	if err != nil {
 		return []interface{}{float64(0), data[1]}
 	} else {

+ 4 - 0
src/jy/extract/extract.go

@@ -944,6 +944,9 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 					tmps = append(tmps, tmp)
 					extinfo[k] = tmps
 					if strings.TrimSpace(val) != "" {
+						if v.RegCore.NumSign == -1 { //正负值修正
+							val = "-" + val
+						}
 						exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
@@ -1308,6 +1311,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					map[string]interface{}{"$set": tmp},
 				}
 				e.BidArr = append(e.BidArr, tmparr)
+				e.BidTotal++
 			}
 			if b, ok := ju.Config["saveresult"].(bool); ok && b {
 				id := tmp["_id"]

+ 13 - 2
src/jy/extract/extractInit.go

@@ -28,6 +28,7 @@ type ExtReg struct {
 	Replace    string
 	Bextract   bool
 	ExtractPos map[string]int
+	NumSign    int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
 }
 type RuleCore struct {
 	Field     string        //逻辑字段
@@ -77,6 +78,7 @@ type ExtractTask struct {
 	ResultArr    [][]map[string]interface{} //抽取结果详情
 	BidChanel    chan bool                  //抽取结果
 	BidArr       [][]map[string]interface{} //抽取结果
+	BidTotal     int                        //结果数量
 
 	RecogFieldMap map[string]map[string]interface{}   //识别字段
 	FidClassMap   map[string][]map[string]interface{} //分类
@@ -427,7 +429,16 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 			} else {
 				qu.Try(func() {
 					rinfo.RuleText = v["s_rule"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
+					ptmp := strings.Split(rinfo.RuleText, "#")
+					sign := 0
+					if len(ptmp) == 2 {
+						if ptmp[1] == "正" {
+							sign = 1
+						} else if ptmp[1] == "负" {
+							sign = -1
+						}
+					}
+					tmp := strings.Split(ptmp[0], "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
 						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
@@ -447,7 +458,7 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 								posm[rinfo.Field] = qu.IntAll(ks[0])
 							}
 						}
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
+						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm, NumSign: sign}
 					} else {
 						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
 					}

+ 11 - 3
src/jy/extract/extractudp.go

@@ -118,7 +118,10 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 		ext.ResultSave(true)
 		ext.BidSave(true)
 		ext.IsRun = true
+	} else {
+		ext.BidTotal = 0
 	}
+	index := 0
 	if len(instanceId) > 0 { //分布式抽取进度
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
 		count1 := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
@@ -163,6 +166,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j, jf)
 					sid = _id
+					index++
 				}
 				db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
 					map[string]interface{}{"$set": map[string]interface{}{
@@ -189,6 +193,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j, jf)
 					sidback = _id
+					index++
 				}
 				db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
 					map[string]interface{}{"$set": map[string]interface{}{
@@ -201,6 +206,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					"pagecurrent": i + 1,
 				}}, true, false)
 		}
+		log.Debug("抽取完成", "count:", count, "index:", index, "bidtotal:", ext.BidTotal)
 	} else { //普通抽取
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
 		count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
@@ -214,7 +220,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 			query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid)}}
 			fmt.Printf("page=%d,query=%v", i+1, query)
 			list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
-			for k, v := range *list {
+			for _, v := range *list {
 				if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 					continue
 				}
@@ -232,13 +238,15 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					defer wg.Done()
 					ext.ExtractProcess(j, jf)
 				}()
-				if k%1000 == 0 {
-					log.Debug(i, k, _id)
+				index++
+				if index%1000 == 0 {
+					log.Debug("index:", index, "页码:", i+1, "_id:", _id)
 				}
 				sid = _id
 			}
 		}
 		wg.Wait()
 		ext.BidSave(false)
+		log.Debug("抽取完成", "count:", count, "index:", index, "bidtotal:", ext.BidTotal, "sid:", eid)
 	}
 }