zhangjinkun 6 жил өмнө
parent
commit
94bc394781

+ 89 - 34
src/jy/extract/extract.go

@@ -77,7 +77,7 @@ func RunExtractTask(ext *ExtractTask) {
 		time.Sleep(1 * time.Second)
 	}
 	//更新task.s_extlastid
-	db.Mgo.UpdateById("task", ext.TaskInfo.LastExtId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
+	db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
 	time.AfterFunc(30*time.Minute, func() { RunExtractTask(ext) })
 }
 
@@ -202,13 +202,13 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 		for k, v := range extinfo { //结果覆盖原doc
 			doc[k] = v
 		}
-		AddExtLog(j.SourceMid, before, extinfo, in, t) //抽取日志
+		AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
 	} else {
 		key := qu.If(in.Field == "", "detail", in.Field).(string)
 		text := qu.ObjToString(doc[key])
 		extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
-		doc[key] = extinfo[key]                        //结果覆盖原doc
-		AddExtLog(j.SourceMid, before, extinfo, in, t) //抽取日志
+		doc[key] = extinfo[key]                                      //结果覆盖原doc
+		AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
 	}
 	return doc
 }
@@ -236,7 +236,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 			}
 		}
 		if len(extinfo) > 0 {
-			AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+			AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 		}
 	} else {
 		//全文正则
@@ -244,7 +244,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 		if in.Field != "" {
 			extinfo := extRegCoreToResult(extfrom, text, j, in)
 			if len(extinfo) > 0 {
-				AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+				AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 			}
 		}
 	}
@@ -272,7 +272,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "colon1",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "string",
+										"matchtype": "tag_string",
 									})
 								}
 								break
@@ -286,7 +286,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "colon1",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "regexp",
+										"matchtype": "tag_regexp",
 									})
 								}
 								break
@@ -303,7 +303,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "colon2",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "string",
+										"matchtype": "tag_string",
 									})
 								}
 								break
@@ -317,7 +317,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "colon2",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "regexp",
+										"matchtype": "tag_regexp",
 									})
 								}
 								break
@@ -340,7 +340,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "space",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "string",
+										"matchtype": "tag_string",
 									})
 								}
 								break
@@ -354,7 +354,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "space",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "regexp",
+										"matchtype": "tag_regexp",
 									})
 								}
 								break
@@ -377,7 +377,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "table",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "string",
+										"matchtype": "tag_string",
 									})
 								}
 								break
@@ -391,7 +391,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
 										"type":      "table",
 										"field":     field,
 										"key":       tag.Key,
-										"matchtype": "regexp",
+										"matchtype": "tag_regexp",
 									})
 								}
 								break
@@ -418,7 +418,14 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 						continue
 					}
 					val := text[pos[p]:pos[p+1]]
-					extinfo[k] = val
+					extinfo[k] = map[string]interface{}{
+						"field":     v.Field,
+						"key":       v.Code,
+						"type":      "regexp",
+						"matchtype": "regcontent",
+						"extfrom":   extfrom,
+						"value":     val,
+					}
 					if val != "" {
 						if j.Result[v.Field] == nil {
 							j.Result[k] = [](*ju.ExtField){}
@@ -430,8 +437,15 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 		}
 	} else {
 		val := v.RegCore.Reg.ReplaceAllString(text, "")
-		extinfo[v.Field] = val
 		if val != "" {
+			extinfo[v.Field] = map[string]interface{}{
+				"field":     v.Field,
+				"key":       v.Code,
+				"type":      "regexp",
+				"matchtype": "regcontent",
+				"extfrom":   extfrom,
+				"value":     val,
+			}
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
@@ -459,7 +473,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 			}
 		}
 		if len(extinfo) > 0 {
-			AddExtLog(j.SourceMid, result, extinfo, in, t) //抽取日志
+			AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
 		}
 	} else {
 		extinfo := map[string]interface{}{}
@@ -472,11 +486,18 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 					text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 				}
 				j.Result[in.Field][k].Value = text
-				exts = append(exts, text)
+				exts = append(exts, map[string]interface{}{
+					"field":     v.Field,
+					"key":       v.Key,
+					"type":      v.Type,
+					"matchtype": v.MatchType,
+					"extfrom":   v.ExtFrom,
+					"value":     text,
+				})
 			}
 			extinfo[in.Field] = exts
 			if len(extinfo) > 0 {
-				AddExtLog(j.SourceMid, tmp, extinfo, in, t) //抽取日志
+				AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
 			}
 		} else {
 			for key, tmp := range j.Result {
@@ -487,12 +508,19 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
 					j.Result[key][k].Value = text
-					exts = append(exts, text)
+					exts = append(exts, map[string]interface{}{
+						"field":     v.Field,
+						"key":       v.Key,
+						"type":      v.Type,
+						"matchtype": v.MatchType,
+						"extfrom":   v.ExtFrom,
+						"value":     text,
+					})
 				}
 				extinfo[key] = exts
 			}
 			if len(extinfo) > 0 {
-				AddExtLog(j.SourceMid, j.Result, extinfo, in, t) //抽取日志
+				AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
 			}
 		}
 	}
@@ -521,13 +549,14 @@ func getResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
 }
 
 //抽取日志
-func AddExtLog(sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
+func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
 	if !t.IsEtxLog {
 		return
 	}
 	logdata := map[string]interface{}{
 		"code":       v.Code,
 		"name":       v.Name,
+		"type":       ftype,
 		"ruletext":   v.RuleText,
 		"islua":      v.IsLua,
 		"field":      v.Field,
@@ -552,15 +581,15 @@ func SaveExtLog() {
 	lock.Unlock()
 	for k, v := range tmpLogs {
 		if len(v) < saveLimit {
-			k.DB.SaveBulk(k.TrackColl, v...)
+			db.Mgo.SaveBulk(k.TrackColl, v...)
 		} else {
 			for {
 				if len(v) > saveLimit {
 					tmp := v[:saveLimit]
-					k.DB.SaveBulk(k.TrackColl, tmp...)
+					db.Mgo.SaveBulk(k.TrackColl, tmp...)
 					v = v[saveLimit:]
 				} else {
-					k.DB.SaveBulk(k.TrackColl, v...)
+					db.Mgo.SaveBulk(k.TrackColl, v...)
 					break
 				}
 			}
@@ -569,20 +598,46 @@ func SaveExtLog() {
 	time.AfterFunc(10*time.Second, SaveExtLog)
 }
 
+type FieldValue struct {
+	Value interface{}
+	Count int
+}
+
 //分析抽取结果并保存
 func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
-	//待完善
+	_id := qu.BsonIdToSId((*doc)["_id"])
+	//结果排序
+	values := map[string][]*ju.SortObject{}
 	for key, val := range result {
-		for _, v := range val { //暂时取第一个保存
-			(*doc)[key] = v.Value
-			if key == "budget" || key == "bidamount" {
-				if qu.Int64All(v.Value) > 0 {
-					break
-				}
-			} else {
+		fieldValue := map[string]int{}
+		for _, v := range val {
+			value := qu.ObjToString(v.Value)
+			fieldValue[value] += 1
+		}
+		objects := []*ju.SortObject{}
+		for k, v := range fieldValue {
+			tmp := &ju.SortObject{
+				Key:   k,
+				Value: v,
+			}
+			objects = append(objects, tmp)
+		}
+		values[key] = ju.ExtSort(objects)
+	}
+	//从排序结果中取值
+	tmp := map[string]interface{}{}
+	for key, val := range values {
+		for _, v := range val { //取第一个
+			if v.Key != "" {
+				tmp[key] = v.Key
 				break
 			}
 		}
 	}
-	task.DB.Update(task.SaveColl, `{"_id":"`+qu.BsonIdToSId((*doc)["_id"])+`"}`, doc, true, false)
+	//保存抽取结果
+	task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, doc, true, false)
+	log.Println(tmp)
+	//保存抽取详情
+	tmp["result"] = result
+	db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, tmp, true, false)
 }

+ 2 - 2
src/jy/util/article.go

@@ -18,9 +18,9 @@ type Job struct {
 
 type ExtField struct {
 	Field     string      //属性
-	Key       string      //匹配标签、正则代码
+	Key       string      //匹配标签(字符串、正则)、正则或lua代码
 	Type      string      //kv(细类:colon1,colon2,space,table)、正则(regexp)
-	MatchType string      //匹配类型:1:标签库类型(string,regexp),2:全文正则regcontent
+	MatchType string      //匹配类型:1:标签库类型(tag_string,tag_regexp),2:全文正则regcontent
 	ExtFrom   string      //抽取来源(title,detail)
 	Value     interface{} //抽取结果
 }

+ 13 - 19
src/jy/util/script.go

@@ -4,7 +4,6 @@ package util
 import (
 	"encoding/json"
 	"fmt"
-	qu "qfw/util"
 
 	ljson "github.com/yuin/gopher-json"
 	"github.com/yuin/gopher-lua"
@@ -135,7 +134,7 @@ func MapToLuaTable2(l *lua.LState, obj map[string][]map[string]interface{}) *lua
 	for k, ms := range obj {
 		tab2 := l.NewTable()
 		for i, v := range ms {
-			tab2.Insert(i, MapToLuaTable(l, v))
+			tab2.Insert(i+1 /*加1防止顺序错乱*/, MapToLuaTable(l, v))
 		}
 		tab.RawSet(lua.LString(k), tab2)
 	}
@@ -145,30 +144,25 @@ func MapToLuaTable2(l *lua.LState, obj map[string][]map[string]interface{}) *lua
 func LuaTableToMap(param *lua.LTable) map[string]interface{} {
 	tmp := map[string]interface{}{}
 	param.ForEach(func(key, val lua.LValue) {
-		k := fmt.Sprint(key)
+		kk := fmt.Sprint(key)
 		if v, ok := val.(lua.LString); ok {
-			tmp[k] = string(v)
+			tmp[kk] = string(v)
 		} else if v, ok := val.(*lua.LTable); ok {
-			i := qu.IntAllDef(k, -1)
-			if i > -1 { //转数组
-				t := []map[string]interface{}{}
-				v.ForEach(func(k, inv lua.LValue) {
-					if vv, ok := inv.(*lua.LTable); ok {
-						t = append(t, LuaTableToMap(vv))
-					}
-				})
-				tmp[k] = t
-			} else {
-				tmp[k] = LuaTableToMap(v)
-			}
+			t := []map[string]interface{}{}
+			v.ForEach(func(k, inv lua.LValue) {
+				if vv, ok := inv.(*lua.LTable); ok {
+					t = append(t, LuaTableToMap(vv))
+				}
+			})
+			tmp[kk] = t
 		} else if v, ok := val.(*lua.LBool); ok {
 			if v.String() == "true" {
-				tmp[k] = true
+				tmp[kk] = true
 			} else {
-				tmp[k] = false
+				tmp[kk] = false
 			}
 		} else {
-			tmp[k] = v
+			tmp[kk] = v
 		}
 	})
 	return tmp

+ 38 - 0
src/jy/util/sort.go

@@ -0,0 +1,38 @@
+// 抽取结果sort
+package util
+
+import (
+	"sort"
+)
+
+type SortObject struct {
+	Key    string
+	Value  int
+	Object interface{}
+}
+
+type SortStruct []*SortObject
+
+func (list SortStruct) Len() int {
+	return len(list)
+}
+
+func (list SortStruct) Less(i, j int) bool {
+	if list[i].Value > list[j].Value {
+		return true
+	} else {
+		return false
+	}
+}
+
+func (list SortStruct) Swap(i, j int) {
+	var temp *SortObject = list[i]
+	list[i] = list[j]
+	list[j] = temp
+}
+
+func ExtSort(list []*SortObject) []*SortObject {
+	ls := SortStruct(list)
+	sort.Sort(ls)
+	return ls
+}