|
@@ -77,7 +77,7 @@ func RunExtractTask(ext *ExtractTask) {
|
|
|
time.Sleep(1 * time.Second)
|
|
|
}
|
|
|
//更新task.s_extlastid
|
|
|
- db.Mgo.UpdateById("task", ext.TaskInfo.LastExtId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
|
|
|
+ db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
|
|
|
time.AfterFunc(30*time.Minute, func() { RunExtractTask(ext) })
|
|
|
}
|
|
|
|
|
@@ -202,13 +202,13 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
|
|
|
for k, v := range extinfo { //结果覆盖原doc
|
|
|
doc[k] = v
|
|
|
}
|
|
|
- AddExtLog(j.SourceMid, before, extinfo, in, t) //抽取日志
|
|
|
+ AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
|
|
|
} else {
|
|
|
key := qu.If(in.Field == "", "detail", in.Field).(string)
|
|
|
text := qu.ObjToString(doc[key])
|
|
|
extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
|
|
|
- doc[key] = extinfo[key] //结果覆盖原doc
|
|
|
- AddExtLog(j.SourceMid, before, extinfo, in, t) //抽取日志
|
|
|
+ doc[key] = extinfo[key] //结果覆盖原doc
|
|
|
+ AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
|
|
|
}
|
|
|
return doc
|
|
|
}
|
|
@@ -236,7 +236,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
}
|
|
|
if len(extinfo) > 0 {
|
|
|
- AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
}
|
|
|
} else {
|
|
|
//全文正则
|
|
@@ -244,7 +244,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
if in.Field != "" {
|
|
|
extinfo := extRegCoreToResult(extfrom, text, j, in)
|
|
|
if len(extinfo) > 0 {
|
|
|
- AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -272,7 +272,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "colon1",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "string",
|
|
|
+ "matchtype": "tag_string",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -286,7 +286,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "colon1",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "regexp",
|
|
|
+ "matchtype": "tag_regexp",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -303,7 +303,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "colon2",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "string",
|
|
|
+ "matchtype": "tag_string",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -317,7 +317,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "colon2",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "regexp",
|
|
|
+ "matchtype": "tag_regexp",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -340,7 +340,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "space",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "string",
|
|
|
+ "matchtype": "tag_string",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -354,7 +354,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "space",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "regexp",
|
|
|
+ "matchtype": "tag_regexp",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -377,7 +377,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "table",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "string",
|
|
|
+ "matchtype": "tag_string",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -391,7 +391,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
"type": "table",
|
|
|
"field": field,
|
|
|
"key": tag.Key,
|
|
|
- "matchtype": "regexp",
|
|
|
+ "matchtype": "tag_regexp",
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -418,7 +418,14 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
continue
|
|
|
}
|
|
|
val := text[pos[p]:pos[p+1]]
|
|
|
- extinfo[k] = val
|
|
|
+ extinfo[k] = map[string]interface{}{
|
|
|
+ "field": v.Field,
|
|
|
+ "key": v.Code,
|
|
|
+ "type": "regexp",
|
|
|
+ "matchtype": "regcontent",
|
|
|
+ "extfrom": extfrom,
|
|
|
+ "value": val,
|
|
|
+ }
|
|
|
if val != "" {
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
@@ -430,8 +437,15 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
}
|
|
|
} else {
|
|
|
val := v.RegCore.Reg.ReplaceAllString(text, "")
|
|
|
- extinfo[v.Field] = val
|
|
|
if val != "" {
|
|
|
+ extinfo[v.Field] = map[string]interface{}{
|
|
|
+ "field": v.Field,
|
|
|
+ "key": v.Code,
|
|
|
+ "type": "regexp",
|
|
|
+ "matchtype": "regcontent",
|
|
|
+ "extfrom": extfrom,
|
|
|
+ "value": val,
|
|
|
+ }
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
@@ -459,7 +473,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
}
|
|
|
}
|
|
|
if len(extinfo) > 0 {
|
|
|
- AddExtLog(j.SourceMid, result, extinfo, in, t) //抽取日志
|
|
|
+ AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
|
|
|
}
|
|
|
} else {
|
|
|
extinfo := map[string]interface{}{}
|
|
@@ -472,11 +486,18 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
|
}
|
|
|
j.Result[in.Field][k].Value = text
|
|
|
- exts = append(exts, text)
|
|
|
+ exts = append(exts, map[string]interface{}{
|
|
|
+ "field": v.Field,
|
|
|
+ "key": v.Key,
|
|
|
+ "type": v.Type,
|
|
|
+ "matchtype": v.MatchType,
|
|
|
+ "extfrom": v.ExtFrom,
|
|
|
+ "value": text,
|
|
|
+ })
|
|
|
}
|
|
|
extinfo[in.Field] = exts
|
|
|
if len(extinfo) > 0 {
|
|
|
- AddExtLog(j.SourceMid, tmp, extinfo, in, t) //抽取日志
|
|
|
+ AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
|
|
|
}
|
|
|
} else {
|
|
|
for key, tmp := range j.Result {
|
|
@@ -487,12 +508,19 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
|
}
|
|
|
j.Result[key][k].Value = text
|
|
|
- exts = append(exts, text)
|
|
|
+ exts = append(exts, map[string]interface{}{
|
|
|
+ "field": v.Field,
|
|
|
+ "key": v.Key,
|
|
|
+ "type": v.Type,
|
|
|
+ "matchtype": v.MatchType,
|
|
|
+ "extfrom": v.ExtFrom,
|
|
|
+ "value": text,
|
|
|
+ })
|
|
|
}
|
|
|
extinfo[key] = exts
|
|
|
}
|
|
|
if len(extinfo) > 0 {
|
|
|
- AddExtLog(j.SourceMid, j.Result, extinfo, in, t) //抽取日志
|
|
|
+ AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -521,13 +549,14 @@ func getResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
|
|
|
}
|
|
|
|
|
|
//抽取日志
|
|
|
-func AddExtLog(sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
|
|
|
+func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
|
|
|
if !t.IsEtxLog {
|
|
|
return
|
|
|
}
|
|
|
logdata := map[string]interface{}{
|
|
|
"code": v.Code,
|
|
|
"name": v.Name,
|
|
|
+ "type": ftype,
|
|
|
"ruletext": v.RuleText,
|
|
|
"islua": v.IsLua,
|
|
|
"field": v.Field,
|
|
@@ -552,15 +581,15 @@ func SaveExtLog() {
|
|
|
lock.Unlock()
|
|
|
for k, v := range tmpLogs {
|
|
|
if len(v) < saveLimit {
|
|
|
- k.DB.SaveBulk(k.TrackColl, v...)
|
|
|
+ db.Mgo.SaveBulk(k.TrackColl, v...)
|
|
|
} else {
|
|
|
for {
|
|
|
if len(v) > saveLimit {
|
|
|
tmp := v[:saveLimit]
|
|
|
- k.DB.SaveBulk(k.TrackColl, tmp...)
|
|
|
+ db.Mgo.SaveBulk(k.TrackColl, tmp...)
|
|
|
v = v[saveLimit:]
|
|
|
} else {
|
|
|
- k.DB.SaveBulk(k.TrackColl, v...)
|
|
|
+ db.Mgo.SaveBulk(k.TrackColl, v...)
|
|
|
break
|
|
|
}
|
|
|
}
|
|
@@ -569,20 +598,46 @@ func SaveExtLog() {
|
|
|
time.AfterFunc(10*time.Second, SaveExtLog)
|
|
|
}
|
|
|
|
|
|
+type FieldValue struct {
|
|
|
+ Value interface{}
|
|
|
+ Count int
|
|
|
+}
|
|
|
+
|
|
|
//分析抽取结果并保存
|
|
|
func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
|
|
|
- //待完善
|
|
|
+ _id := qu.BsonIdToSId((*doc)["_id"])
|
|
|
+ //结果排序
|
|
|
+ values := map[string][]*ju.SortObject{}
|
|
|
for key, val := range result {
|
|
|
- for _, v := range val { //暂时取第一个保存
|
|
|
- (*doc)[key] = v.Value
|
|
|
- if key == "budget" || key == "bidamount" {
|
|
|
- if qu.Int64All(v.Value) > 0 {
|
|
|
- break
|
|
|
- }
|
|
|
- } else {
|
|
|
+ fieldValue := map[string]int{}
|
|
|
+ for _, v := range val {
|
|
|
+ value := qu.ObjToString(v.Value)
|
|
|
+ fieldValue[value] += 1
|
|
|
+ }
|
|
|
+ objects := []*ju.SortObject{}
|
|
|
+ for k, v := range fieldValue {
|
|
|
+ tmp := &ju.SortObject{
|
|
|
+ Key: k,
|
|
|
+ Value: v,
|
|
|
+ }
|
|
|
+ objects = append(objects, tmp)
|
|
|
+ }
|
|
|
+ values[key] = ju.ExtSort(objects)
|
|
|
+ }
|
|
|
+ //从排序结果中取值
|
|
|
+ tmp := map[string]interface{}{}
|
|
|
+ for key, val := range values {
|
|
|
+ for _, v := range val { //取第一个
|
|
|
+ if v.Key != "" {
|
|
|
+ tmp[key] = v.Key
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- task.DB.Update(task.SaveColl, `{"_id":"`+qu.BsonIdToSId((*doc)["_id"])+`"}`, doc, true, false)
|
|
|
+ //保存抽取结果
|
|
|
+ task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, doc, true, false)
|
|
|
+ log.Println(tmp)
|
|
|
+ //保存抽取详情
|
|
|
+ tmp["result"] = result
|
|
|
+ db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, tmp, true, false)
|
|
|
}
|