|
@@ -2,6 +2,7 @@ package extract
|
|
|
|
|
|
import (
|
|
|
"encoding/json"
|
|
|
+ "jy/clear"
|
|
|
db "jy/mongodbutil"
|
|
|
"jy/pretreated"
|
|
|
ju "jy/util"
|
|
@@ -37,6 +38,7 @@ func StartExtractTaskId(taskId string) bool {
|
|
|
ext.InitRuleBacks()
|
|
|
ext.InitRuleCore()
|
|
|
ext.InitTag()
|
|
|
+ ext.InitClearFn()
|
|
|
//只启动一次taskId
|
|
|
go RunExtractTask(ext)
|
|
|
}
|
|
@@ -76,8 +78,8 @@ func RunExtractTask(ext *ExtractTask) {
|
|
|
//信息预处理
|
|
|
func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
|
detail := ""
|
|
|
- d1 := doc["detail"].(string)
|
|
|
- d2 := doc["contenthtml"].(string)
|
|
|
+ d1, _ := doc["detail"].(string)
|
|
|
+ d2, _ := doc["contenthtml"].(string)
|
|
|
if len(d1) >= len(d2) || d2 == "" {
|
|
|
detail = d1
|
|
|
} else {
|
|
@@ -126,7 +128,7 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
|
|
|
qu.Catch()
|
|
|
qu.Try(func() {
|
|
|
doc := *j.Data
|
|
|
- //前置规则,结果覆盖doc属性
|
|
|
+ //全局前置规则,结果覆盖doc属性
|
|
|
for _, v := range e.RulePres {
|
|
|
doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
|
}
|
|
@@ -147,7 +149,7 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
|
|
|
|
|
|
//抽取-规则
|
|
|
for _, v := range vc.RuleCores {
|
|
|
- ExtRegCore(tmp, j, v, e)
|
|
|
+ ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
|
}
|
|
|
//log.Println("抽取-规则", tmp)
|
|
|
|
|
@@ -161,10 +163,18 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
|
|
|
for _, v := range e.RuleBacks {
|
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
|
}
|
|
|
- bs, _ := json.Marshal(j.Result)
|
|
|
- log.Println("抽取结果", string(bs))
|
|
|
}
|
|
|
- //抽取结果保存 todo
|
|
|
+ //函数清理
|
|
|
+ for key, val := range j.Result {
|
|
|
+ for _, v := range val {
|
|
|
+ data := clear.DoClearFn(e.ClearFn[key], []interface{}{v.Value, j.Content})
|
|
|
+ v.Value = data[0]
|
|
|
+ }
|
|
|
+ }
|
|
|
+ bs, _ := json.Marshal(j.Result)
|
|
|
+ log.Println("抽取结果", j.SourceMid, string(bs))
|
|
|
+ //分析抽取结果并保存 todo
|
|
|
+ AnalysisSaveResult(j.Data, j.Result, e.TaskInfo.SaveColl)
|
|
|
|
|
|
}, func(err interface{}) {
|
|
|
log.Println(err)
|
|
@@ -198,11 +208,13 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
|
|
|
}
|
|
|
|
|
|
//抽取-规则
|
|
|
-func ExtRegCore(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
|
|
|
+func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
|
|
|
if in.IsLua {
|
|
|
lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
|
|
|
if in.IsHasFields { //lua脚本配置有属性字段
|
|
|
lua.KvMap = getKvByLuaFields(j, in, et.Tag)
|
|
|
+ } else {
|
|
|
+ lua.KvMap = map[string][]map[string]interface{}{}
|
|
|
}
|
|
|
lua.Block = j.Block
|
|
|
extinfo := lua.RunScript("core")
|
|
@@ -212,17 +224,22 @@ func ExtRegCore(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *Extra
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), tmp["value"]})
|
|
|
+ j.Result[k] = append(j.Result[k],
|
|
|
+ &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ }
|
|
|
} else {
|
|
|
+ //全文正则
|
|
|
+ text := qu.ObjToString(doc[extfrom])
|
|
|
if in.Field != "" {
|
|
|
- //全文正则
|
|
|
- text := qu.ObjToString(doc["detail"])
|
|
|
- extinfo := extRegCoreToResult(text, j, in)
|
|
|
- AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ extinfo := extRegCoreToResult(extfrom, text, j, in)
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog(j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -240,31 +257,65 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
kvs2 := bl.ColonKV.Kvs_2
|
|
|
for _, tag := range tags {
|
|
|
for _, kv := range kvs {
|
|
|
- if kv.Key == tag.Key {
|
|
|
- text := ju.TrimLRSpace(kv.Value, "")
|
|
|
- if text != "" {
|
|
|
- kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
- "value": text,
|
|
|
- "type": "colon1",
|
|
|
- "field": field,
|
|
|
- "key": tag.Key,
|
|
|
- })
|
|
|
+ if tag.Type == "string" {
|
|
|
+ if kv.Key == tag.Key {
|
|
|
+ text := ju.TrimLRSpace(kv.Value, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "colon1",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "string",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if tag.Type == "regexp" {
|
|
|
+ if tag.Reg.MatchString(kv.Key) {
|
|
|
+ text := ju.TrimLRSpace(kv.Value, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "colon1",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "regexp",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
}
|
|
|
- break
|
|
|
}
|
|
|
}
|
|
|
for _, kv := range kvs2 {
|
|
|
- if kv.Key == tag.Key {
|
|
|
- text := ju.TrimLRSpace(kv.Value, "")
|
|
|
- if text != "" {
|
|
|
- kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
- "value": text,
|
|
|
- "type": "colon2",
|
|
|
- "field": field,
|
|
|
- "key": tag.Key,
|
|
|
- })
|
|
|
+ if tag.Type == "string" {
|
|
|
+ if kv.Key == tag.Key {
|
|
|
+ text := ju.TrimLRSpace(kv.Value, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "colon2",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "string",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if tag.Type == "regexp" {
|
|
|
+ if tag.Reg.MatchString(kv.Key) {
|
|
|
+ text := ju.TrimLRSpace(kv.Value, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "colon2",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "regexp",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
}
|
|
|
- break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -274,17 +325,34 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
kvs := bl.SpaceKV.Kvs
|
|
|
for _, tag := range tags {
|
|
|
for _, kv := range kvs {
|
|
|
- if kv.Key == tag.Key {
|
|
|
- text := ju.TrimLRSpace(kv.Value, "")
|
|
|
- if text != "" {
|
|
|
- kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
- "value": text,
|
|
|
- "type": "space",
|
|
|
- "field": field,
|
|
|
- "key": tag.Key,
|
|
|
- })
|
|
|
+ if tag.Type == "string" {
|
|
|
+ if kv.Key == tag.Key {
|
|
|
+ text := ju.TrimLRSpace(kv.Value, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "space",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "string",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if tag.Type == "regexp" {
|
|
|
+ if tag.Reg.MatchString(kv.Key) {
|
|
|
+ text := ju.TrimLRSpace(kv.Value, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "space",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "regexp",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
}
|
|
|
- break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -294,17 +362,34 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
kv := bl.TableKV.Kv
|
|
|
for _, tag := range tags {
|
|
|
for k, val := range kv {
|
|
|
- if k == tag.Key {
|
|
|
- text := ju.TrimLRSpace(val, "")
|
|
|
- if text != "" {
|
|
|
- kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
- "value": text,
|
|
|
- "type": "table",
|
|
|
- "field": field,
|
|
|
- "key": tag.Key,
|
|
|
- })
|
|
|
+ if tag.Type == "string" {
|
|
|
+ if k == tag.Key {
|
|
|
+ text := ju.TrimLRSpace(val, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "table",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "string",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if tag.Type == "regexp" {
|
|
|
+ if tag.Reg.MatchString(k) {
|
|
|
+ text := ju.TrimLRSpace(val, "")
|
|
|
+ if text != "" {
|
|
|
+ kvmap[field] = append(kvmap[field], map[string]interface{}{
|
|
|
+ "value": text,
|
|
|
+ "type": "table",
|
|
|
+ "field": field,
|
|
|
+ "key": tag.Key,
|
|
|
+ "matchtype": "regexp",
|
|
|
+ })
|
|
|
+ }
|
|
|
+ break
|
|
|
}
|
|
|
- break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -315,7 +400,7 @@ func getKvByLuaFields(j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string
|
|
|
}
|
|
|
|
|
|
//正则提取结果
|
|
|
-func extRegCoreToResult(text string, j *ju.Job, v *RegLuaInfo) map[string]interface{} {
|
|
|
+func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string]interface{} {
|
|
|
extinfo := map[string]interface{}{}
|
|
|
if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
|
apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
|
|
@@ -323,13 +408,16 @@ func extRegCoreToResult(text string, j *ju.Job, v *RegLuaInfo) map[string]interf
|
|
|
pos := apos[0]
|
|
|
for k, p := range v.RegCore.ExtractPos {
|
|
|
if len(pos) > p {
|
|
|
+ if pos[p] == -1 || pos[p+1] == -1 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
val := text[pos[p]:pos[p+1]]
|
|
|
extinfo[k] = val
|
|
|
if val != "" {
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.RuleText, "regexp", val})
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, "regexp", "regcontent", extfrom, val})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -341,7 +429,7 @@ func extRegCoreToResult(text string, j *ju.Job, v *RegLuaInfo) map[string]interf
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.RuleText, "regexp", val})
|
|
|
+ j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, "regexp", "regcontent", extfrom, val})
|
|
|
}
|
|
|
}
|
|
|
return extinfo
|
|
@@ -360,11 +448,13 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
|
for _, tmp := range tmps {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), tmp["value"]})
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["key"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"]})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- AddExtLog(j.SourceMid, result, extinfo, in, t) //抽取日志
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog(j.SourceMid, result, extinfo, in, t) //抽取日志
|
|
|
+ }
|
|
|
} else {
|
|
|
extinfo := map[string]interface{}{}
|
|
|
if in.Field != "" && j.Result[in.Field] != nil {
|
|
@@ -379,7 +469,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
exts = append(exts, text)
|
|
|
}
|
|
|
extinfo[in.Field] = exts
|
|
|
- AddExtLog(j.SourceMid, tmp, extinfo, in, t) //抽取日志
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog(j.SourceMid, tmp, extinfo, in, t) //抽取日志
|
|
|
+ }
|
|
|
} else {
|
|
|
for key, tmp := range j.Result {
|
|
|
exts := []interface{}{}
|
|
@@ -393,7 +485,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
}
|
|
|
extinfo[key] = exts
|
|
|
}
|
|
|
- AddExtLog(j.SourceMid, j.Result, extinfo, in, t) //抽取日志
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog(j.SourceMid, j.Result, extinfo, in, t) //抽取日志
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -407,10 +501,12 @@ func getResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
|
|
|
}
|
|
|
for _, v := range val {
|
|
|
tmp := map[string]interface{}{
|
|
|
- "field": v.Field,
|
|
|
- "value": v.Value,
|
|
|
- "type": v.Type,
|
|
|
- "key": v.Key,
|
|
|
+ "field": v.Field,
|
|
|
+ "value": v.Value,
|
|
|
+ "type": v.Type,
|
|
|
+ "matchtype": v.MatchType,
|
|
|
+ "key": v.Key,
|
|
|
+ "extfrom": v.ExtFrom,
|
|
|
}
|
|
|
result[key] = append(result[key], tmp)
|
|
|
}
|
|
@@ -466,3 +562,14 @@ func SaveExtLog() {
|
|
|
}
|
|
|
time.AfterFunc(10*time.Second, SaveExtLog)
|
|
|
}
|
|
|
+
|
|
|
+//分析抽取结果并保存
|
|
|
+func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, totable string) {
|
|
|
+ //待完善
|
|
|
+ for key, val := range result {
|
|
|
+ for _, v := range val { //暂时取第一个保存
|
|
|
+ (*doc)[key] = v.Value
|
|
|
+ }
|
|
|
+ }
|
|
|
+ db.Mgo.Update(totable, `{"_id":"`+qu.BsonIdToSId((*doc)["_id"])+`"}`, doc, true, false)
|
|
|
+}
|