|
@@ -55,10 +55,15 @@ type ExtractTask struct {
|
|
|
RuleCores []*RuleCore //抽取规则
|
|
|
}
|
|
|
|
|
|
-var lock sync.RWMutex
|
|
|
-var ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
-var saveLimit = 200 //抽取日志批量保存
|
|
|
-var TaskList map[string]*ExtractTask //任务列表
|
|
|
+var (
|
|
|
+ lock sync.RWMutex
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ nfields = []string{"contenthtml"} //日志保存排除字段
|
|
|
+)
|
|
|
|
|
|
func init() {
|
|
|
TaskList = make(map[string]*ExtractTask)
|
|
@@ -100,18 +105,37 @@ func RunExtractTask(ext *ExtractTask) {
|
|
|
if !ext.IsRun {
|
|
|
return
|
|
|
}
|
|
|
+ var fields = `{"title":1,"detail":1,"contenthtml":1}`
|
|
|
query := bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
|
- list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, `{"title":1,"detail":1,"contenthtml":1}`, false, -1, -1)
|
|
|
+ list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, fields, false, -1, -1)
|
|
|
for _, v := range *list {
|
|
|
if !ext.IsRun {
|
|
|
break
|
|
|
}
|
|
|
+ v = PreInfo(v)
|
|
|
ext.TaskInfo.ProcessPool <- true
|
|
|
go ext.ExtractProcess(v)
|
|
|
}
|
|
|
time.AfterFunc(30*time.Minute, func() { RunExtractTask(ext) })
|
|
|
}
|
|
|
|
|
|
+//信息预处理
|
|
|
+func PreInfo(doc map[string]interface{}) map[string]interface{} {
|
|
|
+ detail := ""
|
|
|
+ d1 := doc["detail"].(string)
|
|
|
+ d2 := doc["contenthtml"].(string)
|
|
|
+ if len(d1) >= len(d2) || d2 == "" {
|
|
|
+ detail = d1
|
|
|
+ } else {
|
|
|
+ detail = d2
|
|
|
+ }
|
|
|
+ detail = ju.CutLableStr(detail)
|
|
|
+ detail = cut.ClearHtml(detail)
|
|
|
+ doc["detail"] = detail
|
|
|
+ delete(doc, "contenthtml")
|
|
|
+ return doc
|
|
|
+}
|
|
|
+
|
|
|
//加载任务信息
|
|
|
func (e *ExtractTask) InitTaskInfo() {
|
|
|
task, _ := db.Mgo.FindById("task", e.Id, nil)
|
|
@@ -296,40 +320,40 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
|
|
|
for _, v := range e.RulePres {
|
|
|
doc = ExtRegPre(doc, v, e.TaskInfo)
|
|
|
}
|
|
|
- log.Println("全局前置规则", doc)
|
|
|
+ //log.Println("全局前置规则", doc)
|
|
|
//抽取规则
|
|
|
for _, vc := range e.RuleCores {
|
|
|
+ tmp := ju.DeepCopy(doc, []string{}).(map[string]interface{})
|
|
|
//是否进入逻辑
|
|
|
- if !ju.Logic(vc.LuaLogic, doc) {
|
|
|
+ if !ju.Logic(vc.LuaLogic, tmp) {
|
|
|
continue
|
|
|
}
|
|
|
- data := map[string]interface{}{}
|
|
|
//抽取-前置规则
|
|
|
- tmpdoc := map[string]interface{}{}
|
|
|
for _, v := range vc.RulePres {
|
|
|
- tmpdoc = ExtRegPre(doc, v, e.TaskInfo)
|
|
|
+ tmp = ExtRegPre(tmp, v, e.TaskInfo)
|
|
|
}
|
|
|
- log.Println("抽取-前置规则", tmpdoc)
|
|
|
+ //log.Println("抽取-前置规则", tmp)
|
|
|
+
|
|
|
//抽取-规则
|
|
|
for _, v := range vc.RuleCores {
|
|
|
- data = ExtRegCore(tmpdoc, v, e.TaskInfo)
|
|
|
+ tmp = ExtRegCore(tmp, v, e.TaskInfo)
|
|
|
}
|
|
|
- log.Println("抽取-规则", data)
|
|
|
+ //log.Println("抽取-规则", tmp)
|
|
|
|
|
|
//抽取-后置规则
|
|
|
for _, v := range vc.RuleBacks {
|
|
|
- data = ExtRegBack(data, v, e.TaskInfo)
|
|
|
+ tmp = ExtRegBack(tmp, v, e.TaskInfo)
|
|
|
}
|
|
|
- log.Println("抽取-后置规则", data)
|
|
|
+ //log.Println("抽取-后置规则", tmp)
|
|
|
//全局后置规则
|
|
|
for _, v := range e.RuleBacks {
|
|
|
- data = ExtRegBack(data, v, e.TaskInfo)
|
|
|
+ tmp = ExtRegBack(tmp, v, e.TaskInfo)
|
|
|
}
|
|
|
- log.Println("全局后置规则", data)
|
|
|
+ //log.Println("全局后置规则", tmp)
|
|
|
|
|
|
//抽取结果赋值
|
|
|
- for k, v := range data {
|
|
|
- if k == "_id" {
|
|
|
+ for k, v := range tmp {
|
|
|
+ if k == "_id" || k == "detail" || k == "contenthtml" {
|
|
|
continue
|
|
|
}
|
|
|
if result[k] == nil {
|
|
@@ -352,37 +376,40 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
|
|
|
|
|
|
//前置过滤
|
|
|
func ExtRegPre(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[string]interface{} {
|
|
|
+ before := ju.DeepCopy(doc, []string{}).(map[string]interface{})
|
|
|
+ extinfo := map[string]interface{}{}
|
|
|
if v.IsLua {
|
|
|
lua := ju.LuaScript{Code: v.Code, Name: v.Name, Doc: doc, Script: v.RuleText}
|
|
|
- data := lua.RunScript()
|
|
|
- AddExtLog(doc, data, v, t) //抽取日志
|
|
|
- for k, v := range data {
|
|
|
+ extinfo = lua.RunScript()
|
|
|
+ for k, v := range extinfo { //结果覆盖原doc
|
|
|
doc[k] = v
|
|
|
}
|
|
|
+ AddExtLog(before, extinfo, v, t) //抽取日志
|
|
|
} else {
|
|
|
- tmp := doc
|
|
|
key := qu.If(v.Field == "", "detail", v.Field).(string)
|
|
|
text := qu.ObjToString(doc[key])
|
|
|
- doc[key] = v.RegPreBac.Reg.ReplaceAllString(text, "")
|
|
|
- AddExtLog(tmp, doc, v, t) //抽取日志
|
|
|
+ extinfo[key] = v.RegPreBac.Reg.ReplaceAllString(text, "")
|
|
|
+ doc[key] = extinfo[key] //结果覆盖原doc
|
|
|
+ AddExtLog(before, extinfo, v, t) //抽取日志
|
|
|
}
|
|
|
return doc
|
|
|
}
|
|
|
|
|
|
//抽取-规则
|
|
|
func ExtRegCore(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[string]interface{} {
|
|
|
+ before := ju.DeepCopy(doc, nfields).(map[string]interface{})
|
|
|
+ extinfo := map[string]interface{}{}
|
|
|
if v.IsLua {
|
|
|
lua := ju.LuaScript{Code: v.Code, Name: v.Name, Doc: doc, Script: v.RuleText}
|
|
|
- data := lua.RunScript()
|
|
|
- AddExtLog(doc, data, v, t) //抽取日志
|
|
|
- for k, v := range data {
|
|
|
- doc[k] = v
|
|
|
+ extinfo = lua.RunScript()
|
|
|
+ for k, v := range extinfo {
|
|
|
+ doc[k] = v //结果覆盖原doc
|
|
|
}
|
|
|
+ AddExtLog(before, extinfo, v, t) //抽取日志
|
|
|
} else {
|
|
|
if v.Field == "" {
|
|
|
return doc
|
|
|
}
|
|
|
- tmp := doc
|
|
|
text := qu.ObjToString(doc["detail"])
|
|
|
if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
|
apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
|
|
@@ -390,47 +417,51 @@ func ExtRegCore(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[stri
|
|
|
pos := apos[0]
|
|
|
for k, p := range v.RegCore.ExtractPos {
|
|
|
if len(pos) > p {
|
|
|
- doc[k] = text[pos[p]:pos[p+1]]
|
|
|
- //log.Println(k, doc[k])
|
|
|
+ extinfo[k] = text[pos[p]:pos[p+1]]
|
|
|
+ doc[k] = extinfo[k] //结果覆盖原doc
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
} else {
|
|
|
- doc[v.Field] = v.RegCore.Reg.ReplaceAllString(text, "")
|
|
|
+ extinfo[v.Field] = v.RegCore.Reg.ReplaceAllString(text, "")
|
|
|
+ doc[v.Field] = extinfo[v.Field] //结果覆盖原doc
|
|
|
}
|
|
|
- AddExtLog(tmp, doc, v, t) //抽取日志
|
|
|
+ AddExtLog(before, extinfo, v, t) //抽取日志
|
|
|
}
|
|
|
return doc
|
|
|
}
|
|
|
|
|
|
//后置过滤
|
|
|
func ExtRegBack(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[string]interface{} {
|
|
|
+ before := ju.DeepCopy(doc, nfields).(map[string]interface{})
|
|
|
+ extinfo := map[string]interface{}{}
|
|
|
if v.IsLua {
|
|
|
lua := ju.LuaScript{Code: v.Code, Name: v.Name, Doc: doc, Script: v.RuleText}
|
|
|
- data := lua.RunScript()
|
|
|
- AddExtLog(doc, data, v, t) //抽取日志
|
|
|
- for k, v := range data {
|
|
|
+ extinfo = lua.RunScript()
|
|
|
+ for k, v := range extinfo { //结果覆盖原doc
|
|
|
doc[k] = v
|
|
|
}
|
|
|
+ AddExtLog(before, extinfo, v, t) //抽取日志
|
|
|
} else {
|
|
|
- tmp := doc
|
|
|
if v.Field != "" && qu.ObjToString(doc[v.Field]) != "" {
|
|
|
- doc[v.Field] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(doc[v.Field]), v.RegPreBac.Replace)
|
|
|
+ extinfo[v.Field] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(doc[v.Field]), v.RegPreBac.Replace)
|
|
|
+ doc[v.Field] = extinfo[v.Field]
|
|
|
} else {
|
|
|
for k, val := range doc {
|
|
|
- if k == "_id" || k == "detail" || qu.ObjToString(val) == "" {
|
|
|
+ if k == "_id" || qu.ObjToString(val) == "" {
|
|
|
continue
|
|
|
}
|
|
|
- doc[k] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(val), v.RegPreBac.Replace)
|
|
|
+ extinfo[k] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(val), v.RegPreBac.Replace)
|
|
|
+ doc[k] = extinfo[k] //结果覆盖原doc
|
|
|
}
|
|
|
}
|
|
|
- AddExtLog(tmp, doc, v, t) //抽取日志
|
|
|
+ AddExtLog(before, extinfo, v, t) //抽取日志
|
|
|
}
|
|
|
return doc
|
|
|
}
|
|
|
|
|
|
//抽取日志
|
|
|
-func AddExtLog(before, extinfo map[string]interface{}, v *RegLuaInfo, t *TaskInfo) {
|
|
|
+func AddExtLog(before map[string]interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
|
|
|
if !t.IsEtxLog {
|
|
|
return
|
|
|
}
|
|
@@ -439,6 +470,7 @@ func AddExtLog(before, extinfo map[string]interface{}, v *RegLuaInfo, t *TaskInf
|
|
|
"name": v.Name,
|
|
|
"ruletext": v.RuleText,
|
|
|
"islua": v.IsLua,
|
|
|
+ "field": v.Field,
|
|
|
"version": t.Version,
|
|
|
"taskname": t.Name,
|
|
|
"before": before,
|
|
@@ -475,5 +507,5 @@ func SaveExtLog() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- time.AfterFunc(1*time.Minute, SaveExtLog)
|
|
|
+ time.AfterFunc(10*time.Second, SaveExtLog)
|
|
|
}
|