|
@@ -11,7 +11,7 @@ import (
|
|
|
type RegLuaInfo struct { //正则或脚本信息
|
|
|
Code, Name, Field string //
|
|
|
RuleText string //
|
|
|
- IsLua, IsHasFields bool //IsHasFields脚本配置有属性字段
|
|
|
+ IsLua, IsHasFields bool //IsHasFields正则配置有属性字段
|
|
|
RegPreBac *ExtReg //
|
|
|
RegCore *ExtReg //
|
|
|
LFields []interface{} //lua抽取字段属性组
|
|
@@ -25,17 +25,17 @@ type ExtReg struct {
|
|
|
type RuleCore struct {
|
|
|
LuaLogic string //进入逻辑
|
|
|
ExtFrom string //从哪个字段抽取
|
|
|
- RulePres []*RegLuaInfo //前置规则
|
|
|
- RuleBacks []*RegLuaInfo //后置规则
|
|
|
+ RulePres []*RegLuaInfo //抽取前置规则
|
|
|
+ RuleBacks []*RegLuaInfo //抽取后置规则
|
|
|
RuleCores []*RegLuaInfo //抽取规则
|
|
|
}
|
|
|
type TaskInfo struct {
|
|
|
- Name, Version, TrackColl string //名称、版本、追踪记录表
|
|
|
- FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
|
|
|
- SaveColl, LastExtId string //抽取结果表、上次抽取信息id
|
|
|
- DB *db.Pool //数据库连接池
|
|
|
- IsEtxLog bool //是否开启抽取日志
|
|
|
- ProcessPool chan bool //任务进程池
|
|
|
+ Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
|
|
|
+ FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
|
|
|
+ SaveColl, LastExtId string //抽取结果表、上次抽取信息id
|
|
|
+ DB *db.Pool //数据库连接池
|
|
|
+ IsEtxLog bool //是否开启抽取日志
|
|
|
+ ProcessPool chan bool //任务进程池
|
|
|
}
|
|
|
type Tag struct {
|
|
|
Type string //标签类型 string 字符串、regexp 正则
|
|
@@ -47,8 +47,8 @@ type ExtractTask struct {
|
|
|
IsRun bool //是否启动
|
|
|
Content string //信息内容
|
|
|
TaskInfo *TaskInfo //任务信息
|
|
|
- RulePres []*RegLuaInfo //前置规则
|
|
|
- RuleBacks []*RegLuaInfo //后置规则
|
|
|
+ RulePres []*RegLuaInfo //通用前置规则
|
|
|
+ RuleBacks []*RegLuaInfo //通用后置规则
|
|
|
RuleCores []*RuleCore //抽取规则
|
|
|
Tag map[string][]*Tag //标签库
|
|
|
ClearFn map[string][]string //清理函数
|
|
@@ -63,9 +63,11 @@ func init() {
|
|
|
func (e *ExtractTask) InitTaskInfo() {
|
|
|
task, _ := db.Mgo.FindById("task", e.Id, nil)
|
|
|
if len(*task) > 1 {
|
|
|
+ v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`"}`)
|
|
|
e.TaskInfo = &TaskInfo{
|
|
|
Name: (*task)["s_taskname"].(string),
|
|
|
Version: (*task)["s_version"].(string),
|
|
|
+ VersionId: qu.BsonIdToSId((*v)["_id"]),
|
|
|
TrackColl: (*task)["s_trackcoll"].(string),
|
|
|
FromDbAddr: (*task)["s_mgoaddr"].(string),
|
|
|
FromDB: (*task)["s_mgodb"].(string),
|
|
@@ -80,7 +82,7 @@ func (e *ExtractTask) InitTaskInfo() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-//加载前置规则
|
|
|
+//加载通用前置规则
|
|
|
func (e *ExtractTask) InitRulePres() {
|
|
|
defer qu.Catch()
|
|
|
list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
@@ -105,7 +107,7 @@ func (e *ExtractTask) InitRulePres() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-//加载后置规则
|
|
|
+//加载通用后置规则
|
|
|
func (e *ExtractTask) InitRuleBacks() {
|
|
|
defer qu.Catch()
|
|
|
list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
@@ -133,111 +135,117 @@ func (e *ExtractTask) InitRuleBacks() {
|
|
|
//加载抽取规则
|
|
|
func (e *ExtractTask) InitRuleCore() {
|
|
|
defer qu.Catch()
|
|
|
- list, _ := db.Mgo.Find("rule_logic", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
- for _, vv := range *list {
|
|
|
- if b, _ := vv["isuse"].(bool); !b {
|
|
|
+ vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`"}`, nil, nil, false, -1, -1)
|
|
|
+ for _, vinfo := range *vinfos {
|
|
|
+ if b, _ := vinfo["isuse"].(bool); !b {
|
|
|
continue
|
|
|
}
|
|
|
- rcore := &RuleCore{}
|
|
|
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
|
|
|
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
|
|
|
- //前置规则
|
|
|
- rulePres := []*RegLuaInfo{}
|
|
|
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
- for _, v := range *plist {
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
- Code: v["s_code"].(string),
|
|
|
- Name: v["s_name"].(string),
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
+ pid := qu.BsonIdToSId(vinfo["_id"])
|
|
|
+ list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`"}`, nil, nil, false, -1, -1)
|
|
|
+ for _, vv := range *list {
|
|
|
+ if b, _ := vv["isuse"].(bool); !b {
|
|
|
+ continue
|
|
|
}
|
|
|
- if rinfo.IsLua {
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
- } else {
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
- rinfo.Field = v["s_field"].(string)
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
- if len(tmp) == 2 {
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
|
|
|
+ rcore := &RuleCore{}
|
|
|
+ rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
|
|
|
+ rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
|
|
|
+ //前置规则
|
|
|
+ rulePres := []*RegLuaInfo{}
|
|
|
+ plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
|
|
|
+ for _, v := range *plist {
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
+ Code: v["s_code"].(string),
|
|
|
+ Name: v["s_name"].(string),
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
+ }
|
|
|
+ if rinfo.IsLua {
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
} else {
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
+ rinfo.Field = v["s_field"].(string)
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
+ if len(tmp) == 2 {
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
|
|
|
+ } else {
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
|
|
|
+ }
|
|
|
}
|
|
|
+ rulePres = append(rulePres, rinfo)
|
|
|
}
|
|
|
- rulePres = append(rulePres, rinfo)
|
|
|
- }
|
|
|
- rcore.RulePres = rulePres
|
|
|
+ rcore.RulePres = rulePres
|
|
|
|
|
|
- //后置规则
|
|
|
- ruleBacks := []*RegLuaInfo{}
|
|
|
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
- for _, v := range *blist {
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
- Code: v["s_code"].(string),
|
|
|
- Name: v["s_name"].(string),
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
- }
|
|
|
- if rinfo.IsLua {
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
- } else {
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
- rinfo.Field = v["s_field"].(string)
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
- if len(tmp) == 2 {
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
|
|
|
+ //后置规则
|
|
|
+ ruleBacks := []*RegLuaInfo{}
|
|
|
+ blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
|
|
|
+ for _, v := range *blist {
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
+ Code: v["s_code"].(string),
|
|
|
+ Name: v["s_name"].(string),
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
+ }
|
|
|
+ if rinfo.IsLua {
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
} else {
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
+ rinfo.Field = v["s_field"].(string)
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
+ if len(tmp) == 2 {
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
|
|
|
+ } else {
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
|
|
|
+ }
|
|
|
}
|
|
|
+ ruleBacks = append(ruleBacks, rinfo)
|
|
|
}
|
|
|
- ruleBacks = append(ruleBacks, rinfo)
|
|
|
- }
|
|
|
- rcore.RuleBacks = ruleBacks
|
|
|
+ rcore.RuleBacks = ruleBacks
|
|
|
|
|
|
- //抽取规则
|
|
|
- ruleCores := []*RegLuaInfo{}
|
|
|
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
- for _, v := range *clist {
|
|
|
- if b, _ := v["isuse"].(bool); !b {
|
|
|
- continue
|
|
|
- }
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
- Code: v["s_code"].(string),
|
|
|
- Name: v["s_name"].(string),
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
- }
|
|
|
- if rinfo.IsLua {
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
- //暂时提取全部属性
|
|
|
- rinfo.LFields = getALLFields()
|
|
|
- rinfo.IsHasFields = true
|
|
|
- /*rinfo.LFields, _ = v["s_fields"].([]interface{})
|
|
|
- if len(rinfo.LFields) > 0 {
|
|
|
+ //抽取规则
|
|
|
+ ruleCores := []*RegLuaInfo{}
|
|
|
+ clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
|
|
|
+ for _, v := range *clist {
|
|
|
+ if b, _ := v["isuse"].(bool); !b {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
+ Code: v["s_code"].(string),
|
|
|
+ Name: v["s_name"].(string),
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
+ }
|
|
|
+ if rinfo.IsLua {
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
+ //暂时提取全部属性
|
|
|
+ rinfo.LFields = getALLFields()
|
|
|
rinfo.IsHasFields = true
|
|
|
- }*/
|
|
|
- } else {
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
- rinfo.Field = v["s_field"].(string)
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
- if len(tmp) == 2 {
|
|
|
- epos := strings.Split(tmp[1], ",")
|
|
|
- posm := map[string]int{}
|
|
|
- for _, v := range epos {
|
|
|
- ks := strings.Split(v, ":")
|
|
|
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
- posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
- } else { //(.*)招标公告__2
|
|
|
- posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
|
+ /*rinfo.LFields, _ = v["s_fields"].([]interface{})
|
|
|
+ if len(rinfo.LFields) > 0 {
|
|
|
+ rinfo.IsHasFields = true
|
|
|
+ }*/
|
|
|
+ } else {
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
+ rinfo.Field = v["s_field"].(string)
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
+ if len(tmp) == 2 {
|
|
|
+ epos := strings.Split(tmp[1], ",")
|
|
|
+ posm := map[string]int{}
|
|
|
+ for _, v := range epos {
|
|
|
+ ks := strings.Split(v, ":")
|
|
|
+ if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
+ posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
+ } else { //(.*)招标公告__2
|
|
|
+ posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
|
+ }
|
|
|
}
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
|
|
|
+ } else {
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
|
|
|
}
|
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
|
|
|
- } else {
|
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
|
|
|
}
|
|
|
+ ruleCores = append(ruleCores, rinfo)
|
|
|
}
|
|
|
- ruleCores = append(ruleCores, rinfo)
|
|
|
+ rcore.RuleCores = ruleCores
|
|
|
+ //
|
|
|
+ e.RuleCores = append(e.RuleCores, rcore)
|
|
|
}
|
|
|
- rcore.RuleCores = ruleCores
|
|
|
-
|
|
|
- //
|
|
|
- e.RuleCores = append(e.RuleCores, rcore)
|
|
|
}
|
|
|
}
|
|
|
|