|
@@ -54,18 +54,19 @@ type TaskInfo struct {
|
|
TestLua bool //检查测试用
|
|
TestLua bool //检查测试用
|
|
}
|
|
}
|
|
type ExtractTask struct {
|
|
type ExtractTask struct {
|
|
- Id string //任务id
|
|
|
|
- IsRun bool //是否启动
|
|
|
|
- Content string //信息内容
|
|
|
|
- TaskInfo *TaskInfo //任务信息
|
|
|
|
- RulePres []*RegLuaInfo //通用前置规则
|
|
|
|
- RuleBacks []*RegLuaInfo //通用后置规则
|
|
|
|
- RuleCores []*RuleCore //抽取规则
|
|
|
|
- PkgRuleCores []*RuleCore //分包抽取规则
|
|
|
|
- Tag map[string][]*Tag //标签库
|
|
|
|
- ClearFn map[string][]string //清理函数
|
|
|
|
- IsExtractCity bool //是否开启城市抽取
|
|
|
|
- Fields map[string]int //抽取属性组
|
|
|
|
|
|
+ Id string //任务id
|
|
|
|
+ IsRun bool //是否启动
|
|
|
|
+ Content string //信息内容
|
|
|
|
+ TaskInfo *TaskInfo //任务信息
|
|
|
|
+ RulePres []*RegLuaInfo //通用前置规则
|
|
|
|
+ RuleBacks []*RegLuaInfo //通用后置规则
|
|
|
|
+ //RuleCores []*RuleCore //抽取规则
|
|
|
|
+ RuleCores map[string]map[string][]*RuleCore //分类抽取规则
|
|
|
|
+ PkgRuleCores []*RuleCore //分包抽取规则
|
|
|
|
+ Tag map[string][]*Tag //标签库
|
|
|
|
+ ClearFn map[string][]string //清理函数
|
|
|
|
+ IsExtractCity bool //是否开启城市抽取
|
|
|
|
+ Fields map[string]int //抽取属性组
|
|
|
|
|
|
IsFileField bool //是否开启附件抽取
|
|
IsFileField bool //是否开启附件抽取
|
|
FileFields map[string]int //抽取附件属性组
|
|
FileFields map[string]int //抽取附件属性组
|
|
@@ -91,6 +92,8 @@ type ExtractTask struct {
|
|
AreaProvinceGet *ju.DFA //省
|
|
AreaProvinceGet *ju.DFA //省
|
|
AreaSimGet *ju.DFA //市简称
|
|
AreaSimGet *ju.DFA //市简称
|
|
AreaStreet *ju.DFA //街道
|
|
AreaStreet *ju.DFA //街道
|
|
|
|
+
|
|
|
|
+ InfoType []map[string]interface{}
|
|
}
|
|
}
|
|
|
|
|
|
type ClearTaskInfo struct {
|
|
type ClearTaskInfo struct {
|
|
@@ -262,156 +265,183 @@ func (e *ExtractTask) InitRuleBacks() {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+func (e *ExtractTask) InfoTypeList() {
|
|
|
|
+ infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
|
|
|
|
+ infolist := *infolist1
|
|
|
|
+ for _, v := range infolist {
|
|
|
|
+ e.InfoType = append(e.InfoType, v)
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
|
|
//加载抽取规则
|
|
//加载抽取规则
|
|
func (e *ExtractTask) InitRuleCore() {
|
|
func (e *ExtractTask) InitRuleCore() {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
e.Fields = map[string]int{}
|
|
e.Fields = map[string]int{}
|
|
- e.RuleCores = []*RuleCore{}
|
|
|
|
- vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, vinfo := range *vinfos {
|
|
|
|
- if b, _ := vinfo["isuse"].(bool); !b {
|
|
|
|
|
|
+ infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
|
|
|
|
+ e.RuleCores=make(map[string]map[string][]*RuleCore)
|
|
|
|
+ for _, v := range *infolist {
|
|
|
|
+ topclass := qu.ObjToString(v["topclass"])
|
|
|
|
+ if v["subclass"] == nil {
|
|
|
|
+ e.RuleCores[topclass]=make(map[string][]*RuleCore)
|
|
|
|
+ for attr, _ := range v["fields"].(map[string]interface{}) {
|
|
|
|
+ vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
|
|
|
|
+ e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ for ca, fs := range v["subclass"].(map[string]interface{}) {
|
|
|
|
+ e.RuleCores[topclass+"_"+ca]=make(map[string][]*RuleCore)
|
|
|
|
+ for field, _ := range fs.(map[string]interface{}) {
|
|
|
|
+ vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
|
|
|
|
+ e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
|
|
|
|
+ maps := []*RuleCore{}
|
|
|
|
+ if b, _ := vinfo["isuse"].(bool); !b {
|
|
|
|
+ return nil
|
|
|
|
+ }
|
|
|
|
+ s_field := qu.ObjToString(vinfo["s_field"])
|
|
|
|
+ pid := qu.BsonIdToSId(vinfo["_id"])
|
|
|
|
+ list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, vv := range *list {
|
|
|
|
+ if b, _ := vv["isuse"].(bool); !b {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
- s_field := qu.ObjToString(vinfo["s_field"])
|
|
|
|
- pid := qu.BsonIdToSId(vinfo["_id"])
|
|
|
|
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, vv := range *list {
|
|
|
|
- if b, _ := vv["isuse"].(bool); !b {
|
|
|
|
- continue
|
|
|
|
|
|
+ rcore := &RuleCore{}
|
|
|
|
+ rcore.Field = s_field
|
|
|
|
+ rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
|
|
|
|
+ rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
|
|
|
|
+ //前置规则
|
|
|
|
+ rulePres := []*RegLuaInfo{}
|
|
|
|
+ plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *plist {
|
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
|
+ Field: qu.ObjToString(v["s_field"]),
|
|
|
|
+ Code: v["s_code"].(string),
|
|
|
|
+ Name: v["s_name"].(string),
|
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
}
|
|
}
|
|
- rcore := &RuleCore{}
|
|
|
|
- rcore.Field = s_field
|
|
|
|
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
|
|
|
|
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
|
|
|
|
- //前置规则
|
|
|
|
- rulePres := []*RegLuaInfo{}
|
|
|
|
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, v := range *plist {
|
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
|
- Field: qu.ObjToString(v["s_field"]),
|
|
|
|
- Code: v["s_code"].(string),
|
|
|
|
- Name: v["s_name"].(string),
|
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
- }
|
|
|
|
- if rinfo.IsLua {
|
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
|
|
+ if rinfo.IsLua {
|
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
+ rulePres = append(rulePres, rinfo)
|
|
|
|
+ } else {
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
+ var pattern string
|
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
+ } else {
|
|
|
|
+ pattern = tmp[0]
|
|
|
|
+ }
|
|
|
|
+ if len(tmp) == 2 {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
+ } else {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
+ }
|
|
rulePres = append(rulePres, rinfo)
|
|
rulePres = append(rulePres, rinfo)
|
|
- } else {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
- var pattern string
|
|
|
|
- if strings.Contains(tmp[0], "\\u") {
|
|
|
|
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
- } else {
|
|
|
|
- pattern = tmp[0]
|
|
|
|
- }
|
|
|
|
- if len(tmp) == 2 {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
- } else {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
- }
|
|
|
|
- rulePres = append(rulePres, rinfo)
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
- })
|
|
|
|
- }
|
|
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
+ })
|
|
}
|
|
}
|
|
- rcore.RulePres = rulePres
|
|
|
|
-
|
|
|
|
- //后置规则
|
|
|
|
- ruleBacks := []*RegLuaInfo{}
|
|
|
|
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, v := range *blist {
|
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
|
- Field: qu.ObjToString(v["s_field"]),
|
|
|
|
- Code: v["s_code"].(string),
|
|
|
|
- Name: v["s_name"].(string),
|
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
- }
|
|
|
|
- if rinfo.IsLua {
|
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
|
|
+ }
|
|
|
|
+ rcore.RulePres = rulePres
|
|
|
|
+
|
|
|
|
+ //后置规则
|
|
|
|
+ ruleBacks := []*RegLuaInfo{}
|
|
|
|
+ blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *blist {
|
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
|
+ Field: qu.ObjToString(v["s_field"]),
|
|
|
|
+ Code: v["s_code"].(string),
|
|
|
|
+ Name: v["s_name"].(string),
|
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
+ }
|
|
|
|
+ if rinfo.IsLua {
|
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
+ ruleBacks = append(ruleBacks, rinfo)
|
|
|
|
+ } else {
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
+ var pattern string
|
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
+ } else {
|
|
|
|
+ pattern = tmp[0]
|
|
|
|
+ }
|
|
|
|
+ if len(tmp) == 2 {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
+ } else {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
+ }
|
|
ruleBacks = append(ruleBacks, rinfo)
|
|
ruleBacks = append(ruleBacks, rinfo)
|
|
- } else {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
- var pattern string
|
|
|
|
- if strings.Contains(tmp[0], "\\u") {
|
|
|
|
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
- } else {
|
|
|
|
- pattern = tmp[0]
|
|
|
|
- }
|
|
|
|
- if len(tmp) == 2 {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
- } else {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
- }
|
|
|
|
- ruleBacks = append(ruleBacks, rinfo)
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
- })
|
|
|
|
- }
|
|
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
+ })
|
|
}
|
|
}
|
|
- rcore.RuleBacks = ruleBacks
|
|
|
|
|
|
+ }
|
|
|
|
+ rcore.RuleBacks = ruleBacks
|
|
|
|
|
|
- //抽取规则
|
|
|
|
- ruleCores := []*RegLuaInfo{}
|
|
|
|
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, v := range *clist {
|
|
|
|
- if b, _ := v["isuse"].(bool); !b {
|
|
|
|
- continue
|
|
|
|
- }
|
|
|
|
- field := qu.ObjToString(v["s_field"])
|
|
|
|
- e.Fields[field] = 1 //加入抽取属性组备用
|
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
|
- Field: field,
|
|
|
|
- Code: v["s_code"].(string),
|
|
|
|
- Name: v["s_name"].(string),
|
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
- }
|
|
|
|
- if rinfo.IsLua {
|
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
- //提取全部属性
|
|
|
|
- rinfo.LFields = getALLFields()
|
|
|
|
- ruleCores = append(ruleCores, rinfo)
|
|
|
|
- } else {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
- var pattern string
|
|
|
|
- if strings.Contains(tmp[0], "\\u") {
|
|
|
|
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
- } else {
|
|
|
|
- pattern = tmp[0]
|
|
|
|
- }
|
|
|
|
- if len(tmp) == 2 {
|
|
|
|
- epos := strings.Split(tmp[1], ",")
|
|
|
|
- posm := map[string]int{}
|
|
|
|
- for _, v := range epos {
|
|
|
|
- ks := strings.Split(v, ":")
|
|
|
|
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
|
- posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
|
- } else { //(.*)招标公告__2
|
|
|
|
- posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
|
|
- }
|
|
|
|
|
|
+ //抽取规则
|
|
|
|
+ ruleCores := []*RegLuaInfo{}
|
|
|
|
+ clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *clist {
|
|
|
|
+ if b, _ := v["isuse"].(bool); !b {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ field := qu.ObjToString(v["s_field"])
|
|
|
|
+ e.Fields[field] = 1 //加入抽取属性组备用
|
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
|
+ Field: field,
|
|
|
|
+ Code: v["s_code"].(string),
|
|
|
|
+ Name: v["s_name"].(string),
|
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
+ }
|
|
|
|
+ if rinfo.IsLua {
|
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
+ //提取全部属性
|
|
|
|
+ rinfo.LFields = getALLFields()
|
|
|
|
+ ruleCores = append(ruleCores, rinfo)
|
|
|
|
+ } else {
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
+ var pattern string
|
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
+ } else {
|
|
|
|
+ pattern = tmp[0]
|
|
|
|
+ }
|
|
|
|
+ if len(tmp) == 2 {
|
|
|
|
+ epos := strings.Split(tmp[1], ",")
|
|
|
|
+ posm := map[string]int{}
|
|
|
|
+ for _, v := range epos {
|
|
|
|
+ ks := strings.Split(v, ":")
|
|
|
|
+ if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
|
+ posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
|
+ } else { //(.*)招标公告__2
|
|
|
|
+ posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
}
|
|
}
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
|
|
|
|
- } else {
|
|
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
|
|
|
|
}
|
|
}
|
|
- ruleCores = append(ruleCores, rinfo)
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
- })
|
|
|
|
- }
|
|
|
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
|
|
|
|
+ } else {
|
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
|
|
|
|
+ }
|
|
|
|
+ ruleCores = append(ruleCores, rinfo)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
+ })
|
|
}
|
|
}
|
|
- rcore.RuleCores = ruleCores
|
|
|
|
- //
|
|
|
|
- e.RuleCores = append(e.RuleCores, rcore)
|
|
|
|
}
|
|
}
|
|
|
|
+ rcore.RuleCores = ruleCores
|
|
|
|
+ //
|
|
|
|
+ maps = append(maps, rcore)
|
|
}
|
|
}
|
|
|
|
+ return maps
|
|
}
|
|
}
|
|
|
|
|
|
//加载分包抽取规则
|
|
//加载分包抽取规则
|