|
@@ -4,13 +4,15 @@ package extract
|
|
import (
|
|
import (
|
|
db "jy/mongodbutil"
|
|
db "jy/mongodbutil"
|
|
ju "jy/util"
|
|
ju "jy/util"
|
|
- "log"
|
|
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
"regexp"
|
|
"regexp"
|
|
"sort"
|
|
"sort"
|
|
"strconv"
|
|
"strconv"
|
|
"strings"
|
|
"strings"
|
|
|
|
+ "sync"
|
|
"time"
|
|
"time"
|
|
|
|
+
|
|
|
|
+ log "github.com/donnie4w/go-logger/logger"
|
|
)
|
|
)
|
|
|
|
|
|
type RegLuaInfo struct { //正则或脚本信息
|
|
type RegLuaInfo struct { //正则或脚本信息
|
|
@@ -53,22 +55,23 @@ type TaskInfo struct {
|
|
TestLua bool //检查测试用
|
|
TestLua bool //检查测试用
|
|
}
|
|
}
|
|
type ExtractTask struct {
|
|
type ExtractTask struct {
|
|
- Id string //任务id
|
|
|
|
- IsRun bool //是否启动
|
|
|
|
- Content string //信息内容
|
|
|
|
- TaskInfo *TaskInfo //任务信息
|
|
|
|
- RulePres []*RegLuaInfo //通用前置规则
|
|
|
|
- RuleBacks []*RegLuaInfo //通用后置规则
|
|
|
|
- RuleCores []*RuleCore //抽取规则
|
|
|
|
- PkgRuleCores []*RuleCore //分包抽取规则
|
|
|
|
- RuleBlock *ju.RuleBlock
|
|
|
|
- Tag map[string][]*Tag //标签库
|
|
|
|
- ClearFn map[string][]string //清理函数
|
|
|
|
- IsExtractCity bool //是否开启城市抽取
|
|
|
|
- Fields map[string]int //抽取属性组
|
|
|
|
-
|
|
|
|
- IsFileField bool //是否开启附件抽取
|
|
|
|
- FileFields map[string]int //抽取附件属性组
|
|
|
|
|
|
+ Id string //任务id
|
|
|
|
+ IsRun bool //是否启动
|
|
|
|
+ Content string //信息内容
|
|
|
|
+ TaskInfo *TaskInfo //任务信息
|
|
|
|
+ RulePres []*RegLuaInfo //通用前置规则
|
|
|
|
+ RuleBacks []*RegLuaInfo //通用后置规则
|
|
|
|
+ RuleBlock *ju.RuleBlock
|
|
|
|
+ //RuleCores []*RuleCore //抽取规则
|
|
|
|
+ RuleCores map[string]map[string][]*RuleCore //分类抽取规则
|
|
|
|
+ PkgRuleCores []*RuleCore //分包抽取规则
|
|
|
|
+ Tag map[string][]*Tag //标签库
|
|
|
|
+ ClearFn map[string][]string //清理函数
|
|
|
|
+ IsExtractCity bool //是否开启城市抽取
|
|
|
|
+ Fields map[string]int //抽取属性组
|
|
|
|
+
|
|
|
|
+ IsFileField bool //是否开启附件抽取
|
|
|
|
+ FileFields *sync.Map //抽取附件属性组
|
|
|
|
|
|
ResultChanel chan bool //抽取结果详情
|
|
ResultChanel chan bool //抽取结果详情
|
|
ResultArr [][]map[string]interface{} //抽取结果详情
|
|
ResultArr [][]map[string]interface{} //抽取结果详情
|
|
@@ -91,6 +94,8 @@ type ExtractTask struct {
|
|
AreaProvinceGet *ju.DFA //省
|
|
AreaProvinceGet *ju.DFA //省
|
|
AreaSimGet *ju.DFA //市简称
|
|
AreaSimGet *ju.DFA //市简称
|
|
AreaStreet *ju.DFA //街道
|
|
AreaStreet *ju.DFA //街道
|
|
|
|
+
|
|
|
|
+ InfoType []map[string]interface{}
|
|
}
|
|
}
|
|
|
|
|
|
type ClearTaskInfo struct {
|
|
type ClearTaskInfo struct {
|
|
@@ -156,11 +161,11 @@ func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
|
|
//加载任务信息
|
|
//加载任务信息
|
|
func (e *ExtractTask) InitTaskInfo() {
|
|
func (e *ExtractTask) InitTaskInfo() {
|
|
task, _ := db.Mgo.FindById("task", e.Id, nil)
|
|
task, _ := db.Mgo.FindById("task", e.Id, nil)
|
|
- log.Println("task", task)
|
|
|
|
|
|
+ log.Debug("task", task)
|
|
if len(*task) > 1 {
|
|
if len(*task) > 1 {
|
|
v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
|
|
v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
|
|
strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
|
|
strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
|
|
- log.Println("s_mgosavecoll", strs)
|
|
|
|
|
|
+ log.Debug("s_mgosavecoll", strs)
|
|
if len(strs) < 3 {
|
|
if len(strs) < 3 {
|
|
return
|
|
return
|
|
} else {
|
|
} else {
|
|
@@ -183,7 +188,7 @@ func (e *ExtractTask) InitTaskInfo() {
|
|
e.IsExtractCity = (*v)["isextractcity"].(bool)
|
|
e.IsExtractCity = (*v)["isextractcity"].(bool)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- log.Println(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
|
|
|
|
|
|
+ log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
|
|
} else {
|
|
} else {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -220,7 +225,7 @@ func (e *ExtractTask) InitRulePres() {
|
|
}
|
|
}
|
|
e.RulePres = append(e.RulePres, rinfo)
|
|
e.RulePres = append(e.RulePres, rinfo)
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println(rinfo.Code, rinfo.Field, err)
|
|
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -257,161 +262,188 @@ func (e *ExtractTask) InitRuleBacks() {
|
|
}
|
|
}
|
|
e.RuleBacks = append(e.RuleBacks, rinfo)
|
|
e.RuleBacks = append(e.RuleBacks, rinfo)
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println(rinfo.Code, rinfo.Field, err)
|
|
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+func (e *ExtractTask) InfoTypeList() {
|
|
|
|
+ infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
|
|
|
|
+ infolist := *infolist1
|
|
|
|
+ for _, v := range infolist {
|
|
|
|
+ e.InfoType = append(e.InfoType, v)
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
|
|
//加载抽取规则
|
|
//加载抽取规则
|
|
func (e *ExtractTask) InitRuleCore() {
|
|
func (e *ExtractTask) InitRuleCore() {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
e.Fields = map[string]int{}
|
|
e.Fields = map[string]int{}
|
|
- e.RuleCores = []*RuleCore{}
|
|
|
|
- vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, vinfo := range *vinfos {
|
|
|
|
- if b, _ := vinfo["isuse"].(bool); !b {
|
|
|
|
|
|
+ infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
|
|
|
|
+ e.RuleCores = make(map[string]map[string][]*RuleCore)
|
|
|
|
+ for _, v := range *infolist {
|
|
|
|
+ topclass := qu.ObjToString(v["topclass"])
|
|
|
|
+ if v["subclass"] == nil {
|
|
|
|
+ e.RuleCores[topclass] = make(map[string][]*RuleCore)
|
|
|
|
+ for attr, _ := range v["fields"].(map[string]interface{}) {
|
|
|
|
+ vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
|
|
|
|
+ e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ for ca, fs := range v["subclass"].(map[string]interface{}) {
|
|
|
|
+ e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
|
|
|
|
+ for field, _ := range fs.(map[string]interface{}) {
|
|
|
|
+ vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
|
|
|
|
+ e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
|
|
|
|
+ maps := []*RuleCore{}
|
|
|
|
+ if b, _ := vinfo["isuse"].(bool); !b {
|
|
|
|
+ return nil
|
|
|
|
+ }
|
|
|
|
+ s_field := qu.ObjToString(vinfo["s_field"])
|
|
|
|
+ pid := qu.BsonIdToSId(vinfo["_id"])
|
|
|
|
+ list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, vv := range *list {
|
|
|
|
+ if b, _ := vv["isuse"].(bool); !b {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
- s_field := qu.ObjToString(vinfo["s_field"])
|
|
|
|
- pid := qu.BsonIdToSId(vinfo["_id"])
|
|
|
|
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, vv := range *list {
|
|
|
|
- if b, _ := vv["isuse"].(bool); !b {
|
|
|
|
- continue
|
|
|
|
|
|
+ rcore := &RuleCore{}
|
|
|
|
+ rcore.Field = s_field
|
|
|
|
+ rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
|
|
|
|
+ rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
|
|
|
|
+ //前置规则
|
|
|
|
+ rulePres := []*RegLuaInfo{}
|
|
|
|
+ plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *plist {
|
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
|
+ Field: qu.ObjToString(v["s_field"]),
|
|
|
|
+ Code: v["s_code"].(string),
|
|
|
|
+ Name: v["s_name"].(string),
|
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
}
|
|
}
|
|
- rcore := &RuleCore{}
|
|
|
|
- rcore.Field = s_field
|
|
|
|
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
|
|
|
|
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
|
|
|
|
- //前置规则
|
|
|
|
- rulePres := []*RegLuaInfo{}
|
|
|
|
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, v := range *plist {
|
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
|
- Field: qu.ObjToString(v["s_field"]),
|
|
|
|
- Code: v["s_code"].(string),
|
|
|
|
- Name: v["s_name"].(string),
|
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
- }
|
|
|
|
- if rinfo.IsLua {
|
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
|
|
+ if rinfo.IsLua {
|
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
+ rulePres = append(rulePres, rinfo)
|
|
|
|
+ } else {
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
+ var pattern string
|
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
+ } else {
|
|
|
|
+ pattern = tmp[0]
|
|
|
|
+ }
|
|
|
|
+ if len(tmp) == 2 {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
+ } else {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
+ }
|
|
rulePres = append(rulePres, rinfo)
|
|
rulePres = append(rulePres, rinfo)
|
|
- } else {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
- var pattern string
|
|
|
|
- if strings.Contains(tmp[0], "\\u") {
|
|
|
|
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
- } else {
|
|
|
|
- pattern = tmp[0]
|
|
|
|
- }
|
|
|
|
- if len(tmp) == 2 {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
- } else {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
- }
|
|
|
|
- rulePres = append(rulePres, rinfo)
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(rinfo.Code, rinfo.Field, err)
|
|
|
|
- })
|
|
|
|
- }
|
|
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
+ })
|
|
}
|
|
}
|
|
- rcore.RulePres = rulePres
|
|
|
|
|
|
+ }
|
|
|
|
+ rcore.RulePres = rulePres
|
|
|
|
|
|
- //后置规则
|
|
|
|
- ruleBacks := []*RegLuaInfo{}
|
|
|
|
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, v := range *blist {
|
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
|
- Field: qu.ObjToString(v["s_field"]),
|
|
|
|
- Code: v["s_code"].(string),
|
|
|
|
- Name: v["s_name"].(string),
|
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
- }
|
|
|
|
- if rinfo.IsLua {
|
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
|
|
+ //后置规则
|
|
|
|
+ ruleBacks := []*RegLuaInfo{}
|
|
|
|
+ blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *blist {
|
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
|
+ Field: qu.ObjToString(v["s_field"]),
|
|
|
|
+ Code: v["s_code"].(string),
|
|
|
|
+ Name: v["s_name"].(string),
|
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
+ }
|
|
|
|
+ if rinfo.IsLua {
|
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
+ ruleBacks = append(ruleBacks, rinfo)
|
|
|
|
+ } else {
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
+ var pattern string
|
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
+ } else {
|
|
|
|
+ pattern = tmp[0]
|
|
|
|
+ }
|
|
|
|
+ if len(tmp) == 2 {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
+ } else {
|
|
|
|
+ rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
+ }
|
|
ruleBacks = append(ruleBacks, rinfo)
|
|
ruleBacks = append(ruleBacks, rinfo)
|
|
- } else {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
- var pattern string
|
|
|
|
- if strings.Contains(tmp[0], "\\u") {
|
|
|
|
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
- } else {
|
|
|
|
- pattern = tmp[0]
|
|
|
|
- }
|
|
|
|
- if len(tmp) == 2 {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
|
|
|
|
- } else {
|
|
|
|
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
|
|
|
|
- }
|
|
|
|
- ruleBacks = append(ruleBacks, rinfo)
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(rinfo.Code, rinfo.Field, err)
|
|
|
|
- })
|
|
|
|
- }
|
|
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
+ })
|
|
}
|
|
}
|
|
- rcore.RuleBacks = ruleBacks
|
|
|
|
|
|
+ }
|
|
|
|
+ rcore.RuleBacks = ruleBacks
|
|
|
|
|
|
- //抽取规则
|
|
|
|
- ruleCores := []*RegLuaInfo{}
|
|
|
|
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
- for _, v := range *clist {
|
|
|
|
- if b, _ := v["isuse"].(bool); !b {
|
|
|
|
- continue
|
|
|
|
- }
|
|
|
|
- field := qu.ObjToString(v["s_field"])
|
|
|
|
- e.Fields[field] = 1 //加入抽取属性组备用
|
|
|
|
- rinfo := &RegLuaInfo{
|
|
|
|
- Field: field,
|
|
|
|
- Code: v["s_code"].(string),
|
|
|
|
- Name: v["s_name"].(string),
|
|
|
|
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
- }
|
|
|
|
- if rinfo.IsLua {
|
|
|
|
- rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
- //提取全部属性
|
|
|
|
- rinfo.LFields = getALLFields()
|
|
|
|
- ruleCores = append(ruleCores, rinfo)
|
|
|
|
- } else {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
- tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
- var pattern string
|
|
|
|
- if strings.Contains(tmp[0], "\\u") {
|
|
|
|
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
- } else {
|
|
|
|
- pattern = tmp[0]
|
|
|
|
- }
|
|
|
|
- if len(tmp) == 2 {
|
|
|
|
- epos := strings.Split(tmp[1], ",")
|
|
|
|
- posm := map[string]int{}
|
|
|
|
- for _, v := range epos {
|
|
|
|
- ks := strings.Split(v, ":")
|
|
|
|
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
|
- posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
|
- } else { //(.*)招标公告__2
|
|
|
|
- posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
|
|
- }
|
|
|
|
|
|
+ //抽取规则
|
|
|
|
+ ruleCores := []*RegLuaInfo{}
|
|
|
|
+ clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *clist {
|
|
|
|
+ if b, _ := v["isuse"].(bool); !b {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ field := qu.ObjToString(v["s_field"])
|
|
|
|
+ e.Fields[field] = 1 //加入抽取属性组备用
|
|
|
|
+ rinfo := &RegLuaInfo{
|
|
|
|
+ Field: field,
|
|
|
|
+ Code: v["s_code"].(string),
|
|
|
|
+ Name: v["s_name"].(string),
|
|
|
|
+ IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
|
|
|
|
+ }
|
|
|
|
+ if rinfo.IsLua {
|
|
|
|
+ rinfo.RuleText = v["s_luascript"].(string)
|
|
|
|
+ //提取全部属性
|
|
|
|
+ rinfo.LFields = getALLFields()
|
|
|
|
+ ruleCores = append(ruleCores, rinfo)
|
|
|
|
+ } else {
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ rinfo.RuleText = v["s_rule"].(string)
|
|
|
|
+ tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
|
+ var pattern string
|
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
|
+ } else {
|
|
|
|
+ pattern = tmp[0]
|
|
|
|
+ }
|
|
|
|
+ if len(tmp) == 2 {
|
|
|
|
+ epos := strings.Split(tmp[1], ",")
|
|
|
|
+ posm := map[string]int{}
|
|
|
|
+ for _, v := range epos {
|
|
|
|
+ ks := strings.Split(v, ":")
|
|
|
|
+ if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
|
+ posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
|
+ } else { //(.*)招标公告__2
|
|
|
|
+ posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
}
|
|
}
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
|
|
|
|
- } else {
|
|
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
|
|
|
|
}
|
|
}
|
|
- ruleCores = append(ruleCores, rinfo)
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(rinfo.Code, rinfo.Field, err)
|
|
|
|
- })
|
|
|
|
- }
|
|
|
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
|
|
|
|
+ } else {
|
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
|
|
|
|
+ }
|
|
|
|
+ ruleCores = append(ruleCores, rinfo)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
|
|
+ })
|
|
}
|
|
}
|
|
- rcore.RuleCores = ruleCores
|
|
|
|
- //
|
|
|
|
- e.RuleCores = append(e.RuleCores, rcore)
|
|
|
|
}
|
|
}
|
|
|
|
+ rcore.RuleCores = ruleCores
|
|
|
|
+ //
|
|
|
|
+ maps = append(maps, rcore)
|
|
}
|
|
}
|
|
|
|
+ return maps
|
|
}
|
|
}
|
|
|
|
|
|
//加载分包抽取规则
|
|
//加载分包抽取规则
|
|
@@ -464,7 +496,7 @@ func (e *ExtractTask) InitPkgCore() {
|
|
}
|
|
}
|
|
ruleBacks = append(ruleBacks, rinfo)
|
|
ruleBacks = append(ruleBacks, rinfo)
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println(rinfo.Code, rinfo.Field, err)
|
|
|
|
|
|
+ log.Debug(rinfo.Code, rinfo.Field, err)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -746,82 +778,83 @@ func (e *ExtractTask) InitDFA() {
|
|
}
|
|
}
|
|
|
|
|
|
//保存抽取详情数据
|
|
//保存抽取详情数据
|
|
-func (e *ExtractTask) ResultSave() {
|
|
|
|
|
|
+func (e *ExtractTask) ResultSave(init bool) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
- e.ResultChanel = make(chan bool, 5)
|
|
|
|
- e.ResultArr = [][]map[string]interface{}{}
|
|
|
|
- for {
|
|
|
|
- if len(e.ResultArr) > 500 {
|
|
|
|
- e.ResultChanel <- true
|
|
|
|
- arr := e.ResultArr[:500]
|
|
|
|
- go func(tmp *[][]map[string]interface{}) {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- db.Mgo.UpSertBulk("extract_result", *tmp...)
|
|
|
|
- <-e.ResultChanel
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(err)
|
|
|
|
- <-e.ResultChanel
|
|
|
|
- })
|
|
|
|
- }(&arr)
|
|
|
|
- e.ResultArr = e.ResultArr[500:]
|
|
|
|
- } else {
|
|
|
|
- e.ResultChanel <- true
|
|
|
|
- arr := e.ResultArr
|
|
|
|
- go func(tmp *[][]map[string]interface{}) {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- db.Mgo.UpSertBulk("extract_result", *tmp...)
|
|
|
|
- <-e.ResultChanel
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(err)
|
|
|
|
- <-e.ResultChanel
|
|
|
|
- })
|
|
|
|
- }(&arr)
|
|
|
|
- e.ResultArr = [][]map[string]interface{}{}
|
|
|
|
- time.Sleep(10 * time.Second)
|
|
|
|
- }
|
|
|
|
- if !e.IsRun {
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
|
|
+ if e.ResultArr == nil {
|
|
|
|
+ e.ResultArr = [][]map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ if init {
|
|
|
|
+ go func() {
|
|
|
|
+ for {
|
|
|
|
+ if len(e.ResultArr) > 500 {
|
|
|
|
+ arr := e.ResultArr[:500]
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ db.Mgo.UpSertBulk("extract_result", arr...)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(err)
|
|
|
|
+ })
|
|
|
|
+ e.ResultArr = e.ResultArr[500:]
|
|
|
|
+ } else {
|
|
|
|
+ arr := e.ResultArr
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ db.Mgo.UpSertBulk("extract_result", arr...)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(err)
|
|
|
|
+ })
|
|
|
|
+ e.ResultArr = [][]map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ time.Sleep(10 * time.Second)
|
|
|
|
+ }
|
|
|
|
+ }()
|
|
|
|
+ } else {
|
|
|
|
+ arr := e.ResultArr
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(err)
|
|
|
|
+ })
|
|
|
|
+ e.ResultArr = [][]map[string]interface{}{}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//保存抽取数据
|
|
//保存抽取数据
|
|
-func (e *ExtractTask) BidSave() {
|
|
|
|
|
|
+func (e *ExtractTask) BidSave(init bool) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
- e.BidChanel = make(chan bool, 5)
|
|
|
|
- e.BidArr = [][]map[string]interface{}{}
|
|
|
|
- for {
|
|
|
|
- if len(e.BidArr) > 500 {
|
|
|
|
- e.BidChanel <- true
|
|
|
|
- arr := e.BidArr[:500]
|
|
|
|
- go func(tmp *[][]map[string]interface{}) {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- db.Mgo.UpSertBulk(e.TaskInfo.ToColl, *tmp...)
|
|
|
|
- <-e.BidChanel
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(err)
|
|
|
|
- <-e.BidChanel
|
|
|
|
- })
|
|
|
|
- }(&arr)
|
|
|
|
- e.BidArr = e.BidArr[500:]
|
|
|
|
- } else {
|
|
|
|
- e.BidChanel <- true
|
|
|
|
- arr := e.BidArr
|
|
|
|
- go func(tmp *[][]map[string]interface{}) {
|
|
|
|
- qu.Try(func() {
|
|
|
|
- db.Mgo.UpSertBulk(e.TaskInfo.ToColl, *tmp...)
|
|
|
|
- <-e.BidChanel
|
|
|
|
- }, func(err interface{}) {
|
|
|
|
- log.Println(err)
|
|
|
|
- <-e.BidChanel
|
|
|
|
- })
|
|
|
|
- }(&arr)
|
|
|
|
- e.BidArr = [][]map[string]interface{}{}
|
|
|
|
- }
|
|
|
|
- if !e.IsRun {
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- time.Sleep(10 * time.Second)
|
|
|
|
|
|
+ if e.BidArr == nil {
|
|
|
|
+ e.BidArr = [][]map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ if init {
|
|
|
|
+ go func() {
|
|
|
|
+ for {
|
|
|
|
+ if len(e.BidArr) > 500 {
|
|
|
|
+ arr := e.BidArr[:500]
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(err)
|
|
|
|
+ })
|
|
|
|
+ e.BidArr = e.BidArr[500:]
|
|
|
|
+ } else {
|
|
|
|
+ arr := e.BidArr
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(err)
|
|
|
|
+ })
|
|
|
|
+ e.BidArr = [][]map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ time.Sleep(10 * time.Second)
|
|
|
|
+ }
|
|
|
|
+ }()
|
|
|
|
+ } else {
|
|
|
|
+ arr := e.BidArr
|
|
|
|
+ qu.Try(func() {
|
|
|
|
+ e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
|
|
|
|
+ }, func(err interface{}) {
|
|
|
|
+ log.Debug(err)
|
|
|
|
+ })
|
|
|
|
+ e.BidArr = [][]map[string]interface{}{}
|
|
|
|
+ time.Sleep(1 * time.Second)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -868,7 +901,7 @@ func (e *ExtractTask) InitAuditRule() {
|
|
ru = string(rs[1 : len(rs)-1])
|
|
ru = string(rs[1 : len(rs)-1])
|
|
rureg, err = regexp.Compile(ru)
|
|
rureg, err = regexp.Compile(ru)
|
|
if err != nil {
|
|
if err != nil {
|
|
- log.Println("error---rule:", r)
|
|
|
|
|
|
+ log.Debug("error---rule:", r)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
i_rule = append(i_rule, []interface{}{rureg}...)
|
|
i_rule = append(i_rule, []interface{}{rureg}...)
|
|
@@ -921,13 +954,14 @@ func (e *ExtractTask) InitFile() {
|
|
if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
|
|
if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
|
|
e.IsFileField = true
|
|
e.IsFileField = true
|
|
}
|
|
}
|
|
- efiled := make(map[string]int, 0)
|
|
|
|
|
|
+ syscefiled := new(sync.Map)
|
|
|
|
+
|
|
if (*ve)["s_filefileds"] != nil {
|
|
if (*ve)["s_filefileds"] != nil {
|
|
for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
|
|
for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
|
|
- efiled[vff.(string)] = 1
|
|
|
|
|
|
+ syscefiled.Store(vff.(string), 1)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- e.FileFields = efiled
|
|
|
|
|
|
+ e.FileFields = syscefiled
|
|
}
|
|
}
|
|
|
|
|
|
//加载清理任务信息
|
|
//加载清理任务信息
|
|
@@ -945,7 +979,7 @@ func (c *ClearTask) InitClearTaskInfo() {
|
|
IsCltLog: ju.Config["iscltlog"].(bool),
|
|
IsCltLog: ju.Config["iscltlog"].(bool),
|
|
ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
|
|
ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
|
|
}
|
|
}
|
|
- log.Println(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
|
|
|
|
|
|
+ log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
|
|
} else {
|
|
} else {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -996,7 +1030,6 @@ func (e *ExtractTask) InitBlockRule() {
|
|
}
|
|
}
|
|
b_reg, b_err := regexp.Compile(block_reg)
|
|
b_reg, b_err := regexp.Compile(block_reg)
|
|
t_reg, t_err := regexp.Compile(title_reg)
|
|
t_reg, t_err := regexp.Compile(title_reg)
|
|
- log.Println(block_reg, title_reg, b_err, t_err)
|
|
|
|
if b_err != nil || t_err != nil {
|
|
if b_err != nil || t_err != nil {
|
|
continue
|
|
continue
|
|
}
|
|
}
|