|
@@ -1,6 +1,7 @@
|
|
|
package extract
|
|
|
|
|
|
import (
|
|
|
+ "encoding/json"
|
|
|
db "jy/mongodbutil"
|
|
|
ju "jy/util"
|
|
|
"log"
|
|
@@ -24,9 +25,10 @@ type ExtReg struct {
|
|
|
Reg *regexp.Regexp
|
|
|
Replace string
|
|
|
Bextract bool
|
|
|
- ExtractPos int
|
|
|
+ ExtractPos map[string]int
|
|
|
}
|
|
|
type RuleCore struct {
|
|
|
+ LuaLogic string //进入逻辑
|
|
|
RulePres []*RegLuaInfo //前置规则
|
|
|
RuleBacks []*RegLuaInfo //后置规则
|
|
|
RuleCores []*RegLuaInfo //抽取规则
|
|
@@ -40,13 +42,13 @@ type TaskInfo struct {
|
|
|
ProcessPool chan bool //任务进程池
|
|
|
}
|
|
|
type ExtField struct {
|
|
|
- Field string //属性
|
|
|
- Value map[string]int //属性值:出现次数
|
|
|
- ExtNum int //抽取次数
|
|
|
+ Field string //属性
|
|
|
+ Value []interface{} //抽取结果
|
|
|
}
|
|
|
type ExtractTask struct {
|
|
|
Id string //任务id
|
|
|
IsRun bool //是否启动
|
|
|
+ Content string //信息内容
|
|
|
TaskInfo *TaskInfo //任务信息
|
|
|
RulePres []*RegLuaInfo //前置规则
|
|
|
RuleBacks []*RegLuaInfo //后置规则
|
|
@@ -134,7 +136,7 @@ func (e *ExtractTask) InitTaskInfo() {
|
|
|
//加载前置规则
|
|
|
func (e *ExtractTask) InitRulePres() {
|
|
|
defer qu.Catch()
|
|
|
- list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`"}`, `{"_id":-1}`, nil, false, -1, -1)
|
|
|
+ list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
for _, v := range *list {
|
|
|
rinfo := &RegLuaInfo{
|
|
|
Code: v["s_code"].(string),
|
|
@@ -159,7 +161,7 @@ func (e *ExtractTask) InitRulePres() {
|
|
|
//加载后置规则
|
|
|
func (e *ExtractTask) InitRuleBacks() {
|
|
|
defer qu.Catch()
|
|
|
- list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`"}`, `{"_id":-1}`, nil, false, -1, -1)
|
|
|
+ list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
for _, v := range *list {
|
|
|
rinfo := &RegLuaInfo{
|
|
|
Code: v["s_code"].(string),
|
|
@@ -184,15 +186,17 @@ func (e *ExtractTask) InitRuleBacks() {
|
|
|
//加载抽取规则
|
|
|
func (e *ExtractTask) InitRuleCore() {
|
|
|
defer qu.Catch()
|
|
|
- list, _ := db.Mgo.Find("rule_logic", `{"s_version":"`+e.TaskInfo.Version+`"}`, `{"_id":-1}`, nil, false, -1, -1)
|
|
|
+ list, _ := db.Mgo.Find("rule_logic", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
for _, vv := range *list {
|
|
|
if b, _ := vv["isuse"].(bool); !b {
|
|
|
continue
|
|
|
}
|
|
|
rcore := &RuleCore{}
|
|
|
+ //是否进入逻辑脚本
|
|
|
+ rcore.LuaLogic = qu.ObjToString(vv["s_luascript"])
|
|
|
//前置规则
|
|
|
rulePres := []*RegLuaInfo{}
|
|
|
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, `{"_id":-1}`, nil, false, -1, -1)
|
|
|
+ plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
for _, v := range *plist {
|
|
|
rinfo := &RegLuaInfo{
|
|
|
Code: v["s_code"].(string),
|
|
@@ -217,7 +221,7 @@ func (e *ExtractTask) InitRuleCore() {
|
|
|
|
|
|
//后置规则
|
|
|
ruleBacks := []*RegLuaInfo{}
|
|
|
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, `{"_id":-1}`, nil, false, -1, -1)
|
|
|
+ blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
for _, v := range *blist {
|
|
|
rinfo := &RegLuaInfo{
|
|
|
Code: v["s_code"].(string),
|
|
@@ -242,7 +246,7 @@ func (e *ExtractTask) InitRuleCore() {
|
|
|
|
|
|
//抽取规则
|
|
|
ruleCores := []*RegLuaInfo{}
|
|
|
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, `{"_id":-1}`, nil, false, -1, -1)
|
|
|
+ clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
for _, v := range *clist {
|
|
|
if b, _ := v["isuse"].(bool); !b {
|
|
|
continue
|
|
@@ -259,9 +263,19 @@ func (e *ExtractTask) InitRuleCore() {
|
|
|
rinfo.Field = v["s_field"].(string)
|
|
|
tmp := strings.Split(rinfo.RuleText, "__")
|
|
|
if len(tmp) == 2 {
|
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: qu.IntAll(tmp[1])}
|
|
|
+ epos := strings.Split(tmp[1], ",")
|
|
|
+ posm := map[string]int{}
|
|
|
+ for _, v := range epos {
|
|
|
+ ks := strings.Split(v, ":")
|
|
|
+ if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
+ posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
+ } else { //(.*)招标公告__2
|
|
|
+ posm[rinfo.Field] = qu.IntAll(ks[0])
|
|
|
+ }
|
|
|
+ }
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
|
|
|
} else {
|
|
|
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false, ExtractPos: 0}
|
|
|
+ rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
|
|
|
}
|
|
|
}
|
|
|
ruleCores = append(ruleCores, rinfo)
|
|
@@ -282,9 +296,13 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
|
|
|
for _, v := range e.RulePres {
|
|
|
doc = ExtRegPre(doc, v, e.TaskInfo)
|
|
|
}
|
|
|
- log.Println("前置规则,detail", doc["detail"])
|
|
|
+ log.Println("全局前置规则", doc)
|
|
|
//抽取规则
|
|
|
for _, vc := range e.RuleCores {
|
|
|
+ //是否进入逻辑
|
|
|
+ if !ju.Logic(vc.LuaLogic, doc) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
data := map[string]interface{}{}
|
|
|
//抽取-前置规则
|
|
|
tmpdoc := map[string]interface{}{}
|
|
@@ -303,6 +321,11 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
|
|
|
data = ExtRegBack(data, v, e.TaskInfo)
|
|
|
}
|
|
|
log.Println("抽取-后置规则", data)
|
|
|
+ //全局后置规则
|
|
|
+ for _, v := range e.RuleBacks {
|
|
|
+ data = ExtRegBack(data, v, e.TaskInfo)
|
|
|
+ }
|
|
|
+ log.Println("全局后置规则", data)
|
|
|
|
|
|
//抽取结果赋值
|
|
|
for k, v := range data {
|
|
@@ -310,15 +333,16 @@ func (e *ExtractTask) ExtractProcess(doc map[string]interface{}) {
|
|
|
continue
|
|
|
}
|
|
|
if result[k] == nil {
|
|
|
- result[k] = &ExtField{Field: k, Value: map[string]int{qu.ObjToString(v): 1}, ExtNum: 1}
|
|
|
+ result[k] = &ExtField{Field: k, Value: []interface{}{v}}
|
|
|
} else {
|
|
|
- ef := result[k]
|
|
|
- ef.Value[qu.ObjToString(v)] += 1
|
|
|
- ef.ExtNum += 1
|
|
|
+ result[k].Value = append(result[k].Value, v)
|
|
|
}
|
|
|
}
|
|
|
- //抽取结果保存 todo
|
|
|
+ bs, _ := json.Marshal(result)
|
|
|
+ log.Println("抽取结果", string(bs))
|
|
|
}
|
|
|
+ //抽取结果保存 todo
|
|
|
+
|
|
|
}, func(err interface{}) {
|
|
|
log.Println(err)
|
|
|
<-e.TaskInfo.ProcessPool
|
|
@@ -364,8 +388,11 @@ func ExtRegCore(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[stri
|
|
|
apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
|
|
|
if len(apos) > 0 {
|
|
|
pos := apos[0]
|
|
|
- if len(pos)-1 > v.RegCore.ExtractPos {
|
|
|
- doc[v.Field] = text[pos[v.RegCore.ExtractPos]:pos[v.RegCore.ExtractPos+1]]
|
|
|
+ for k, p := range v.RegCore.ExtractPos {
|
|
|
+ if len(pos) > p {
|
|
|
+ doc[k] = text[pos[p]:pos[p+1]]
|
|
|
+ //log.Println(k, doc[k])
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
} else {
|
|
@@ -388,10 +415,13 @@ func ExtRegBack(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[stri
|
|
|
} else {
|
|
|
tmp := doc
|
|
|
if v.Field != "" && qu.ObjToString(doc[v.Field]) != "" {
|
|
|
- doc[v.Field] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(doc[v.Field]), "")
|
|
|
+ doc[v.Field] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(doc[v.Field]), v.RegPreBac.Replace)
|
|
|
} else {
|
|
|
for k, val := range doc {
|
|
|
- doc[k] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(val), "")
|
|
|
+ if k == "_id" || k == "detail" || qu.ObjToString(val) == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ doc[k] = v.RegPreBac.Reg.ReplaceAllString(qu.ObjToString(val), v.RegPreBac.Replace)
|
|
|
}
|
|
|
}
|
|
|
AddExtLog(tmp, doc, v, t) //抽取日志
|
|
@@ -400,7 +430,7 @@ func ExtRegBack(doc map[string]interface{}, v *RegLuaInfo, t *TaskInfo) map[stri
|
|
|
}
|
|
|
|
|
|
//抽取日志
|
|
|
-func AddExtLog(before, extifno map[string]interface{}, v *RegLuaInfo, t *TaskInfo) {
|
|
|
+func AddExtLog(before, extinfo map[string]interface{}, v *RegLuaInfo, t *TaskInfo) {
|
|
|
if !t.IsEtxLog {
|
|
|
return
|
|
|
}
|
|
@@ -412,10 +442,12 @@ func AddExtLog(before, extifno map[string]interface{}, v *RegLuaInfo, t *TaskInf
|
|
|
"version": t.Version,
|
|
|
"taskname": t.Name,
|
|
|
"before": before,
|
|
|
- "extinfo": extifno,
|
|
|
+ "extinfo": extinfo,
|
|
|
+ "sid": qu.BsonIdToSId(before["_id"]),
|
|
|
"comeintime": time.Now().Unix(),
|
|
|
}
|
|
|
lock.Lock()
|
|
|
+
|
|
|
ExtLogs[t] = append(ExtLogs[t], logdata)
|
|
|
lock.Unlock()
|
|
|
}
|
|
@@ -438,9 +470,10 @@ func SaveExtLog() {
|
|
|
v = v[saveLimit:]
|
|
|
} else {
|
|
|
k.DB.SaveBulk(k.TrackColl, v...)
|
|
|
+ break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- time.AfterFunc(2*time.Minute, SaveExtLog)
|
|
|
+ time.AfterFunc(1*time.Minute, SaveExtLog)
|
|
|
}
|