123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- // extractInit
- package extract
- import (
- db "jy/mongodbutil"
- "log"
- qu "qfw/util"
- "regexp"
- "strings"
- )
- type RegLuaInfo struct { //正则或脚本信息
- Code, Name, Field string //
- RuleText string //
- IsLua, IsHasFields bool //IsHasFields正则配置有属性字段
- RegPreBac *ExtReg //
- RegCore *ExtReg //
- LFields []interface{} //lua抽取字段属性组
- }
- type ExtReg struct {
- Reg *regexp.Regexp
- Replace string
- Bextract bool
- ExtractPos map[string]int
- }
- type RuleCore struct {
- Field string //逻辑字段
- LuaLogic string //进入逻辑
- ExtFrom string //从哪个字段抽取
- RulePres []*RegLuaInfo //抽取前置规则
- RuleBacks []*RegLuaInfo //抽取后置规则
- RuleCores []*RegLuaInfo //抽取规则
- }
- type TaskInfo struct {
- Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
- FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
- SaveColl, TestColl, LastExtId string //抽取结果表、测试结果表、上次抽取信息id
- DB *db.Pool //数据库连接池
- IsEtxLog bool //是否开启抽取日志
- ProcessPool chan bool //任务进程池
- TestLua bool //检查测试用
- }
- type Tag struct {
- Type string //标签类型 string 字符串、regexp 正则
- Key string //
- Reg *regexp.Regexp //
- }
- type ExtractTask struct {
- Id string //任务id
- IsRun bool //是否启动
- Content string //信息内容
- TaskInfo *TaskInfo //任务信息
- RulePres []*RegLuaInfo //通用前置规则
- RuleBacks []*RegLuaInfo //通用后置规则
- RuleCores []*RuleCore //抽取规则
- Tag map[string][]*Tag //标签库
- ClearFn map[string][]string //清理函数
- }
- func init() {
- TaskList = make(map[string]*ExtractTask)
- go SaveExtLog()
- }
- //加载任务信息
- func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`"}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- TrackColl: trackcoll,
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- TestColl: resultcoll,
- IsEtxLog: true,
- ProcessPool: make(chan bool, 1),
- }
- } else {
- return
- }
- }
- //加载任务信息
- func (e *ExtractTask) InitTaskInfo() {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- log.Println("task", task)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`"}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- //TrackColl: (*task)["s_trackcoll"].(string),
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- SaveColl: (*task)["s_mgosavecoll"].(string),
- IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
- LastExtId: qu.ObjToString((*task)["s_extlastid"]),
- ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
- }
- log.Println(e.TaskInfo.Name, e.TaskInfo.ProcessPool)
- } else {
- return
- }
- }
- //加载通用前置规则
- func (e *ExtractTask) InitRulePres() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- e.RulePres = append(e.RulePres, rinfo)
- }
- }
- //加载通用后置规则
- func (e *ExtractTask) InitRuleBacks() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- e.RuleBacks = append(e.RuleBacks, rinfo)
- }
- }
- //加载抽取规则
- func (e *ExtractTask) InitRuleCore() {
- defer qu.Catch()
- vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`"}`, nil, nil, false, -1, -1)
- for _, vinfo := range *vinfos {
- if b, _ := vinfo["isuse"].(bool); !b {
- continue
- }
- pid := qu.BsonIdToSId(vinfo["_id"])
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`"}`, nil, nil, false, -1, -1)
- for _, vv := range *list {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = vinfo["s_field"].(string)
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //前置规则
- rulePres := []*RegLuaInfo{}
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
- for _, v := range *plist {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- rinfo.Field = v["s_field"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- rulePres = append(rulePres, rinfo)
- }
- rcore.RulePres = rulePres
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- rinfo.Field = v["s_field"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- ruleBacks = append(ruleBacks, rinfo)
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`"}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //暂时提取全部属性
- rinfo.LFields = getALLFields()
- rinfo.IsHasFields = true
- /*rinfo.LFields, _ = v["s_fields"].([]interface{})
- if len(rinfo.LFields) > 0 {
- rinfo.IsHasFields = true
- }*/
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- rinfo.Field = v["s_field"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- ruleCores = append(ruleCores, rinfo)
- }
- rcore.RuleCores = ruleCores
- //
- e.RuleCores = append(e.RuleCores, rcore)
- }
- }
- }
- //加载标签库
- func (e *ExtractTask) InitTag() {
- defer qu.Catch()
- e.Tag = map[string][]*Tag{}
- //字符串标签库
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"字符串","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- for _, key := range tmp {
- tag := &Tag{Type: "string", Key: key.(string)}
- e.Tag[field] = append(e.Tag[field], tag)
- }
- }
- }
- //正则标签库
- list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"正则","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- for _, key := range tmp {
- tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
- e.Tag[field] = append(e.Tag[field], tag)
- }
- }
- }
- }
- //获取fields
- func getALLFields() []interface{} {
- fields := []interface{}{}
- list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1}`, false, -1, -1)
- for _, v := range *list {
- fields = append(fields, v["s_field"])
- }
- return fields
- }
- //加载clear函数
- func (e *ExtractTask) InitClearFn() {
- list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
- fn := map[string][]string{}
- for _, tmp := range *list {
- field := tmp["s_field"].(string)
- fns := tmp["clear"].([]interface{})
- if fn[field] == nil {
- fn[field] = []string{}
- }
- for _, v := range fns {
- fn[field] = append(fn[field], v.(string))
- }
- }
- e.ClearFn = fn
- }
|