|
- // extractInit
- package extract
- import (
- db "jy/mongodbutil"
- ju "jy/util"
- qu "qfw/util"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "sync"
- "time"
- log "github.com/donnie4w/go-logger/logger"
- )
- type RegLuaInfo struct { //正则或脚本信息
- Code, Name, Field string //
- RuleText string //
- IsLua bool //
- RegPreBac *ExtReg //
- RegCore *ExtReg //
- LFields map[string]string //lua抽取字段属性组
- }
- type ExtReg struct {
- Reg *regexp.Regexp
- Replace string
- Bextract bool
- ExtractPos map[string]int
- NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
- }
- type RuleCore struct {
- Field string //逻辑字段
- LuaLogic string //进入逻辑
- ExtFrom string //从哪个字段抽取
- RulePres []*RegLuaInfo //抽取前置规则
- RuleBacks []*RegLuaInfo //抽取后置规则
- RuleCores []*RegLuaInfo //抽取规则
- }
- type Tag struct {
- Type string //标签类型 string 字符串、regexp 正则
- Key string //
- Reg *regexp.Regexp //
- }
- type TaskInfo struct {
- Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
- FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
- ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
- TestColl, LastExtId string //测试结果表、上次抽取信息id
- FDB *db.Pool //数据库连接池
- TDB *db.Pool //数据库连接池
- IsEtxLog bool //是否开启抽取日志
- ProcessPool chan bool //任务进程池
- TestLua bool //检查测试用
- }
- type ExtractTask struct {
- Id string //任务id
- IsRun bool //是否启动
- Content string //信息内容
- TaskInfo *TaskInfo //任务信息
- RulePres []*RegLuaInfo //通用前置规则
- RuleBacks []*RegLuaInfo //通用后置规则
- RuleBlock *ju.RuleBlock
- //RuleCores []*RuleCore //抽取规则
- RuleCores map[string]map[string][]*RuleCore //分类抽取规则
- PkgRuleCores []*RuleCore //分包抽取规则
- Tag map[string][]*Tag //标签库
- ClearFn map[string][]string //清理函数
- IsExtractCity bool //是否开启城市抽取
- Fields map[string]int //抽取属性组
- IsFileField bool //是否开启附件抽取
- FileFields *sync.Map //抽取附件属性组
- ResultChanel chan bool //抽取结果详情
- ResultArr [][]map[string]interface{} //抽取结果详情
- BidChanel chan bool //抽取结果
- BidArr [][]map[string]interface{} //抽取结果
- BidTotal int //结果数量
- RecogFieldMap map[string]map[string]interface{} //识别字段
- FidClassMap map[string][]map[string]interface{} //分类
- CidRuleMap map[string][]map[string]interface{} //规则
- AuditFields []string //需要审核的字段名称
- ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
- ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
- CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
- CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
- CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
- DistrictCityMap map[string]*City //区或县对应的city
- DistrictSimAndAll map[string]string //区或县(key:简称 val:全称)
- StreetDistrictMap map[string]*District //街道对应的区或县
- ProvinceAllGet *ju.DFA //省全称
- ProvinceSimGet *ju.DFA //省简称
- CityAllGet *ju.DFA //市全称
- CitySimGet *ju.DFA //市简称
- DistrictAllGet *ju.DFA //区或县全称
- DistrictSimGet *ju.DFA //区或县简称
- StreetGet *ju.DFA //街道
- PostCodeMap map[string]*PostCode //邮编
- AreaCodeMap map[string]*AreaCode //区号
- InfoType []map[string]interface{}
- }
- type ClearTaskInfo struct {
- Name, Version, VersionId string //名称、版本、版本id
- FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
- FDB *db.Pool //数据库连接池
- TDB *db.Pool //数据库连接池
- IsCltLog bool //是否开启清理日志
- ProcessPool chan bool //任务进程池
- }
- type ClearLua struct {
- Field string //字段字段
- Code string //代码
- Name string //名称
- LuaText string
- //LuaLogic string //进入逻辑
- //ExtFrom string //从哪个字段抽取
- LFields map[string]string //lua抽取字段属性组
- }
- type ClearTask struct {
- Id string //任务id
- Content string //信息内容
- ClearTaskInfo *ClearTaskInfo //任务信息
- ClearLuas map[string][]*ClearLua //清理脚本
- UpdateResult [][]map[string]interface{} //清理后结果
- ClearChannel chan bool
- }
- func init() {
- TaskList = make(map[string]*ExtractTask)
- ClearTaskList = make(map[string]*ClearTask)
- go SaveExtLog()
- go SaveCltLog() //保存清理日志
- }
- //加载任务信息
- func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- TrackColl: trackcoll,
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- TestColl: resultcoll,
- IsEtxLog: true,
- ProcessPool: make(chan bool, 1),
- }
- if (*v)["isextractcity"] != nil {
- e.IsExtractCity = (*v)["isextractcity"].(bool)
- }
- } else {
- return
- }
- }
- //加载任务信息
- func (e *ExtractTask) InitTaskInfo() {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- log.Debug("task", task)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
- log.Debug("s_mgosavecoll", strs)
- if len(strs) < 3 {
- return
- } else {
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- //TrackColl: (*task)["s_trackcoll"].(string),
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- ToDbAddr: strs[0],
- ToDB: strs[1],
- ToColl: strs[2],
- IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
- LastExtId: qu.ObjToString((*task)["s_extlastid"]),
- ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
- }
- if (*v)["isextractcity"] != nil {
- e.IsExtractCity = (*v)["isextractcity"].(bool)
- }
- }
- log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
- } else {
- return
- }
- }
- //加载通用前置规则
- func (e *ExtractTask) InitRulePres() {
- defer qu.Catch()
- e.RulePres = []*RegLuaInfo{}
- list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- e.RulePres = append(e.RulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- e.RulePres = append(e.RulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- }
- //加载通用后置规则
- func (e *ExtractTask) InitRuleBacks() {
- defer qu.Catch()
- e.RuleBacks = []*RegLuaInfo{}
- list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- e.RuleBacks = append(e.RuleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- e.RuleBacks = append(e.RuleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- }
- func (e *ExtractTask) InfoTypeList() {
- infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
- infolist := *infolist1
- for _, v := range infolist {
- e.InfoType = append(e.InfoType, v)
- }
- }
- //加载抽取规则
- func (e *ExtractTask) InitRuleCore() {
- defer qu.Catch()
- e.Fields = map[string]int{}
- e.RuleCores = make(map[string]map[string][]*RuleCore)
- fieldrules := map[string][]*RuleCore{}
- vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vinfo := range *vinfos {
- if b, _ := vinfo["isuse"].(bool); !b {
- continue
- }
- s_field := qu.ObjToString(vinfo["s_field"])
- pid := qu.BsonIdToSId(vinfo["_id"])
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *list {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = s_field
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //前置规则
- rulePres := []*RegLuaInfo{}
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *plist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- rulePres = append(rulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- rulePres = append(rulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RulePres = rulePres
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- field := qu.ObjToString(v["s_field"])
- e.Fields[field] = 1 //加入抽取属性组备用
- rinfo := &RegLuaInfo{
- Field: field,
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //提取全部属性
- rinfo.LFields = getALLFields()
- ruleCores = append(ruleCores, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
- }
- ruleCores = append(ruleCores, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleCores = ruleCores
- //
- if fieldrules[s_field] == nil {
- fieldrules[s_field] = []*RuleCore{}
- }
- fieldrules[s_field] = append(fieldrules[s_field], rcore)
- }
- }
- //属性配置
- infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
- for _, v := range *infolist {
- topclass := qu.ObjToString(v["topclass"])
- if v["subclass"] == nil {
- e.RuleCores[topclass] = make(map[string][]*RuleCore)
- for attr, _ := range v["fields"].(map[string]interface{}) {
- e.RuleCores[topclass][attr] = fieldrules[attr]
- }
- } else {
- for ca, fs := range v["subclass"].(map[string]interface{}) {
- e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
- for field, _ := range fs.(map[string]interface{}) {
- e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
- }
- }
- }
- }
- }
- //加载抽取规则
- func (e *ExtractTask) InitRuleCore2() {
- defer qu.Catch()
- e.Fields = map[string]int{}
- infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
- e.RuleCores = make(map[string]map[string][]*RuleCore)
- for _, v := range *infolist {
- topclass := qu.ObjToString(v["topclass"])
- if v["subclass"] == nil {
- e.RuleCores[topclass] = make(map[string][]*RuleCore)
- for attr, _ := range v["fields"].(map[string]interface{}) {
- vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+attr+`"}`, `{}`)
- e.RuleCores[topclass][attr] = append(e.RuleCores[topclass][attr], e.InfoRole(*vinfo)...)
- }
- } else {
- for ca, fs := range v["subclass"].(map[string]interface{}) {
- e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
- for field, _ := range fs.(map[string]interface{}) {
- vinfo, _ := db.Mgo.FindOneByField("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false,"s_field":"`+field+`"}`, `{}`)
- e.RuleCores[topclass+"_"+ca][field] = append(e.RuleCores[topclass+"_"+ca][field], e.InfoRole(*vinfo)...)
- }
- }
- }
- }
- }
- func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
- maps := []*RuleCore{}
- if b, _ := vinfo["isuse"].(bool); !b {
- return nil
- }
- s_field := qu.ObjToString(vinfo["s_field"])
- pid := qu.BsonIdToSId(vinfo["_id"])
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *list {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = s_field
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //前置规则
- rulePres := []*RegLuaInfo{}
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *plist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- rulePres = append(rulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- rulePres = append(rulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RulePres = rulePres
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- field := qu.ObjToString(v["s_field"])
- e.Fields[field] = 1 //加入抽取属性组备用
- rinfo := &RegLuaInfo{
- Field: field,
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //提取全部属性
- rinfo.LFields = getALLFields()
- ruleCores = append(ruleCores, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- ptmp := strings.Split(rinfo.RuleText, "#")
- sign := 0
- if len(ptmp) == 2 {
- if ptmp[1] == "正" {
- sign = 1
- } else if ptmp[1] == "负" {
- sign = -1
- }
- }
- tmp := strings.Split(ptmp[0], "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm, NumSign: sign}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
- }
- ruleCores = append(ruleCores, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleCores = ruleCores
- //
- maps = append(maps, rcore)
- }
- return maps
- }
- //加载分包抽取规则
- func (e *ExtractTask) InitPkgCore() {
- defer qu.Catch()
- e.PkgRuleCores = []*RuleCore{}
- pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, pkginfo := range *pkginfos {
- if b, _ := pkginfo["isuse"].(bool); !b {
- continue
- }
- s_field := qu.ObjToString(pkginfo["s_field"])
- pid := qu.BsonIdToSId(pkginfo["_id"])
- logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *logicList {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = s_field
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- e.PkgRuleCores = append(e.PkgRuleCores, rcore)
- }
- }
- }
- //加载标签库
- func (e *ExtractTask) InitTag() {
- defer qu.Catch()
- e.Tag = map[string][]*Tag{}
- //字符串标签库
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- fname := qu.ObjToString(v["s_name"])
- tab := ju.TagFile{Name: fname} //用于表格kv
- tab.Items = make([]*ju.Tag, len(tmp))
- for k, key := range tmp {
- tag := &Tag{Type: "string", Key: key.(string)}
- e.Tag[field] = append(e.Tag[field], tag)
- tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
- }
- sort.Sort(tab.Items)
- ju.TagdbTable[fname] = &tab
- }
- }
- //正则标签库
- list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- fname := qu.ObjToString(v["s_name"])
- tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
- tab.Items = make([]*ju.Tag, len(tmp))
- for k, key := range tmp {
- tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
- e.Tag[field] = append(e.Tag[field], tag)
- tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
- }
- sort.Sort(tab.Items)
- ju.TagdbTable[fname+"_reg"] = &tab
- }
- }
- }
- //获取fields
- func getALLFields() map[string]string {
- fields := map[string]string{}
- list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
- for _, v := range *list {
- fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
- }
- return fields
- }
- //加载clear函数
- func (e *ExtractTask) InitClearFn() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string][]string{}
- for _, tmp := range *list {
- field := tmp["s_field"].(string)
- fns := tmp["clear"].([]interface{})
- if fn[field] == nil {
- fn[field] = []string{}
- }
- for _, v := range fns {
- fn[field] = append(fn[field], v.(string))
- }
- }
- e.ClearFn = fn
- }
- //加载省份
- func InitProvince(version string) map[string]interface{} {
- defer qu.Catch()
- fn := map[string]interface{}{}
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- content := v["content"]
- switch content.(type) {
- case string:
- fn[name] = []interface{}{content.(string)}
- case []interface{}:
- fn[name] = content
- }
- }
- return fn
- }
- //加载城市简称
- func InitCitySim(version string) map[string]map[string]interface{} {
- defer qu.Catch()
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string]map[string]interface{}{}
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- tmp := v["content"].(map[string]interface{})
- fn[name] = tmp
- }
- return fn
- }
- //加载城市全称
- func InitCityAll(version string) map[string]map[string]interface{} {
- defer qu.Catch()
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string]map[string]interface{}{}
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- tmp := v["content"].(map[string]interface{})
- fn[name] = tmp
- }
- return fn
- }
- //初始化城市省份敏感词
- func (e *ExtractTask) InitCityDFA() {
- defer qu.Catch()
- e.CityAllGet = &ju.DFA{}
- e.CitySimGet = &ju.DFA{}
- e.DistrictAllGet = &ju.DFA{}
- e.DistrictSimGet = &ju.DFA{}
- e.ProvinceAllGet = &ju.DFA{}
- e.ProvinceSimGet = &ju.DFA{}
- e.StreetGet = &ju.DFA{}
- //初始化map
- if e.ProvinceMap == nil {
- e.ProvinceMap = make(map[string]string)
- }
- if e.CityMap == nil {
- e.CityMap = make(map[string]string)
- }
- if e.DistrictSimAndAll == nil {
- e.DistrictSimAndAll = make(map[string]string)
- }
- if e.CityBriefMap == nil {
- e.CityBriefMap = make(map[string]*City)
- }
- if e.CityFullMap == nil {
- e.CityFullMap = make(map[string]*City)
- }
- if e.ProvinceBriefMap == nil {
- e.ProvinceBriefMap = make(map[string]*Province)
- }
- if e.DistrictCityMap == nil {
- e.DistrictCityMap = make(map[string]*City)
- }
- if e.StreetDistrictMap == nil {
- e.StreetDistrictMap = make(map[string]*District)
- }
- //初始化省
- fn1 := InitProvince(e.TaskInfo.Version)
- for k, v := range fn1 {
- for _, p := range v.([]interface{}) {
- p1, _ := p.(string)
- e.ProvinceAllGet.AddWord(p1) //华中科技大学
- e.ProvinceMap[p1] = k //华中科技大学:湖北
- }
- }
- //初始化城市全称
- fn2 := InitCityAll(e.TaskInfo.Version)
- for k, v := range fn2 {
- //加载省信息
- e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
- p := &Province{}
- p.Name = k //省全称:浙江省
- p.Brief = v["brief"].(string) //省简称:浙江
- e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
- e.ProvinceMap[k] = p.Brief //浙江省:浙江
- e.ProvinceBriefMap[p.Brief] = p //浙江:省信息{}
- p.Cap = v["captial"].(string) //省会(杭州)
- //加载市信息
- city, _ := v["city"].(map[string]interface{})
- for k1, v1 := range city {
- e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
- v1m, _ := v1.(map[string]interface{})
- c := &City{}
- c.Name = k1 //市全称:杭州市
- c.Brief = v1m["brief"].(string) //市简称:杭州
- e.CitySimGet.AddWord(c.Brief) //加入市简称dfa(k:杭州)
- e.CityMap[k1] = c.Brief //杭州市:杭州
- e.CityBriefMap[c.Brief] = c //杭州:市信息{}
- e.CityFullMap[k1] = c //杭州市:市信息{}
- c.P = p
- if c.Name == p.Cap {
- p.Captial = c //加载province中的省会市信息{}
- }
- //区县
- districtmap := v1m["area"].(map[string]interface{}) //区或县
- for district, streetarr := range districtmap {
- d := &District{}
- d.Name = district
- d.C = c
- //省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
- //匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
- e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
- ctmp := e.DistrictCityMap[district]
- if ctmp == nil {
- e.DistrictCityMap[district] = c
- }
- //街道
- for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
- e.StreetGet.AddWord(s) //加入街道敏感词
- dtmp := e.StreetDistrictMap[s]
- if dtmp == nil {
- e.StreetDistrictMap[s] = d
- }
- }
- }
- }
- }
- //初始化城市简称
- fn3 := InitCitySim(e.TaskInfo.Version)
- for _, v := range fn3 {
- city, _ := v["city"].(map[string]interface{})
- for _, v1 := range city {
- v1m, _ := v1.(map[string]interface{})
- cb := v1m["brief"].(string) //市简称
- arr := v1m["area"].(map[string]interface{}) //区或县简称
- for districtsim, districtall := range arr {
- e.DistrictSimAndAll[districtsim] = districtall.(string)
- d := &District{}
- d.Name = districtsim
- d.C = e.CityBriefMap[cb]
- e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
- ctmp := e.DistrictCityMap[districtsim]
- if ctmp == nil {
- e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
- }
- }
- }
- }
- }
- //初始化邮编库
- func (e *ExtractTask) InitPostCode() {
- defer qu.Catch()
- if e.PostCodeMap == nil {
- e.PostCodeMap = make(map[string]*PostCode)
- }
- list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
- for _, l := range *list {
- pc := &PostCode{}
- pc.Code = qu.ObjToString(l["code"])
- pc.P = qu.ObjToString(l["province"])
- pc.C = qu.ObjToString(l["city"])
- pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
- e.PostCodeMap[pc.Code] = pc
- }
- }
- //初始化区号库
- func (e *ExtractTask) InitAreaCode() {
- defer qu.Catch()
- if e.AreaCodeMap == nil {
- e.AreaCodeMap = make(map[string]*AreaCode)
- }
- list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
- for _, l := range *list {
- ac := &AreaCode{}
- ac.Code = qu.ObjToString(l["code"])
- ac.P = qu.ObjToString(l["province"])
- ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
- e.AreaCodeMap[ac.Code] = ac
- }
- }
- //初始化城市省份敏感词
- //func (e *ExtractTask) InitCityDFA() {
- // defer qu.Catch()
- // e.CityAllGet = &ju.DFA{}
- // e.DistrictGet = &ju.DFA{}
- // e.AreaProvinceGet = &ju.DFA{}
- // e.StreetGet = &ju.DFA{}
- // //初始化map
- // if e.ProvinceMap == nil {
- // e.ProvinceMap = make(map[string]string)
- // }
- // if e.CityBriefMap == nil {
- // e.CityBriefMap = make(map[string]*City)
- // }
- // if e.ProvinceBriefMap == nil {
- // e.ProvinceBriefMap = make(map[string]*Province)
- // }
- // if e.AreaToCityMap == nil {
- // e.AreaToCityMap = make(map[string][]*City)
- // }
- // if e.DistrictCityMap == nil {
- // e.DistrictCityMap = make(map[string]*City)
- // }
- // if e.StreetDistrictMap == nil {
- // e.StreetDistrictMap = make(map[string]*District)
- // }
- // //初始化省
- // fn1 := InitProvince(e.TaskInfo.Version)
- // for k, v := range fn1 {
- // for _, p := range v.([]interface{}) {
- // p1, _ := p.(string)
- // e.AreaProvinceGet.AddWord(p1) //华中科技大学
- // e.ProvinceMap[p1] = k //华中科技大学:湖北
- // }
- // }
- // //初始化城市全称
- // fn2 := InitCityAll(e.TaskInfo.Version)
- // for k, v := range fn2 {
- // e.AreaProvinceGet.AddWord(k) //加入省全称dfa(k:浙江省)
- // p := &Province{}
- // p.Name = k //省全称
- // p.Brief = v["brief"].(string) //省简称
- // e.ProvinceMap[k] = p.Brief //浙江省:浙江
- // e.ProvinceBriefMap[p.Brief] = p //浙江:省信息
- // p.Cap = v["captial"].(string) //省会(杭州)
- // city, _ := v["city"].(map[string]interface{})
- // //
- // for k1, v1 := range city {
- // v1m, _ := v1.(map[string]interface{})
- // c := &City{}
- // c.Name = k1
- // c.Brief = v1m["brief"].(string)
- // e.CityBriefMap[c.Brief] = c
- // c.P = p
- // if c.Brief == p.Cap {
- // p.Captial = c
- // }
- // //加入到城市map中
- // //
- // cs := e.AreaToCityMap[k1]
- // e.CityAllGet.AddWord(k1) //市全称
- // if cs != nil {
- // cs = append(cs, c)
- // } else {
- // cs = []*City{c}
- // }
- // e.AreaToCityMap[k1] = cs
- // //区县
- // districtmap := v1m["area"].(map[string]interface{}) //区或县
- // for district, streetarr := range districtmap {
- // d := &District{}
- // d.Name = district
- // d.C = c
- // e.DistrictGet.AddWord(district) //加入区或县敏感词
- // ctmp := e.DistrictCityMap[district]
- // if ctmp == nil {
- // e.DistrictCityMap[district] = c
- // }
- // //街道
- // for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
- // e.StreetGet.AddWord(s) //加入街道敏感词
- // dtmp := e.StreetDistrictMap[s]
- // if dtmp == nil {
- // e.StreetDistrictMap[s] = d
- // }
- // }
- // }
- // }
- // }
- // //初始化城市简称
- // fn3 := InitCitySim(e.TaskInfo.Version)
- // e.CitySimGet = &ju.DFA{}
- // for k, v := range fn3 {
- // pb := v["brief"].(string)
- // p := e.ProvinceBriefMap[pb]
- // //加载
- // for _, ss := range []string{k, pb} { //省全称和省简称
- // cs := e.AreaToCityMap[ss]
- // if cs != nil {
- // cs = append(cs, p.Captial)
- // } else {
- // cs = []*City{p.Captial}
- // }
- // e.AreaToCityMap[ss] = cs
- // e.CitySimGet.AddWord(ss)
- // }
- // city, _ := v["city"].(map[string]interface{})
- // for k1, v1 := range city {
- // v1m, _ := v1.(map[string]interface{})
- // if v1m["brief"] == nil {
- // }
- // cb := v1m["brief"].(string)
- // c := e.AreaToCityMap[k1][0]
- // //加入到城市map中
- // for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
- // e.CitySimGet.AddWord(ss)
- // cs := e.AreaToCityMap[ss]
- // if cs != nil {
- // cs = append(cs, c)
- // } else {
- // cs = []*City{c}
- // }
- // e.AreaToCityMap[ss] = cs
- // }
- // arr := v1m["area"].([]interface{})
- // for _, k2 := range arr {
- // s := k2.(string)
- // for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
- // cs := e.AreaToCityMap[ss]
- // e.CitySimGet.AddWord(ss)
- // if cs != nil {
- // cs = append(cs, c)
- // } else {
- // cs = []*City{c}
- // }
- // e.AreaToCityMap[ss] = cs
- // //只加入简称
- // if n == 0 {
- // d := &District{}
- // d.Name = ss
- // d.C = c
- // e.DistrictGet.AddWord(ss) //加入区或县简称敏感词
- // ctmp := e.DistrictCityMap[ss]
- // if ctmp == nil {
- // e.DistrictCityMap[ss] = c
- // }
- // }
- // }
- // }
- // }
- // }
- //}
- //保存抽取详情数据
- func (e *ExtractTask) ResultSave(init bool) {
- defer qu.Catch()
- if e.ResultArr == nil {
- e.ResultArr = [][]map[string]interface{}{}
- }
- if init {
- go func() {
- for {
- if len(e.ResultArr) > 500 {
- arr := e.ResultArr[:500]
- e.ResultArr = e.ResultArr[500:]
- qu.Try(func() {
- db.Mgo.UpSertBulk("extract_result", arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- } else {
- arr := e.ResultArr
- e.ResultArr = [][]map[string]interface{}{}
- qu.Try(func() {
- db.Mgo.UpSertBulk("extract_result", arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- }
- time.Sleep(10 * time.Second)
- }
- }()
- } else {
- arr := e.ResultArr
- e.ResultArr = [][]map[string]interface{}{}
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- }
- }
- //保存抽取数据
- func (e *ExtractTask) BidSave(init bool) {
- defer qu.Catch()
- if e.BidArr == nil {
- e.BidArr = [][]map[string]interface{}{}
- }
- if init {
- go func() {
- for {
- if len(e.BidArr) > 500 {
- arr := e.BidArr[:500]
- e.BidArr = e.BidArr[500:]
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- } else {
- arr := e.BidArr
- e.BidArr = [][]map[string]interface{}{}
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- }
- time.Sleep(10 * time.Second)
- }
- }()
- } else {
- arr := e.BidArr
- e.BidArr = [][]map[string]interface{}{}
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- time.Sleep(1 * time.Second)
- }
- }
- func (e *ExtractTask) InitAuditRecogField() {
- defer qu.Catch()
- e.RecogFieldMap = make(map[string]map[string]interface{})
- recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
- for _, f := range *recogFieldList {
- field := qu.ObjToString(f["s_recogfield"])
- e.RecogFieldMap[field] = f
- }
- }
- func (e *ExtractTask) InitAuditClass() {
- defer qu.Catch()
- e.FidClassMap = make(map[string][]map[string]interface{})
- class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
- for _, c := range *class {
- classList := []map[string]interface{}{}
- fid := qu.ObjToString(c["s_fid"])
- if len(e.FidClassMap[fid]) > 0 { //追加
- classList = e.FidClassMap[fid]
- }
- classList = append(classList, c)
- e.FidClassMap[fid] = classList
- }
- }
- //加载规则
- func (e *ExtractTask) InitAuditRule() {
- defer qu.Catch()
- var rureg *regexp.Regexp
- var rs []rune
- var ru string
- var err error
- e.CidRuleMap = make(map[string][]map[string]interface{})
- rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
- for _, v := range *rule {
- i_rule := []interface{}{}
- ss, _ := (v["s_rule"].([]interface{}))
- for _, r := range qu.ObjArrToStringArr(ss) {
- if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
- rs = []rune(r)
- ru = string(rs[1 : len(rs)-1])
- rureg, err = regexp.Compile(ru)
- if err != nil {
- log.Debug("error---rule:", r)
- continue
- }
- i_rule = append(i_rule, []interface{}{rureg}...)
- } else { //规则
- i_rule = append(i_rule, r)
- }
- }
- v["rule"] = i_rule
- ruleList := []map[string]interface{}{}
- classid := qu.ObjToString(v["s_classid"])
- if len(e.CidRuleMap[classid]) > 0 { //追加
- ruleList = e.CidRuleMap[classid]
- }
- ruleList = append(ruleList, v)
- e.CidRuleMap[classid] = ruleList
- }
- }
- //
- func (e *ExtractTask) InitAuditFields() {
- if len(e.AuditFields) == 0 {
- v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
- if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
- vid := qu.BsonIdToSId((*v)["_id"])
- query := map[string]interface{}{
- "isaudit": true,
- "delete": false,
- "vid": vid,
- }
- data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
- for _, d := range *data {
- field := qu.ObjToString(d["s_field"])
- e.AuditFields = append(e.AuditFields, field)
- }
- }
- }
- }
- //加载附件抽取
- func (e *ExtractTask) InitFile() {
- defer qu.Catch()
- //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
- ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
- //ve, _ := db.Mgo.FindOne("version", query)
- if ve == nil {
- return
- }
- if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
- e.IsFileField = true
- }
- syscefiled := new(sync.Map)
- if (*ve)["s_filefileds"] != nil {
- for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
- syscefiled.Store(vff.(string), 1)
- }
- }
- e.FileFields = syscefiled
- }
- //加载清理任务信息
- func (c *ClearTask) InitClearTaskInfo() {
- cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
- if len(*cleartask) > 1 {
- v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
- c.ClearTaskInfo = &ClearTaskInfo{
- Name: (*cleartask)["s_taskname"].(string),
- Version: (*cleartask)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
- FromDB: (*cleartask)["s_mgodb"].(string),
- FromColl: (*cleartask)["s_mgocoll"].(string),
- IsCltLog: ju.Config["iscltlog"].(bool),
- ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
- }
- log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
- } else {
- return
- }
- }
- //加载清理脚本
- func (c *ClearTask) InitClearLuas() {
- defer qu.Catch()
- c.ClearLuas = make(map[string][]*ClearLua)
- list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, l := range *list {
- if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
- continue
- }
- s_field := qu.ObjToString(l["s_field"])
- pid := qu.BsonIdToSId(l["_id"])
- luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *luas {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- clearLua := &ClearLua{
- Field: s_field,
- Code: vv["s_code"].(string),
- Name: vv["s_name"].(string),
- LuaText: vv["s_luascript"].(string),
- LFields: getALLFields(),
- }
- c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
- }
- }
- }
- //加载分块规则
- func (e *ExtractTask) InitBlockRule() {
- datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
- brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
- for _, v := range *datas {
- block_reg, _ := v["block_reg"].(string)
- block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
- title_reg, _ := v["title_reg"].(string)
- title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
- if block_reg == "" || title_reg == "" {
- continue
- }
- b_reg, b_err := regexp.Compile(block_reg)
- t_reg, t_err := regexp.Compile(title_reg)
- if b_err != nil || t_err != nil {
- continue
- }
- brs = append(brs, b_reg)
- trs = append(trs, t_reg)
- }
- e.RuleBlock = &ju.RuleBlock{
- BlockRegs: brs,
- TitleRegs: trs,
- Classify: e.InitBlockClassify(),
- }
- }
- //加载分块规则
- func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
- classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, nil, `{"name":1}`, false, -1, -1)
- classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
- classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, nil, `{"name":1,"pid":1}`, false, -1, -1)
- tag_map := map[string]ju.Tags{}
- for _, v := range *classify_tag {
- pid := qu.ObjToString(v["pid"])
- tag_map[pid] = append(tag_map[pid], &ju.Tag{Value: qu.ObjToString(v["name"])})
- }
- //
- info_map := map[string][]*ju.NameCode{}
- info_tag := map[string]*ju.TagFile{}
- for _, v := range *classify_info {
- pid := qu.ObjToString(v["pid"])
- _id := qu.BsonIdToSId(v["_id"])
- name := qu.ObjToString(v["name"])
- info_tag[name] = &ju.TagFile{
- Name: name,
- Items: tag_map[_id],
- }
- info_map[pid] = append(info_map[pid], &ju.NameCode{
- Name: name,
- Code: qu.ObjToString(v["code"]),
- })
- }
- classify_map := map[string][]*ju.NameCode{}
- for _, v := range *classify {
- _id := qu.BsonIdToSId(v["_id"])
- if info_map[_id] == nil {
- continue
- }
- for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
- classify_map[vv] = append(classify_map[vv], info_map[_id]...)
- }
- }
- return &ju.BlockClassify{
- Type: classify_map,
- Classify: info_tag,
- }
- }
|