|
- // extractInit
- package extract
- import (
- db "jy/mongodbutil"
- ju "jy/util"
- qu "qfw/util"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "sync"
- "time"
- log "github.com/donnie4w/go-logger/logger"
- )
- type RegLuaInfo struct { //正则或脚本信息
- Code, Name, Field string //
- RuleText string //
- IsLua bool //
- RegPreBac *ExtReg //
- RegCore *ExtReg //
- LFields map[string]string //lua抽取字段属性组
- }
- type ExtReg struct {
- Reg *regexp.Regexp
- Replace string
- Bextract bool
- ExtractPos map[string]int
- }
- type RuleCore struct {
- Field string //逻辑字段
- LuaLogic string //进入逻辑
- ExtFrom string //从哪个字段抽取
- RulePres []*RegLuaInfo //抽取前置规则
- RuleBacks []*RegLuaInfo //抽取后置规则
- RuleCores []*RegLuaInfo //抽取规则
- }
- type Tag struct {
- Type string //标签类型 string 字符串、regexp 正则
- Key string //
- Reg *regexp.Regexp //
- }
- type TaskInfo struct {
- Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
- FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
- ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
- TestColl, LastExtId string //测试结果表、上次抽取信息id
- FDB *db.Pool //数据库连接池
- TDB *db.Pool //数据库连接池
- IsEtxLog bool //是否开启抽取日志
- ProcessPool chan bool //任务进程池
- TestLua bool //检查测试用
- }
- type ExtractTask struct {
- Id string //任务id
- IsRun bool //是否启动
- Content string //信息内容
- TaskInfo *TaskInfo //任务信息
- RulePres []*RegLuaInfo //通用前置规则
- RuleBacks []*RegLuaInfo //通用后置规则
- RuleCores []*RuleCore //抽取规则
- PkgRuleCores []*RuleCore //分包抽取规则
- Tag map[string][]*Tag //标签库
- ClearFn map[string][]string //清理函数
- IsExtractCity bool //是否开启城市抽取
- Fields map[string]int //抽取属性组
- IsFileField bool //是否开启附件抽取
- FileFields *sync.Map //抽取附件属性组
- ResultChanel chan bool //抽取结果详情
- ResultArr [][]map[string]interface{} //抽取结果详情
- BidChanel chan bool //抽取结果
- BidArr [][]map[string]interface{} //抽取结果
- RecogFieldMap map[string]map[string]interface{} //识别字段
- FidClassMap map[string][]map[string]interface{} //分类
- CidRuleMap map[string][]map[string]interface{} //规则
- AuditFields []string //需要审核的字段名称
- ProvinceMap map[string]string
- CityBrief map[string]*City //只加载一次即可
- ProvinceBrief map[string]*Province //只加载一次
- AreaToCity map[string][]*City //两个文件共用
- DistrictCityMap map[string]*City
- StreetDistrictMap map[string]*District
- AreaGet *ju.DFA //市全称
- AreaDistrict *ju.DFA //区或县
- AreaProvinceGet *ju.DFA //省
- AreaSimGet *ju.DFA //市简称
- AreaStreet *ju.DFA //街道
- }
- type ClearTaskInfo struct {
- Name, Version, VersionId string //名称、版本、版本id
- FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
- FDB *db.Pool //数据库连接池
- TDB *db.Pool //数据库连接池
- IsCltLog bool //是否开启清理日志
- ProcessPool chan bool //任务进程池
- }
- type ClearLua struct {
- Field string //字段字段
- Code string //代码
- Name string //名称
- LuaText string
- //LuaLogic string //进入逻辑
- //ExtFrom string //从哪个字段抽取
- LFields map[string]string //lua抽取字段属性组
- }
- type ClearTask struct {
- Id string //任务id
- Content string //信息内容
- ClearTaskInfo *ClearTaskInfo //任务信息
- ClearLuas map[string][]*ClearLua //清理脚本
- UpdateResult [][]map[string]interface{} //清理后结果
- ClearChannel chan bool
- }
- func init() {
- TaskList = make(map[string]*ExtractTask)
- ClearTaskList = make(map[string]*ClearTask)
- go SaveExtLog()
- go SaveCltLog() //保存清理日志
- }
- //加载任务信息
- func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- TrackColl: trackcoll,
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- TestColl: resultcoll,
- IsEtxLog: true,
- ProcessPool: make(chan bool, 1),
- }
- if (*v)["isextractcity"] != nil {
- e.IsExtractCity = (*v)["isextractcity"].(bool)
- }
- } else {
- return
- }
- }
- //加载任务信息
- func (e *ExtractTask) InitTaskInfo() {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- log.Debug("task", task)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
- log.Debug("s_mgosavecoll", strs)
- if len(strs) < 3 {
- return
- } else {
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- //TrackColl: (*task)["s_trackcoll"].(string),
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- ToDbAddr: strs[0],
- ToDB: strs[1],
- ToColl: strs[2],
- IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
- LastExtId: qu.ObjToString((*task)["s_extlastid"]),
- ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
- }
- if (*v)["isextractcity"] != nil {
- e.IsExtractCity = (*v)["isextractcity"].(bool)
- }
- }
- log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
- } else {
- return
- }
- }
- //加载通用前置规则
- func (e *ExtractTask) InitRulePres() {
- defer qu.Catch()
- e.RulePres = []*RegLuaInfo{}
- list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- e.RulePres = append(e.RulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- e.RulePres = append(e.RulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- }
- //加载通用后置规则
- func (e *ExtractTask) InitRuleBacks() {
- defer qu.Catch()
- e.RuleBacks = []*RegLuaInfo{}
- list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- e.RuleBacks = append(e.RuleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- e.RuleBacks = append(e.RuleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- }
- //加载抽取规则
- func (e *ExtractTask) InitRuleCore() {
- defer qu.Catch()
- e.Fields = map[string]int{}
- e.RuleCores = []*RuleCore{}
- vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vinfo := range *vinfos {
- if b, _ := vinfo["isuse"].(bool); !b {
- continue
- }
- s_field := qu.ObjToString(vinfo["s_field"])
- pid := qu.BsonIdToSId(vinfo["_id"])
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *list {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = s_field
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //前置规则
- rulePres := []*RegLuaInfo{}
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *plist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- rulePres = append(rulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- rulePres = append(rulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RulePres = rulePres
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- field := qu.ObjToString(v["s_field"])
- e.Fields[field] = 1 //加入抽取属性组备用
- rinfo := &RegLuaInfo{
- Field: field,
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //提取全部属性
- rinfo.LFields = getALLFields()
- ruleCores = append(ruleCores, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
- }
- ruleCores = append(ruleCores, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleCores = ruleCores
- //
- e.RuleCores = append(e.RuleCores, rcore)
- }
- }
- }
- //加载分包抽取规则
- func (e *ExtractTask) InitPkgCore() {
- defer qu.Catch()
- e.PkgRuleCores = []*RuleCore{}
- pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, pkginfo := range *pkginfos {
- if b, _ := pkginfo["isuse"].(bool); !b {
- continue
- }
- s_field := qu.ObjToString(pkginfo["s_field"])
- pid := qu.BsonIdToSId(pkginfo["_id"])
- logicList, _ := db.Mgo.Find("pkg_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *logicList {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = s_field
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- e.PkgRuleCores = append(e.PkgRuleCores, rcore)
- }
- }
- }
- //加载标签库
- func (e *ExtractTask) InitTag() {
- defer qu.Catch()
- e.Tag = map[string][]*Tag{}
- //字符串标签库
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- fname := qu.ObjToString(v["s_name"])
- tab := ju.TagFile{Name: fname} //用于表格kv
- tab.Items = make([]*ju.Tag, len(tmp))
- for k, key := range tmp {
- tag := &Tag{Type: "string", Key: key.(string)}
- e.Tag[field] = append(e.Tag[field], tag)
- tab.Items[k] = &ju.Tag{key.(string), 0 - k, nil}
- }
- sort.Sort(tab.Items)
- ju.TagdbTable[fname] = &tab
- }
- }
- //正则标签库
- list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- fname := qu.ObjToString(v["s_name"])
- tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
- tab.Items = make([]*ju.Tag, len(tmp))
- for k, key := range tmp {
- tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
- e.Tag[field] = append(e.Tag[field], tag)
- tab.Items[k] = &ju.Tag{key.(string), 0 - k, regexp.MustCompile(key.(string))}
- }
- sort.Sort(tab.Items)
- ju.TagdbTable[fname+"_reg"] = &tab
- }
- }
- }
- //获取fields
- func getALLFields() map[string]string {
- fields := map[string]string{}
- list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
- for _, v := range *list {
- fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
- }
- return fields
- }
- //加载clear函数
- func (e *ExtractTask) InitClearFn() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string][]string{}
- for _, tmp := range *list {
- field := tmp["s_field"].(string)
- fns := tmp["clear"].([]interface{})
- if fn[field] == nil {
- fn[field] = []string{}
- }
- for _, v := range fns {
- fn[field] = append(fn[field], v.(string))
- }
- }
- e.ClearFn = fn
- }
- //加载省份
- func InitProvince(version string) map[string]interface{} {
- defer qu.Catch()
- fn := map[string]interface{}{}
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- content := v["content"]
- switch content.(type) {
- case string:
- fn[name] = []interface{}{content.(string)}
- case []interface{}:
- fn[name] = content
- }
- }
- return fn
- }
- //加载城市简称
- func InitCitySim(version string) map[string]map[string]interface{} {
- defer qu.Catch()
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string]map[string]interface{}{}
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- tmp := v["content"].(map[string]interface{})
- fn[name] = tmp
- }
- return fn
- }
- //加载城市全称
- func InitCityAll(version string) map[string]map[string]interface{} {
- defer qu.Catch()
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string]map[string]interface{}{}
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- tmp := v["content"].(map[string]interface{})
- fn[name] = tmp
- }
- return fn
- }
- //初始化城市省份敏感词
- func (e *ExtractTask) InitDFA() {
- defer qu.Catch()
- e.AreaGet = &ju.DFA{}
- e.AreaDistrict = &ju.DFA{}
- e.AreaProvinceGet = &ju.DFA{}
- e.AreaStreet = &ju.DFA{}
- //初始化map
- if e.ProvinceMap == nil {
- e.ProvinceMap = make(map[string]string)
- }
- if e.CityBrief == nil {
- e.CityBrief = make(map[string]*City)
- }
- if e.ProvinceBrief == nil {
- e.ProvinceBrief = make(map[string]*Province)
- }
- if e.AreaToCity == nil {
- e.AreaToCity = make(map[string][]*City)
- }
- if e.DistrictCityMap == nil {
- e.DistrictCityMap = make(map[string]*City)
- }
- if e.StreetDistrictMap == nil {
- e.StreetDistrictMap = make(map[string]*District)
- }
- //初始化省
- fn1 := InitProvince(e.TaskInfo.Version)
- for k, v := range fn1 {
- for _, p := range v.([]interface{}) {
- p1, _ := p.(string)
- e.AreaProvinceGet.AddWord(p1)
- e.ProvinceMap[p1] = k
- }
- }
- //初始化城市全称
- fn2 := InitCityAll(e.TaskInfo.Version)
- for k, v := range fn2 {
- e.AreaProvinceGet.AddWord(k) //省全称
- p := &Province{}
- p.Name = k
- p.Brief = v["brief"].(string)
- e.ProvinceMap[k] = p.Brief
- //
- e.ProvinceBrief[p.Brief] = p
- p.Cap = v["captial"].(string)
- city, _ := v["city"].(map[string]interface{})
- for k1, v1 := range city {
- v1m, _ := v1.(map[string]interface{})
- c := &City{}
- c.Name = k1
- // if v1m["brief"] == nil {
- // }
- c.Brief = v1m["brief"].(string)
- //
- e.CityBrief[c.Brief] = c
- c.P = p
- if c.Brief == p.Cap {
- p.Captial = c
- }
- //加入到城市map中
- //
- cs := e.AreaToCity[k1]
- e.AreaGet.AddWord(k1) //市全称
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- e.AreaToCity[k1] = cs
- //区县
- districtmap := v1m["area"].(map[string]interface{}) //区或县
- for district, streetarr := range districtmap {
- d := &District{}
- d.Name = district
- d.C = c
- e.AreaDistrict.AddWord(district) //加入区或县敏感词
- ctmp := e.DistrictCityMap[district]
- if ctmp == nil {
- e.DistrictCityMap[district] = c
- }
- //街道
- for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
- e.AreaStreet.AddWord(s) //加入街道敏感词
- dtmp := e.StreetDistrictMap[s]
- if dtmp == nil {
- e.StreetDistrictMap[s] = d
- }
- }
- }
- }
- }
- //初始化城市简称
- fn3 := InitCitySim(e.TaskInfo.Version)
- e.AreaSimGet = &ju.DFA{}
- for k, v := range fn3 {
- pb := v["brief"].(string)
- p := e.ProvinceBrief[pb]
- //加载
- for _, ss := range []string{k, pb} {
- cs := e.AreaToCity[ss]
- if cs != nil {
- cs = append(cs, p.Captial)
- } else {
- cs = []*City{p.Captial}
- }
- e.AreaToCity[ss] = cs
- e.AreaSimGet.AddWord(ss) //省全称和省简称
- }
- city, _ := v["city"].(map[string]interface{})
- for k1, v1 := range city {
- v1m, _ := v1.(map[string]interface{})
- if v1m["brief"] == nil {
- }
- cb := v1m["brief"].(string)
- c := e.AreaToCity[k1][0]
- //加入到城市map中
- for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
- e.AreaSimGet.AddWord(ss)
- cs := e.AreaToCity[ss]
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- e.AreaToCity[ss] = cs
- }
- arr := v1m["area"].([]interface{})
- for _, k2 := range arr {
- s := k2.(string)
- for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
- cs := e.AreaToCity[ss]
- e.AreaSimGet.AddWord(ss)
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- e.AreaToCity[ss] = cs
- //只加入简称
- if n == 0 {
- d := &District{}
- d.Name = ss
- d.C = c
- e.AreaDistrict.AddWord(ss) //加入区或县简称敏感词
- ctmp := e.DistrictCityMap[ss]
- if ctmp == nil {
- e.DistrictCityMap[ss] = c
- }
- }
- }
- }
- }
- }
- }
- //保存抽取详情数据
- func (e *ExtractTask) ResultSave(init bool) {
- defer qu.Catch()
- if e.ResultArr == nil {
- e.ResultArr = [][]map[string]interface{}{}
- }
- if init {
- go func() {
- for {
- if len(e.ResultArr) > 500 {
- arr := e.ResultArr[:500]
- qu.Try(func() {
- db.Mgo.UpSertBulk("extract_result", arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- e.ResultArr = e.ResultArr[500:]
- } else {
- arr := e.ResultArr
- qu.Try(func() {
- db.Mgo.UpSertBulk("extract_result", arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- e.ResultArr = [][]map[string]interface{}{}
- }
- time.Sleep(10 * time.Second)
- }
- }()
- } else {
- arr := e.ResultArr
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- e.ResultArr = [][]map[string]interface{}{}
- }
- }
- //保存抽取数据
- func (e *ExtractTask) BidSave(init bool) {
- defer qu.Catch()
- if e.BidArr == nil {
- e.BidArr = [][]map[string]interface{}{}
- }
- if init {
- go func() {
- for {
- if len(e.BidArr) > 500 {
- arr := e.BidArr[:500]
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- e.BidArr = e.BidArr[500:]
- } else {
- arr := e.BidArr
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- e.BidArr = [][]map[string]interface{}{}
- }
- time.Sleep(10 * time.Second)
- }
- }()
- } else {
- arr := e.BidArr
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- e.BidArr = [][]map[string]interface{}{}
- time.Sleep(1 * time.Second)
- }
- }
- func (e *ExtractTask) InitAuditRecogField() {
- defer qu.Catch()
- e.RecogFieldMap = make(map[string]map[string]interface{})
- recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
- for _, f := range *recogFieldList {
- field := qu.ObjToString(f["s_recogfield"])
- e.RecogFieldMap[field] = f
- }
- }
- func (e *ExtractTask) InitAuditClass() {
- defer qu.Catch()
- e.FidClassMap = make(map[string][]map[string]interface{})
- class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
- for _, c := range *class {
- classList := []map[string]interface{}{}
- fid := qu.ObjToString(c["s_fid"])
- if len(e.FidClassMap[fid]) > 0 { //追加
- classList = e.FidClassMap[fid]
- }
- classList = append(classList, c)
- e.FidClassMap[fid] = classList
- }
- }
- //加载规则
- func (e *ExtractTask) InitAuditRule() {
- defer qu.Catch()
- var rureg *regexp.Regexp
- var rs []rune
- var ru string
- var err error
- e.CidRuleMap = make(map[string][]map[string]interface{})
- rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
- for _, v := range *rule {
- i_rule := []interface{}{}
- ss, _ := (v["s_rule"].([]interface{}))
- for _, r := range qu.ObjArrToStringArr(ss) {
- if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
- rs = []rune(r)
- ru = string(rs[1 : len(rs)-1])
- rureg, err = regexp.Compile(ru)
- if err != nil {
- log.Debug("error---rule:", r)
- continue
- }
- i_rule = append(i_rule, []interface{}{rureg}...)
- } else { //规则
- i_rule = append(i_rule, r)
- }
- }
- v["rule"] = i_rule
- ruleList := []map[string]interface{}{}
- classid := qu.ObjToString(v["s_classid"])
- if len(e.CidRuleMap[classid]) > 0 { //追加
- ruleList = e.CidRuleMap[classid]
- }
- ruleList = append(ruleList, v)
- e.CidRuleMap[classid] = ruleList
- }
- }
- //
- func (e *ExtractTask) InitAuditFields() {
- if len(e.AuditFields) == 0 {
- v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
- if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
- vid := qu.BsonIdToSId((*v)["_id"])
- query := map[string]interface{}{
- "isaudit": true,
- "delete": false,
- "vid": vid,
- }
- data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
- for _, d := range *data {
- field := qu.ObjToString(d["s_field"])
- e.AuditFields = append(e.AuditFields, field)
- }
- }
- }
- }
- //加载附件抽取
- func (e *ExtractTask) InitFile() {
- defer qu.Catch()
- //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
- ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
- //ve, _ := db.Mgo.FindOne("version", query)
- if ve == nil {
- return
- }
- if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
- e.IsFileField = true
- }
- syscefiled := new(sync.Map)
- if (*ve)["s_filefileds"] != nil {
- for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
- syscefiled.Store(vff.(string),1)
- }
- }
- e.FileFields = syscefiled
- }
- //加载清理任务信息
- func (c *ClearTask) InitClearTaskInfo() {
- cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
- if len(*cleartask) > 1 {
- v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
- c.ClearTaskInfo = &ClearTaskInfo{
- Name: (*cleartask)["s_taskname"].(string),
- Version: (*cleartask)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
- FromDB: (*cleartask)["s_mgodb"].(string),
- FromColl: (*cleartask)["s_mgocoll"].(string),
- IsCltLog: ju.Config["iscltlog"].(bool),
- ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
- }
- log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
- } else {
- return
- }
- }
- //加载清理脚本
- func (c *ClearTask) InitClearLuas() {
- defer qu.Catch()
- c.ClearLuas = make(map[string][]*ClearLua)
- list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, l := range *list {
- if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
- continue
- }
- s_field := qu.ObjToString(l["s_field"])
- pid := qu.BsonIdToSId(l["_id"])
- luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *luas {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- clearLua := &ClearLua{
- Field: s_field,
- Code: vv["s_code"].(string),
- Name: vv["s_name"].(string),
- LuaText: vv["s_luascript"].(string),
- LFields: getALLFields(),
- }
- c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
- }
- }
- }
|