|
- // extractInit
- package extract
- import (
- "github.com/sensitive"
- db "jy/mongodbutil"
- ju "jy/util"
- qu "qfw/util"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "sync"
- "time"
- "gopkg.in/mgo.v2/bson"
- log "github.com/donnie4w/go-logger/logger"
- "github.com/go-ego/gse"
- )
- type RegLuaInfo struct {
- //正则或脚本信息
- Code, Name, Field string //
- Score float64
- RuleText string //
- IsLua bool //
- RegPreBac *ExtReg //
- RegCore *ExtReg //
- }
- type ExtReg struct {
- Reg *regexp.Regexp
- Replace string
- Bextract bool
- ExtractPos map[string]int
- NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1)
- }
- type RuleCore struct {
- Id string //id
- Field string //逻辑字段
- LuaLogic string //进入逻辑
- ExtFrom string //从哪个字段抽取
- RulePres []*RegLuaInfo //抽取前置规则
- RuleBacks []*RegLuaInfo //抽取后置规则
- RuleCores []*RegLuaInfo //抽取规则
- KVRuleCores []*RegLuaInfo //KV抽取清理规则
- LFields map[string]string //所有字段属性组
- }
- type Tag struct {
- Type string //标签类型 string 字符串、regexp 正则
- Key string //
- Reg *regexp.Regexp //
- }
- type TaskInfo struct {
- Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
- FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
- ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名
- TestColl, LastExtId string //测试结果表、上次抽取信息id
- FDB *db.Pool //数据库连接池
- TDB *db.Pool //数据库连接池
- IsEtxLog bool //是否开启抽取日志
- ProcessPool chan bool //任务进程池
- TestLua bool //检查测试用
- }
- type ExtractTask struct {
- Id string //任务id
- IsRun bool //是否启动
- Content string //信息内容
- TaskInfo *TaskInfo //任务信息
- RulePres []*RegLuaInfo //通用前置规则
- RuleBacks []*RegLuaInfo //通用后置规则
- SiteRuleBacks []*RegLuaInfo //站点通用后置规则
- RuleBlock *ju.RuleBlock
- RuleCores map[string]map[string][]*RuleCore //分类抽取规则
- SiteRuleCores map[string]map[string][]*RuleCore //站点分类抽取规则
- PkgRuleCores []*RuleCore //分包抽取规则
- Tag map[string][]*Tag //标签库
- SiteTag map[string][]*Tag //站点标签库
- ClearFn map[string][]string //清理函数
- SiteClearFn map[string][]string //站点清理函数
- IsExtractCity bool //是否开启城市抽取
- Fields map[string]int //抽取属性组
- SiteFields map[string]int //抽取站点属性组
- IsFileField bool //是否开启附件抽取
- FileFields *sync.Map //抽取附件属性组
- ResultChanel chan bool //抽取结果详情
- sync.RWMutex
- ResultArr [][]map[string]interface{} //抽取结果详情
- BidChanel chan bool //抽取结果
- BidArr [][]map[string]interface{} //抽取结果
- BidTotal int //结果数量
- RecogFieldMap map[string]map[string]interface{} //识别字段
- FidClassMap map[string][]map[string]interface{} //分类
- CidRuleMap map[string][]map[string]interface{} //规则
- AuditFields []string //需要审核的字段名称
- SiteCityMap map[string]*SiteCity //站点对应的省市区
- ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江)
- ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
- CityMap map[string]string //市全称简称(key:杭州市 val:杭州)
- CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{})
- CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{})
- DistrictCityMap map[string][]*City //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
- DistrictSimAndAll map[string][]map[string]*City //区或县简称对应的city(全国有相同名称的区或县,这里对应的city用slice)
- StreetDistrictMap map[string][]*District //街道全称对应的区或县
- ProvinceAllGet *ju.DFA //省全称
- ProvinceSimGet *ju.DFA //省简称
- CityAllGet *ju.DFA //市全称
- CitySimGet *ju.DFA //市简称
- DistrictAllGet *ju.DFA //区或县全称
- DistrictSimGet *ju.DFA //区或县简称
- StreetGet *ju.DFA //街道
- PostCodeMap map[string]*PostCode //邮编
- AreaCodeMap map[string]*AreaCode //区号
- XjbtCityArr []map[string]interface{} //新疆兵团相关数据
- SensitiveFullCity *sensitive.Filter
- SensitiveSimCity *sensitive.Filter
- InfoType []map[string]interface{}
- Trie_Full_Province *ju.Trie //省全称 省、直辖市、自治区
- Trie_Full_City *ju.Trie //市全称 地级市
- Trie_Full_District *ju.Trie //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
- Trie_Full_Street *ju.Trie //街道、乡镇全称 镇、乡、民族乡、县辖区、街道
- Trie_Full_Community *ju.Trie //村/委员会全称 村、居委会
- Trie_Sim_Province *ju.Trie //省简称
- Trie_Sim_City *ju.Trie //市简称
- Trie_Sim_District *ju.Trie //县简称
- Trie_Fulls []*ju.Trie //所有全称
- Trie_Sims []*ju.Trie //所有简称
- Seg_PCD *gse.Segmenter //分词
- Seg_SV *gse.Segmenter //分词
- Luacodes *sync.Map //站点规则
- SiteMerge *sync.Map //抽取合并
- }
- type SiteCity struct {
- P string //省简称
- C string //市全称
- D string //区全称
- }
- type ClearTaskInfo struct {
- Name, Version, VersionId string //名称、版本、版本id
- FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名
- FDB *db.Pool //数据库连接池
- TDB *db.Pool //数据库连接池
- IsCltLog bool //是否开启清理日志
- ProcessPool chan bool //任务进程池
- }
- type ClearLua struct {
- Field string //字段字段
- Code string //代码
- Name string //名称
- LuaText string
- LFields map[string]string //lua抽取字段属性组
- //LuaLogic string //进入逻辑
- //ExtFrom string //从哪个字段抽取
- }
- type ClearTask struct {
- sync.RWMutex
- Id string //任务id
- Content string //信息内容
- ClearTaskInfo *ClearTaskInfo //任务信息
- ClearLuas map[string][]*ClearLua //清理脚本
- UpdateResult [][]map[string]interface{} //清理后结果
- //ClearChannel chan bool
- }
- func init() {
- TaskList = make(map[string]*ExtractTask)
- ClearTaskList = make(map[string]*ClearTask)
- go SaveExtLog()
- go SaveCltLog() //保存清理日志
- }
- //加载任务信息
- func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- TrackColl: trackcoll,
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- TestColl: resultcoll,
- IsEtxLog: true,
- ProcessPool: make(chan bool, 1),
- }
- if (*v)["isextractcity"] != nil {
- e.IsExtractCity = (*v)["isextractcity"].(bool)
- }
- } else {
- return
- }
- }
- //加载任务信息
- func (e *ExtractTask) InitTaskInfo() {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- log.Debug("task", task, "~", e.Id)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- strs := strings.Split((*task)["s_mgosavecoll"].(string), "/")
- log.Debug("s_mgosavecoll", strs)
- if len(strs) < 3 {
- return
- } else {
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- //TrackColl: (*task)["s_trackcoll"].(string),
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- ToDbAddr: strs[0],
- ToDB: strs[1],
- ToColl: strs[2],
- IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
- LastExtId: qu.ObjToString((*task)["s_extlastid"]),
- ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
- }
- if (*v)["isextractcity"] != nil {
- e.IsExtractCity = (*v)["isextractcity"].(bool)
- }
- }
- log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1))
- } else {
- return
- }
- }
- func (e *ExtractTask) InitSite() {
- e.Luacodes = &sync.Map{}
- e.SiteMerge = &sync.Map{}
- sites, _ := db.Mgo.Find("site_management", bson.M{"version": e.TaskInfo.Version}, nil, bson.M{"site_script": 1, "ismerge": 1}, false, -1, -1)
- for _, v := range *sites {
- if vv, ok := v["site_script"].([]interface{}); ok {
- for _, vvv := range vv {
- e.Luacodes.Store(vvv, map[string]interface{}{})
- e.SiteMerge.Store(vvv, v["ismerge"].(bool))
- }
- } else if vv, ok := v["site_script"].(interface{}); ok {
- e.Luacodes.Store(vv, map[string]interface{}{})
- e.SiteMerge.Store(vv, v["ismerge"].(bool))
- }
- }
- }
- //加载通用前置规则
- func (e *ExtractTask) InitRulePres() {
- defer qu.Catch()
- e.RulePres = []*RegLuaInfo{}
- list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- e.RulePres = append(e.RulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- e.RulePres = append(e.RulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- }
- //加载通用后置规则
- func (e *ExtractTask) InitRuleBacks(isSite bool) {
- defer qu.Catch()
- cDB := ""
- eSiteRuleBacks := []*RegLuaInfo{}
- if isSite {
- cDB = "site_rule_back"
- e.SiteRuleBacks = []*RegLuaInfo{}
- } else {
- cDB = "rule_back"
- e.RuleBacks = []*RegLuaInfo{}
- }
- list, _ := db.Mgo.Find(cDB, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- if isSite {
- eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
- //e.SiteRuleBacks = append(e.SiteRuleBacks, rinfo)
- } else {
- e.RuleBacks = append(e.RuleBacks, rinfo)
- }
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- if isSite {
- eSiteRuleBacks = append(eSiteRuleBacks, rinfo)
- } else {
- e.RuleBacks = append(e.RuleBacks, rinfo)
- }
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- if isSite {
- sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
- if (*sm) == nil || len(*sm) <= 0 {
- eSiteRuleBacks = []*RegLuaInfo{}
- continue
- }
- for _, v2 := range (*sm)["site_script"].([]interface{}) {
- if mdpvalue, ok := e.Luacodes.Load(v2); ok {
- if mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] == nil {
- mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = eSiteRuleBacks
- } else {
- if tmplist, ok3 := mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo); ok3 {
- tmplist = append(tmplist, eSiteRuleBacks...)
- mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = tmplist
- }
- //mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) = append(mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo), eSiteRuleBacks...)
- }
- e.Luacodes.Store(v2, mdpvalue)
- }
- }
- eSiteRuleBacks = []*RegLuaInfo{}
- }
- }
- }
- func (e *ExtractTask) InfoTypeList() {
- infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
- infolist := *infolist1
- for _, v := range infolist {
- e.InfoType = append(e.InfoType, v)
- }
- }
- //加载抽取规则
- func (e *ExtractTask) InitRuleCore(isSite bool) {
- defer qu.Catch()
- allFields := getALLFields()
- var versioninfodb, rule_logicdb, rule_logicpredb, rule_logicbackdb, rule_logicoredb, rule_logickvdb string
- eSiteRuleCores := make(map[string]map[string][]*RuleCore)
- if isSite {
- versioninfodb = "site_versioninfo"
- rule_logicdb = "site_rule_logic"
- rule_logicpredb = "site_rule_logicpre"
- rule_logicbackdb = "site_rule_logicback"
- rule_logicoredb = "site_rule_logicore"
- rule_logickvdb = "site_rule_logickv"
- e.SiteFields = map[string]int{}
- e.SiteRuleCores = make(map[string]map[string][]*RuleCore)
- } else {
- versioninfodb = "versioninfo"
- rule_logicdb = "rule_logic"
- rule_logicpredb = "rule_logicpre"
- rule_logicbackdb = "rule_logicback"
- rule_logicoredb = "rule_logicore"
- rule_logickvdb = "rule_logickv"
- e.Fields = map[string]int{}
- e.RuleCores = make(map[string]map[string][]*RuleCore)
- }
- fieldrules := map[string][]*RuleCore{}
- vinfos, _ := db.Mgo.Find(versioninfodb, `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vinfo := range *vinfos {
- //fmt.Println("总计",len(*vinfos),"当前第N个",kkkk)
- if b, _ := vinfo["isuse"].(bool); !b {
- continue
- }
- s_field := qu.ObjToString(vinfo["s_field"])
- pid := qu.BsonIdToSId(vinfo["_id"])
- list, _ := db.Mgo.Find(rule_logicdb, `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *list {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])}
- rcore.Field = s_field
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- rcore.LFields = allFields
- //前置规则
- rulePres := []*RegLuaInfo{}
- plist, _ := db.Mgo.Find(rule_logicpredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *plist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- rulePres = append(rulePres, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- rulePres = append(rulePres, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RulePres = rulePres
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find(rule_logicbackdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find(rule_logicoredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- field := qu.ObjToString(v["s_field"])
- if isSite {
- e.SiteFields[field] = 1
- } else {
- e.Fields[field] = 1 //加入抽取属性组备用
- }
- rinfo := &RegLuaInfo{
- Field: field,
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- Score: qu.Float64All(v["s_default_score"]),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //提取全部属性
- ruleCores = append(ruleCores, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
- }
- ruleCores = append(ruleCores, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleCores = ruleCores
- //kv规则
- kvRuleCores := []*RegLuaInfo{}
- kvlist, _ := db.Mgo.Find(rule_logickvdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *kvlist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- field := qu.ObjToString(v["s_field"])
- if isSite {
- e.SiteFields[field] = 1
- } else {
- e.Fields[field] = 1 //加入抽取属性组备用
- }
- rinfo := &RegLuaInfo{
- Field: field,
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- kvRuleCores = append(kvRuleCores, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- rcore.KVRuleCores = kvRuleCores
- if fieldrules[s_field] == nil {
- fieldrules[s_field] = []*RuleCore{}
- }
- fieldrules[s_field] = append(fieldrules[s_field], rcore)
- }
- infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
- for _, v := range *infolist {
- topclass := qu.ObjToString(v["topclass"])
- if v["subclass"] == nil {
- eSiteRuleCores[topclass] = make(map[string][]*RuleCore)
- for attr, _ := range v["fields"].(map[string]interface{}) {
- if fieldrules[attr] != nil {
- eSiteRuleCores[topclass][attr] = fieldrules[attr]
- }
- }
- } else {
- for ca, fs := range v["subclass"].(map[string]interface{}) {
- eSiteRuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
- for field, _ := range fs.(map[string]interface{}) {
- if fieldrules[field] != nil {
- eSiteRuleCores[topclass+"_"+ca][field] = fieldrules[field]
- }
- }
- }
- }
- }
- if isSite {
- sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(vinfo["pid"]), bson.M{"site_script": 1})
- if (*sm) == nil || len(*sm) <= 0 {
- eSiteRuleCores = make(map[string]map[string][]*RuleCore)
- fieldrules = map[string][]*RuleCore{}
- continue
- }
- for _, v2 := range (*sm)["site_script"].([]interface{}) {
- if mdpvalue, ok := e.Luacodes.Load(v2); ok {
- //属性配置
- if mdpvalue.(map[string]interface{})["e.SiteRuleCores"] == nil {
- mdpvalue.(map[string]interface{})["e.SiteRuleCores"] = eSiteRuleCores
- } else {
- for k2, v2 := range eSiteRuleCores {
- tmpv := mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2]
- for kkkk, vvv := range v2 {
- tmpv[kkkk] = vvv
- }
- mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] = tmpv
- }
- }
- e.Luacodes.Store(v2, mdpvalue)
- }
- }
- eSiteRuleCores = make(map[string]map[string][]*RuleCore)
- fieldrules = map[string][]*RuleCore{}
- }
- }
- if !isSite {
- //属性配置
- infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1)
- for _, v := range *infolist {
- topclass := qu.ObjToString(v["topclass"])
- if v["subclass"] == nil {
- e.RuleCores[topclass] = make(map[string][]*RuleCore)
- for attr, _ := range v["fields"].(map[string]interface{}) {
- if fieldrules[attr] != nil {
- e.RuleCores[topclass][attr] = fieldrules[attr]
- }
- }
- } else {
- for ca, fs := range v["subclass"].(map[string]interface{}) {
- e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore)
- for field, _ := range fs.(map[string]interface{}) {
- if fieldrules[field] != nil {
- e.RuleCores[topclass+"_"+ca][field] = fieldrules[field]
- }
- }
- }
- }
- }
- }
- }
- //加载分包抽取规则
- func (e *ExtractTask) InitPkgCore() {
- defer qu.Catch()
- e.PkgRuleCores = []*RuleCore{}
- pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, pkginfo := range *pkginfos {
- if b, _ := pkginfo["isuse"].(bool); !b {
- continue
- }
- s_field := qu.ObjToString(pkginfo["s_field"])
- sid := qu.BsonIdToSId(pkginfo["_id"])
- rcore := &RuleCore{}
- rcore.Field = s_field
- rcore.ExtFrom = "detail"
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Field: qu.ObjToString(v["s_field"]),
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- ruleBacks = append(ruleBacks, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""}
- }
- ruleBacks = append(ruleBacks, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find("pkg_logicore", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- field := qu.ObjToString(v["s_field"])
- e.Fields[field] = 1 //加入抽取属性组备用
- rinfo := &RegLuaInfo{
- Field: field,
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //提取全部属性
- ruleCores = append(ruleCores, rinfo)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- var pattern string
- if strings.Contains(tmp[0], "\\u") {
- tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
- tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
- pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
- } else {
- pattern = tmp[0]
- }
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false}
- }
- ruleCores = append(ruleCores, rinfo)
- }, func(err interface{}) {
- log.Debug(rinfo.Code, rinfo.Field, err)
- })
- }
- }
- rcore.RuleCores = ruleCores
- e.PkgRuleCores = append(e.PkgRuleCores, rcore)
- }
- }
- //加载标签库
- func (e *ExtractTask) InitTag(isSite bool) {
- defer qu.Catch()
- var tagdetailinfodb string
- eSiteTag := map[string][]*Tag{}
- if isSite {
- tagdetailinfodb = "site_tagdetailinfo"
- e.SiteTag = map[string][]*Tag{}
- } else {
- tagdetailinfodb = "tagdetailinfo"
- e.Tag = map[string][]*Tag{}
- }
- //字符串标签库
- list, _ := db.Mgo.Find(tagdetailinfodb, `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- var tmpMap sync.Map
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- fname := qu.ObjToString(v["s_name"])
- tab := ju.TagFile{Name: fname} //用于表格kv
- tab.Items = make([]*ju.Tag, len(tmp))
- for k, key := range tmp {
- tag := &Tag{Type: "string", Key: key.(string)}
- if isSite {
- eSiteTag[field] = append(eSiteTag[field], tag)
- //e.SiteTag[field] = append(e.SiteTag[field], tag)
- } else {
- e.Tag[field] = append(e.Tag[field], tag)
- }
- tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, nil, false}
- }
- sort.Sort(tab.Items)
- //ju.TagdbTable[fname] = &tab
- if isSite {
- sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
- if (*sm) == nil || len(*sm) <= 0 {
- eSiteTag = map[string][]*Tag{}
- continue
- }
- for _, v2 := range (*sm)["site_script"].([]interface{}) {
- if v2 == nil || v2 == "" {
- continue
- }
- if mdpvalue, ok := e.Luacodes.Load(v2); ok {
- if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
- mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
- } else {
- for k2, v2 := range eSiteTag {
- mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
- }
- }
- e.Luacodes.Store(v2, mdpvalue)
- }
- tmpMap.Store(fname, &tab)
- ju.SiteTagdbTable.Store(v2, tmpMap)
- }
- //ju.SiteTagdbTable.Store(fname, &tab)
- eSiteTag = map[string][]*Tag{}
- } else {
- ju.TagdbTable.Store(fname, &tab)
- }
- }
- //if isSite {
- // sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
- // for _, v2 := range (*sm)["site_script"].([]interface{}) {
- // if mdpvalue, ok := Luacodes.Load(v2); ok {
- // if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil{
- // mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
- // }else {
- // for k2,v2 := range eSiteTag{
- // mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
- // }
- // }
- // Luacodes.Store(v2, mdpvalue)
- // }
- // }
- // eSiteTag = map[string][]*Tag{}
- //}
- }
- //正则标签库
- list, _ = db.Mgo.Find(tagdetailinfodb, `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- fname := qu.ObjToString(v["s_name"])
- tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv
- tab.Items = make([]*ju.Tag, len(tmp))
- for k, key := range tmp {
- tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
- if isSite {
- eSiteTag[field] = append(eSiteTag[field], tag)
- //e.SiteTag[field] = append(e.SiteTag[field], tag)
- } else {
- e.Tag[field] = append(e.Tag[field], tag)
- }
- tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, regexp.MustCompile(key.(string)), false}
- }
- sort.Sort(tab.Items)
- //ju.TagdbTable[fname+"_reg"] = &tab
- if isSite {
- ju.SiteTagdbTable.Store(fname+"_reg", &tab)
- } else {
- ju.TagdbTable.Store(fname+"_reg", &tab)
- }
- }
- if isSite {
- sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1})
- if (*sm) == nil || len(*sm) <= 0 {
- eSiteTag = map[string][]*Tag{}
- continue
- }
- for _, v2 := range (*sm)["site_script"].([]interface{}) {
- if mdpvalue, ok := e.Luacodes.Load(v2); ok {
- if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil {
- mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag
- } else {
- for k2, v2 := range eSiteTag {
- mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2
- }
- }
- e.Luacodes.Store(v2, mdpvalue)
- }
- }
- eSiteTag = map[string][]*Tag{}
- }
- }
- }
- //获取fields
- func getALLFields() map[string]string {
- fields := map[string]string{}
- list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1)
- for _, v := range *list {
- fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"])
- }
- return fields
- }
- //加载clear函数
- func (e *ExtractTask) InitClearFn(isSite bool) {
- defer qu.Catch()
- var cleanupdb string
- if isSite {
- cleanupdb = "site_cleanup"
- e.SiteClearFn = map[string][]string{}
- } else {
- cleanupdb = "cleanup"
- }
- list, _ := db.Mgo.Find(cleanupdb, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string][]string{}
- for _, tmp := range *list {
- field := tmp["s_field"].(string)
- fns := tmp["clear"].([]interface{})
- if fn[field] == nil {
- fn[field] = []string{}
- }
- for _, v := range fns {
- fn[field] = append(fn[field], v.(string))
- }
- if isSite {
- sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(tmp["pid"]), bson.M{"site_script": 1})
- if (*sm) == nil || len(*sm) <= 0 {
- fn = map[string][]string{}
- continue
- }
- for _, v2 := range (*sm)["site_script"].([]interface{}) {
- if mdpvalue, ok := e.Luacodes.Load(v2); ok {
- if mdpvalue.(map[string]interface{})["e.SiteClearFn"] == nil {
- mdpvalue.(map[string]interface{})["e.SiteClearFn"] = fn
- } else {
- for k2, v2 := range fn {
- mdpvalue.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)[k2] = v2
- }
- }
- e.Luacodes.Store(v2, mdpvalue)
- }
- }
- fn = map[string][]string{}
- }
- }
- if !isSite {
- e.ClearFn = fn
- }
- }
- //加载省份
- func InitProvince(version string) map[string]interface{} {
- defer qu.Catch()
- fn := map[string]interface{}{}
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- content := v["content"]
- switch content.(type) {
- case string:
- fn[name] = []interface{}{content.(string)}
- case []interface{}:
- fn[name] = content
- }
- }
- return fn
- }
- //加载所有
- func InitProvincesx() []map[string]interface{} {
- defer qu.Catch()
- provinces := make([]map[string]interface{}, 0)
- ju.AddrsSess.Find(map[string]interface{}{
- "Remarks": nil,
- }).All(&provinces)
- return provinces
- }
- //加载站点库site城市信息
- func InitSite() []map[string]interface{} {
- defer qu.Catch()
- query := map[string]interface{}{
- "site_type": map[string]interface{}{
- "$ne": "代理机构",
- },
- }
- list, _ := ju.Site_Mgo.Find("site", query, nil, map[string]interface{}{
- "site": 1,
- "area": 1,
- "city": 1,
- "district": 1,
- })
- return list
- //list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1)
- //return *list
- }
- //加载新疆兵团映射关系
- func (e *ExtractTask) InitXjbtCityInfo() {
- defer qu.Catch()
- //加载数据
- query := map[string]interface{}{}
- list, _ := db.Mgo.Find("area_xjbt", query, nil, nil, false, -1, -1)
- arr := []map[string]interface{}{}
- for _, v := range *list {
- delete(v, "_id")
- arr = append(arr, v)
- }
- e.XjbtCityArr = arr
- }
- //站点加载...
- func (e *ExtractTask) InitUpdateSite() {
- defer qu.Catch()
- e.SiteCityMap = make(map[string]*SiteCity)
- for _, v := range InitSite() {
- site := qu.ObjToString(v["site"])
- area := qu.ObjToString(v["area"])
- city := qu.ObjToString(v["city"])
- district := qu.ObjToString(v["district"])
- if area != "" && area != "全国" && site != "" {
- s := &SiteCity{
- P: area,
- C: city,
- D: district,
- }
- e.SiteCityMap[site] = s
- }
- }
- log.Debug("有效站点数量:", len(e.SiteCityMap))
- }
- func (e *ExtractTask) InitCityInfo() {
- defer qu.Catch()
- e.InitVar() //初始化变量
- //新疆兵团数据
- e.InitXjbtCityInfo()
- //site站点信息
- e.InitUpdateSite()
- //初始化省信息
- fn1 := InitProvince(e.TaskInfo.Version)
- for k, v := range fn1 {
- for _, p := range v.([]interface{}) {
- p1, _ := p.(string)
- e.Trie_Full_Province.AddWords(p1) //华中科技大学
- e.ProvinceMap[p1] = k //华中科技大学:湖北
- }
- }
- alldata := InitProvincesx()
- fnx := make([]map[string]interface{}, 0)
- citys_maps := make(map[string][]map[string]interface{}, 0)
- districts_maps := make(map[string]map[string][]map[string]interface{}, 0)
- towns_maps := make(map[string]map[string]map[string][]map[string]interface{}, 0)
- jwhs_maps := make(map[string]map[string]map[string]map[string][]map[string]interface{}, 0)
- for _, v := range alldata {
- codenum := len(v["code"].(string))
- province := qu.ObjToString(v["province"])
- city := qu.ObjToString(v["city"])
- district := qu.ObjToString(v["district"])
- town := qu.ObjToString(v["town"])
- if codenum == 2 {
- fnx = append(fnx, v)
- } else if codenum == 4 {
- citys_maps[province] = append(citys_maps[province], v)
- } else if codenum == 6 {
- if districts_maps[province] == nil {
- districts_maps[province] = make(map[string][]map[string]interface{}, 0)
- }
- districts_maps[province][city] = append(districts_maps[province][city], v)
- } else if codenum == 9 {
- if towns_maps[province] == nil {
- towns_maps[province] = make(map[string]map[string][]map[string]interface{}, 0)
- }
- if towns_maps[province][city] == nil {
- towns_maps[province][city] = make(map[string][]map[string]interface{}, 0)
- }
- towns_maps[province][city][district] = append(towns_maps[province][city][district], v)
- } else if codenum == 12 {
- if jwhs_maps[province] == nil {
- jwhs_maps[province] = make(map[string]map[string]map[string][]map[string]interface{}, 0)
- }
- if jwhs_maps[province][city] == nil {
- jwhs_maps[province][city] = make(map[string]map[string][]map[string]interface{}, 0)
- }
- if jwhs_maps[province][city][district] == nil {
- jwhs_maps[province][city][district] = make(map[string][]map[string]interface{}, 0)
- }
- jwhs_maps[province][city][district][town] = append(jwhs_maps[province][city][district][town], v)
- }
- }
- //初始化城市全称
- for _, provinces := range fnx {
- all_province := qu.ObjToString(provinces["all_province"]) //省全称
- jc_province := qu.ObjToString(provinces["province"]) //省简称
- //加载省信息
- e.Trie_Full_Province.AddWords(all_province) //加入省全称Trie(k:浙江省)
- p := &Province{}
- p.Name = all_province //省全称:浙江省
- p.Brief = jc_province //省简称:浙江
- e.Trie_Sim_Province.AddWords(jc_province) //加入省简称Trie(k:浙江)
- e.ProvinceMap[all_province] = jc_province //浙江省:浙江
- e.ProvinceBriefMap[jc_province] = p //浙江:省信息{}
- if province_alias, ok := provinces["province_alias"].([]interface{}); ok {
- for _, vprovince_alias := range province_alias {
- e.ProvinceBriefMap[qu.ObjToString(vprovince_alias)] = p
- }
- }
- //加载市信息
- citys := citys_maps[jc_province]
- isok := make(map[string]bool)
- for _, vcity := range citys {
- qc_city := qu.ObjToString(vcity["city"])
- jc_city := qu.ObjToString(vcity["brief_city"])
- e.Trie_Full_City.AddWords(qc_city) //加入市全称Trie(k:杭州市)
- e.SensitiveFullCity.AddWord(qc_city)
- c := &City{}
- c.Name = qc_city //市全称:杭州市
- if jc_city != "" {
- c.Brief = jc_city //市简称:杭州
- e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州)
- e.SensitiveSimCity.AddWord(c.Brief)
- e.CityMap[qc_city] = c.Brief //杭州市:杭州
- e.CityBriefMap[c.Brief] = c //杭州:市信息{}
- e.CityFullMap[qc_city] = c //杭州市:市信息{}
- }
- c.P = p
- if city_alias, ok := vcity["city_alias"].([]interface{}); ok {
- for _, vcity_alias := range city_alias {
- strvcity_alias := qu.ObjToString(vcity_alias)
- if isok[jc_province+"_"+strvcity_alias] {
- continue
- }
- e.CityBriefMap[strvcity_alias] = c
- e.initDistricts(jc_province, strvcity_alias, c, jc_city, districts_maps, towns_maps, jwhs_maps)
- isok[jc_province+"_"+strvcity_alias] = true
- }
- }
- if isok[jc_province+"_"+qc_city] {
- continue
- }
- e.initDistricts(jc_province, qc_city, c, jc_city, districts_maps, towns_maps, jwhs_maps)
- }
- }
- e.Trie_Fulls = []*ju.Trie{e.Trie_Full_Province, e.Trie_Full_City, e.Trie_Full_District, e.Trie_Full_Street, e.Trie_Full_Community}
- e.Trie_Sims = []*ju.Trie{e.Trie_Sim_Province, e.Trie_Sim_City, e.Trie_Sim_District}
- }
- //加载区县
- func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
- jc_city string, districts_maps map[string]map[string][]map[string]interface{},
- towns_maps map[string]map[string]map[string][]map[string]interface{},
- jwhs_maps map[string]map[string]map[string]map[string][]map[string]interface{}) {
- districts := districts_maps[jc_province][qc_city]
- for _, vdistricts := range districts {
- qc_district := qu.ObjToString(vdistricts["district"])
- jc_district := qu.ObjToString(vdistricts["brief_district"])
- d := &District{}
- d.Name = qc_district
- d.C = c
- e.Trie_Full_District.AddWords(qc_district) //加入区或县全称Trie
- if jc_district != "" {
- e.Trie_Sim_District.AddWords(jc_district) //加入区或县简称Trie
- //初始化城市简称
- c := e.CityBriefMap[jc_city]
- dfullarr := e.DistrictSimAndAll[jc_district]
- dfullcity := map[string]*City{qc_district: c}
- if len(dfullarr) == 0 {
- tmparr := []map[string]*City{dfullcity}
- e.DistrictSimAndAll[jc_district] = tmparr
- } else {
- e.DistrictSimAndAll[jc_district] = append(e.DistrictSimAndAll[jc_district], dfullcity)
- }
- }
- ctmp := e.DistrictCityMap[qc_district]
- if len(ctmp) == 0 {
- tmpcarr := []*City{c}
- e.DistrictCityMap[qc_district] = tmpcarr
- } else {
- e.DistrictCityMap[qc_district] = append(e.DistrictCityMap[qc_district], c)
- }
- if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok {
- for _, vdistrict_alias := range district_alias {
- strvdistrict_alias := qu.ObjToString(vdistrict_alias)
- e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
- c_tmp := e.DistrictCityMap[strvdistrict_alias]
- if len(c_tmp) == 0 {
- tmpcarr := []*City{c}
- e.DistrictCityMap[strvdistrict_alias] = tmpcarr
- } else {
- e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c)
- }
- }
- }
- //街道
- towns := towns_maps[jc_province][qc_city][qc_district]
- for _, vtown := range towns {
- strvtown := qu.ObjToString(vtown["town"])
- s := &Street{}
- s.Name = strvtown
- s.D = d
- e.Trie_Full_Street.AddWords(strvtown) //加入街道全称Trie
- dtmp := e.StreetDistrictMap[strvtown]
- if len(dtmp) == 0 {
- tmpdarr := []*District{d}
- e.StreetDistrictMap[strvtown] = tmpdarr
- } else {
- e.StreetDistrictMap[strvtown] = append(e.StreetDistrictMap[strvtown], d)
- }
- //村、居委会
- //jwhs := jwhs_maps[jc_province][qc_city][qc_district][strvtown]
- //for _, vjwh := range jwhs {
- // strvillage := qu.ObjToString(vjwh["village"])
- // e.Trie_Full_Community.AddWords(strvillage) //加入居委会、村全称Trie
- // cttmp := e.CommunityDistrictMap[strvillage]
- // if len(cttmp) == 0 {
- // tmpdarr := []*District{d}
- // e.CommunityDistrictMap[strvillage] = tmpdarr
- // } else {
- // e.CommunityDistrictMap[strvillage] = append(e.CommunityDistrictMap[strvillage], d)
- // }
- //}
- }
- }
- }
- func (e *ExtractTask) InitVar() {
- defer qu.Catch()
- //初始化Trie
- //全称
- e.Trie_Full_Province = &ju.Trie{}
- e.Trie_Full_City = &ju.Trie{}
- e.Trie_Full_District = &ju.Trie{}
- e.Trie_Full_Street = &ju.Trie{}
- e.Trie_Full_Community = &ju.Trie{}
- //简称
- e.Trie_Sim_Province = &ju.Trie{}
- e.Trie_Sim_City = &ju.Trie{}
- e.Trie_Sim_District = &ju.Trie{}
- //初始化分词
- e.Seg_PCD = &gse.Segmenter{}
- e.Seg_SV = &gse.Segmenter{}
- e.Seg_PCD.LoadDict("./res/pcd.txt")
- e.Seg_SV.LoadDict("./res/sv.txt")
- //初始化城市相关
- e.SiteCityMap = make(map[string]*SiteCity)
- e.ProvinceMap = make(map[string]string)
- e.CityMap = make(map[string]string)
- e.DistrictSimAndAll = make(map[string][]map[string]*City)
- e.CityBriefMap = make(map[string]*City)
- e.CityFullMap = make(map[string]*City)
- e.ProvinceBriefMap = make(map[string]*Province)
- e.DistrictCityMap = make(map[string][]*City)
- e.StreetDistrictMap = make(map[string][]*District)
- //新疆兵团-数组
- e.XjbtCityArr = make([]map[string]interface{}, 0)
- //敏感词-筛选
- e.SensitiveFullCity = sensitive.New()
- e.SensitiveSimCity = sensitive.New()
- }
- //初始化邮编库
- func (e *ExtractTask) InitPostCode() {
- defer qu.Catch()
- e.PostCodeMap = make(map[string]*PostCode)
- list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
- for _, l := range *list {
- pc := &PostCode{}
- pc.Code = qu.ObjToString(l["code"])
- pc.P = qu.ObjToString(l["province"])
- pc.C = qu.ObjToString(l["city"])
- pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
- e.PostCodeMap[pc.Code] = pc
- }
- }
- //初始化区号库
- func (e *ExtractTask) InitAreaCode() {
- defer qu.Catch()
- e.AreaCodeMap = make(map[string]*AreaCode)
- list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
- for _, l := range *list {
- ac := &AreaCode{}
- ac.Code = qu.ObjToString(l["code"])
- ac.P = qu.ObjToString(l["province"])
- ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
- e.AreaCodeMap[ac.Code] = ac
- }
- }
- //保存抽取详情数据
- func (e *ExtractTask) ResultSave(init bool) {
- defer qu.Catch()
- e.RWMutex.Lock()
- if e.ResultArr == nil {
- e.ResultArr = [][]map[string]interface{}{}
- }
- e.RWMutex.Unlock()
- if init {
- go func() {
- for {
- e.RWMutex.Lock()
- if len(e.ResultArr) > saveLimit {
- arr := e.ResultArr[:saveLimit]
- e.ResultArr = e.ResultArr[saveLimit:]
- e.RWMutex.Unlock()
- qu.Try(func() {
- db.Mgo.UpSertBulk("extract_result", arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- } else {
- arr := e.ResultArr
- e.ResultArr = [][]map[string]interface{}{}
- e.RWMutex.Unlock()
- qu.Try(func() {
- db.Mgo.UpSertBulk("extract_result", arr...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- }
- time.Sleep(2 * time.Second)
- }
- }()
- } else {
- e.RWMutex.Lock()
- arr := e.ResultArr
- e.ResultArr = [][]map[string]interface{}{}
- e.RWMutex.Unlock()
- qu.Try(func() {
- lenarr := len(arr)
- for {
- if lenarr > saveLimit {
- arr2 := arr[:saveLimit]
- arr = arr[saveLimit:]
- lenarr = len(arr)
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
- } else {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- break
- }
- }
- }, func(err interface{}) {
- defer e.RWMutex.Unlock()
- log.Debug(err)
- })
- }
- }
- //保存抽取数据
- func (e *ExtractTask) BidSave(init bool) {
- defer qu.Catch()
- e.RWMutex.Lock()
- if e.BidArr == nil {
- e.BidArr = [][]map[string]interface{}{}
- }
- e.RWMutex.Unlock()
- if init {
- go func() {
- for {
- e.RWMutex.Lock()
- if len(e.BidArr) > saveLimit {
- arr := e.BidArr[:saveLimit]
- e.BidArr = e.BidArr[saveLimit:]
- e.RWMutex.Unlock()
- //arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr)
- arr, _, _, _ = getFieldAllAndBlocks(arr)
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- } else {
- arr := e.BidArr
- e.BidArr = [][]map[string]interface{}{}
- e.RWMutex.Unlock()
- arr, _, _, _ = getFieldAllAndBlocks(arr)
- qu.Try(func() {
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
- }, func(err interface{}) {
- log.Debug(err)
- })
- }
- time.Sleep(2 * time.Second)
- }
- }()
- } else {
- e.RWMutex.Lock()
- arr := e.BidArr
- e.BidArr = [][]map[string]interface{}{}
- e.RWMutex.Unlock()
- qu.Try(func() {
- lenarr := len(arr)
- for {
- if lenarr > saveLimit {
- arr2 := arr[:saveLimit]
- arr = arr[saveLimit:]
- lenarr = len(arr)
- arr2, _, _, _ = getFieldAllAndBlocks(arr2)
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...)
- //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
- } else {
- arr, _, _, _ := getFieldAllAndBlocks(arr)
- e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...)
- //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...)
- //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...)
- break
- }
- }
- }, func(err interface{}) {
- log.Debug(err)
- })
- time.Sleep(1 * time.Second)
- }
- }
- func getFieldAllAndBlocks(a [][]map[string]interface{}) (arr [][]map[string]interface{}, blocks, fieldalls, fieldallsf []map[string]interface{}) {
- arr = [][]map[string]interface{}{}
- blocks = []map[string]interface{}{}
- fieldalls = []map[string]interface{}{}
- fieldallsf = []map[string]interface{}{}
- for _, v := range a {
- _id, _ := v[0]["_id"]
- if tmp, ok := v[1]["$set"].(map[string]interface{}); ok {
- if ju.SaveBlock {
- if tmp["blocks"] != nil {
- block := map[string]interface{}{
- "_id": _id,
- "blocks": tmp["blocks"],
- }
- blocks = append(blocks, block)
- }
- }
- delete(tmp, "blocks")
- if ju.FieldsFind {
- if f, ok := tmp["fieldall"].(map[string][]map[string]interface{}); ok {
- fieldall := map[string]interface{}{
- "_id": _id,
- }
- for k, v := range f {
- fieldall[k] = v
- }
- fieldalls = append(fieldalls, fieldall)
- }
- if ff, ok := tmp["fieldallf"].(map[string][]map[string]interface{}); ok {
- fieldallf := map[string]interface{}{
- "_id": _id,
- }
- for k, v := range ff {
- fieldallf[k] = v
- }
- fieldallsf = append(fieldalls, fieldallf)
- }
- }
- delete(tmp, "fieldall")
- delete(tmp, "fieldallf")
- v[1] = tmp //全部更新
- //v[1]["$set"] = tmp //指定更新~针对指定projectname
- }
- arr = append(arr, v)
- }
- return arr, blocks, fieldalls, fieldallsf
- }
- func (e *ExtractTask) InitAuditRecogField() {
- defer qu.Catch()
- e.RecogFieldMap = make(map[string]map[string]interface{})
- recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1)
- for _, f := range *recogFieldList {
- field := qu.ObjToString(f["s_recogfield"])
- e.RecogFieldMap[field] = f
- }
- }
- func (e *ExtractTask) InitAuditClass() {
- defer qu.Catch()
- e.FidClassMap = make(map[string][]map[string]interface{})
- class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
- for _, c := range *class {
- classList := []map[string]interface{}{}
- fid := qu.ObjToString(c["s_fid"])
- if len(e.FidClassMap[fid]) > 0 { //追加
- classList = e.FidClassMap[fid]
- }
- classList = append(classList, c)
- e.FidClassMap[fid] = classList
- }
- }
- //加载规则
- func (e *ExtractTask) InitAuditRule() {
- defer qu.Catch()
- var rureg *regexp.Regexp
- var rs []rune
- var ru string
- var err error
- e.CidRuleMap = make(map[string][]map[string]interface{})
- rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1)
- for _, v := range *rule {
- i_rule := []interface{}{}
- ss, _ := (v["s_rule"].([]interface{}))
- for _, r := range qu.ObjArrToStringArr(ss) {
- if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
- rs = []rune(r)
- ru = string(rs[1 : len(rs)-1])
- rureg, err = regexp.Compile(ru)
- if err != nil {
- log.Debug("error---rule:", r)
- continue
- }
- i_rule = append(i_rule, []interface{}{rureg}...)
- } else { //规则
- i_rule = append(i_rule, r)
- }
- }
- v["rule"] = i_rule
- ruleList := []map[string]interface{}{}
- classid := qu.ObjToString(v["s_classid"])
- if len(e.CidRuleMap[classid]) > 0 { //追加
- ruleList = e.CidRuleMap[classid]
- }
- ruleList = append(ruleList, v)
- e.CidRuleMap[classid] = ruleList
- }
- }
- //
- func (e *ExtractTask) InitAuditFields() {
- if len(e.AuditFields) == 0 {
- v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
- if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
- vid := qu.BsonIdToSId((*v)["_id"])
- query := map[string]interface{}{
- "isaudit": true,
- "delete": false,
- "vid": vid,
- }
- data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1)
- for _, d := range *data {
- field := qu.ObjToString(d["s_field"])
- e.AuditFields = append(e.AuditFields, field)
- }
- }
- }
- }
- //加载附件抽取
- func (e *ExtractTask) InitFile() {
- defer qu.Catch()
- //query:=bson.M{"version":e.TaskInfo.Version,"delete":false}
- ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`)
- //ve, _ := db.Mgo.FindOne("version", query)
- if ve == nil {
- return
- }
- if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) {
- e.IsFileField = true
- }
- syscefiled := new(sync.Map)
- if (*ve)["s_filefileds"] != nil {
- for _, vff := range (*ve)["s_filefileds"].([]interface{}) {
- syscefiled.Store(vff.(string), 1)
- }
- }
- e.FileFields = syscefiled
- ju.InitOss(ju.Config["istest"].(bool))
- }
- //加载清理任务信息
- func (c *ClearTask) InitClearTaskInfo() {
- cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil)
- if len(*cleartask) > 1 {
- v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`)
- c.ClearTaskInfo = &ClearTaskInfo{
- Name: (*cleartask)["s_taskname"].(string),
- Version: (*cleartask)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- FromDbAddr: (*cleartask)["s_mgoaddr"].(string),
- FromDB: (*cleartask)["s_mgodb"].(string),
- FromColl: (*cleartask)["s_mgocoll"].(string),
- IsCltLog: ju.Config["iscltlog"].(bool),
- ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)),
- }
- log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1))
- } else {
- return
- }
- }
- //加载清理脚本
- func (c *ClearTask) InitClearLuas() {
- defer qu.Catch()
- c.ClearLuas = make(map[string][]*ClearLua)
- list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, l := range *list {
- if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性
- continue
- }
- s_field := qu.ObjToString(l["s_field"])
- pid := qu.BsonIdToSId(l["_id"])
- luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *luas {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- clearLua := &ClearLua{
- Field: s_field,
- Code: vv["s_code"].(string),
- Name: vv["s_name"].(string),
- LuaText: vv["s_luascript"].(string),
- LFields: getALLFields(),
- }
- c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua)
- }
- }
- }
- //加载分块规则
- func (e *ExtractTask) InitBlockRule() {
- datas, _ := db.Mgo.Find("block_info", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1)
- brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{}
- for _, v := range *datas {
- block_reg, _ := v["block_reg"].(string)
- block_reg, _ = strconv.Unquote(`"` + block_reg + `"`)
- title_reg, _ := v["title_reg"].(string)
- title_reg, _ = strconv.Unquote(`"` + title_reg + `"`)
- if block_reg == "" || title_reg == "" {
- continue
- }
- b_reg, b_err := regexp.Compile(block_reg)
- t_reg, t_err := regexp.Compile(title_reg)
- if b_err != nil || t_err != nil {
- continue
- }
- brs = append(brs, b_reg)
- trs = append(trs, t_reg)
- }
- e.RuleBlock = &ju.RuleBlock{
- BlockRegs: brs,
- TitleRegs: trs,
- Classify: e.InitBlockClassify(),
- }
- }
- //加载分块规则
- func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
- classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, nil, `{"name":1}`, false, -1, -1)
- classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1)
- classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{
- "vid": e.TaskInfo.VersionId,
- "delete": false,
- }, nil, `{"name":1,"pid":1}`, false, -1, -1)
- tag_map := map[string]ju.Tags{}
- for _, v := range *classify_tag {
- pid := qu.ObjToString(v["pid"])
- name := qu.ObjToString(v["name"])
- tag := &ju.Tag{Value: name}
- if strings.HasPrefix(name, "reg__") {
- tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__"))
- }
- tag_map[pid] = append(tag_map[pid], tag)
- }
- //
- info_map := map[string][]*ju.NameCode{}
- info_tag := map[string]*ju.TagFile{}
- for _, v := range *classify_info {
- pid := qu.ObjToString(v["pid"])
- _id := qu.BsonIdToSId(v["_id"])
- name := qu.ObjToString(v["name"])
- info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]}
- info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])})
- }
- classify_map := map[string][]*ju.NameCode{}
- for _, v := range *classify {
- _id := qu.BsonIdToSId(v["_id"])
- if info_map[_id] == nil {
- continue
- }
- for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") {
- classify_map[vv] = append(classify_map[vv], info_map[_id]...)
- }
- }
- return &ju.BlockClassify{Type: classify_map, Classify: info_tag}
- }
|