// extractInit package extract import ( "github.com/sensitive" db "jy/mongodbutil" ju "jy/util" qu "qfw/util" "regexp" "sort" "strconv" "strings" "sync" "time" "gopkg.in/mgo.v2/bson" log "github.com/donnie4w/go-logger/logger" "github.com/go-ego/gse" ) type RegLuaInfo struct { //正则或脚本信息 Code, Name, Field string // Score float64 RuleText string // IsLua bool // RegPreBac *ExtReg // RegCore *ExtReg // } type ExtReg struct { Reg *regexp.Regexp Replace string Bextract bool ExtractPos map[string]int NumSign int //正负修正标记,例如浮动率(上浮正1、下浮负-1) } type RuleCore struct { Id string //id Field string //逻辑字段 LuaLogic string //进入逻辑 ExtFrom string //从哪个字段抽取 RulePres []*RegLuaInfo //抽取前置规则 RuleBacks []*RegLuaInfo //抽取后置规则 RuleCores []*RegLuaInfo //抽取规则 KVRuleCores []*RegLuaInfo //KV抽取清理规则 LFields map[string]string //所有字段属性组 } type Tag struct { Type string //标签类型 string 字符串、regexp 正则 Key string // Reg *regexp.Regexp // } type TaskInfo struct { Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表 FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名 ToDbAddr, ToDB, ToColl string //结果数据库地址、库名、表名 TestColl, LastExtId string //测试结果表、上次抽取信息id FDB *db.Pool //数据库连接池 TDB *db.Pool //数据库连接池 IsEtxLog bool //是否开启抽取日志 ProcessPool chan bool //任务进程池 TestLua bool //检查测试用 } type ExtractTask struct { Id string //任务id IsRun bool //是否启动 Content string //信息内容 TaskInfo *TaskInfo //任务信息 RulePres []*RegLuaInfo //通用前置规则 RuleBacks []*RegLuaInfo //通用后置规则 SiteRuleBacks []*RegLuaInfo //站点通用后置规则 RuleBlock *ju.RuleBlock RuleCores map[string]map[string][]*RuleCore //分类抽取规则 SiteRuleCores map[string]map[string][]*RuleCore //站点分类抽取规则 PkgRuleCores []*RuleCore //分包抽取规则 Tag map[string][]*Tag //标签库 SiteTag map[string][]*Tag //站点标签库 ClearFn map[string][]string //清理函数 SiteClearFn map[string][]string //站点清理函数 IsExtractCity bool //是否开启城市抽取 Fields map[string]int //抽取属性组 SiteFields map[string]int //抽取站点属性组 IsFileField bool //是否开启附件抽取 FileFields *sync.Map //抽取附件属性组 ResultChanel chan bool //抽取结果详情 sync.RWMutex ResultArr [][]map[string]interface{} //抽取结果详情 BidChanel chan bool //抽取结果 BidArr [][]map[string]interface{} //抽取结果 BidTotal int //结果数量 RecogFieldMap map[string]map[string]interface{} //识别字段 FidClassMap map[string][]map[string]interface{} //分类 CidRuleMap map[string][]map[string]interface{} //规则 AuditFields []string //需要审核的字段名称 SiteCityMap map[string]*SiteCity //站点对应的省市区 ProvinceMap map[string]string //省全称简称(key:浙江省 val:浙江) ProvinceBriefMap map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{}) CityMap map[string]string //市全称简称(key:杭州市 val:杭州) CityBriefMap map[string]*City //市简称对应的市信息(key:杭州 val:&City{}) CityFullMap map[string]*City //市全称对应的市信息(key:杭州市 val:&City{}) DistrictCityMap map[string][]*City //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice) DistrictSimAndAll map[string][]map[string]*City //区或县简称对应的city(全国有相同名称的区或县,这里对应的city用slice) StreetDistrictMap map[string][]*District //街道全称对应的区或县 ProvinceAllGet *ju.DFA //省全称 ProvinceSimGet *ju.DFA //省简称 CityAllGet *ju.DFA //市全称 CitySimGet *ju.DFA //市简称 DistrictAllGet *ju.DFA //区或县全称 DistrictSimGet *ju.DFA //区或县简称 StreetGet *ju.DFA //街道 PostCodeMap map[string]*PostCode //邮编 AreaCodeMap map[string]*AreaCode //区号 XjbtCityArr []map[string]interface{} //新疆兵团相关数据 SensitiveFullCity *sensitive.Filter SensitiveSimCity *sensitive.Filter InfoType []map[string]interface{} Trie_Full_Province *ju.Trie //省全称 省、直辖市、自治区 Trie_Full_City *ju.Trie //市全称 地级市 Trie_Full_District *ju.Trie //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区 Trie_Full_Street *ju.Trie //街道、乡镇全称 镇、乡、民族乡、县辖区、街道 Trie_Full_Community *ju.Trie //村/委员会全称 村、居委会 Trie_Sim_Province *ju.Trie //省简称 Trie_Sim_City *ju.Trie //市简称 Trie_Sim_District *ju.Trie //县简称 Trie_Fulls []*ju.Trie //所有全称 Trie_Sims []*ju.Trie //所有简称 Seg_PCD *gse.Segmenter //分词 Seg_SV *gse.Segmenter //分词 Luacodes *sync.Map //站点规则 SiteMerge *sync.Map //抽取合并 } type SiteCity struct { P string //省简称 C string //市全称 D string //区全称 } type ClearTaskInfo struct { Name, Version, VersionId string //名称、版本、版本id FromDbAddr, FromDB, FromColl string //清理数据库地址、库名、表名 FDB *db.Pool //数据库连接池 TDB *db.Pool //数据库连接池 IsCltLog bool //是否开启清理日志 ProcessPool chan bool //任务进程池 } type ClearLua struct { Field string //字段字段 Code string //代码 Name string //名称 LuaText string LFields map[string]string //lua抽取字段属性组 //LuaLogic string //进入逻辑 //ExtFrom string //从哪个字段抽取 } type ClearTask struct { sync.RWMutex Id string //任务id Content string //信息内容 ClearTaskInfo *ClearTaskInfo //任务信息 ClearLuas map[string][]*ClearLua //清理脚本 UpdateResult [][]map[string]interface{} //清理后结果 //ClearChannel chan bool } func init() { TaskList = make(map[string]*ExtractTask) ClearTaskList = make(map[string]*ClearTask) go SaveExtLog() go SaveCltLog() //保存清理日志 } //加载任务信息 func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) { task, _ := db.Mgo.FindById("task", e.Id, nil) if len(*task) > 1 { v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`) e.TaskInfo = &TaskInfo{ Name: (*task)["s_taskname"].(string), Version: (*task)["s_version"].(string), VersionId: qu.BsonIdToSId((*v)["_id"]), TrackColl: trackcoll, FromDbAddr: (*task)["s_mgoaddr"].(string), FromDB: (*task)["s_mgodb"].(string), FromColl: (*task)["s_mgocoll"].(string), TestColl: resultcoll, IsEtxLog: true, ProcessPool: make(chan bool, 1), } if (*v)["isextractcity"] != nil { e.IsExtractCity = (*v)["isextractcity"].(bool) } } else { return } } //加载任务信息 func (e *ExtractTask) InitTaskInfo() { task, _ := db.Mgo.FindById("task", e.Id, nil) log.Debug("task", task, "~", e.Id) if len(*task) > 1 { v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`) strs := strings.Split((*task)["s_mgosavecoll"].(string), "/") log.Debug("s_mgosavecoll", strs) if len(strs) < 3 { return } else { e.TaskInfo = &TaskInfo{ Name: (*task)["s_taskname"].(string), Version: (*task)["s_version"].(string), VersionId: qu.BsonIdToSId((*v)["_id"]), //TrackColl: (*task)["s_trackcoll"].(string), FromDbAddr: (*task)["s_mgoaddr"].(string), FromDB: (*task)["s_mgodb"].(string), FromColl: (*task)["s_mgocoll"].(string), ToDbAddr: strs[0], ToDB: strs[1], ToColl: strs[2], IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool), LastExtId: qu.ObjToString((*task)["s_extlastid"]), ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)), } if (*v)["isextractcity"] != nil { e.IsExtractCity = (*v)["isextractcity"].(bool) } } log.Debug(e.TaskInfo.Name, "thread:", qu.IntAllDef((*task)["i_process"], 1)) } else { return } } func (e *ExtractTask) InitSite() { e.Luacodes = &sync.Map{} e.SiteMerge = &sync.Map{} sites, _ := db.Mgo.Find("site_management", bson.M{"version": e.TaskInfo.Version}, nil, bson.M{"site_script": 1, "ismerge": 1}, false, -1, -1) for _, v := range *sites { if vv, ok := v["site_script"].([]interface{}); ok { for _, vvv := range vv { e.Luacodes.Store(vvv, map[string]interface{}{}) e.SiteMerge.Store(vvv, v["ismerge"].(bool)) } } else if vv, ok := v["site_script"].(interface{}); ok { e.Luacodes.Store(vv, map[string]interface{}{}) e.SiteMerge.Store(vv, v["ismerge"].(bool)) } } } //加载通用前置规则 func (e *ExtractTask) InitRulePres() { defer qu.Catch() e.RulePres = []*RegLuaInfo{} list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *list { rinfo := &RegLuaInfo{ Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) e.RulePres = append(e.RulePres, rinfo) } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]} } else { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""} } e.RulePres = append(e.RulePres, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } } } //加载通用后置规则 func (e *ExtractTask) InitRuleBacks(isSite bool) { defer qu.Catch() cDB := "" eSiteRuleBacks := []*RegLuaInfo{} if isSite { cDB = "site_rule_back" e.SiteRuleBacks = []*RegLuaInfo{} } else { cDB = "rule_back" e.RuleBacks = []*RegLuaInfo{} } list, _ := db.Mgo.Find(cDB, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *list { rinfo := &RegLuaInfo{ Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) if isSite { eSiteRuleBacks = append(eSiteRuleBacks, rinfo) //e.SiteRuleBacks = append(e.SiteRuleBacks, rinfo) } else { e.RuleBacks = append(e.RuleBacks, rinfo) } } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]} } else { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""} } if isSite { eSiteRuleBacks = append(eSiteRuleBacks, rinfo) } else { e.RuleBacks = append(e.RuleBacks, rinfo) } }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } if isSite { sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1}) if (*sm) == nil || len(*sm) <= 0 { eSiteRuleBacks = []*RegLuaInfo{} continue } for _, v2 := range (*sm)["site_script"].([]interface{}) { if mdpvalue, ok := e.Luacodes.Load(v2); ok { if mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] == nil { mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = eSiteRuleBacks } else { if tmplist, ok3 := mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo); ok3 { tmplist = append(tmplist, eSiteRuleBacks...) mdpvalue.(map[string]interface{})["e.SiteRuleBacks"] = tmplist } //mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) = append(mdpvalue.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo), eSiteRuleBacks...) } e.Luacodes.Store(v2, mdpvalue) } } eSiteRuleBacks = []*RegLuaInfo{} } } } func (e *ExtractTask) InfoTypeList() { infolist1, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1) infolist := *infolist1 for _, v := range infolist { e.InfoType = append(e.InfoType, v) } } //加载抽取规则 func (e *ExtractTask) InitRuleCore(isSite bool) { defer qu.Catch() allFields := getALLFields() var versioninfodb, rule_logicdb, rule_logicpredb, rule_logicbackdb, rule_logicoredb, rule_logickvdb string eSiteRuleCores := make(map[string]map[string][]*RuleCore) if isSite { versioninfodb = "site_versioninfo" rule_logicdb = "site_rule_logic" rule_logicpredb = "site_rule_logicpre" rule_logicbackdb = "site_rule_logicback" rule_logicoredb = "site_rule_logicore" rule_logickvdb = "site_rule_logickv" e.SiteFields = map[string]int{} e.SiteRuleCores = make(map[string]map[string][]*RuleCore) } else { versioninfodb = "versioninfo" rule_logicdb = "rule_logic" rule_logicpredb = "rule_logicpre" rule_logicbackdb = "rule_logicback" rule_logicoredb = "rule_logicore" rule_logickvdb = "rule_logickv" e.Fields = map[string]int{} e.RuleCores = make(map[string]map[string][]*RuleCore) } fieldrules := map[string][]*RuleCore{} vinfos, _ := db.Mgo.Find(versioninfodb, `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1) for _, vinfo := range *vinfos { //fmt.Println("总计",len(*vinfos),"当前第N个",kkkk) if b, _ := vinfo["isuse"].(bool); !b { continue } s_field := qu.ObjToString(vinfo["s_field"]) pid := qu.BsonIdToSId(vinfo["_id"]) list, _ := db.Mgo.Find(rule_logicdb, `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1) for _, vv := range *list { if b, _ := vv["isuse"].(bool); !b { continue } rcore := &RuleCore{Id: qu.BsonIdToSId(vv["_id"])} rcore.Field = s_field rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本 rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string) rcore.LFields = allFields //前置规则 rulePres := []*RegLuaInfo{} plist, _ := db.Mgo.Find(rule_logicpredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *plist { rinfo := &RegLuaInfo{ Field: qu.ObjToString(v["s_field"]), Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) rulePres = append(rulePres, rinfo) } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]} } else { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""} } rulePres = append(rulePres, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } } rcore.RulePres = rulePres //后置规则 ruleBacks := []*RegLuaInfo{} blist, _ := db.Mgo.Find(rule_logicbackdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *blist { rinfo := &RegLuaInfo{ Field: qu.ObjToString(v["s_field"]), Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) ruleBacks = append(ruleBacks, rinfo) } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]} } else { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""} } ruleBacks = append(ruleBacks, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } } rcore.RuleBacks = ruleBacks //抽取规则 ruleCores := []*RegLuaInfo{} clist, _ := db.Mgo.Find(rule_logicoredb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *clist { if b, _ := v["isuse"].(bool); !b { continue } field := qu.ObjToString(v["s_field"]) if isSite { e.SiteFields[field] = 1 } else { e.Fields[field] = 1 //加入抽取属性组备用 } rinfo := &RegLuaInfo{ Field: field, Code: v["s_code"].(string), Name: v["s_name"].(string), Score: qu.Float64All(v["s_default_score"]), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) //提取全部属性 ruleCores = append(ruleCores, rinfo) } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { epos := strings.Split(tmp[1], ",") posm := map[string]int{} for _, v := range epos { ks := strings.Split(v, ":") if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area posm[ks[1]] = qu.IntAll(ks[0]) } else { //(.*)招标公告__2 posm[rinfo.Field] = qu.IntAll(ks[0]) } } rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm} } else { rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false} } ruleCores = append(ruleCores, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } } rcore.RuleCores = ruleCores //kv规则 kvRuleCores := []*RegLuaInfo{} kvlist, _ := db.Mgo.Find(rule_logickvdb, `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *kvlist { if b, _ := v["isuse"].(bool); !b { continue } field := qu.ObjToString(v["s_field"]) if isSite { e.SiteFields[field] = 1 } else { e.Fields[field] = 1 //加入抽取属性组备用 } rinfo := &RegLuaInfo{ Field: field, Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]} } else { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""} } kvRuleCores = append(kvRuleCores, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } rcore.KVRuleCores = kvRuleCores if fieldrules[s_field] == nil { fieldrules[s_field] = []*RuleCore{} } fieldrules[s_field] = append(fieldrules[s_field], rcore) } infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1) for _, v := range *infolist { topclass := qu.ObjToString(v["topclass"]) if v["subclass"] == nil { eSiteRuleCores[topclass] = make(map[string][]*RuleCore) for attr, _ := range v["fields"].(map[string]interface{}) { if fieldrules[attr] != nil { eSiteRuleCores[topclass][attr] = fieldrules[attr] } } } else { for ca, fs := range v["subclass"].(map[string]interface{}) { eSiteRuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore) for field, _ := range fs.(map[string]interface{}) { if fieldrules[field] != nil { eSiteRuleCores[topclass+"_"+ca][field] = fieldrules[field] } } } } } if isSite { sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(vinfo["pid"]), bson.M{"site_script": 1}) if (*sm) == nil || len(*sm) <= 0 { eSiteRuleCores = make(map[string]map[string][]*RuleCore) fieldrules = map[string][]*RuleCore{} continue } for _, v2 := range (*sm)["site_script"].([]interface{}) { if mdpvalue, ok := e.Luacodes.Load(v2); ok { //属性配置 if mdpvalue.(map[string]interface{})["e.SiteRuleCores"] == nil { mdpvalue.(map[string]interface{})["e.SiteRuleCores"] = eSiteRuleCores } else { for k2, v2 := range eSiteRuleCores { tmpv := mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] for kkkk, vvv := range v2 { tmpv[kkkk] = vvv } mdpvalue.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)[k2] = tmpv } } e.Luacodes.Store(v2, mdpvalue) } } eSiteRuleCores = make(map[string]map[string][]*RuleCore) fieldrules = map[string][]*RuleCore{} } } if !isSite { //属性配置 infolist, _ := db.Mgo.Find("infotype", `{}`, `{}`, `{}`, false, -1, -1) for _, v := range *infolist { topclass := qu.ObjToString(v["topclass"]) if v["subclass"] == nil { e.RuleCores[topclass] = make(map[string][]*RuleCore) for attr, _ := range v["fields"].(map[string]interface{}) { if fieldrules[attr] != nil { e.RuleCores[topclass][attr] = fieldrules[attr] } } } else { for ca, fs := range v["subclass"].(map[string]interface{}) { e.RuleCores[topclass+"_"+ca] = make(map[string][]*RuleCore) for field, _ := range fs.(map[string]interface{}) { if fieldrules[field] != nil { e.RuleCores[topclass+"_"+ca][field] = fieldrules[field] } } } } } } } //加载分包抽取规则 func (e *ExtractTask) InitPkgCore() { defer qu.Catch() e.PkgRuleCores = []*RuleCore{} pkginfos, _ := db.Mgo.Find("pkg_info", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1) for _, pkginfo := range *pkginfos { if b, _ := pkginfo["isuse"].(bool); !b { continue } s_field := qu.ObjToString(pkginfo["s_field"]) sid := qu.BsonIdToSId(pkginfo["_id"]) rcore := &RuleCore{} rcore.Field = s_field rcore.ExtFrom = "detail" //后置规则 ruleBacks := []*RegLuaInfo{} blist, _ := db.Mgo.Find("pkg_logicback", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *blist { rinfo := &RegLuaInfo{ Field: qu.ObjToString(v["s_field"]), Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) ruleBacks = append(ruleBacks, rinfo) } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: tmp[1]} } else { rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(pattern), Replace: ""} } ruleBacks = append(ruleBacks, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } } rcore.RuleBacks = ruleBacks //抽取规则 ruleCores := []*RegLuaInfo{} clist, _ := db.Mgo.Find("pkg_logicore", `{"sid":"`+sid+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *clist { if b, _ := v["isuse"].(bool); !b { continue } field := qu.ObjToString(v["s_field"]) e.Fields[field] = 1 //加入抽取属性组备用 rinfo := &RegLuaInfo{ Field: field, Code: v["s_code"].(string), Name: v["s_name"].(string), IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool), } if rinfo.IsLua { rinfo.RuleText = v["s_luascript"].(string) //提取全部属性 ruleCores = append(ruleCores, rinfo) } else { qu.Try(func() { rinfo.RuleText = v["s_rule"].(string) tmp := strings.Split(rinfo.RuleText, "__") var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } if len(tmp) == 2 { epos := strings.Split(tmp[1], ",") posm := map[string]int{} for _, v := range epos { ks := strings.Split(v, ":") if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area posm[ks[1]] = qu.IntAll(ks[0]) } else { //(.*)招标公告__2 posm[rinfo.Field] = qu.IntAll(ks[0]) } } rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: true, ExtractPos: posm} } else { rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(pattern), Bextract: false} } ruleCores = append(ruleCores, rinfo) }, func(err interface{}) { log.Debug(rinfo.Code, rinfo.Field, err) }) } } rcore.RuleCores = ruleCores e.PkgRuleCores = append(e.PkgRuleCores, rcore) } } //加载标签库 func (e *ExtractTask) InitTag(isSite bool) { defer qu.Catch() var tagdetailinfodb string eSiteTag := map[string][]*Tag{} if isSite { tagdetailinfodb = "site_tagdetailinfo" e.SiteTag = map[string][]*Tag{} } else { tagdetailinfodb = "tagdetailinfo" e.Tag = map[string][]*Tag{} } //字符串标签库 list, _ := db.Mgo.Find(tagdetailinfodb, `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1) var tmpMap sync.Map for _, v := range *list { field := qu.ObjToString(v["s_field"]) if tmp, ok := v["content"].([]interface{}); ok { fname := qu.ObjToString(v["s_name"]) tab := ju.TagFile{Name: fname} //用于表格kv tab.Items = make([]*ju.Tag, len(tmp)) for k, key := range tmp { tag := &Tag{Type: "string", Key: key.(string)} if isSite { eSiteTag[field] = append(eSiteTag[field], tag) //e.SiteTag[field] = append(e.SiteTag[field], tag) } else { e.Tag[field] = append(e.Tag[field], tag) } tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, nil, false} } sort.Sort(tab.Items) //ju.TagdbTable[fname] = &tab if isSite { sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1}) if (*sm) == nil || len(*sm) <= 0 { eSiteTag = map[string][]*Tag{} continue } for _, v2 := range (*sm)["site_script"].([]interface{}) { if v2 == nil || v2 == "" { continue } if mdpvalue, ok := e.Luacodes.Load(v2); ok { if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil { mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag } else { for k2, v2 := range eSiteTag { mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2 } } e.Luacodes.Store(v2, mdpvalue) } tmpMap.Store(fname, &tab) ju.SiteTagdbTable.Store(v2, tmpMap) } //ju.SiteTagdbTable.Store(fname, &tab) eSiteTag = map[string][]*Tag{} } else { ju.TagdbTable.Store(fname, &tab) } } //if isSite { // sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1}) // for _, v2 := range (*sm)["site_script"].([]interface{}) { // if mdpvalue, ok := Luacodes.Load(v2); ok { // if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil{ // mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag // }else { // for k2,v2 := range eSiteTag{ // mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2 // } // } // Luacodes.Store(v2, mdpvalue) // } // } // eSiteTag = map[string][]*Tag{} //} } //正则标签库 list, _ = db.Mgo.Find(tagdetailinfodb, `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *list { field := qu.ObjToString(v["s_field"]) if tmp, ok := v["content"].([]interface{}); ok { fname := qu.ObjToString(v["s_name"]) tab := ju.TagFile{Name: fname, Type: "reg"} //用于表格kv tab.Items = make([]*ju.Tag, len(tmp)) for k, key := range tmp { tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))} if isSite { eSiteTag[field] = append(eSiteTag[field], tag) //e.SiteTag[field] = append(e.SiteTag[field], tag) } else { e.Tag[field] = append(e.Tag[field], tag) } tab.Items[k] = &ju.Tag{"", key.(string), 0 - k, regexp.MustCompile(key.(string)), false} } sort.Sort(tab.Items) //ju.TagdbTable[fname+"_reg"] = &tab if isSite { ju.SiteTagdbTable.Store(fname+"_reg", &tab) } else { ju.TagdbTable.Store(fname+"_reg", &tab) } } if isSite { sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(v["pid"]), bson.M{"site_script": 1}) if (*sm) == nil || len(*sm) <= 0 { eSiteTag = map[string][]*Tag{} continue } for _, v2 := range (*sm)["site_script"].([]interface{}) { if mdpvalue, ok := e.Luacodes.Load(v2); ok { if mdpvalue.(map[string]interface{})["e.SiteTag"] == nil { mdpvalue.(map[string]interface{})["e.SiteTag"] = eSiteTag } else { for k2, v2 := range eSiteTag { mdpvalue.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)[k2] = v2 } } e.Luacodes.Store(v2, mdpvalue) } } eSiteTag = map[string][]*Tag{} } } } //获取fields func getALLFields() map[string]string { fields := map[string]string{} list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1,"s_name"}`, false, -1, -1) for _, v := range *list { fields[qu.ObjToString(v["s_name"])] = qu.ObjToString(v["s_field"]) } return fields } //加载clear函数 func (e *ExtractTask) InitClearFn(isSite bool) { defer qu.Catch() var cleanupdb string if isSite { cleanupdb = "site_cleanup" e.SiteClearFn = map[string][]string{} } else { cleanupdb = "cleanup" } list, _ := db.Mgo.Find(cleanupdb, `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1) fn := map[string][]string{} for _, tmp := range *list { field := tmp["s_field"].(string) fns := tmp["clear"].([]interface{}) if fn[field] == nil { fn[field] = []string{} } for _, v := range fns { fn[field] = append(fn[field], v.(string)) } if isSite { sm, _ := db.Mgo.FindById("site_management", qu.ObjToString(tmp["pid"]), bson.M{"site_script": 1}) if (*sm) == nil || len(*sm) <= 0 { fn = map[string][]string{} continue } for _, v2 := range (*sm)["site_script"].([]interface{}) { if mdpvalue, ok := e.Luacodes.Load(v2); ok { if mdpvalue.(map[string]interface{})["e.SiteClearFn"] == nil { mdpvalue.(map[string]interface{})["e.SiteClearFn"] = fn } else { for k2, v2 := range fn { mdpvalue.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)[k2] = v2 } } e.Luacodes.Store(v2, mdpvalue) } } fn = map[string][]string{} } } if !isSite { e.ClearFn = fn } } //加载省份 func InitProvince(version string) map[string]interface{} { defer qu.Catch() fn := map[string]interface{}{} list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+version+`","delete":false}`, nil, nil, false, -1, -1) for _, v := range *list { name := qu.ObjToString(v["s_name"]) content := v["content"] switch content.(type) { case string: fn[name] = []interface{}{content.(string)} case []interface{}: fn[name] = content } } return fn } //加载所有 func InitProvincesx() []map[string]interface{} { defer qu.Catch() provinces := make([]map[string]interface{}, 0) ju.AddrsSess.Find(map[string]interface{}{ "Remarks": nil, }).All(&provinces) return provinces } //加载站点库site城市信息 func InitSite() []map[string]interface{} { defer qu.Catch() query := map[string]interface{}{ "site_type": map[string]interface{}{ "$ne": "代理机构", }, } list, _ := ju.Site_Mgo.Find("site", query, nil, map[string]interface{}{ "site": 1, "area": 1, "city": 1, "district": 1, }) return list //list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1) //return *list } //加载新疆兵团映射关系 func (e *ExtractTask) InitXjbtCityInfo() { defer qu.Catch() //加载数据 query := map[string]interface{}{} list, _ := db.Mgo.Find("area_xjbt", query, nil, nil, false, -1, -1) arr := []map[string]interface{}{} for _, v := range *list { delete(v, "_id") arr = append(arr, v) } e.XjbtCityArr = arr } //站点加载... func (e *ExtractTask) InitUpdateSite() { defer qu.Catch() e.SiteCityMap = make(map[string]*SiteCity) for _, v := range InitSite() { site := qu.ObjToString(v["site"]) area := qu.ObjToString(v["area"]) city := qu.ObjToString(v["city"]) district := qu.ObjToString(v["district"]) if area != "" && area != "全国" && site != "" { s := &SiteCity{ P: area, C: city, D: district, } e.SiteCityMap[site] = s } } log.Debug("有效站点数量:", len(e.SiteCityMap)) } func (e *ExtractTask) InitCityInfo() { defer qu.Catch() e.InitVar() //初始化变量 //新疆兵团数据 e.InitXjbtCityInfo() //site站点信息 e.InitUpdateSite() //初始化省信息 fn1 := InitProvince(e.TaskInfo.Version) for k, v := range fn1 { for _, p := range v.([]interface{}) { p1, _ := p.(string) e.Trie_Full_Province.AddWords(p1) //华中科技大学 e.ProvinceMap[p1] = k //华中科技大学:湖北 } } alldata := InitProvincesx() fnx := make([]map[string]interface{}, 0) citys_maps := make(map[string][]map[string]interface{}, 0) districts_maps := make(map[string]map[string][]map[string]interface{}, 0) towns_maps := make(map[string]map[string]map[string][]map[string]interface{}, 0) jwhs_maps := make(map[string]map[string]map[string]map[string][]map[string]interface{}, 0) for _, v := range alldata { codenum := len(v["code"].(string)) province := qu.ObjToString(v["province"]) city := qu.ObjToString(v["city"]) district := qu.ObjToString(v["district"]) town := qu.ObjToString(v["town"]) if codenum == 2 { fnx = append(fnx, v) } else if codenum == 4 { citys_maps[province] = append(citys_maps[province], v) } else if codenum == 6 { if districts_maps[province] == nil { districts_maps[province] = make(map[string][]map[string]interface{}, 0) } districts_maps[province][city] = append(districts_maps[province][city], v) } else if codenum == 9 { if towns_maps[province] == nil { towns_maps[province] = make(map[string]map[string][]map[string]interface{}, 0) } if towns_maps[province][city] == nil { towns_maps[province][city] = make(map[string][]map[string]interface{}, 0) } towns_maps[province][city][district] = append(towns_maps[province][city][district], v) } else if codenum == 12 { if jwhs_maps[province] == nil { jwhs_maps[province] = make(map[string]map[string]map[string][]map[string]interface{}, 0) } if jwhs_maps[province][city] == nil { jwhs_maps[province][city] = make(map[string]map[string][]map[string]interface{}, 0) } if jwhs_maps[province][city][district] == nil { jwhs_maps[province][city][district] = make(map[string][]map[string]interface{}, 0) } jwhs_maps[province][city][district][town] = append(jwhs_maps[province][city][district][town], v) } } //初始化城市全称 for _, provinces := range fnx { all_province := qu.ObjToString(provinces["all_province"]) //省全称 jc_province := qu.ObjToString(provinces["province"]) //省简称 //加载省信息 e.Trie_Full_Province.AddWords(all_province) //加入省全称Trie(k:浙江省) p := &Province{} p.Name = all_province //省全称:浙江省 p.Brief = jc_province //省简称:浙江 e.Trie_Sim_Province.AddWords(jc_province) //加入省简称Trie(k:浙江) e.ProvinceMap[all_province] = jc_province //浙江省:浙江 e.ProvinceBriefMap[jc_province] = p //浙江:省信息{} if province_alias, ok := provinces["province_alias"].([]interface{}); ok { for _, vprovince_alias := range province_alias { e.ProvinceBriefMap[qu.ObjToString(vprovince_alias)] = p } } //加载市信息 citys := citys_maps[jc_province] isok := make(map[string]bool) for _, vcity := range citys { qc_city := qu.ObjToString(vcity["city"]) jc_city := qu.ObjToString(vcity["brief_city"]) e.Trie_Full_City.AddWords(qc_city) //加入市全称Trie(k:杭州市) e.SensitiveFullCity.AddWord(qc_city) c := &City{} c.Name = qc_city //市全称:杭州市 if jc_city != "" { c.Brief = jc_city //市简称:杭州 e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州) e.SensitiveSimCity.AddWord(c.Brief) e.CityMap[qc_city] = c.Brief //杭州市:杭州 e.CityBriefMap[c.Brief] = c //杭州:市信息{} e.CityFullMap[qc_city] = c //杭州市:市信息{} } c.P = p if city_alias, ok := vcity["city_alias"].([]interface{}); ok { for _, vcity_alias := range city_alias { strvcity_alias := qu.ObjToString(vcity_alias) if isok[jc_province+"_"+strvcity_alias] { continue } e.CityBriefMap[strvcity_alias] = c e.initDistricts(jc_province, strvcity_alias, c, jc_city, districts_maps, towns_maps, jwhs_maps) isok[jc_province+"_"+strvcity_alias] = true } } if isok[jc_province+"_"+qc_city] { continue } e.initDistricts(jc_province, qc_city, c, jc_city, districts_maps, towns_maps, jwhs_maps) } } e.Trie_Fulls = []*ju.Trie{e.Trie_Full_Province, e.Trie_Full_City, e.Trie_Full_District, e.Trie_Full_Street, e.Trie_Full_Community} e.Trie_Sims = []*ju.Trie{e.Trie_Sim_Province, e.Trie_Sim_City, e.Trie_Sim_District} } //加载区县 func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City, jc_city string, districts_maps map[string]map[string][]map[string]interface{}, towns_maps map[string]map[string]map[string][]map[string]interface{}, jwhs_maps map[string]map[string]map[string]map[string][]map[string]interface{}) { districts := districts_maps[jc_province][qc_city] for _, vdistricts := range districts { qc_district := qu.ObjToString(vdistricts["district"]) jc_district := qu.ObjToString(vdistricts["brief_district"]) d := &District{} d.Name = qc_district d.C = c e.Trie_Full_District.AddWords(qc_district) //加入区或县全称Trie if jc_district != "" { e.Trie_Sim_District.AddWords(jc_district) //加入区或县简称Trie //初始化城市简称 c := e.CityBriefMap[jc_city] dfullarr := e.DistrictSimAndAll[jc_district] dfullcity := map[string]*City{qc_district: c} if len(dfullarr) == 0 { tmparr := []map[string]*City{dfullcity} e.DistrictSimAndAll[jc_district] = tmparr } else { e.DistrictSimAndAll[jc_district] = append(e.DistrictSimAndAll[jc_district], dfullcity) } } ctmp := e.DistrictCityMap[qc_district] if len(ctmp) == 0 { tmpcarr := []*City{c} e.DistrictCityMap[qc_district] = tmpcarr } else { e.DistrictCityMap[qc_district] = append(e.DistrictCityMap[qc_district], c) } if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok { for _, vdistrict_alias := range district_alias { strvdistrict_alias := qu.ObjToString(vdistrict_alias) e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie c_tmp := e.DistrictCityMap[strvdistrict_alias] if len(c_tmp) == 0 { tmpcarr := []*City{c} e.DistrictCityMap[strvdistrict_alias] = tmpcarr } else { e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c) } } } //街道 towns := towns_maps[jc_province][qc_city][qc_district] for _, vtown := range towns { strvtown := qu.ObjToString(vtown["town"]) s := &Street{} s.Name = strvtown s.D = d e.Trie_Full_Street.AddWords(strvtown) //加入街道全称Trie dtmp := e.StreetDistrictMap[strvtown] if len(dtmp) == 0 { tmpdarr := []*District{d} e.StreetDistrictMap[strvtown] = tmpdarr } else { e.StreetDistrictMap[strvtown] = append(e.StreetDistrictMap[strvtown], d) } //村、居委会 //jwhs := jwhs_maps[jc_province][qc_city][qc_district][strvtown] //for _, vjwh := range jwhs { // strvillage := qu.ObjToString(vjwh["village"]) // e.Trie_Full_Community.AddWords(strvillage) //加入居委会、村全称Trie // cttmp := e.CommunityDistrictMap[strvillage] // if len(cttmp) == 0 { // tmpdarr := []*District{d} // e.CommunityDistrictMap[strvillage] = tmpdarr // } else { // e.CommunityDistrictMap[strvillage] = append(e.CommunityDistrictMap[strvillage], d) // } //} } } } func (e *ExtractTask) InitVar() { defer qu.Catch() //初始化Trie //全称 e.Trie_Full_Province = &ju.Trie{} e.Trie_Full_City = &ju.Trie{} e.Trie_Full_District = &ju.Trie{} e.Trie_Full_Street = &ju.Trie{} e.Trie_Full_Community = &ju.Trie{} //简称 e.Trie_Sim_Province = &ju.Trie{} e.Trie_Sim_City = &ju.Trie{} e.Trie_Sim_District = &ju.Trie{} //初始化分词 e.Seg_PCD = &gse.Segmenter{} e.Seg_SV = &gse.Segmenter{} e.Seg_PCD.LoadDict("./res/pcd.txt") e.Seg_SV.LoadDict("./res/sv.txt") //初始化城市相关 e.SiteCityMap = make(map[string]*SiteCity) e.ProvinceMap = make(map[string]string) e.CityMap = make(map[string]string) e.DistrictSimAndAll = make(map[string][]map[string]*City) e.CityBriefMap = make(map[string]*City) e.CityFullMap = make(map[string]*City) e.ProvinceBriefMap = make(map[string]*Province) e.DistrictCityMap = make(map[string][]*City) e.StreetDistrictMap = make(map[string][]*District) //新疆兵团-数组 e.XjbtCityArr = make([]map[string]interface{}, 0) //敏感词-筛选 e.SensitiveFullCity = sensitive.New() e.SensitiveSimCity = sensitive.New() } //初始化邮编库 func (e *ExtractTask) InitPostCode() { defer qu.Catch() e.PostCodeMap = make(map[string]*PostCode) list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1) for _, l := range *list { pc := &PostCode{} pc.Code = qu.ObjToString(l["code"]) pc.P = qu.ObjToString(l["province"]) pc.C = qu.ObjToString(l["city"]) pc.D = qu.ObjArrToStringArr(l["district"].([]interface{})) e.PostCodeMap[pc.Code] = pc } } //初始化区号库 func (e *ExtractTask) InitAreaCode() { defer qu.Catch() e.AreaCodeMap = make(map[string]*AreaCode) list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1) for _, l := range *list { ac := &AreaCode{} ac.Code = qu.ObjToString(l["code"]) ac.P = qu.ObjToString(l["province"]) ac.C = qu.ObjArrToStringArr(l["city"].([]interface{})) e.AreaCodeMap[ac.Code] = ac } } //保存抽取详情数据 func (e *ExtractTask) ResultSave(init bool) { defer qu.Catch() e.RWMutex.Lock() if e.ResultArr == nil { e.ResultArr = [][]map[string]interface{}{} } e.RWMutex.Unlock() if init { go func() { for { e.RWMutex.Lock() if len(e.ResultArr) > saveLimit { arr := e.ResultArr[:saveLimit] e.ResultArr = e.ResultArr[saveLimit:] e.RWMutex.Unlock() qu.Try(func() { db.Mgo.UpSertBulk("extract_result", arr...) }, func(err interface{}) { log.Debug(err) }) } else { arr := e.ResultArr e.ResultArr = [][]map[string]interface{}{} e.RWMutex.Unlock() qu.Try(func() { db.Mgo.UpSertBulk("extract_result", arr...) }, func(err interface{}) { log.Debug(err) }) } time.Sleep(2 * time.Second) } }() } else { e.RWMutex.Lock() arr := e.ResultArr e.ResultArr = [][]map[string]interface{}{} e.RWMutex.Unlock() qu.Try(func() { lenarr := len(arr) for { if lenarr > saveLimit { arr2 := arr[:saveLimit] arr = arr[saveLimit:] lenarr = len(arr) e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...) } else { e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...) break } } }, func(err interface{}) { defer e.RWMutex.Unlock() log.Debug(err) }) } } //保存抽取数据 func (e *ExtractTask) BidSave(init bool) { defer qu.Catch() e.RWMutex.Lock() if e.BidArr == nil { e.BidArr = [][]map[string]interface{}{} } e.RWMutex.Unlock() if init { go func() { for { e.RWMutex.Lock() if len(e.BidArr) > saveLimit { arr := e.BidArr[:saveLimit] e.BidArr = e.BidArr[saveLimit:] e.RWMutex.Unlock() //arr, blocks, fieldalls, fieldallsf := getFieldAllAndBlocks(arr) arr, _, _, _ = getFieldAllAndBlocks(arr) qu.Try(func() { e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...) //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...) //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...) //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...) }, func(err interface{}) { log.Debug(err) }) } else { arr := e.BidArr e.BidArr = [][]map[string]interface{}{} e.RWMutex.Unlock() arr, _, _, _ = getFieldAllAndBlocks(arr) qu.Try(func() { e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...) //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...) //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...) //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...) }, func(err interface{}) { log.Debug(err) }) } time.Sleep(2 * time.Second) } }() } else { e.RWMutex.Lock() arr := e.BidArr e.BidArr = [][]map[string]interface{}{} e.RWMutex.Unlock() qu.Try(func() { lenarr := len(arr) for { if lenarr > saveLimit { arr2 := arr[:saveLimit] arr = arr[saveLimit:] lenarr = len(arr) arr2, _, _, _ = getFieldAllAndBlocks(arr2) e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr2...) //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...) //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...) //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...) } else { arr, _, _, _ := getFieldAllAndBlocks(arr) e.TaskInfo.TDB.UpSertBulk(e.TaskInfo.ToColl, arr...) //e.TaskInfo.TDB.SaveBulk("ext_blocks", blocks...) //e.TaskInfo.TDB.SaveBulk("ext_fieldall", fieldalls...) //e.TaskInfo.TDB.SaveBulk("ext_fieldallf", fieldallsf...) break } } }, func(err interface{}) { log.Debug(err) }) time.Sleep(1 * time.Second) } } func getFieldAllAndBlocks(a [][]map[string]interface{}) (arr [][]map[string]interface{}, blocks, fieldalls, fieldallsf []map[string]interface{}) { arr = [][]map[string]interface{}{} blocks = []map[string]interface{}{} fieldalls = []map[string]interface{}{} fieldallsf = []map[string]interface{}{} for _, v := range a { _id, _ := v[0]["_id"] if tmp, ok := v[1]["$set"].(map[string]interface{}); ok { if ju.SaveBlock { if tmp["blocks"] != nil { block := map[string]interface{}{ "_id": _id, "blocks": tmp["blocks"], } blocks = append(blocks, block) } } delete(tmp, "blocks") if ju.FieldsFind { if f, ok := tmp["fieldall"].(map[string][]map[string]interface{}); ok { fieldall := map[string]interface{}{ "_id": _id, } for k, v := range f { fieldall[k] = v } fieldalls = append(fieldalls, fieldall) } if ff, ok := tmp["fieldallf"].(map[string][]map[string]interface{}); ok { fieldallf := map[string]interface{}{ "_id": _id, } for k, v := range ff { fieldallf[k] = v } fieldallsf = append(fieldalls, fieldallf) } } delete(tmp, "fieldall") delete(tmp, "fieldallf") v[1] = tmp //全部更新 //v[1]["$set"] = tmp //指定更新~针对指定projectname } arr = append(arr, v) } return arr, blocks, fieldalls, fieldallsf } func (e *ExtractTask) InitAuditRecogField() { defer qu.Catch() e.RecogFieldMap = make(map[string]map[string]interface{}) recogFieldList, _ := db.Mgo.Find("rc_field", `{"delete":false}`, `{"_id":1}`, `{"s_recogfield":1,"s_recogfield_prerule":1}`, false, -1, -1) for _, f := range *recogFieldList { field := qu.ObjToString(f["s_recogfield"]) e.RecogFieldMap[field] = f } } func (e *ExtractTask) InitAuditClass() { defer qu.Catch() e.FidClassMap = make(map[string][]map[string]interface{}) class, _ := db.Mgo.Find("rc_class", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1) for _, c := range *class { classList := []map[string]interface{}{} fid := qu.ObjToString(c["s_fid"]) if len(e.FidClassMap[fid]) > 0 { //追加 classList = e.FidClassMap[fid] } classList = append(classList, c) e.FidClassMap[fid] = classList } } //加载规则 func (e *ExtractTask) InitAuditRule() { defer qu.Catch() var rureg *regexp.Regexp var rs []rune var ru string var err error e.CidRuleMap = make(map[string][]map[string]interface{}) rule, _ := db.Mgo.Find("rc_rule", `{"delete":false}`, `{"i_order":1}`, nil, false, -1, -1) for _, v := range *rule { i_rule := []interface{}{} ss, _ := (v["s_rule"].([]interface{})) for _, r := range qu.ObjArrToStringArr(ss) { if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则 rs = []rune(r) ru = string(rs[1 : len(rs)-1]) rureg, err = regexp.Compile(ru) if err != nil { log.Debug("error---rule:", r) continue } i_rule = append(i_rule, []interface{}{rureg}...) } else { //规则 i_rule = append(i_rule, r) } } v["rule"] = i_rule ruleList := []map[string]interface{}{} classid := qu.ObjToString(v["s_classid"]) if len(e.CidRuleMap[classid]) > 0 { //追加 ruleList = e.CidRuleMap[classid] } ruleList = append(ruleList, v) e.CidRuleMap[classid] = ruleList } } // func (e *ExtractTask) InitAuditFields() { if len(e.AuditFields) == 0 { v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本 if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段 vid := qu.BsonIdToSId((*v)["_id"]) query := map[string]interface{}{ "isaudit": true, "delete": false, "vid": vid, } data, _ := db.Mgo.Find("versioninfo", query, `{"_id":-1}`, `{"s_field":1}`, false, -1, -1) for _, d := range *data { field := qu.ObjToString(d["s_field"]) e.AuditFields = append(e.AuditFields, field) } } } } //加载附件抽取 func (e *ExtractTask) InitFile() { defer qu.Catch() //query:=bson.M{"version":e.TaskInfo.Version,"delete":false} ve, _ := db.Mgo.FindOne("version", `{"version":"`+e.TaskInfo.Version+`","delete":false}`) //ve, _ := db.Mgo.FindOne("version", query) if ve == nil { return } if (*ve)["isfiles"] != nil && (*ve)["isfiles"].(bool) { e.IsFileField = true } syscefiled := new(sync.Map) if (*ve)["s_filefileds"] != nil { for _, vff := range (*ve)["s_filefileds"].([]interface{}) { syscefiled.Store(vff.(string), 1) } } e.FileFields = syscefiled ju.InitOss(ju.Config["istest"].(bool)) } //加载清理任务信息 func (c *ClearTask) InitClearTaskInfo() { cleartask, _ := db.Mgo.FindById("cleartask", c.Id, nil) if len(*cleartask) > 1 { v, _ := db.Mgo.FindOne("clearversion", `{"clearversion":"`+(*cleartask)["s_version"].(string)+`","delete":false}`) c.ClearTaskInfo = &ClearTaskInfo{ Name: (*cleartask)["s_taskname"].(string), Version: (*cleartask)["s_version"].(string), VersionId: qu.BsonIdToSId((*v)["_id"]), FromDbAddr: (*cleartask)["s_mgoaddr"].(string), FromDB: (*cleartask)["s_mgodb"].(string), FromColl: (*cleartask)["s_mgocoll"].(string), IsCltLog: ju.Config["iscltlog"].(bool), ProcessPool: make(chan bool, qu.IntAllDef((*cleartask)["i_process"], 1)), } log.Debug(c.ClearTaskInfo.Name, "thread:", qu.IntAllDef((*cleartask)["i_process"], 1)) } else { return } } //加载清理脚本 func (c *ClearTask) InitClearLuas() { defer qu.Catch() c.ClearLuas = make(map[string][]*ClearLua) list, _ := db.Mgo.Find("clearversioninfo", `{"vid":"`+c.ClearTaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1) for _, l := range *list { if b, _ := l["isuse"].(bool); !b { //仅使用启用的属性 continue } s_field := qu.ObjToString(l["s_field"]) pid := qu.BsonIdToSId(l["_id"]) luas, _ := db.Mgo.Find("clearulelogic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1) for _, vv := range *luas { if b, _ := vv["isuse"].(bool); !b { continue } clearLua := &ClearLua{ Field: s_field, Code: vv["s_code"].(string), Name: vv["s_name"].(string), LuaText: vv["s_luascript"].(string), LFields: getALLFields(), } c.ClearLuas[s_field] = append(c.ClearLuas[s_field], clearLua) } } } //加载分块规则 func (e *ExtractTask) InitBlockRule() { datas, _ := db.Mgo.Find("block_info", map[string]interface{}{ "vid": e.TaskInfo.VersionId, "delete": false, }, `{"index":-1}`, `{"block_reg":1,"title_reg":1}`, false, -1, -1) brs, trs := []*regexp.Regexp{}, []*regexp.Regexp{} for _, v := range *datas { block_reg, _ := v["block_reg"].(string) block_reg, _ = strconv.Unquote(`"` + block_reg + `"`) title_reg, _ := v["title_reg"].(string) title_reg, _ = strconv.Unquote(`"` + title_reg + `"`) if block_reg == "" || title_reg == "" { continue } b_reg, b_err := regexp.Compile(block_reg) t_reg, t_err := regexp.Compile(title_reg) if b_err != nil || t_err != nil { continue } brs = append(brs, b_reg) trs = append(trs, t_reg) } e.RuleBlock = &ju.RuleBlock{ BlockRegs: brs, TitleRegs: trs, Classify: e.InitBlockClassify(), } } //加载分块规则 func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify { classify, _ := db.Mgo.Find("block_classify", map[string]interface{}{ "vid": e.TaskInfo.VersionId, "delete": false, }, nil, `{"name":1}`, false, -1, -1) classify_info, _ := db.Mgo.Find("block_classify_info", map[string]interface{}{ "vid": e.TaskInfo.VersionId, "delete": false, }, nil, `{"name":1,"code":1,"pid":1}`, false, -1, -1) classify_tag, _ := db.Mgo.Find("block_classify_tag", map[string]interface{}{ "vid": e.TaskInfo.VersionId, "delete": false, }, nil, `{"name":1,"pid":1}`, false, -1, -1) tag_map := map[string]ju.Tags{} for _, v := range *classify_tag { pid := qu.ObjToString(v["pid"]) name := qu.ObjToString(v["name"]) tag := &ju.Tag{Value: name} if strings.HasPrefix(name, "reg__") { tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__")) } tag_map[pid] = append(tag_map[pid], tag) } // info_map := map[string][]*ju.NameCode{} info_tag := map[string]*ju.TagFile{} for _, v := range *classify_info { pid := qu.ObjToString(v["pid"]) _id := qu.BsonIdToSId(v["_id"]) name := qu.ObjToString(v["name"]) info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]} info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])}) } classify_map := map[string][]*ju.NameCode{} for _, v := range *classify { _id := qu.BsonIdToSId(v["_id"]) if info_map[_id] == nil { continue } for _, vv := range strings.Split(qu.ObjToString(v["name"]), ",") { classify_map[vv] = append(classify_map[vv], info_map[_id]...) } } return &ju.BlockClassify{Type: classify_map, Classify: info_tag} }