123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620 |
- // extractInit
- package extract
- import (
- db "jy/mongodbutil"
- "log"
- qu "qfw/util"
- "regexp"
- "strings"
- )
- type RegLuaInfo struct { //正则或脚本信息
- Code, Name, Field string //
- RuleText string //
- IsLua, IsHasFields bool //IsHasFields正则配置有属性字段
- RegPreBac *ExtReg //
- RegCore *ExtReg //
- LFields []interface{} //lua抽取字段属性组
- }
- type ExtReg struct {
- Reg *regexp.Regexp
- Replace string
- Bextract bool
- ExtractPos map[string]int
- }
- type RuleCore struct {
- Field string //逻辑字段
- LuaLogic string //进入逻辑
- ExtFrom string //从哪个字段抽取
- RulePres []*RegLuaInfo //抽取前置规则
- RuleBacks []*RegLuaInfo //抽取后置规则
- RuleCores []*RegLuaInfo //抽取规则
- }
- type TaskInfo struct {
- Name, Version, VersionId, TrackColl string //名称、版本、版本id、追踪记录表
- FromDbAddr, FromDB, FromColl string //抽取数据库地址、库名、表名
- SaveColl, TestColl, LastExtId string //抽取结果表、测试结果表、上次抽取信息id
- DB *db.Pool //数据库连接池
- IsEtxLog bool //是否开启抽取日志
- ProcessPool chan bool //任务进程池
- TestLua bool //检查测试用
- }
- type Tag struct {
- Type string //标签类型 string 字符串、regexp 正则
- Key string //
- Reg *regexp.Regexp //
- }
- type City struct {
- Name string
- Brief string
- P *Province
- }
- type Province struct {
- Name string
- Brief string
- Cap string
- Captial *City
- }
- type ExtractTask struct {
- Id string //任务id
- IsRun bool //是否启动
- Content string //信息内容
- TaskInfo *TaskInfo //任务信息
- RulePres []*RegLuaInfo //通用前置规则
- RuleBacks []*RegLuaInfo //通用后置规则
- RuleCores []*RuleCore //抽取规则
- Tag map[string][]*Tag //标签库
- ClearFn map[string][]string //清理函数
- }
- //敏感词
- type DFA struct {
- Link map[string]interface{}
- }
- func init() {
- TaskList = make(map[string]*ExtractTask)
- go SaveExtLog()
- }
- //加载任务信息
- func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- TrackColl: trackcoll,
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- TestColl: resultcoll,
- IsEtxLog: true,
- ProcessPool: make(chan bool, 1),
- }
- } else {
- return
- }
- }
- //加载任务信息
- func (e *ExtractTask) InitTaskInfo() {
- task, _ := db.Mgo.FindById("task", e.Id, nil)
- log.Println("task", task)
- if len(*task) > 1 {
- v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`","delete":false}`)
- e.TaskInfo = &TaskInfo{
- Name: (*task)["s_taskname"].(string),
- Version: (*task)["s_version"].(string),
- VersionId: qu.BsonIdToSId((*v)["_id"]),
- //TrackColl: (*task)["s_trackcoll"].(string),
- FromDbAddr: (*task)["s_mgoaddr"].(string),
- FromDB: (*task)["s_mgodb"].(string),
- FromColl: (*task)["s_mgocoll"].(string),
- SaveColl: (*task)["s_mgosavecoll"].(string),
- IsEtxLog: false, //qu.If(qu.IntAll((*task)["i_track"]) == 1, true, false).(bool),
- LastExtId: qu.ObjToString((*task)["s_extlastid"]),
- ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
- }
- log.Println(e.TaskInfo.Name, e.TaskInfo.ProcessPool)
- } else {
- return
- }
- }
- //加载通用前置规则
- func (e *ExtractTask) InitRulePres() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("rule_pre", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- e.RulePres = append(e.RulePres, rinfo)
- }
- }
- //加载通用后置规则
- func (e *ExtractTask) InitRuleBacks() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("rule_back", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- e.RuleBacks = append(e.RuleBacks, rinfo)
- }
- }
- //加载抽取规则
- func (e *ExtractTask) InitRuleCore() {
- defer qu.Catch()
- vinfos, _ := db.Mgo.Find("versioninfo", `{"vid":"`+e.TaskInfo.VersionId+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vinfo := range *vinfos {
- if b, _ := vinfo["isuse"].(bool); !b {
- continue
- }
- pid := qu.BsonIdToSId(vinfo["_id"])
- list, _ := db.Mgo.Find("rule_logic", `{"pid":"`+pid+`","delete":false}`, nil, nil, false, -1, -1)
- for _, vv := range *list {
- if b, _ := vv["isuse"].(bool); !b {
- continue
- }
- rcore := &RuleCore{}
- rcore.Field = vinfo["s_field"].(string)
- rcore.LuaLogic = qu.ObjToString(vv["s_luascript"]) //是否进入逻辑脚本
- rcore.ExtFrom = qu.If(vv["extfrom"].(bool), "title", "detail").(string)
- //前置规则
- rulePres := []*RegLuaInfo{}
- plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *plist {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- rinfo.Field = v["s_field"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- rulePres = append(rulePres, rinfo)
- }
- rcore.RulePres = rulePres
- //后置规则
- ruleBacks := []*RegLuaInfo{}
- blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *blist {
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- rinfo.Field = v["s_field"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
- } else {
- rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- ruleBacks = append(ruleBacks, rinfo)
- }
- rcore.RuleBacks = ruleBacks
- //抽取规则
- ruleCores := []*RegLuaInfo{}
- clist, _ := db.Mgo.Find("rule_logicore", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *clist {
- if b, _ := v["isuse"].(bool); !b {
- continue
- }
- rinfo := &RegLuaInfo{
- Code: v["s_code"].(string),
- Name: v["s_name"].(string),
- IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
- }
- if rinfo.IsLua {
- rinfo.RuleText = v["s_luascript"].(string)
- //暂时提取全部属性
- rinfo.LFields = getALLFields()
- rinfo.IsHasFields = true
- /*rinfo.LFields, _ = v["s_fields"].([]interface{})
- if len(rinfo.LFields) > 0 {
- rinfo.IsHasFields = true
- }*/
- } else {
- qu.Try(func() {
- rinfo.RuleText = v["s_rule"].(string)
- rinfo.Field = v["s_field"].(string)
- tmp := strings.Split(rinfo.RuleText, "__")
- if len(tmp) == 2 {
- epos := strings.Split(tmp[1], ",")
- posm := map[string]int{}
- for _, v := range epos {
- ks := strings.Split(v, ":")
- if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
- posm[ks[1]] = qu.IntAll(ks[0])
- } else { //(.*)招标公告__2
- posm[rinfo.Field] = qu.IntAll(ks[0])
- }
- }
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
- } else {
- rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
- }
- }, func(err interface{}) {
- log.Println(rinfo.Code, rinfo.Field, err)
- })
- }
- ruleCores = append(ruleCores, rinfo)
- }
- rcore.RuleCores = ruleCores
- //
- e.RuleCores = append(e.RuleCores, rcore)
- }
- }
- }
- //加载标签库
- func (e *ExtractTask) InitTag() {
- defer qu.Catch()
- e.Tag = map[string][]*Tag{}
- //字符串标签库
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- for _, key := range tmp {
- tag := &Tag{Type: "string", Key: key.(string)}
- e.Tag[field] = append(e.Tag[field], tag)
- }
- }
- }
- //正则标签库
- list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- field := qu.ObjToString(v["s_field"])
- if tmp, ok := v["content"].([]interface{}); ok {
- for _, key := range tmp {
- tag := &Tag{Type: "regexp", Key: key.(string), Reg: regexp.MustCompile(key.(string))}
- e.Tag[field] = append(e.Tag[field], tag)
- }
- }
- }
- }
- //获取fields
- func getALLFields() []interface{} {
- fields := []interface{}{}
- list, _ := db.Mgo.Find("fields", `{}`, nil, `{"s_field":1}`, false, -1, -1)
- for _, v := range *list {
- fields = append(fields, v["s_field"])
- }
- return fields
- }
- //加载clear函数
- func (e *ExtractTask) InitClearFn() {
- list, _ := db.Mgo.Find("cleanup", `{"s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string][]string{}
- for _, tmp := range *list {
- field := tmp["s_field"].(string)
- fns := tmp["clear"].([]interface{})
- if fn[field] == nil {
- fn[field] = []string{}
- }
- for _, v := range fns {
- fn[field] = append(fn[field], v.(string))
- }
- }
- e.ClearFn = fn
- }
- //加载省份
- func (e *ExtractTask) InitProvince() {
- defer qu.Catch()
- fn := map[string]interface{}{}
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- content := v["content"]
- switch content.(type) {
- case string:
- fn[name] = []interface{}{content.(string)}
- case []interface{}:
- fn[name] = content
- }
- }
- ProviceConfig = fn
- }
- //加载城市简称
- func (e *ExtractTask) InitCitySim() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- fn := map[string]map[string]interface{}{}
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- tmp := v["content"].(map[string]interface{})
- fn[name] = tmp
- }
- CitySimConfig = fn
- }
- //加载城市全称
- func (e *ExtractTask) InitCityAll() {
- defer qu.Catch()
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+e.TaskInfo.Version+`","delete":false}`, nil, nil, false, -1, -1)
- // if len(*list) != 34 {
- // fmt.Println("加载城市配置文件出错", len(*list))
- // }
- fn := map[string]map[string]interface{}{}
- for _, v := range *list {
- name := qu.ObjToString(v["s_name"])
- tmp := v["content"].(map[string]interface{})
- fn[name] = tmp
- }
- CityAllConfig = fn
- }
- //初始化城市省份敏感词
- func InitDFA() {
- AreaGet = DFA{}
- AreaProvinceGet = DFA{}
- for k, v := range ProviceConfig {
- log.Println(k, "----------", v)
- for _, p := range v.([]interface{}) {
- log.Println("ppppp", p)
- p1, _ := p.(string)
- AreaProvinceGet.AddWord(p1)
- ProvinceMap[p1] = k
- }
- }
- log.Println("ProvinceMap11----", ProvinceMap)
- for k, v := range CityAllConfig {
- AreaProvinceGet.AddWord(k) //省全称
- p := &Province{}
- p.Name = k
- p.Brief = v["brief"].(string)
- ProvinceMap[k] = p.Brief
- log.Println("ProvinceMap22----", ProvinceMap)
- ProvinceBrief[p.Brief] = p
- p.Cap = v["captial"].(string)
- log.Println("ProvinceBrief11====", p.Brief, ProvinceBrief[p.Brief].Name, ProvinceBrief[p.Brief].Brief, "==", ProvinceBrief[p.Brief].Cap)
- city, _ := v["city"].(map[string]interface{})
- log.Println("======================================================")
- for k1, v1 := range city {
- v1m, _ := v1.(map[string]interface{})
- c := &City{}
- c.Name = k1
- if v1m["brief"] == nil {
- log.Println(k, k1)
- }
- c.Brief = v1m["brief"].(string)
- //cityAll[k1] = c
- CityBrief[c.Brief] = c
- c.P = p
- if c.Brief == p.Cap {
- p.Captial = c
- }
- log.Println("CityBrief11+++", k1, "---", CityBrief[c.Brief].Name, CityBrief[c.Brief].Brief, "===", CityBrief[c.Brief].P.Captial, "===", CityBrief[c.Brief].P.Name)
- //加入到城市map中
- cs := AreaToCity[k1]
- AreaGet.AddWord(k1) //市全称
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- AreaToCity[k1] = cs
- log.Println("市---", k1, AreaToCity[k1][0].Brief, AreaToCity[k1][0].Name, AreaToCity[k1][0].P.Name)
- /*
- AreaToCity["衢州市"] = []interface{}{
- &City{
- c.Name = 衢州市,
- c.Brief = 衢州,
- c.P = xxx
- },
- }
- */
- arr := v1m["area"].([]interface{})
- for _, k2 := range arr {
- s := k2.(string)
- cs := AreaToCity[s]
- AreaGet.AddWord(s) //街道全称
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- AreaToCity[s] = cs
- log.Println("街道===", k2, AreaToCity)
- }
- }
- }
- log.Println("======================================================")
- //加载简称
- AreaSimGet = DFA{}
- //util.ReadConfig("./city_sim.json", &CitySimConfig)
- // if len(CitySimConfig) != 34 {
- // log.Println("加载简称配置文件出错", len(CitySimConfig))
- // }
- for k, v := range CitySimConfig {
- pb := v["brief"].(string)
- p := ProvinceBrief[pb]
- log.Println("++++++++++++++++++", p)
- //加载
- for _, ss := range []string{k, pb} {
- cs := AreaToCity[ss]
- if cs != nil {
- cs = append(cs, p.Captial)
- } else {
- cs = []*City{p.Captial}
- }
- AreaToCity[ss] = cs
- log.Println("+++", ss, AreaToCity)
- AreaSimGet.AddWord(ss) //省全称和省简称
- }
- city, _ := v["city"].(map[string]interface{})
- for k1, v1 := range city {
- v1m, _ := v1.(map[string]interface{})
- if v1m["brief"] == nil {
- log.Println(k, k1)
- }
- cb := v1m["brief"].(string)
- c := AreaToCity[k1][0]
- //加入到城市map中
- for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
- AreaSimGet.AddWord(ss)
- cs := AreaToCity[ss]
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- AreaToCity[ss] = cs
- log.Println("+-+-", ss, AreaToCity)
- }
- arr := v1m["area"].([]interface{})
- for _, k2 := range arr {
- s := k2.(string)
- for _, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
- cs := AreaToCity[ss]
- AreaSimGet.AddWord(ss)
- if cs != nil {
- cs = append(cs, c)
- } else {
- cs = []*City{c}
- }
- AreaToCity[ss] = cs
- log.Println("-+-+", ss, AreaToCity)
- }
- }
- }
- }
- log.Println(AreaToCity)
- }
- func (d *DFA) AddWord(keys ...string) {
- d.AddWordAll(true, keys...)
- }
- func (d *DFA) AddWordAll(haskey bool, keys ...string) {
- if d.Link == nil {
- d.Link = make(map[string]interface{})
- }
- for _, key := range keys {
- nowMap := &d.Link
- for i := 0; i < len(key); i++ {
- kc := key[i : i+1]
- if v, ok := (*nowMap)[kc]; ok {
- nowMap, _ = v.(*map[string]interface{})
- } else {
- newMap := map[string]interface{}{}
- newMap["YN"] = "0"
- (*nowMap)[kc] = &newMap
- nowMap = &newMap
- }
- if i == len(key)-1 {
- (*nowMap)["YN"] = "1"
- if haskey {
- (*nowMap)["K"] = key
- }
- }
- }
- }
- }
- func (d *DFA) CheckSensitiveWord(src string) string {
- pos := 0
- nowMap := &d.Link
- res := ""
- for i := 0; i < len(src); i++ {
- word := src[i : i+1]
- nowMap, _ = (*nowMap)[word].(*map[string]interface{})
- if nowMap != nil { // 存在,则判断是否为最后一个
- if pos == 0 {
- pos = i
- }
- if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
- res = qu.ObjToString((*nowMap)["K"])
- //pos = 0
- //break
- }
- } else {
- if res != "" {
- break
- } else {
- nowMap = &d.Link
- if pos > 0 {
- i = pos
- pos = 0
- }
- }
- }
- }
- return res
- }
|