|
@@ -2,6 +2,7 @@
|
|
package extract
|
|
package extract
|
|
|
|
|
|
import (
|
|
import (
|
|
|
|
+ "fmt"
|
|
db "jy/mongodbutil"
|
|
db "jy/mongodbutil"
|
|
"log"
|
|
"log"
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
@@ -45,6 +46,20 @@ type Tag struct {
|
|
Key string //
|
|
Key string //
|
|
Reg *regexp.Regexp //
|
|
Reg *regexp.Regexp //
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+type City struct {
|
|
|
|
+ Name string
|
|
|
|
+ Brief string
|
|
|
|
+ P *Province
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+type Province struct {
|
|
|
|
+ Name string
|
|
|
|
+ Brief string
|
|
|
|
+ Cap string
|
|
|
|
+ Captial *City
|
|
|
|
+}
|
|
|
|
+
|
|
type ExtractTask struct {
|
|
type ExtractTask struct {
|
|
Id string //任务id
|
|
Id string //任务id
|
|
IsRun bool //是否启动
|
|
IsRun bool //是否启动
|
|
@@ -57,6 +72,11 @@ type ExtractTask struct {
|
|
ClearFn map[string][]string //清理函数
|
|
ClearFn map[string][]string //清理函数
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+//敏感词
|
|
|
|
+type DFA struct {
|
|
|
|
+ Link map[string]interface{}
|
|
|
|
+}
|
|
|
|
+
|
|
func init() {
|
|
func init() {
|
|
TaskList = make(map[string]*ExtractTask)
|
|
TaskList = make(map[string]*ExtractTask)
|
|
go SaveExtLog()
|
|
go SaveExtLog()
|
|
@@ -302,7 +322,7 @@ func (e *ExtractTask) InitTag() {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
e.Tag = map[string][]*Tag{}
|
|
e.Tag = map[string][]*Tag{}
|
|
//字符串标签库
|
|
//字符串标签库
|
|
- list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"字符串","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
|
|
|
+ list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
field := qu.ObjToString(v["s_field"])
|
|
field := qu.ObjToString(v["s_field"])
|
|
if tmp, ok := v["content"].([]interface{}); ok {
|
|
if tmp, ok := v["content"].([]interface{}); ok {
|
|
@@ -313,7 +333,7 @@ func (e *ExtractTask) InitTag() {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//正则标签库
|
|
//正则标签库
|
|
- list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"正则","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
|
|
|
+ list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
field := qu.ObjToString(v["s_field"])
|
|
field := qu.ObjToString(v["s_field"])
|
|
if tmp, ok := v["content"].([]interface{}); ok {
|
|
if tmp, ok := v["content"].([]interface{}); ok {
|
|
@@ -351,3 +371,231 @@ func (e *ExtractTask) InitClearFn() {
|
|
}
|
|
}
|
|
e.ClearFn = fn
|
|
e.ClearFn = fn
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+//加载省份
|
|
|
|
+func (e *ExtractTask) InitProvince() {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ fn := map[string]interface{}{}
|
|
|
|
+ list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
|
+ for _, v := range *list {
|
|
|
|
+ name := qu.ObjToString(v["s_name"])
|
|
|
|
+ fn[name] = qu.ObjArrToStringArr(v["content"].([]interface{}))
|
|
|
|
+ }
|
|
|
|
+ ProviceConfig = fn
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+//加载城市简称
|
|
|
|
+func (e *ExtractTask) InitCitySim() {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
|
+ fn := map[string]map[string]interface{}{}
|
|
|
|
+ for _, v := range *list {
|
|
|
|
+ name := qu.ObjToString(v["s_name"])
|
|
|
|
+ tmp := v["content"].(map[string]interface{})
|
|
|
|
+ fn[name] = tmp
|
|
|
|
+ }
|
|
|
|
+ CitySimConfig = fn
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+//加载城市全称
|
|
|
|
+func (e *ExtractTask) InitCityAll() {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
|
|
|
|
+ if len(*list) != 34 {
|
|
|
|
+ fmt.Println("加载城市配置文件出错", len(*list))
|
|
|
|
+ }
|
|
|
|
+ fn := map[string]map[string]interface{}{}
|
|
|
|
+ for _, v := range *list {
|
|
|
|
+ name := qu.ObjToString(v["s_name"])
|
|
|
|
+ tmp := v["content"].(map[string]interface{})
|
|
|
|
+ fn[name] = tmp
|
|
|
|
+ }
|
|
|
|
+ CityAllConfig = fn
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+//初始化城市省份敏感词
|
|
|
|
+func InitDfa() {
|
|
|
|
+ AreaGet = DFA{}
|
|
|
|
+ AreaProvinceGet = DFA{}
|
|
|
|
+ for k, v := range ProviceConfig {
|
|
|
|
+ for _, p := range v.([]interface{}) {
|
|
|
|
+ p1, _ := p.(string)
|
|
|
|
+ AreaProvinceGet.AddWord(p1)
|
|
|
|
+ ProvinceMap[p1] = k
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ for k, v := range CityAllConfig {
|
|
|
|
+ AreaProvinceGet.AddWord(k)
|
|
|
|
+ p := &Province{}
|
|
|
|
+ p.Name = k
|
|
|
|
+ p.Brief = v["brief"].(string)
|
|
|
|
+ ProvinceMap[k] = p.Brief
|
|
|
|
+ ProvinceBrief[p.Brief] = p
|
|
|
|
+ p.Cap = v["captial"].(string)
|
|
|
|
+ city, _ := v["city"].(map[string]interface{})
|
|
|
|
+ for k1, v1 := range city {
|
|
|
|
+ v1m, _ := v1.(map[string]interface{})
|
|
|
|
+ c := &City{}
|
|
|
|
+ c.Name = k1
|
|
|
|
+ if v1m["brief"] == nil {
|
|
|
|
+ log.Println(k, k1)
|
|
|
|
+ }
|
|
|
|
+ c.Brief = v1m["brief"].(string)
|
|
|
|
+ //cityAll[k1] = c
|
|
|
|
+ CityBrief[c.Brief] = c
|
|
|
|
+ c.P = p
|
|
|
|
+ if c.Brief == p.Cap {
|
|
|
|
+ p.Captial = c
|
|
|
|
+ }
|
|
|
|
+ //加入到城市map中
|
|
|
|
+ cs := AreaToCity[k1]
|
|
|
|
+ AreaGet.AddWord(k1)
|
|
|
|
+ if cs != nil {
|
|
|
|
+ cs = append(cs, c)
|
|
|
|
+ } else {
|
|
|
|
+ cs = []*City{c}
|
|
|
|
+ }
|
|
|
|
+ AreaToCity[k1] = cs
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ AreaToCity["衢州市"] = []interface{}{
|
|
|
|
+ &City{
|
|
|
|
+ c.Name = 衢州市,
|
|
|
|
+ c.Brief = 衢州,
|
|
|
|
+ c.P = xxx
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+ arr := v1m["area"].([]interface{})
|
|
|
|
+ for _, k2 := range arr {
|
|
|
|
+ s := k2.(string)
|
|
|
|
+ cs := AreaToCity[s]
|
|
|
|
+ AreaGet.AddWord(s)
|
|
|
|
+ if cs != nil {
|
|
|
|
+ cs = append(cs, c)
|
|
|
|
+ } else {
|
|
|
|
+ cs = []*City{c}
|
|
|
|
+ }
|
|
|
|
+ AreaToCity[s] = cs
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ //加载简称
|
|
|
|
+ AreaSimGet = DFA{}
|
|
|
|
+ //util.ReadConfig("./city_sim.json", &CitySimConfig)
|
|
|
|
+ if len(CitySimConfig) != 34 {
|
|
|
|
+ log.Println("加载简称配置文件出错", len(CitySimConfig))
|
|
|
|
+ }
|
|
|
|
+ for k, v := range CitySimConfig {
|
|
|
|
+ pb := v["brief"].(string)
|
|
|
|
+ p := ProvinceBrief[pb]
|
|
|
|
+ //加载
|
|
|
|
+ for _, ss := range []string{k, pb} {
|
|
|
|
+ cs := AreaToCity[ss]
|
|
|
|
+ if cs != nil {
|
|
|
|
+ cs = append(cs, p.Captial)
|
|
|
|
+ } else {
|
|
|
|
+ cs = []*City{p.Captial}
|
|
|
|
+ }
|
|
|
|
+ AreaToCity[ss] = cs
|
|
|
|
+ AreaSimGet.AddWord(ss)
|
|
|
|
+ }
|
|
|
|
+ city, _ := v["city"].(map[string]interface{})
|
|
|
|
+ for k1, v1 := range city {
|
|
|
|
+ v1m, _ := v1.(map[string]interface{})
|
|
|
|
+ if v1m["brief"] == nil {
|
|
|
|
+ log.Println(k, k1)
|
|
|
|
+ }
|
|
|
|
+ cb := v1m["brief"].(string)
|
|
|
|
+ c := AreaToCity[k1][0]
|
|
|
|
+ //加入到城市map中
|
|
|
|
+ for _, ss := range []string{cb, k + cb, pb + cb} {
|
|
|
|
+ AreaSimGet.AddWord(ss)
|
|
|
|
+ cs := AreaToCity[ss]
|
|
|
|
+ if cs != nil {
|
|
|
|
+ cs = append(cs, c)
|
|
|
|
+ } else {
|
|
|
|
+ cs = []*City{c}
|
|
|
|
+ }
|
|
|
|
+ AreaToCity[ss] = cs
|
|
|
|
+ }
|
|
|
|
+ arr := v1m["area"].([]interface{})
|
|
|
|
+ for _, k2 := range arr {
|
|
|
|
+ s := k2.(string)
|
|
|
|
+ for _, ss := range []string{s, cb + s, pb + s, k + s} {
|
|
|
|
+ cs := AreaToCity[ss]
|
|
|
|
+ AreaSimGet.AddWord(ss)
|
|
|
|
+ if cs != nil {
|
|
|
|
+ cs = append(cs, c)
|
|
|
|
+ } else {
|
|
|
|
+ cs = []*City{c}
|
|
|
|
+ }
|
|
|
|
+ AreaToCity[ss] = cs
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func (d *DFA) AddWord(keys ...string) {
|
|
|
|
+ d.AddWordAll(true, keys...)
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func (d *DFA) AddWordAll(haskey bool, keys ...string) {
|
|
|
|
+ if d.Link == nil {
|
|
|
|
+ d.Link = make(map[string]interface{})
|
|
|
|
+ }
|
|
|
|
+ for _, key := range keys {
|
|
|
|
+ nowMap := &d.Link
|
|
|
|
+ for i := 0; i < len(key); i++ {
|
|
|
|
+ kc := key[i : i+1]
|
|
|
|
+ if v, ok := (*nowMap)[kc]; ok {
|
|
|
|
+ nowMap, _ = v.(*map[string]interface{})
|
|
|
|
+ } else {
|
|
|
|
+ newMap := map[string]interface{}{}
|
|
|
|
+ newMap["YN"] = "0"
|
|
|
|
+ (*nowMap)[kc] = &newMap
|
|
|
|
+ nowMap = &newMap
|
|
|
|
+ }
|
|
|
|
+ if i == len(key)-1 {
|
|
|
|
+ (*nowMap)["YN"] = "1"
|
|
|
|
+ if haskey {
|
|
|
|
+ (*nowMap)["K"] = key
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+//匹配最长
|
|
|
|
+func (d *DFA) CheckSensitiveWord(src string) string {
|
|
|
|
+ pos := 0
|
|
|
|
+ nowMap := &d.Link
|
|
|
|
+ res := ""
|
|
|
|
+ for i := 0; i < len(src); i++ {
|
|
|
|
+ word := src[i : i+1]
|
|
|
|
+ nowMap, _ = (*nowMap)[word].(*map[string]interface{})
|
|
|
|
+ if nowMap != nil { // 存在,则判断是否为最后一个
|
|
|
|
+ if pos == 0 {
|
|
|
|
+ pos = i
|
|
|
|
+ }
|
|
|
|
+ if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
|
|
|
|
+ res = qu.ObjToString((*nowMap)["K"])
|
|
|
|
+ //pos = 0
|
|
|
|
+ //break
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ if res != "" {
|
|
|
|
+ break
|
|
|
|
+ } else {
|
|
|
|
+ nowMap = &d.Link
|
|
|
|
+ if pos > 0 {
|
|
|
|
+ i = pos
|
|
|
|
+ pos = 0
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return res
|
|
|
|
+}
|