123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322 |
- package extract
- import (
- "fmt"
- //ju "jy/util"
- "log"
- qu "qfw/util"
- "strings"
- )
- //省
- type Province struct {
- Name string
- Brief string
- Cap string
- Captial *City
- }
- //市
- type City struct {
- Name string
- Brief string
- P *Province
- }
- //区或县
- type District struct {
- Name string
- C *City
- }
- //街道
- type Street struct {
- Name string
- D *District
- }
- //敏感词
- type DFA struct {
- Link map[string]interface{}
- }
- var SortField []string
- var (
- AreaGet DFA //市全称
- AreaDistrict DFA //区或县
- AreaProvinceGet DFA //省
- AreaSimGet DFA //市简称
- AreaStreet DFA //街道
- )
- var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
- var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
- var ProviceConfig map[string]interface{} = make(map[string]interface{}) //省份
- var ProvinceMap map[string]string = make(map[string]string)
- var CityBrief map[string]*City = make(map[string]*City) //只加载一次即可
- var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
- var AreaToCity map[string][]*City = make(map[string][]*City) //两个文件共用
- var DistrictCityMap map[string]*City = make(map[string]*City)
- var StreetDistrictMap map[string]*District = make(map[string]*District)
- func init() {
- qu.ReadConfig("./extractcity.json", &SortField)
- }
- func TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
- province := fmt.Sprint(resulttmp["area"])
- city := fmt.Sprint(resulttmp["city"])
- field := make([]string, 0)
- for _, f := range SortField { //
- val := resulttmp[f]
- if val == nil {
- field = append(field, "")
- } else {
- field = append(field, fmt.Sprint(val))
- }
- }
- bres, c, p = ExtractProvinceCity(province, city, id, field) //抽取省和市
- bres, p, c, d = ExtractDistrict(field, bres, c, p, id) //抽取区或县
- return
- }
- //抽取区或县(从配置的字段信息中抽取区或县)
- func ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
- d := ""
- for _, str := range field {
- for pos, GET := range []DFA{AreaDistrict, AreaStreet} { //先匹配区或县再匹配街道
- word := GET.CheckSensitiveWord(str)
- if word != "" {
- if pos == 0 { //区或县匹配
- //log.Println("县直接匹配到====", word)
- city := DistrictCityMap[word]
- if city != nil {
- d = word
- ctmp := city.Brief
- ptmp := city.P.Brief
- if !bres { //城市省份没有抽到,通过区或县定位市和省
- c = ctmp
- p = ptmp
- bres = true
- } else { //对比抽到的城市省份是否一致
- if c != ctmp || p != ptmp {
- log.Println("str---", str, "====", word)
- log.Println("district: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
- c = ctmp
- p = ptmp
- }
- }
- }
- } else { //街道匹配
- //log.Println("匹配到街道====", word)
- district := StreetDistrictMap[word]
- if district != nil {
- d = district.Name
- ctmp := district.C.Brief
- ptmp := district.C.P.Brief
- if !bres { //城市省份没有抽到,通过区或县定位市和省
- c = ctmp
- p = ptmp
- bres = true
- } else { //对比抽到的城市省份是否一致
- if c != ctmp || p != ptmp {
- log.Println("street: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
- c = ctmp
- p = ptmp
- }
- }
- }
- }
- return bres, p, c, d
- }
- }
- }
- return bres, p, c, d
- }
- //抽取城市、省份
- func ExtractProvinceCity(province, city, id string, field []string) (bres bool, c, p string) {
- defer qu.Catch()
- bc := true //是否继续抽取
- if city != "" {
- if CityBrief[city] == nil { //简称不存在
- //log.Println("city err:", city, id)
- } else { //简称存在
- if province != CityBrief[city].P.Brief { //省份不配对
- //log.Println("province err:", city, province, id)
- } else {
- bc = false
- //城市省份都正确
- }
- }
- }
- //有省份
- bp := false
- if ProvinceBrief[province] != nil { //省份简称正确
- bp = true
- } else { //没有省份,先识别省份
- for _, str := range field { //没有省的简称,从配置的字段信息中抽取省
- word := AreaProvinceGet.CheckSensitiveWord(str) //省全称DFA中匹配
- if word != "" {
- province = ProvinceMap[word] //
- bp = true
- break
- }
- }
- }
- //匹配城市
- if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
- //目前是全匹配模式,如果再加上精简匹配,加一层循环
- for pos, GET := range []DFA{AreaGet, AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
- ws := make([]string, 5)
- for n, str := range field {
- if str != "" {
- word := GET.CheckSensitiveWord(str)
- if pos == 1 { //用简称 后辍为路、集团替换
- str1 := strings.Replace(str, word+"路", "", 1)
- if str1 != str {
- word = GET.CheckSensitiveWord(str1)
- }
- }
- ws[n] = word
- if word != "" {
- res := AreaToCity[word]
- if len(res) == 1 {
- //判断省份
- if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
- bres = true
- c = res[0].Brief
- p = res[0].P.Brief
- break
- } else { //不一致时。。暂时不处理
- }
- } else { //多个时(出现这种情况是多个省中的市,市名相同)
- }
- }
- }
- }
- if !bres { //没有匹配到
- mc := map[string]int{}
- for _, w := range ws {
- res := AreaToCity[w]
- for _, ct := range res {
- if ct == nil {
- continue
- }
- if bp { //有省份
- if ct.P != nil && ct.P.Brief == province {
- mc[ct.Brief]++
- }
- } else { //没有省份
- mc[ct.Brief]++
- }
- }
- }
- //计算mc中最大值且大于1
- max := 1
- v := ""
- for mk, mv := range mc {
- if mv > max {
- v = mk
- }
- }
- if v != "" {
- bres = true
- c = CityBrief[v].Brief
- p = CityBrief[v].P.Brief
- } else if len(mc) > 0 {
- //取级别更大的
- v := ""
- for mk, _ := range mc {
- if CityBrief[mk].P.Cap == mk {
- bres = true
- c = CityBrief[mk].Brief
- p = CityBrief[mk].P.Brief
- break
- } else {
- v = mk
- }
- }
- if !bres {
- bres = true
- c = CityBrief[v].Brief
- p = CityBrief[v].P.Brief
- }
- }
- }
- if bres {
- break
- }
- }
- } else {
- return
- }
- if !bres {
- //取默认省会
- if ProvinceBrief[province] != nil {
- bres = true
- c = ProvinceBrief[province].Cap
- p = province
- }
- }
- return
- }
- func (d *DFA) AddWord(keys ...string) {
- d.AddWordAll(true, keys...)
- }
- func (d *DFA) AddWordAll(haskey bool, keys ...string) {
- if d.Link == nil {
- d.Link = make(map[string]interface{})
- }
- for _, key := range keys {
- nowMap := &d.Link
- for i := 0; i < len(key); i++ {
- kc := key[i : i+1]
- if v, ok := (*nowMap)[kc]; ok {
- nowMap, _ = v.(*map[string]interface{})
- } else {
- newMap := map[string]interface{}{}
- newMap["YN"] = "0"
- (*nowMap)[kc] = &newMap
- nowMap = &newMap
- }
- if i == len(key)-1 {
- (*nowMap)["YN"] = "1"
- if haskey {
- (*nowMap)["K"] = key
- }
- }
- }
- }
- }
- func (d *DFA) CheckSensitiveWord(src string) string {
- pos := 0
- nowMap := &d.Link
- res := ""
- for i := 0; i < len(src); i++ {
- word := src[i : i+1]
- nowMap, _ = (*nowMap)[word].(*map[string]interface{})
- if nowMap != nil { // 存在,则判断是否为最后一个
- if pos == 0 {
- pos = i
- }
- if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
- res = qu.ObjToString((*nowMap)["K"])
- //pos = 0
- //break
- }
- } else {
- if res != "" {
- break
- } else {
- nowMap = &d.Link
- if pos > 0 {
- i = pos
- pos = 0
- }
- }
- }
- }
- return res
- }
|