123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- package extract
- import (
- "fmt"
- "log"
- qu "qfw/util"
- "strings"
- )
- //省
- type Province struct {
- Name string
- Brief string
- Cap string
- Captial *City
- }
- //市
- type City struct {
- Name string
- Brief string
- P *Province
- }
- //区或县
- type District struct {
- Name string
- C *City
- }
- //街道
- type Street struct {
- Name string
- D *District
- }
- //敏感词
- type DFA struct {
- Link map[string]interface{}
- }
- var SortField []string
- func init() {
- qu.ReadConfig("./extractcity.json", &SortField)
- }
- func (e *ExtractTask) TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
- defer qu.Catch()
- province := fmt.Sprint(resulttmp["area"])
- city := fmt.Sprint(resulttmp["city"])
- fieldval := make([]string, 0)
- for _, f := range SortField { //
- val := resulttmp[f]
- if val == nil {
- fieldval = append(fieldval, "")
- } else {
- fieldval = append(fieldval, fmt.Sprint(val))
- }
- }
- //log.Println("field========", fieldval)
- bres, c, p = e.ExtractProvinceCity(province, city, id, fieldval) //抽取省和市
- //log.Println("b--------", bres, "p---------", p, "c-------------", c)
- bres, p, c, d = e.ExtractDistrict(fieldval, bres, c, p, id) //抽取区或县
- //log.Println("bres========", bres, "p===========", p, "c=========", c, "d=============", d)
- return
- }
- //抽取区或县(从配置的字段信息中抽取区或县)
- func (e *ExtractTask) ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
- d := ""
- for _, str := range field {
- //log.Println("field===========", str)
- for pos, GET := range []DFA{e.AreaDistrict, e.AreaStreet} { //先匹配区或县再匹配街道
- word := GET.CheckSensitiveWord(str)
- //log.Println("word================", word)
- if word != "" {
- if pos == 0 { //区或县匹配
- //log.Println("县直接匹配到====", word)
- lock.Lock()
- city := e.DistrictCityMap[word]
- lock.Unlock()
- //log.Println("city================", city)
- if city != nil {
- d = word
- ctmp := city.Brief
- ptmp := city.P.Brief
- //log.Println("ctmpptmp================", ptmp, ctmp)
- if !bres { //城市省份没有抽到,通过区或县定位市和省
- c = ctmp
- p = ptmp
- bres = true
- } else { //对比抽到的城市省份是否一致
- if c != ctmp || p != ptmp {
- //log.Println("str---", str, "====", word)
- //log.Println("district: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
- c = ctmp
- p = ptmp
- }
- }
- }
- } else { //街道匹配
- //log.Println("匹配到街道====", word)
- lock.Lock()
- district := e.StreetDistrictMap[word]
- lock.Unlock()
- //log.Println("district================", district)
- if district != nil {
- d = district.Name
- ctmp := district.C.Brief
- ptmp := district.C.P.Brief
- //log.Println("districtptmp================", ctmp, ptmp)
- if !bres { //城市省份没有抽到,通过区或县定位市和省
- c = ctmp
- p = ptmp
- bres = true
- } else { //对比抽到的城市省份是否一致
- if c != ctmp || p != ptmp {
- //log.Println("street: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
- c = ctmp
- p = ptmp
- }
- }
- }
- }
- return bres, p, c, d
- }
- }
- }
- return bres, p, c, d
- }
- //抽取城市、省份
- func (e *ExtractTask) ExtractProvinceCity(province, city, id string, text []string) (bres bool, c, p string) {
- defer qu.Catch()
- bc := true //是否继续抽取
- if city != "" {
- lock.Lock()
- citybrief := e.CityBrief[city]
- //log.Println("citybrief========", citybrief)
- lock.Unlock()
- if citybrief == nil { //简称不存在
- log.Println("city err:", city, id)
- } else { //简称存在
- lock.Lock()
- pbrief := e.CityBrief[city].P.Brief
- //log.Println("pbrief========", pbrief)
- lock.Unlock()
- if province != pbrief { //省份不配对
- log.Println("province err:", city, province, id)
- } else {
- bc = false
- //城市省份都正确
- }
- }
- }
- //有省份
- bp := false
- lock.Lock()
- provincebrief := e.ProvinceBrief[province]
- //log.Println("provincebrief========", provincebrief)
- lock.Unlock()
- if provincebrief != nil { //省份简称正确
- bp = true
- } else { //没有省份,先识别省份
- for _, str := range text { //没有省的简称,从配置的字段信息中抽取省
- word := e.AreaProvinceGet.CheckSensitiveWord(str) //省全称DFA中匹配
- if word != "" {
- lock.Lock()
- province = e.ProvinceMap[word]
- lock.Unlock()
- bp = true
- break
- }
- }
- }
- //匹配城市
- if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
- for pos, GET := range []DFA{e.AreaGet, e.AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
- ws := make([]string, 5)
- for n, str := range text {
- if str != "" {
- word := GET.CheckSensitiveWord(str)
- if pos == 1 { //用简称 后辍为路、集团替换
- str1 := strings.Replace(str, word+"路", "", 1)
- if str1 != str {
- word = GET.CheckSensitiveWord(str1)
- }
- }
- ws[n] = word
- if word != "" {
- lock.Lock()
- res := e.AreaToCity[word]
- lock.Unlock()
- if len(res) == 1 {
- //判断省份
- if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
- bres = true
- c = res[0].Brief
- p = res[0].P.Brief
- break
- } else { //不一致时。。暂时不处理
- }
- } else { //多个时(出现这种情况是多个省中的市,市名相同。现在的配置文件中已经将市名,县名重复的全部去掉)
- }
- }
- }
- }
- if !bres { //没有匹配到
- mc := map[string]int{}
- for _, w := range ws {
- lock.Lock()
- res := e.AreaToCity[w]
- lock.Unlock()
- for _, ct := range res {
- if ct == nil {
- continue
- }
- if bp { //有省份
- if ct.P != nil && ct.P.Brief == province {
- mc[ct.Brief]++
- }
- } else { //没有省份
- mc[ct.Brief]++
- }
- }
- }
- //计算mc中最大值且大于1
- max := 1
- v := ""
- for mk, mv := range mc {
- if mv > max {
- v = mk
- }
- }
- if v != "" {
- bres = true
- lock.Lock()
- ctb := e.CityBrief[v]
- lock.Unlock()
- c = ctb.Brief
- p = ctb.P.Brief
- } else if len(mc) > 0 {
- //取级别更大的
- v := ""
- for mk, _ := range mc {
- lock.Lock()
- cb := e.CityBrief[mk]
- lock.Unlock()
- if cb.P.Cap == mk {
- bres = true
- c = cb.Brief
- p = cb.P.Brief
- break
- } else {
- v = mk
- }
- }
- if !bres {
- bres = true
- lock.Lock()
- cbb := e.CityBrief[v]
- c = cbb.Brief
- p = cbb.P.Brief
- lock.Unlock()
- }
- }
- }
- if bres {
- break
- }
- }
- } else {
- return
- }
- if !bres {
- //取默认省会
- lock.Lock()
- pbp := e.ProvinceBrief[province]
- lock.Unlock()
- if pbp != nil {
- bres = true
- c = pbp.Cap
- p = province
- }
- }
- return
- }
- func (d *DFA) AddWord(keys ...string) {
- d.AddWordAll(true, keys...)
- }
- func (d *DFA) AddWordAll(haskey bool, keys ...string) {
- if d.Link == nil {
- d.Link = make(map[string]interface{})
- }
- for _, key := range keys {
- nowMap := &d.Link
- for i := 0; i < len(key); i++ {
- kc := key[i : i+1]
- if v, ok := (*nowMap)[kc]; ok {
- nowMap, _ = v.(*map[string]interface{})
- } else {
- newMap := map[string]interface{}{}
- newMap["YN"] = "0"
- (*nowMap)[kc] = &newMap
- nowMap = &newMap
- }
- if i == len(key)-1 {
- (*nowMap)["YN"] = "1"
- if haskey {
- (*nowMap)["K"] = key
- }
- }
- }
- }
- }
- func (d *DFA) CheckSensitiveWord(src string) string {
- pos := 0
- nowMap := &d.Link
- res := ""
- for i := 0; i < len(src); i++ {
- word := src[i : i+1]
- nowMap, _ = (*nowMap)[word].(*map[string]interface{})
- if nowMap != nil { // 存在,则判断是否为最后一个
- if pos == 0 {
- pos = i
- }
- if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
- res = qu.ObjToString((*nowMap)["K"])
- //pos = 0
- //break
- }
- } else {
- if res != "" {
- break
- } else {
- nowMap = &d.Link
- if pos > 0 {
- i = pos
- pos = 0
- }
- }
- }
- }
- return res
- }
|