extractcity.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. package extract
  2. import (
  3. "fmt"
  4. //ju "jy/util"
  5. "log"
  6. qu "qfw/util"
  7. "strings"
  8. )
  9. //省
  10. type Province struct {
  11. Name string
  12. Brief string
  13. Cap string
  14. Captial *City
  15. }
  16. //市
  17. type City struct {
  18. Name string
  19. Brief string
  20. P *Province
  21. }
  22. //区或县
  23. type District struct {
  24. Name string
  25. C *City
  26. }
  27. //街道
  28. type Street struct {
  29. Name string
  30. D *District
  31. }
  32. //敏感词
  33. type DFA struct {
  34. Link map[string]interface{}
  35. }
  36. var SortField []string
  37. var (
  38. AreaGet DFA //市全称
  39. AreaDistrict DFA //区或县
  40. AreaProvinceGet DFA //省
  41. AreaSimGet DFA //市简称
  42. AreaStreet DFA //街道
  43. )
  44. var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
  45. var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
  46. var ProviceConfig map[string]interface{} = make(map[string]interface{}) //省份
  47. var ProvinceMap map[string]string = make(map[string]string)
  48. var CityBrief map[string]*City = make(map[string]*City) //只加载一次即可
  49. var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
  50. var AreaToCity map[string][]*City = make(map[string][]*City) //两个文件共用
  51. var DistrictCityMap map[string]*City = make(map[string]*City)
  52. var StreetDistrictMap map[string]*District = make(map[string]*District)
  53. func init() {
  54. qu.ReadConfig("./extractcity.json", &SortField)
  55. }
  56. func TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
  57. province := fmt.Sprint(resulttmp["area"])
  58. city := fmt.Sprint(resulttmp["city"])
  59. field := make([]string, 0)
  60. for _, f := range SortField { //
  61. val := resulttmp[f]
  62. if val == nil {
  63. field = append(field, "")
  64. } else {
  65. field = append(field, fmt.Sprint(val))
  66. }
  67. }
  68. bres, c, p = ExtractProvinceCity(province, city, id, field) //抽取省和市
  69. bres, p, c, d = ExtractDistrict(field, bres, c, p, id) //抽取区或县
  70. return
  71. }
  72. //抽取区或县(从配置的字段信息中抽取区或县)
  73. func ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
  74. d := ""
  75. for _, str := range field {
  76. for pos, GET := range []DFA{AreaDistrict, AreaStreet} { //先匹配区或县再匹配街道
  77. word := GET.CheckSensitiveWord(str)
  78. if word != "" {
  79. if pos == 0 { //区或县匹配
  80. //log.Println("县直接匹配到====", word)
  81. city := DistrictCityMap[word]
  82. if city != nil {
  83. d = word
  84. ctmp := city.Brief
  85. ptmp := city.P.Brief
  86. if !bres { //城市省份没有抽到,通过区或县定位市和省
  87. c = ctmp
  88. p = ptmp
  89. bres = true
  90. } else { //对比抽到的城市省份是否一致
  91. if c != ctmp || p != ptmp {
  92. log.Println("str---", str, "====", word)
  93. log.Println("district: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
  94. c = ctmp
  95. p = ptmp
  96. }
  97. }
  98. }
  99. } else { //街道匹配
  100. //log.Println("匹配到街道====", word)
  101. district := StreetDistrictMap[word]
  102. if district != nil {
  103. d = district.Name
  104. ctmp := district.C.Brief
  105. ptmp := district.C.P.Brief
  106. if !bres { //城市省份没有抽到,通过区或县定位市和省
  107. c = ctmp
  108. p = ptmp
  109. bres = true
  110. } else { //对比抽到的城市省份是否一致
  111. if c != ctmp || p != ptmp {
  112. log.Println("street: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
  113. c = ctmp
  114. p = ptmp
  115. }
  116. }
  117. }
  118. }
  119. return bres, p, c, d
  120. }
  121. }
  122. }
  123. return bres, p, c, d
  124. }
  125. //抽取城市、省份
  126. func ExtractProvinceCity(province, city, id string, field []string) (bres bool, c, p string) {
  127. defer qu.Catch()
  128. bc := true //是否继续抽取
  129. if city != "" {
  130. if CityBrief[city] == nil { //简称不存在
  131. //log.Println("city err:", city, id)
  132. } else { //简称存在
  133. if province != CityBrief[city].P.Brief { //省份不配对
  134. //log.Println("province err:", city, province, id)
  135. } else {
  136. bc = false
  137. //城市省份都正确
  138. }
  139. }
  140. }
  141. //有省份
  142. bp := false
  143. if ProvinceBrief[province] != nil { //省份简称正确
  144. bp = true
  145. } else { //没有省份,先识别省份
  146. for _, str := range field { //没有省的简称,从配置的字段信息中抽取省
  147. word := AreaProvinceGet.CheckSensitiveWord(str) //省全称DFA中匹配
  148. if word != "" {
  149. province = ProvinceMap[word] //
  150. bp = true
  151. break
  152. }
  153. }
  154. }
  155. //匹配城市
  156. if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
  157. //目前是全匹配模式,如果再加上精简匹配,加一层循环
  158. for pos, GET := range []DFA{AreaGet, AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
  159. ws := make([]string, 5)
  160. for n, str := range field {
  161. if str != "" {
  162. word := GET.CheckSensitiveWord(str)
  163. if pos == 1 { //用简称 后辍为路、集团替换
  164. str1 := strings.Replace(str, word+"路", "", 1)
  165. if str1 != str {
  166. word = GET.CheckSensitiveWord(str1)
  167. }
  168. }
  169. ws[n] = word
  170. if word != "" {
  171. res := AreaToCity[word]
  172. if len(res) == 1 {
  173. //判断省份
  174. if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
  175. bres = true
  176. c = res[0].Brief
  177. p = res[0].P.Brief
  178. break
  179. } else { //不一致时。。暂时不处理
  180. }
  181. } else { //多个时(出现这种情况是多个省中的市,市名相同)
  182. }
  183. }
  184. }
  185. }
  186. if !bres { //没有匹配到
  187. mc := map[string]int{}
  188. for _, w := range ws {
  189. res := AreaToCity[w]
  190. for _, ct := range res {
  191. if ct == nil {
  192. continue
  193. }
  194. if bp { //有省份
  195. if ct.P != nil && ct.P.Brief == province {
  196. mc[ct.Brief]++
  197. }
  198. } else { //没有省份
  199. mc[ct.Brief]++
  200. }
  201. }
  202. }
  203. //计算mc中最大值且大于1
  204. max := 1
  205. v := ""
  206. for mk, mv := range mc {
  207. if mv > max {
  208. v = mk
  209. }
  210. }
  211. if v != "" {
  212. bres = true
  213. c = CityBrief[v].Brief
  214. p = CityBrief[v].P.Brief
  215. } else if len(mc) > 0 {
  216. //取级别更大的
  217. v := ""
  218. for mk, _ := range mc {
  219. if CityBrief[mk].P.Cap == mk {
  220. bres = true
  221. c = CityBrief[mk].Brief
  222. p = CityBrief[mk].P.Brief
  223. break
  224. } else {
  225. v = mk
  226. }
  227. }
  228. if !bres {
  229. bres = true
  230. c = CityBrief[v].Brief
  231. p = CityBrief[v].P.Brief
  232. }
  233. }
  234. }
  235. if bres {
  236. break
  237. }
  238. }
  239. } else {
  240. return
  241. }
  242. if !bres {
  243. //取默认省会
  244. if ProvinceBrief[province] != nil {
  245. bres = true
  246. c = ProvinceBrief[province].Cap
  247. p = province
  248. }
  249. }
  250. return
  251. }
  252. func (d *DFA) AddWord(keys ...string) {
  253. d.AddWordAll(true, keys...)
  254. }
  255. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  256. if d.Link == nil {
  257. d.Link = make(map[string]interface{})
  258. }
  259. for _, key := range keys {
  260. nowMap := &d.Link
  261. for i := 0; i < len(key); i++ {
  262. kc := key[i : i+1]
  263. if v, ok := (*nowMap)[kc]; ok {
  264. nowMap, _ = v.(*map[string]interface{})
  265. } else {
  266. newMap := map[string]interface{}{}
  267. newMap["YN"] = "0"
  268. (*nowMap)[kc] = &newMap
  269. nowMap = &newMap
  270. }
  271. if i == len(key)-1 {
  272. (*nowMap)["YN"] = "1"
  273. if haskey {
  274. (*nowMap)["K"] = key
  275. }
  276. }
  277. }
  278. }
  279. }
  280. func (d *DFA) CheckSensitiveWord(src string) string {
  281. pos := 0
  282. nowMap := &d.Link
  283. res := ""
  284. for i := 0; i < len(src); i++ {
  285. word := src[i : i+1]
  286. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  287. if nowMap != nil { // 存在,则判断是否为最后一个
  288. if pos == 0 {
  289. pos = i
  290. }
  291. if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  292. res = qu.ObjToString((*nowMap)["K"])
  293. //pos = 0
  294. //break
  295. }
  296. } else {
  297. if res != "" {
  298. break
  299. } else {
  300. nowMap = &d.Link
  301. if pos > 0 {
  302. i = pos
  303. pos = 0
  304. }
  305. }
  306. }
  307. }
  308. return res
  309. }