extractcity.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. package extract
  2. import (
  3. "fmt"
  4. "log"
  5. qu "qfw/util"
  6. "strings"
  7. )
  8. //省
  9. type Province struct {
  10. Name string
  11. Brief string
  12. Cap string
  13. Captial *City
  14. }
  15. //市
  16. type City struct {
  17. Name string
  18. Brief string
  19. P *Province
  20. }
  21. //区或县
  22. type District struct {
  23. Name string
  24. C *City
  25. }
  26. //街道
  27. type Street struct {
  28. Name string
  29. D *District
  30. }
  31. //敏感词
  32. type DFA struct {
  33. Link map[string]interface{}
  34. }
  35. var SortField []string
  36. func init() {
  37. qu.ReadConfig("./extractcity.json", &SortField)
  38. }
  39. func (e *ExtractTask) TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
  40. defer qu.Catch()
  41. province := fmt.Sprint(resulttmp["area"])
  42. city := fmt.Sprint(resulttmp["city"])
  43. fieldval := make([]string, 0)
  44. for _, f := range SortField { //
  45. val := resulttmp[f]
  46. if val == nil {
  47. fieldval = append(fieldval, "")
  48. } else {
  49. fieldval = append(fieldval, fmt.Sprint(val))
  50. }
  51. }
  52. //log.Println("field========", fieldval)
  53. bres, c, p = e.ExtractProvinceCity(province, city, id, fieldval) //抽取省和市
  54. //log.Println("b--------", bres, "p---------", p, "c-------------", c)
  55. bres, p, c, d = e.ExtractDistrict(fieldval, bres, c, p, id) //抽取区或县
  56. //log.Println("bres========", bres, "p===========", p, "c=========", c, "d=============", d)
  57. return
  58. }
  59. //抽取区或县(从配置的字段信息中抽取区或县)
  60. func (e *ExtractTask) ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
  61. d := ""
  62. for _, str := range field {
  63. //log.Println("field===========", str)
  64. for pos, GET := range []DFA{e.AreaDistrict, e.AreaStreet} { //先匹配区或县再匹配街道
  65. word := GET.CheckSensitiveWord(str)
  66. //log.Println("word================", word)
  67. if word != "" {
  68. if pos == 0 { //区或县匹配
  69. //log.Println("县直接匹配到====", word)
  70. lock.Lock()
  71. city := e.DistrictCityMap[word]
  72. lock.Unlock()
  73. //log.Println("city================", city)
  74. if city != nil {
  75. d = word
  76. ctmp := city.Brief
  77. ptmp := city.P.Brief
  78. //log.Println("ctmpptmp================", ptmp, ctmp)
  79. if !bres { //城市省份没有抽到,通过区或县定位市和省
  80. c = ctmp
  81. p = ptmp
  82. bres = true
  83. } else { //对比抽到的城市省份是否一致
  84. if c != ctmp || p != ptmp {
  85. //log.Println("str---", str, "====", word)
  86. //log.Println("district: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
  87. c = ctmp
  88. p = ptmp
  89. }
  90. }
  91. }
  92. } else { //街道匹配
  93. //log.Println("匹配到街道====", word)
  94. lock.Lock()
  95. district := e.StreetDistrictMap[word]
  96. lock.Unlock()
  97. //log.Println("district================", district)
  98. if district != nil {
  99. d = district.Name
  100. ctmp := district.C.Brief
  101. ptmp := district.C.P.Brief
  102. //log.Println("districtptmp================", ctmp, ptmp)
  103. if !bres { //城市省份没有抽到,通过区或县定位市和省
  104. c = ctmp
  105. p = ptmp
  106. bres = true
  107. } else { //对比抽到的城市省份是否一致
  108. if c != ctmp || p != ptmp {
  109. //log.Println("street: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
  110. c = ctmp
  111. p = ptmp
  112. }
  113. }
  114. }
  115. }
  116. return bres, p, c, d
  117. }
  118. }
  119. }
  120. return bres, p, c, d
  121. }
  122. //抽取城市、省份
  123. func (e *ExtractTask) ExtractProvinceCity(province, city, id string, text []string) (bres bool, c, p string) {
  124. defer qu.Catch()
  125. bc := true //是否继续抽取
  126. if city != "" {
  127. lock.Lock()
  128. citybrief := e.CityBrief[city]
  129. //log.Println("citybrief========", citybrief)
  130. lock.Unlock()
  131. if citybrief == nil { //简称不存在
  132. log.Println("city err:", city, id)
  133. } else { //简称存在
  134. lock.Lock()
  135. pbrief := e.CityBrief[city].P.Brief
  136. //log.Println("pbrief========", pbrief)
  137. lock.Unlock()
  138. if province != pbrief { //省份不配对
  139. log.Println("province err:", city, province, id)
  140. } else {
  141. bc = false
  142. //城市省份都正确
  143. }
  144. }
  145. }
  146. //有省份
  147. bp := false
  148. lock.Lock()
  149. provincebrief := e.ProvinceBrief[province]
  150. //log.Println("provincebrief========", provincebrief)
  151. lock.Unlock()
  152. if provincebrief != nil { //省份简称正确
  153. bp = true
  154. } else { //没有省份,先识别省份
  155. for _, str := range text { //没有省的简称,从配置的字段信息中抽取省
  156. word := e.AreaProvinceGet.CheckSensitiveWord(str) //省全称DFA中匹配
  157. if word != "" {
  158. lock.Lock()
  159. province = e.ProvinceMap[word]
  160. lock.Unlock()
  161. bp = true
  162. break
  163. }
  164. }
  165. }
  166. //匹配城市
  167. if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
  168. for pos, GET := range []DFA{e.AreaGet, e.AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
  169. ws := make([]string, 5)
  170. for n, str := range text {
  171. if str != "" {
  172. word := GET.CheckSensitiveWord(str)
  173. if pos == 1 { //用简称 后辍为路、集团替换
  174. str1 := strings.Replace(str, word+"路", "", 1)
  175. if str1 != str {
  176. word = GET.CheckSensitiveWord(str1)
  177. }
  178. }
  179. ws[n] = word
  180. if word != "" {
  181. lock.Lock()
  182. res := e.AreaToCity[word]
  183. lock.Unlock()
  184. if len(res) == 1 {
  185. //判断省份
  186. if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
  187. bres = true
  188. c = res[0].Brief
  189. p = res[0].P.Brief
  190. break
  191. } else { //不一致时。。暂时不处理
  192. }
  193. } else { //多个时(出现这种情况是多个省中的市,市名相同。现在的配置文件中已经将市名,县名重复的全部去掉)
  194. }
  195. }
  196. }
  197. }
  198. if !bres { //没有匹配到
  199. mc := map[string]int{}
  200. for _, w := range ws {
  201. lock.Lock()
  202. res := e.AreaToCity[w]
  203. lock.Unlock()
  204. for _, ct := range res {
  205. if ct == nil {
  206. continue
  207. }
  208. if bp { //有省份
  209. if ct.P != nil && ct.P.Brief == province {
  210. mc[ct.Brief]++
  211. }
  212. } else { //没有省份
  213. mc[ct.Brief]++
  214. }
  215. }
  216. }
  217. //计算mc中最大值且大于1
  218. max := 1
  219. v := ""
  220. for mk, mv := range mc {
  221. if mv > max {
  222. v = mk
  223. }
  224. }
  225. if v != "" {
  226. bres = true
  227. lock.Lock()
  228. ctb := e.CityBrief[v]
  229. lock.Unlock()
  230. c = ctb.Brief
  231. p = ctb.P.Brief
  232. } else if len(mc) > 0 {
  233. //取级别更大的
  234. v := ""
  235. for mk, _ := range mc {
  236. lock.Lock()
  237. cb := e.CityBrief[mk]
  238. lock.Unlock()
  239. if cb.P.Cap == mk {
  240. bres = true
  241. c = cb.Brief
  242. p = cb.P.Brief
  243. break
  244. } else {
  245. v = mk
  246. }
  247. }
  248. if !bres {
  249. bres = true
  250. lock.Lock()
  251. cbb := e.CityBrief[v]
  252. c = cbb.Brief
  253. p = cbb.P.Brief
  254. lock.Unlock()
  255. }
  256. }
  257. }
  258. if bres {
  259. break
  260. }
  261. }
  262. } else {
  263. return
  264. }
  265. if !bres {
  266. //取默认省会
  267. lock.Lock()
  268. pbp := e.ProvinceBrief[province]
  269. lock.Unlock()
  270. if pbp != nil {
  271. bres = true
  272. c = pbp.Cap
  273. p = province
  274. }
  275. }
  276. return
  277. }
  278. func (d *DFA) AddWord(keys ...string) {
  279. d.AddWordAll(true, keys...)
  280. }
  281. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  282. if d.Link == nil {
  283. d.Link = make(map[string]interface{})
  284. }
  285. for _, key := range keys {
  286. nowMap := &d.Link
  287. for i := 0; i < len(key); i++ {
  288. kc := key[i : i+1]
  289. if v, ok := (*nowMap)[kc]; ok {
  290. nowMap, _ = v.(*map[string]interface{})
  291. } else {
  292. newMap := map[string]interface{}{}
  293. newMap["YN"] = "0"
  294. (*nowMap)[kc] = &newMap
  295. nowMap = &newMap
  296. }
  297. if i == len(key)-1 {
  298. (*nowMap)["YN"] = "1"
  299. if haskey {
  300. (*nowMap)["K"] = key
  301. }
  302. }
  303. }
  304. }
  305. }
  306. func (d *DFA) CheckSensitiveWord(src string) string {
  307. pos := 0
  308. nowMap := &d.Link
  309. res := ""
  310. for i := 0; i < len(src); i++ {
  311. word := src[i : i+1]
  312. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  313. if nowMap != nil { // 存在,则判断是否为最后一个
  314. if pos == 0 {
  315. pos = i
  316. }
  317. if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
  318. res = qu.ObjToString((*nowMap)["K"])
  319. //pos = 0
  320. //break
  321. }
  322. } else {
  323. if res != "" {
  324. break
  325. } else {
  326. nowMap = &d.Link
  327. if pos > 0 {
  328. i = pos
  329. pos = 0
  330. }
  331. }
  332. }
  333. }
  334. return res
  335. }