newextractcity.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. package extract
  2. import (
  3. . "jy/pretreated"
  4. ju "jy/util"
  5. qu "qfw/util"
  6. "strings"
  7. )
  8. //抽取city
  9. func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
  10. /*
  11. 高准确率:
  12. 1.爬虫数据jsondata
  13. 2.采购单位库
  14. 3.邮编
  15. 4.固话
  16. 5.site(todo)
  17. 低准确率:(全称库匹配到不走简称库)
  18. 1.city全称库(buyeraddr;title,projectname)
  19. 2.city简称库(buyeraddr;title,projectname)
  20. */
  21. defer qu.Catch()
  22. //初始化
  23. if j.AreaScore == nil {
  24. j.AreaScore = make(map[string]int)
  25. }
  26. if j.CityScore == nil {
  27. j.CityScore = make(map[string]int)
  28. }
  29. if j.DistrictScore == nil {
  30. j.DistrictScore = make(map[string]int)
  31. }
  32. //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
  33. pscore := make(map[string]int)
  34. cscore := make(map[string]int)
  35. dscore := make(map[string]int)
  36. sm := NewSortMap()
  37. //1.jsondata抽取
  38. e.NewGetCityByJsonData(j)
  39. //qu.Debug("jsondata打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
  40. //2.site库抽取
  41. e.NewGetCityBySite(j)
  42. //qu.Debug("site打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
  43. //3.采购单位库抽取(暂时没有采购单位库)
  44. //buyer, _ := resulttmp["buyer"].(string)
  45. //4.postcode邮编抽取
  46. buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
  47. e.NewGetCityByPostCode(j, buyerzipcode)
  48. //qu.Debug("邮编打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
  49. //5.areacode固话区号抽取
  50. buyertel, _ := resulttmp["buyertel"].(string)
  51. e.NewGetCityByAreaCode(j, buyertel)
  52. //qu.Debug("固话打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
  53. //6.buyeraddr,title,projectname抽取
  54. buyeraddr, _ := resulttmp["buyeraddr"].(string)
  55. title, _ := resulttmp["title"].(string)
  56. projectname, _ := resulttmp["projectname"].(string)
  57. buyer, _ := resulttmp["buyer"].(string)
  58. //qu.Debug("buyeraddr--", buyeraddr, "--buyer--", buyer, "--title--", title, "--projectname--", projectname)
  59. sm.AddKey("buyeraddr", buyeraddr)
  60. sm.AddKey("title", title)
  61. sm.AddKey("projectname", projectname)
  62. sm.AddKey("buyer", buyer)
  63. e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
  64. //qu.Debug("打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
  65. //7.detail抽取
  66. if len(j.AreaScore) > 0 {
  67. e.NewGetCityByDetail(j)
  68. }
  69. //qu.Debug("detail打分后---", j.AreaScore, j.CityScore, j.DistrictScore)
  70. //合并得分
  71. //qu.Debug("pcd=====", pscore, cscore, dscore)
  72. MergeScores(j, &pscore, &cscore, &dscore)
  73. //qu.Debug("合并打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
  74. finishP := HighestScoreArr(j.AreaScore)
  75. finishC := HighestScoreArr(j.CityScore)
  76. finishD := HighestScoreArr(j.DistrictScore)
  77. arearesult := ""
  78. cityresult := ""
  79. districtresult := ""
  80. tmpcity := []string{}
  81. if len(finishP) == 1 { //最高分一个
  82. arearesult = finishP[0] //抽取结果直接赋值
  83. cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  84. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  85. } else if len(finishP) > 1 { //province最高分多个
  86. if len(finishC) == 1 {
  87. cityresult = finishC[0]
  88. if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
  89. arearesult = cfMap.P.Brief
  90. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  91. }
  92. } else { //对应的city有多个(多个province和city)
  93. arearesult = finishP[0] //抽取结果直接赋值
  94. cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  95. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  96. }
  97. }
  98. //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
  99. if arearesult == "" {
  100. arearesult = "全国"
  101. } else if cityresult == "" {
  102. if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
  103. cityresult = pbMap.Cap
  104. resulttmp["defaultpcap"] = true
  105. }
  106. }
  107. //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
  108. resulttmp["area"] = arearesult
  109. resulttmp["city"] = cityresult
  110. resulttmp["district"] = districtresult
  111. }
  112. //jsondata中抽取城市
  113. func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
  114. defer qu.Catch()
  115. jsondata := *j.Jsondata
  116. if jsondata != nil { //jsondata中获取province和city
  117. if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
  118. p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
  119. GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配
  120. }
  121. city, _ = jsondata["city"].(string) //city全称或者简称
  122. province, _ = jsondata["area"].(string) //province简称
  123. district, _ = jsondata["district"].(string) //district全称
  124. }
  125. PCDScore(j, "district", district, 5) //district打分
  126. bp := false
  127. if province != "" {
  128. if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
  129. bp = true //省份正确
  130. }
  131. }
  132. pbrief := ""
  133. if city != "" {
  134. cityfullmap := e.CityFullMap[city] //判断city全称是否正确
  135. if cityfullmap != nil {
  136. pbrief = cityfullmap.P.Brief //province简称
  137. } else {
  138. citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
  139. if citybriefmap != nil {
  140. city = citybriefmap.Name //city简称替换为全称
  141. pbrief = citybriefmap.P.Brief
  142. }
  143. }
  144. }
  145. if bp {
  146. if pbrief == province { //爬虫的province和city匹配
  147. PCDScore(j, "city", city, 5)
  148. } else { //pbrief不匹配province(此时city为空或者错误)
  149. city = ""
  150. }
  151. PCDScore(j, "province", province, 5)
  152. } else { //省份错误或为空,取city的对应的pbrief为province
  153. if pbrief != "" {
  154. province = pbrief
  155. PCDScore(j, "province", province, 5)
  156. PCDScore(j, "city", city, 5)
  157. } else {
  158. province = ""
  159. city = ""
  160. }
  161. }
  162. return
  163. }
  164. //全称从area_city_district中抽城市
  165. func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
  166. text := e.JB_PCD.Cut(a_c_d, true)
  167. //qu.Debug("Full----", text)
  168. repeatPb := map[string]bool{}
  169. for _, full := range text {
  170. if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
  171. if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
  172. pbrief = tmpPbrief //省简称
  173. PCDScore(j, "province", pbrief, 5)
  174. }
  175. } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
  176. if cfMap := e.CityFullMap[full]; cfMap != nil {
  177. tmpcity := cfMap.Name //城市全称
  178. tmpPbrief := cfMap.P.Brief //省简称
  179. if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
  180. city = tmpcity
  181. PCDScore(j, "city", city, 5)
  182. } else if pbrief == "" {
  183. city = tmpcity
  184. pbrief = tmpPbrief
  185. PCDScore(j, "city", city, 5)
  186. PCDScore(j, "province", pbrief, 5)
  187. }
  188. }
  189. } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
  190. carr := e.NewDistrictCityMap[full]
  191. if len(carr) > 0 {
  192. district = full
  193. PCDScore(j, "district", district, 5)
  194. for _, c := range carr {
  195. tmpcity := c.Name //城市全称
  196. tmpPbrief := c.P.Brief //省简称
  197. if pbrief == "" { //之前没有匹配到省份
  198. PCDScore(j, "city", tmpcity, 5)
  199. if !repeatPb[tmpPbrief] {
  200. PCDScore(j, "province", tmpPbrief, 5)
  201. repeatPb[tmpPbrief] = true
  202. }
  203. } else { //已有省份
  204. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  205. PCDScore(j, "city", tmpcity, -5)
  206. PCDScore(j, "province", tmpPbrief, -5)
  207. } else { //与之前匹配结果一致
  208. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  209. PCDScore(j, "city", tmpcity, 5)
  210. }
  211. }
  212. }
  213. }
  214. }
  215. }
  216. }
  217. return pbrief, city, district
  218. }
  219. //简称从area_city_district中抽城市
  220. func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
  221. text := e.JB_PCD.Cut(a_c_d, true)
  222. repeatPb := map[string]bool{}
  223. for _, sim := range text {
  224. if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
  225. if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
  226. pbrief = pbMap.Brief
  227. PCDScore(j, "province", pbrief, 5) //打分
  228. //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
  229. }
  230. } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
  231. if cbMap := e.CityBriefMap[sim]; cbMap != nil {
  232. tmpcity := cbMap.Name
  233. tmpPbrief := cbMap.P.Brief
  234. if pbrief != "" && pbrief == tmpPbrief {
  235. city = tmpcity
  236. PCDScore(j, "city", city, 5)
  237. } else if pbrief == "" {
  238. city = tmpcity
  239. pbrief = tmpPbrief
  240. PCDScore(j, "city", city, 5)
  241. PCDScore(j, "province", pbrief, 5)
  242. //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
  243. }
  244. }
  245. } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
  246. dfullarr := e.NewDistrictSimAndAll[sim]
  247. if len(dfullarr) > 0 {
  248. PCDScore(j, "district", sim, 5)
  249. for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
  250. for _, c := range dfullAndCity {
  251. tmpcity := c.Name //城市全称
  252. tmpPbrief := c.P.Brief //省简称
  253. if pbrief == "" { //之前没有匹配到省份
  254. PCDScore(j, "city", tmpcity, 5)
  255. if !repeatPb[tmpPbrief] {
  256. PCDScore(j, "province", tmpPbrief, 5)
  257. repeatPb[tmpPbrief] = true
  258. }
  259. } else { //已有省份
  260. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  261. PCDScore(j, "city", tmpcity, -5)
  262. PCDScore(j, "province", tmpPbrief, -5)
  263. } else { //与之前匹配结果一致
  264. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  265. PCDScore(j, "city", tmpcity, 5)
  266. }
  267. }
  268. }
  269. }
  270. }
  271. }
  272. }
  273. }
  274. }
  275. //通过site提取城市
  276. func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
  277. site, _ := (*j.Data)["site"].(string)
  278. //qu.Debug("site--------", site)
  279. if scMap := e.SiteCityMap[site]; scMap != nil {
  280. if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
  281. PCDScore(j, "province", scMap.P, 5)
  282. }
  283. if scMap.C != "" && scMap.C != "null" {
  284. PCDScore(j, "city", scMap.C, 5)
  285. }
  286. if scMap.D != "" && scMap.D != "null" {
  287. PCDScore(j, "district", scMap.D, 5)
  288. }
  289. }
  290. }
  291. //通过邮编提取城市
  292. func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
  293. defer qu.Catch()
  294. pc := e.PostCodeMap[postcode]
  295. if pc != nil {
  296. province = pc.P
  297. city = pc.C
  298. districtTmp := pc.D //邮编可能对应多个区
  299. score := 3
  300. if len(districtTmp) == 1 && districtTmp[0] != "" {
  301. score = 5
  302. }
  303. for _, district := range districtTmp {
  304. PCDScore(j, "district", district, score)
  305. }
  306. PCDScore(j, "province", province, 5)
  307. PCDScore(j, "city", city, 5)
  308. }
  309. return
  310. }
  311. //固话区号提取城市
  312. func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
  313. defer qu.Catch()
  314. if len(buyertel) >= 11 {
  315. if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
  316. n := 4
  317. L:
  318. areacode := buyertel[:n]
  319. ac := e.AreaCodeMap[areacode]
  320. if ac != nil {
  321. province = ac.P
  322. citytmp := ac.C
  323. if len(citytmp) == 1 { //对应多个city舍去
  324. city = citytmp[0]
  325. PCDScore(j, "city", city, 5)
  326. }
  327. PCDScore(j, "province", province, 5)
  328. } else {
  329. n = n - 1
  330. if n >= 3 {
  331. goto L
  332. }
  333. }
  334. } else if buyertel[:3] == "853" { //澳门
  335. province = "澳门"
  336. city = "澳门"
  337. PCDScore(j, "province", province, 5)
  338. PCDScore(j, "city", city, 5)
  339. }
  340. }
  341. return
  342. }
  343. func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]int) {
  344. /*
  345. 1.对字段进行分词
  346. 2.省、市、区、街道、居委会全称进行匹配打分
  347. 3.省、市、区简称进行匹配打分
  348. */
  349. for _, from := range sm.Keys { //buyeraddr;title;projectname
  350. p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
  351. str, _ := sm.Map[from].(string)
  352. //qu.Debug(str, "---分词结果---", e.JB_SV.Cut(str, true), p_full, c_full, d_full, p_sim, c_sim, d_sim)
  353. jbText := e.JB_SV.Cut(str, true)
  354. for _, text := range jbText { //结巴分词
  355. if len([]rune(text)) == 1 {
  356. continue
  357. }
  358. //全称匹配
  359. //qu.Debug("text------", text)
  360. for pos_full, trie_full := range e.Trie_Fulls {
  361. if trie_full.Get(text) {
  362. if pos_full == 0 && p_full == "" { //省全称
  363. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
  364. p_full = tmpPbrief
  365. PCDScore(j, "province", p_full, 4)
  366. break
  367. }
  368. } else if pos_full == 1 && c_full == "" { //市全称
  369. if cfMap := e.CityFullMap[text]; cfMap != nil {
  370. tmpPbrief := cfMap.P.Brief
  371. //qu.Debug("市--------", text, tmpPbrief, p_full)
  372. if p_full == "" {
  373. p_full = tmpPbrief
  374. c_full = cfMap.Name
  375. PCDScore(j, "province", p_full, 4)
  376. PCDScore(j, "city", c_full, 4)
  377. break
  378. } else if p_full == tmpPbrief {
  379. c_full = cfMap.Name
  380. PCDScore(j, "city", c_full, 4)
  381. break
  382. } else if p_full != "" && p_full != tmpPbrief {
  383. //city不做处理
  384. }
  385. }
  386. } else if pos_full == 2 && d_full == "" { //区全称
  387. //qu.Debug("区全称===========")
  388. repeatPb := map[string]bool{}
  389. isOk := false
  390. districtOk := false
  391. citys := e.NewDistrictCityMap[text]
  392. for _, c := range citys {
  393. tmpPbrief := c.P.Brief
  394. if p_full == tmpPbrief { //省份一致
  395. d_full = text
  396. if c_full == "" {
  397. c_full = c.Name
  398. PCDScore(j, "city", c_full, 4)
  399. }
  400. isOk = true
  401. districtOk = true
  402. } else if p_full == "" { //省份不存在
  403. districtOk = true
  404. if len(citys) == 1 { //对应一个city
  405. p_full = tmpPbrief
  406. c_full = c.Name
  407. d_full = text
  408. PCDScore(j, "province", p_full, 4)
  409. PCDScore(j, "city", c_full, 4)
  410. isOk = true
  411. } else { //多个city,只打分,不赋值
  412. if !repeatPb[tmpPbrief] {
  413. PCDScore(j, "province", tmpPbrief, 2)
  414. repeatPb[tmpPbrief] = true
  415. }
  416. //PCDScore(j, "province", tmpPbrief, 2)
  417. PCDScore(j, "city", c.Name, 2)
  418. }
  419. } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
  420. if !repeatPb[tmpPbrief] {
  421. PCDScore(j, "province", tmpPbrief, -5)
  422. repeatPb[tmpPbrief] = true
  423. }
  424. //PCDScore(j, "province", tmpPbrief, -5)
  425. PCDScore(j, "city", c.Name, -5)
  426. }
  427. }
  428. if districtOk {
  429. PCDScore(j, "district", text, 4)
  430. } else {
  431. PCDScore(j, "district", text, -5)
  432. }
  433. if isOk {
  434. break
  435. }
  436. } else if pos_full == 3 { //街道全称
  437. districts := e.NewStreetDistrictMap[text]
  438. DealMultipleDistrict(e, j, districts, 2)
  439. } else if pos_full == 4 { //居委会全称
  440. districts := e.CommunityDistrictMap[text]
  441. DealMultipleDistrict(e, j, districts, 2)
  442. }
  443. }
  444. }
  445. //qu.Debug("全称后--", j.AreaScore, j.CityScore, j.DistrictScore)
  446. //简称匹配
  447. for pos_sim, trie_sim := range e.Trie_Sims {
  448. if trie_sim.Get(text) {
  449. if pos_sim == 0 && p_sim == "" { //省简称
  450. p_sim = text
  451. PCDScore(j, "province", p_sim, 3)
  452. break
  453. } else if pos_sim == 1 && c_sim == "" { //市简称
  454. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  455. tmpPbrief := cbMap.P.Brief
  456. if p_sim == "" {
  457. p_sim = tmpPbrief
  458. c_sim = cbMap.Brief
  459. PCDScore(j, "province", p_sim, 2)
  460. PCDScore(j, "city", cbMap.Name, 2)
  461. break
  462. } else if p_sim == tmpPbrief {
  463. c_sim = cbMap.Brief
  464. PCDScore(j, "city", cbMap.Name, 3)
  465. break
  466. } else if p_sim != "" && p_sim != tmpPbrief {
  467. //city不做处理
  468. }
  469. }
  470. } else if pos_sim == 2 && d_sim == "" { //区简称
  471. repeatPb := map[string]bool{}
  472. repeatDb := map[string]bool{}
  473. dfull_citys := e.NewDistrictSimAndAll[text]
  474. //qu.Debug(text, dfull_citys, p_sim)
  475. for _, dfull_city := range dfull_citys {
  476. for dfull, c := range dfull_city { //dfull:简称对应的全称
  477. tmpPbrief := c.P.Brief
  478. if p_sim == tmpPbrief { //省份一致
  479. d_sim = text
  480. //PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
  481. PCDScore(j, "district", dfull, 2)
  482. if c_sim == "" {
  483. c_sim = c.Brief
  484. //PCDScoreByDistrictSim("c", c.Name, 2, pscore, cscore, dscore)
  485. PCDScore(j, "city", c.Name, 2)
  486. }
  487. } else if p_sim == "" {
  488. if !repeatDb[dfull] {
  489. PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
  490. //PCDScore(j, "district", dfull, 1)
  491. repeatDb[dfull] = true
  492. }
  493. if len(dfull_citys) == 1 {
  494. //p_sim = tmpPbrief
  495. //c_sim = c.Brief
  496. //d_sim = text
  497. PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
  498. PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
  499. //PCDScore(j, "province", p_sim, 2)
  500. //PCDScore(j, "city", c.Name, 2)
  501. } else {
  502. if !repeatPb[tmpPbrief] {
  503. PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
  504. //PCDScore(j, "province", tmpPbrief, 1)
  505. repeatPb[tmpPbrief] = true
  506. }
  507. //PCDScore(j, "city", c.Name, 1)
  508. PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
  509. }
  510. } else if p_sim != "" && p_sim != tmpPbrief {
  511. if !repeatPb[tmpPbrief] {
  512. PCDScoreByDistrictSim("p", tmpPbrief, -5, pscore, cscore, dscore)
  513. //PCDScore(j, "province", tmpPbrief, -5)
  514. repeatPb[tmpPbrief] = true
  515. }
  516. PCDScoreByDistrictSim("c", c.Name, -5, pscore, cscore, dscore)
  517. //PCDScore(j, "city", c.Name, -5)
  518. }
  519. }
  520. }
  521. }
  522. }
  523. }
  524. //qu.Debug("简称后--", j.AreaScore, j.CityScore, j.DistrictScore)
  525. }
  526. }
  527. }
  528. func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
  529. detailRune := []rune(j.Content)
  530. detail := j.Content
  531. if len(detailRune) > 600 {
  532. start := detailRune[:300]
  533. end := detailRune[len(detailRune)-300:]
  534. detail = string(start) + string(end)
  535. }
  536. for _, text := range e.JB_SV.Cut(detail, true) {
  537. if len([]rune(text)) > 1 {
  538. //qu.Debug("text---", text)
  539. //全称匹配
  540. for pos_full, trie_full := range e.Trie_Fulls {
  541. if trie_full.Get(text) {
  542. if pos_full == 0 { //省全称
  543. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
  544. PCDScore(j, "province", tmpPbrief, 1)
  545. break
  546. }
  547. } else if pos_full == 1 { //市全称
  548. if cfMap := e.CityFullMap[text]; cfMap != nil {
  549. PCDScore(j, "province", cfMap.P.Brief, 1)
  550. PCDScore(j, "city", cfMap.Name, 1)
  551. break
  552. }
  553. } else if pos_full == 2 { //区全称
  554. citys := e.NewDistrictCityMap[text]
  555. if len(citys) > 0 {
  556. repeatPb := map[string]bool{}
  557. PCDScore(j, "district", text, 1)
  558. for _, c := range citys {
  559. PCDScore(j, "city", c.Name, 1)
  560. if !repeatPb[text] {
  561. PCDScore(j, "province", c.P.Brief, 1)
  562. repeatPb[text] = true
  563. }
  564. }
  565. break
  566. }
  567. } else if pos_full == 3 { //街道全称
  568. districts := e.NewStreetDistrictMap[text]
  569. DealMultipleDistrict(e, j, districts, 1)
  570. } else if pos_full == 4 { //居委会全称
  571. districts := e.CommunityDistrictMap[text]
  572. DealMultipleDistrict(e, j, districts, 1)
  573. }
  574. }
  575. }
  576. //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
  577. //简称匹配
  578. for pos_sim, trie_sim := range e.Trie_Sims {
  579. if trie_sim.Get(text) {
  580. if pos_sim == 0 { //省简称
  581. PCDScore(j, "province", text, 1)
  582. break
  583. } else if pos_sim == 1 { //市简称
  584. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  585. PCDScore(j, "city", cbMap.Name, 1)
  586. PCDScore(j, "province", cbMap.P.Brief, 1)
  587. break
  588. }
  589. } /* else if pos_sim == 2 { //区简称
  590. repeatDb := map[string]bool{}
  591. dfull_citys := e.NewDistrictSimAndAll[text]
  592. for _, dfull_city := range dfull_citys {
  593. for dfull, _ := range dfull_city { //dfull:简称对应的全称
  594. if !repeatDb[dfull] {
  595. PCDScore(j, "district", dfull, 1)
  596. repeatDb[dfull] = true
  597. }
  598. }
  599. }
  600. }*/
  601. }
  602. }
  603. //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
  604. }
  605. }
  606. }
  607. //街道、居委会对应多地市处理
  608. func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score int) {
  609. repeatPb := map[string]bool{}
  610. repeatCb := map[string]bool{}
  611. repeatDb := map[string]bool{}
  612. for _, district := range districts {
  613. tmpDistrict := district.Name
  614. if !repeatDb[tmpDistrict] {
  615. PCDScore(j, "district", tmpDistrict, score)
  616. repeatDb[tmpDistrict] = true
  617. }
  618. citys := e.NewDistrictCityMap[tmpDistrict]
  619. for _, c := range citys {
  620. tmpCity := c.Name
  621. tmpPbrief := c.P.Brief
  622. if !repeatPb[tmpPbrief] {
  623. PCDScore(j, "province", tmpPbrief, score)
  624. repeatPb[tmpPbrief] = true
  625. }
  626. if !repeatCb[tmpCity] {
  627. PCDScore(j, "city", tmpCity, score)
  628. repeatCb[tmpCity] = true
  629. }
  630. }
  631. }
  632. }
  633. func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
  634. for _, c := range finishC { //取最高分与province匹配的city
  635. if cfMap := e.CityFullMap[c]; cfMap != nil {
  636. if cfMap.P.Brief == area {
  637. // city = c
  638. // break
  639. tmpcity = append(tmpcity, c)
  640. }
  641. }
  642. }
  643. if len(tmpcity) == 1 {
  644. city = tmpcity[0]
  645. }
  646. return city, tmpcity
  647. }
  648. func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
  649. for _, d := range finishD { //取最高分与province匹配的district
  650. citys := e.NewDistrictCityMap[d]
  651. for _, c := range citys {
  652. if len(tmpcity) == 0 { //没有city
  653. if c.P.Brief == area {
  654. city = c.Name
  655. district = d
  656. return city, district
  657. }
  658. } else if len(tmpcity) == 1 { //一个city
  659. if c.Name == city && c.P.Brief == area {
  660. district = d
  661. return city, district
  662. }
  663. } else { //多个city
  664. for _, tc := range tmpcity {
  665. if tc == c.Name {
  666. city = c.Name
  667. district = d
  668. return city, district
  669. }
  670. }
  671. }
  672. // if len(citys) == 1 { //区对应一个市
  673. // if c.P.Brief == area {
  674. // district = d
  675. // city = c.Name
  676. // return city, district
  677. // }
  678. // } else {
  679. // if c.P.Brief == area && c.Name == city {
  680. // district = d
  681. // return city, district
  682. // }
  683. // }
  684. }
  685. }
  686. return city, district
  687. }
  688. //计算province,city,district区或县匹配的得分
  689. func PCDScoreByDistrictSim(stype, t string, score int, ps, cs, ds *map[string]int) {
  690. defer qu.Catch()
  691. if t != "" {
  692. if stype == "d" {
  693. tmpscore := (*ds)[t]
  694. (*ds)[t] = tmpscore + score
  695. } else if stype == "c" {
  696. tmpscore := (*cs)[t]
  697. (*cs)[t] = tmpscore + score
  698. } else if stype == "p" {
  699. tmpscore := (*ps)[t]
  700. (*ps)[t] = tmpscore + score
  701. }
  702. }
  703. }
  704. func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]int) {
  705. if len(j.AreaScore) > 0 {
  706. for pt, ps := range *pscore {
  707. j.AreaScore[pt] = j.AreaScore[pt] + ps
  708. }
  709. for ct, cs := range *cscore {
  710. j.CityScore[ct] = j.CityScore[ct] + cs
  711. }
  712. for dt, ds := range *dscore {
  713. j.DistrictScore[dt] = j.DistrictScore[dt] + ds
  714. }
  715. }
  716. }
  717. //province,city,district干扰项减分
  718. //func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
  719. // defer qu.Catch()
  720. // if text != "" {
  721. // if stype == "city" {
  722. // for cn, cscore := range j.CityScore {
  723. // if cn != text {
  724. // j.CityScore[cn] = cscore + score
  725. // //错误的city减分后对应的province也减分
  726. // for pb, pscore := range j.AreaScore {
  727. // if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
  728. // j.AreaScore[pb] = pscore + score
  729. // }
  730. // }
  731. // }
  732. // }
  733. // } else if stype == "province" {
  734. // for pb, pscore := range j.AreaScore {
  735. // if pb != text {
  736. // j.AreaScore[pb] = pscore + score
  737. // //错误的province减分后对应的city也要减分
  738. // for cn, cscore := range j.CityScore {
  739. // if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
  740. // j.CityScore[cn] = cscore + score
  741. // }
  742. // }
  743. // }
  744. // }
  745. // }
  746. // // for name, tmpscore := range *whichMap {
  747. // // if name != text {
  748. // // (*whichMap)[name] = tmpscore + score
  749. // // }
  750. // // }
  751. // }
  752. //}