extractcity_old.go 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994
  1. package extract
  2. import (
  3. . "jy/pretreated"
  4. ju "jy/util"
  5. qu "qfw/util"
  6. "strings"
  7. log "github.com/donnie4w/go-logger/logger"
  8. )
  9. //抽取city
  10. func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}) {
  11. /*
  12. 高准确率:
  13. 1.爬虫数据jsondata
  14. 2.采购单位库
  15. 3.邮编
  16. 4.固话
  17. 5.site(todo)
  18. 低准确率:(全称库匹配到不走简称库)
  19. 1.city全称库(buyeraddr;title,projectname)
  20. 2.city简称库(buyeraddr;title,projectname)
  21. */
  22. defer qu.Catch()
  23. //初始化
  24. if j.FullAreaScore == nil {
  25. j.FullAreaScore = make(map[string]float64)
  26. }
  27. if j.FullCityScore == nil {
  28. j.FullCityScore = make(map[string]float64)
  29. }
  30. if j.FullDistrictScore == nil {
  31. j.FullDistrictScore = make(map[string]float64)
  32. }
  33. if j.SimAreaScore == nil {
  34. j.SimAreaScore = make(map[string]float64)
  35. }
  36. if j.SimCityScore == nil {
  37. j.SimCityScore = make(map[string]float64)
  38. }
  39. if j.SimDistrictScore == nil {
  40. j.SimDistrictScore = make(map[string]float64)
  41. }
  42. //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
  43. pscore := make(map[string]float64)
  44. cscore := make(map[string]float64)
  45. dscore := make(map[string]float64)
  46. sm := NewSortMap()
  47. //1.jsondata抽取
  48. e.NewGetCityByJsonData(j)
  49. //2.site库抽取
  50. e.NewGetCityBySite(j)
  51. //3.采购单位库抽取(暂时没有采购单位库)
  52. //4.postcode邮编抽取
  53. buyerzipcode := qu.ObjToString((*resulttmp)["buyerzipcode"])
  54. e.NewGetCityByPostCode(j, buyerzipcode)
  55. //5.areacode固话区号抽取
  56. buyertel := qu.ObjToString((*resulttmp)["buyertel"])
  57. e.NewGetCityByAreaCode(j, buyertel)
  58. //6.buyeraddr,title,projectname抽取
  59. buyeraddr := qu.ObjToString((*resulttmp)["buyeraddr"])
  60. title := qu.ObjToString((*resulttmp)["title"])
  61. projectname := qu.ObjToString((*resulttmp)["projectname"])
  62. buyer := qu.ObjToString((*resulttmp)["buyer"])
  63. addressing := qu.ObjToString((*resulttmp)["addressing"])
  64. sm.AddKey("buyeraddr", buyeraddr)
  65. sm.AddKey("buyer", buyer)
  66. sm.AddKey("title", title)
  67. sm.AddKey("projectname", projectname)
  68. sm.AddKey("addressing", addressing) //新增地址辅助字段
  69. if projectaddr, isok := (*resulttmp)["projectaddr"].(string); isok {
  70. sm.AddKey("projectaddr", projectaddr)
  71. }
  72. if bidopenaddress, isok := (*resulttmp)["bidopenaddress"].(string); isok {
  73. sm.AddKey("bidopenaddress", bidopenaddress)
  74. }
  75. //7.buyeraddr buyer title projectname抽取
  76. e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
  77. //qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  78. //qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  79. //全称简称得分合并
  80. MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
  81. //qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  82. //合并区简称得分
  83. //qu.Debug("pcd=====", pscore, cscore, dscore)
  84. MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
  85. //qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  86. j.SimAreaScore = map[string]float64{}
  87. j.SimCityScore = map[string]float64{}
  88. j.SimDistrictScore = map[string]float64{}
  89. //8.detail抽取
  90. if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
  91. e.NewGetCityByDetail(j)
  92. }
  93. //qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  94. //qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  95. MergeFullSimScore(j) //合并detail的全简称
  96. //qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  97. finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
  98. e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
  99. //qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
  100. //获取结果
  101. finishC := HighestScoreArr(j.FullCityScore)
  102. finishD := HighestScoreArr(j.FullDistrictScore)
  103. arearesult := ""
  104. cityresult := ""
  105. districtresult := ""
  106. tmpcity := []string{}
  107. if len(finishP) == 1 { //最高分一个
  108. arearesult = finishP[0] //抽取结果直接赋值
  109. cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  110. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  111. } else if len(finishP) > 1 { //province最高分多个
  112. if len(finishC) == 1 {
  113. cityresult = finishC[0]
  114. if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
  115. arearesult = cfMap.P.Brief
  116. tmpcity = append(tmpcity, cityresult)
  117. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  118. }
  119. } else { //对应的city有多个(多个province和city)
  120. //arearesult = finishP[0] //抽取结果直接赋值
  121. //cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  122. //cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  123. arearesult = "全国"
  124. }
  125. }
  126. if cityresult != "" && cityresult == districtresult {
  127. districtresult = ""
  128. }
  129. //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
  130. //直辖市
  131. if arearesult == "北京" {
  132. cityresult = "北京市"
  133. if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
  134. districtresult = "朝阳区"
  135. }
  136. } else if arearesult == "天津" {
  137. cityresult = "天津市"
  138. } else if arearesult == "上海" {
  139. cityresult = "上海市"
  140. } else if arearesult == "重庆" {
  141. cityresult = "重庆市"
  142. }
  143. if arearesult == "" {
  144. arearesult = "全国"
  145. } /* else if cityresult == "" {
  146. if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
  147. cityresult = pbMap.Cap
  148. resulttmp["defaultpcap"] = true
  149. }
  150. }*/
  151. //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
  152. (*resulttmp)["area"] = arearesult
  153. (*resulttmp)["city"] = cityresult
  154. (*resulttmp)["district"] = districtresult
  155. //校验-映射新疆兵团
  156. if xjbtReg.MatchString(buyer) && cityresult == "" {
  157. a, c, d, ok := e.CheckingXjbtCity(buyer)
  158. if ok {
  159. (*resulttmp)["area"] = a
  160. (*resulttmp)["city"] = c
  161. (*resulttmp)["district"] = d
  162. }
  163. }
  164. //如果-仅有省份-敏感词-校验核对方法
  165. if arearesult != "全国" && cityresult == "" {
  166. sensitive_city := e.SensitiveCityData(qu.ObjToString((*j.Data)["detail"]), arearesult)
  167. if sensitive_city != "" {
  168. (*resulttmp)["city"] = sensitive_city
  169. (*resulttmp)["is_sensitive"] = 1
  170. }
  171. }
  172. }
  173. //jsondata中抽取城市
  174. func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
  175. defer qu.Catch()
  176. if j.Jsondata != nil {
  177. jsondata := *j.Jsondata
  178. //jsondata中获取province和city
  179. if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
  180. p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
  181. GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配
  182. }
  183. city, _ = jsondata["city"].(string) //city全称或者简称
  184. province, _ = jsondata["area"].(string) //province简称
  185. district, _ = jsondata["district"].(string) //district全称
  186. }
  187. PCDScore(j, "district", district, 5, true) //district打分
  188. bp := false
  189. if province != "" {
  190. if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
  191. bp = true //省份正确
  192. }
  193. }
  194. pbrief := ""
  195. if city != "" {
  196. cityfullmap := e.CityFullMap[city] //判断city全称是否正确
  197. if cityfullmap != nil {
  198. pbrief = cityfullmap.P.Brief //province简称
  199. } else {
  200. citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
  201. if citybriefmap != nil {
  202. city = citybriefmap.Name //city简称替换为全称
  203. pbrief = citybriefmap.P.Brief
  204. }
  205. }
  206. }
  207. if bp {
  208. if pbrief == province { //爬虫的province和city匹配
  209. PCDScore(j, "city", city, 5, true)
  210. } else { //pbrief不匹配province(此时city为空或者错误)
  211. city = ""
  212. }
  213. PCDScore(j, "province", province, 5, true)
  214. } else { //省份错误或为空,取city的对应的pbrief为province
  215. if pbrief != "" {
  216. province = pbrief
  217. PCDScore(j, "province", province, 5, true)
  218. PCDScore(j, "city", city, 5, true)
  219. } else {
  220. province = ""
  221. city = ""
  222. }
  223. }
  224. return
  225. }
  226. //全称从area_city_district中抽城市
  227. func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
  228. text := e.Seg_PCD.Cut(a_c_d, true)
  229. repeatPb := map[string]bool{}
  230. for _, full := range text {
  231. if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
  232. if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
  233. pbrief = tmpPbrief //省简称
  234. PCDScore(j, "province", pbrief, 5, true)
  235. }
  236. } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
  237. if cfMap := e.CityFullMap[full]; cfMap != nil {
  238. tmpcity := cfMap.Name //城市全称
  239. tmpPbrief := cfMap.P.Brief //省简称
  240. if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
  241. city = tmpcity
  242. PCDScore(j, "city", city, 5, true)
  243. } else if pbrief == "" {
  244. city = tmpcity
  245. pbrief = tmpPbrief
  246. PCDScore(j, "city", city, 5, true)
  247. PCDScore(j, "province", pbrief, 5, true)
  248. }
  249. }
  250. } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
  251. carr := e.DistrictCityMap[full]
  252. if len(carr) > 0 {
  253. district = full
  254. PCDScore(j, "district", district, 5, true)
  255. for _, c := range carr {
  256. tmpcity := c.Name //城市全称
  257. tmpPbrief := c.P.Brief //省简称
  258. if pbrief == "" { //之前没有匹配到省份
  259. PCDScore(j, "city", tmpcity, 5, true)
  260. if !repeatPb[tmpPbrief] {
  261. PCDScore(j, "province", tmpPbrief, 5, true)
  262. repeatPb[tmpPbrief] = true
  263. }
  264. } else { //已有省份
  265. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  266. PCDScore(j, "city", tmpcity, -5, true)
  267. PCDScore(j, "province", tmpPbrief, -5, true)
  268. } else { //与之前匹配结果一致
  269. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  270. PCDScore(j, "city", tmpcity, 5, true)
  271. }
  272. }
  273. }
  274. }
  275. }
  276. }
  277. }
  278. return pbrief, city, district
  279. }
  280. //简称从area_city_district中抽城市
  281. func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
  282. text := e.Seg_PCD.Cut(a_c_d, true)
  283. repeatPb := map[string]bool{}
  284. for _, sim := range text {
  285. if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
  286. if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
  287. pbrief = pbMap.Brief
  288. PCDScore(j, "province", pbrief, 5, true) //打分
  289. //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
  290. }
  291. } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
  292. if cbMap := e.CityBriefMap[sim]; cbMap != nil {
  293. tmpcity := cbMap.Name
  294. tmpPbrief := cbMap.P.Brief
  295. if pbrief != "" && pbrief == tmpPbrief {
  296. city = tmpcity
  297. PCDScore(j, "city", city, 5, true)
  298. } else if pbrief == "" {
  299. city = tmpcity
  300. pbrief = tmpPbrief
  301. PCDScore(j, "city", city, 5, true)
  302. PCDScore(j, "province", pbrief, 5, true)
  303. //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
  304. }
  305. }
  306. } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
  307. dfullarr := e.DistrictSimAndAll[sim]
  308. if len(dfullarr) > 0 {
  309. PCDScore(j, "district", sim, 5, true)
  310. for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
  311. for _, c := range dfullAndCity {
  312. if c == nil {
  313. continue
  314. }
  315. tmpcity := c.Name //城市全称
  316. tmpPbrief := c.P.Brief //省简称
  317. if pbrief == "" { //之前没有匹配到省份
  318. PCDScore(j, "city", tmpcity, 5, true)
  319. if !repeatPb[tmpPbrief] {
  320. PCDScore(j, "province", tmpPbrief, 5, true)
  321. repeatPb[tmpPbrief] = true
  322. }
  323. } else { //已有省份
  324. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  325. PCDScore(j, "city", tmpcity, -5, true)
  326. PCDScore(j, "province", tmpPbrief, -5, true)
  327. } else { //与之前匹配结果一致
  328. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  329. PCDScore(j, "city", tmpcity, 5, true)
  330. }
  331. }
  332. }
  333. }
  334. }
  335. }
  336. }
  337. }
  338. }
  339. //通过site提取城市
  340. func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
  341. site, _ := (*j.Data)["site"].(string)
  342. //qu.Debug("site--------", site)
  343. if scMap := e.SiteCityMap[site]; scMap != nil {
  344. if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
  345. PCDScore(j, "province", scMap.P, 5, true)
  346. }
  347. if scMap.C != "" && scMap.C != "null" {
  348. PCDScore(j, "city", scMap.C, 5, true)
  349. }
  350. if scMap.D != "" && scMap.D != "null" {
  351. PCDScore(j, "district", scMap.D, 5, true)
  352. }
  353. }
  354. }
  355. //通过邮编提取城市
  356. func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
  357. defer qu.Catch()
  358. pc := e.PostCodeMap[postcode]
  359. if pc != nil {
  360. province = pc.P
  361. city = pc.C
  362. districtTmp := pc.D //邮编可能对应多个区
  363. score := 3.0
  364. if len(districtTmp) == 1 && districtTmp[0] != "" {
  365. score = 5.0
  366. }
  367. for _, district := range districtTmp {
  368. PCDScore(j, "district", district, score, true)
  369. }
  370. PCDScore(j, "province", province, 5, true)
  371. PCDScore(j, "city", city, 5, true)
  372. }
  373. return
  374. }
  375. //固话区号提取城市
  376. func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
  377. defer qu.Catch()
  378. if len(buyertel) >= 11 {
  379. if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
  380. n := 4
  381. L:
  382. areacode := buyertel[:n]
  383. ac := e.AreaCodeMap[areacode]
  384. if ac != nil {
  385. province = ac.P
  386. citytmp := ac.C
  387. if len(citytmp) == 1 { //对应多个city舍去
  388. city = citytmp[0]
  389. score := float64(5)
  390. if areacode == "0371" {
  391. score = float64(4)
  392. }
  393. PCDScore(j, "city", city, score, true)
  394. }
  395. PCDScore(j, "province", province, 5, true)
  396. } else {
  397. n = n - 1
  398. if n >= 3 {
  399. goto L
  400. }
  401. }
  402. } /* else if buyertel[:3] == "853" { //澳门
  403. province = "澳门"
  404. city = "澳门"
  405. PCDScore(j, "province", province, 5, true)
  406. PCDScore(j, "city", city, 5, true)
  407. }*/
  408. }
  409. return
  410. }
  411. func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
  412. /*
  413. 1.对字段进行分词
  414. 2.省、市、区、街道、居委会全称进行匹配打分
  415. 3.省、市、区简称进行匹配打分
  416. */
  417. ts := 0.5
  418. for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
  419. if i > 1 {
  420. ts = 0.2
  421. }
  422. p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
  423. str, _ := sm.Map[from].(string)
  424. jbText := e.Seg_SV.Cut(str, true)
  425. for jb_index, text := range jbText {
  426. if len([]rune(text)) == 1 {
  427. continue
  428. }
  429. //全称匹配
  430. //qu.Debug("text------", text)
  431. for pos_full, trie_full := range e.Trie_Fulls {
  432. if trie_full.Get(text) {
  433. if pos_full == 0 && p_full == "" { //省全称
  434. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
  435. p_full = tmpPbrief
  436. PCDScore(j, "province", p_full, 4+ts, true)
  437. break
  438. }
  439. } else if pos_full == 1 && c_full == "" { //市全称
  440. if cfMap := e.CityFullMap[text]; cfMap != nil {
  441. tmpPbrief := cfMap.P.Brief
  442. if p_full == "" {
  443. p_full = tmpPbrief
  444. c_full = cfMap.Name
  445. PCDScore(j, "province", p_full, 4+ts, true)
  446. PCDScore(j, "city", c_full, 4+ts, true)
  447. break
  448. } else if p_full == tmpPbrief {
  449. c_full = cfMap.Name
  450. PCDScore(j, "province", tmpPbrief, 4+ts, true) //
  451. PCDScore(j, "city", c_full, 4+ts, true)
  452. break
  453. } else if p_full != "" && p_full != tmpPbrief {
  454. //city不做处理
  455. }
  456. }
  457. } else if pos_full == 2 && d_full == "" { //区全称
  458. repeatPb := map[string]bool{}
  459. isOk := false
  460. districtOk := false
  461. citys := e.DistrictCityMap[text]
  462. for _, c := range citys {
  463. tmpPbrief := c.P.Brief
  464. if p_full == tmpPbrief { //省份一致
  465. d_full = text
  466. if c_full == "" {
  467. c_full = c.Name
  468. PCDScore(j, "city", c_full, 4+ts, true)
  469. PCDScore(j, "province", tmpPbrief, 4+ts, true) //
  470. }
  471. isOk = true
  472. districtOk = true
  473. } else if p_full == "" { //省份不存在
  474. districtOk = true
  475. if len(citys) == 1 { //对应一个city
  476. p_full = tmpPbrief
  477. c_full = c.Name
  478. d_full = text
  479. PCDScore(j, "province", p_full, 4+ts, true)
  480. PCDScore(j, "city", c_full, 4+ts, true)
  481. isOk = true
  482. } else { //多个city,只打分,不赋值
  483. if !repeatPb[tmpPbrief] {
  484. PCDScore(j, "province", tmpPbrief, 2+ts, true)
  485. repeatPb[tmpPbrief] = true
  486. }
  487. //PCDScore(j, "province", tmpPbrief, 2, true)
  488. PCDScore(j, "city", c.Name, 2+ts, true)
  489. }
  490. } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
  491. if !repeatPb[tmpPbrief] {
  492. PCDScore(j, "province", tmpPbrief, -5, true)
  493. repeatPb[tmpPbrief] = true
  494. }
  495. //PCDScore(j, "province", tmpPbrief, -5, true)
  496. PCDScore(j, "city", c.Name, -5, true)
  497. }
  498. }
  499. if districtOk {
  500. PCDScore(j, "district", text, 4+ts, true)
  501. } else {
  502. PCDScore(j, "district", text, -5, true)
  503. }
  504. if isOk {
  505. break
  506. }
  507. } else if pos_full == 3 { //街道全称
  508. districts := e.StreetDistrictMap[text]
  509. if len(districts) == 1 { //街道唯一
  510. DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
  511. }
  512. } else if pos_full == 4 { //居委会全称
  513. //districts := e.CommunityDistrictMap[text]
  514. //if len(districts) == 1 { //居委会唯一
  515. // DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
  516. //}
  517. }
  518. }
  519. }
  520. //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  521. //简称匹配
  522. for pos_sim, trie_sim := range e.Trie_Sims {
  523. if trie_sim.Get(text) {
  524. if pos_sim == 0 && p_sim == "" { //省简称
  525. p_sim = text
  526. PCDScore(j, "province", p_sim, 3+ts, false)
  527. break
  528. } else if pos_sim == 1 { //市简称
  529. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  530. tmpPbrief := cbMap.P.Brief
  531. if p_sim == "" {
  532. score := 2.0 + ts
  533. if tmpPbrief == p_full {
  534. score += 1.0
  535. }
  536. p_sim = tmpPbrief
  537. c_sim = cbMap.Brief
  538. PCDScore(j, "province", p_sim, score, false)
  539. PCDScore(j, "city", cbMap.Name, score, false)
  540. break
  541. } else if p_sim == tmpPbrief {
  542. c_sim = cbMap.Brief
  543. PCDScore(j, "city", cbMap.Name, 3+ts, false)
  544. PCDScore(j, "province", tmpPbrief, 3+ts, false)
  545. break
  546. } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
  547. delete(j.SimAreaScore, p_sim)
  548. c_sim = text //
  549. p_sim = tmpPbrief //
  550. PCDScore(j, "province", tmpPbrief, 3+ts, false)
  551. PCDScore(j, "city", cbMap.Name, 3+ts, false)
  552. }
  553. }
  554. } else if pos_sim == 2 && d_sim == "" { //区简称
  555. repeatPb := map[string]bool{}
  556. repeatDb := map[string]bool{}
  557. dfull_citys := e.DistrictSimAndAll[text]
  558. for _, dfull_city := range dfull_citys {
  559. for dfull, c := range dfull_city { //dfull:简称对应的全称
  560. if c == nil || c.P == nil {
  561. continue
  562. }
  563. tmpPbrief := c.P.Brief
  564. if p_sim == tmpPbrief { //省份一致
  565. d_sim = text
  566. PCDScore(j, "district", dfull, 2+ts, false)
  567. if c_sim == "" {
  568. c_sim = c.Brief
  569. PCDScore(j, "city", c.Name, 2+ts, false)
  570. }
  571. PCDScore(j, "province", tmpPbrief, 2+ts, false) //
  572. } else if p_sim == "" { //暂未匹配到省
  573. if !repeatDb[dfull] {
  574. PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
  575. repeatDb[dfull] = true
  576. }
  577. if len(dfull_citys) == 1 {
  578. PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
  579. PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
  580. } else {
  581. if !repeatPb[tmpPbrief] {
  582. PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
  583. repeatPb[tmpPbrief] = true
  584. }
  585. PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
  586. }
  587. //新增~特殊组情况下~津市高新区管委会~切词首"津市"~均未匹配到情况下
  588. if jb_index == 0 && len(dfull_citys) == 1 && len(j.FullAreaScore) == 0 && len(j.SimAreaScore) == 0 {
  589. PCDScore(j, "district", dfull, 0, false)
  590. PCDScore(j, "city", c.Name, 0, false)
  591. PCDScore(j, "province", tmpPbrief, 0, false) //
  592. }
  593. } else if p_sim != "" && p_sim != tmpPbrief {
  594. if !repeatPb[tmpPbrief] {
  595. PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
  596. repeatPb[tmpPbrief] = true
  597. }
  598. PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
  599. PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
  600. }
  601. }
  602. }
  603. }
  604. }
  605. }
  606. //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  607. }
  608. }
  609. }
  610. func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
  611. repeatP_full := map[string]bool{}
  612. repeatC_full := map[string]bool{}
  613. repeatD_full := map[string]bool{}
  614. repeatP_sim := map[string]bool{}
  615. repeatC_sim := map[string]bool{}
  616. repeatD_sim := map[string]bool{}
  617. detailRune := []rune(j.Content)
  618. detail := j.Content
  619. if len(detailRune) > 600 {
  620. start := detailRune[:300]
  621. end := detailRune[len(detailRune)-300:]
  622. detail = string(start) + string(end)
  623. }
  624. for _, reg := range AgencyReg {
  625. detail = reg.ReplaceAllString(detail, "")
  626. }
  627. for _, text := range e.Seg_SV.Cut(detail, true) {
  628. if len([]rune(text)) > 1 {
  629. //全称匹配
  630. for pos_full, trie_full := range e.Trie_Fulls {
  631. if trie_full.Get(text) {
  632. if pos_full == 0 { //省全称
  633. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
  634. PCDScore(j, "province", tmpPbrief, 1, true)
  635. repeatP_full[tmpPbrief] = true
  636. break
  637. }
  638. } else if pos_full == 1 { //市全称
  639. if cfMap := e.CityFullMap[text]; cfMap != nil {
  640. if !repeatP_full[cfMap.P.Brief] {
  641. PCDScore(j, "province", cfMap.P.Brief, 1, true)
  642. repeatP_full[cfMap.P.Brief] = true
  643. }
  644. if !repeatC_full[cfMap.Name] {
  645. PCDScore(j, "city", cfMap.Name, 1, true)
  646. repeatC_full[cfMap.Name] = true
  647. }
  648. break
  649. }
  650. } else if pos_full == 2 { //区全称
  651. citys := e.DistrictCityMap[text]
  652. if len(citys) > 0 {
  653. if !repeatD_full[text] {
  654. PCDScore(j, "district", text, 1, true)
  655. repeatD_full[text] = true
  656. }
  657. for _, c := range citys {
  658. if !repeatC_full[c.Name] {
  659. PCDScore(j, "city", c.Name, 1, true)
  660. repeatC_full[c.Name] = true
  661. }
  662. if !repeatP_full[c.P.Brief] {
  663. PCDScore(j, "province", c.P.Brief, 1, true)
  664. repeatP_full[c.P.Brief] = true
  665. }
  666. }
  667. break
  668. }
  669. } else if pos_full == 3 { //街道全称
  670. districts := e.StreetDistrictMap[text]
  671. if len(districts) == 1 {
  672. DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
  673. }
  674. } else if pos_full == 4 { //居委会全称
  675. //districts := e.CommunityDistrictMap[text]
  676. //if len(districts) == 1 {
  677. // DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
  678. //}
  679. }
  680. }
  681. }
  682. //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
  683. //简称匹配
  684. for pos_sim, trie_sim := range e.Trie_Sims {
  685. if trie_sim.Get(text) {
  686. if pos_sim == 0 && !repeatP_sim[text] { //省简称
  687. PCDScore(j, "province", text, 1, false)
  688. repeatP_sim[text] = true
  689. break
  690. } else if pos_sim == 1 { //市简称
  691. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  692. if !repeatP_sim[cbMap.P.Brief] {
  693. PCDScore(j, "province", cbMap.P.Brief, 1, false)
  694. repeatP_sim[cbMap.P.Brief] = true
  695. }
  696. if !repeatC_sim[cbMap.Name] {
  697. PCDScore(j, "city", cbMap.Name, 1, false)
  698. repeatC_sim[cbMap.Name] = true
  699. }
  700. break
  701. }
  702. } else if pos_sim == 2 { //区简称
  703. dfull_citys := e.DistrictSimAndAll[text]
  704. if len(dfull_citys) == 1 {
  705. for _, dfull_city := range dfull_citys {
  706. for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
  707. if !repeatD_sim[dfull] {
  708. PCDScore(j, "district", dfull, 1, false)
  709. repeatD_sim[dfull] = true
  710. }
  711. if ctmp == nil {
  712. continue
  713. }
  714. if !repeatC_sim[ctmp.Name] {
  715. PCDScore(j, "city", ctmp.Name, 1, false)
  716. repeatC_sim[ctmp.Name] = true
  717. }
  718. if !repeatP_sim[ctmp.P.Brief] {
  719. PCDScore(j, "province", ctmp.P.Brief, 1, false)
  720. repeatP_sim[ctmp.P.Brief] = true
  721. }
  722. }
  723. }
  724. }
  725. }
  726. }
  727. }
  728. //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
  729. }
  730. }
  731. }
  732. //街道、居委会对应多地市处理
  733. func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
  734. if len(districts) == 1 {
  735. district := districts[0]
  736. city := district.C.Name
  737. tmpPbrief := district.C.P.Brief
  738. if pbrief != "" && tmpPbrief == pbrief {
  739. PCDScore(j, "province", tmpPbrief, score, true)
  740. PCDScore(j, "city", city, score, true)
  741. PCDScore(j, "district", district.Name, score, true)
  742. } else if pbrief == "" {
  743. if repeatP != nil && !(*repeatP)[tmpPbrief] {
  744. PCDScore(j, "province", tmpPbrief, score, true)
  745. (*repeatP)[tmpPbrief] = true
  746. } else if repeatP == nil {
  747. PCDScore(j, "province", tmpPbrief, score, true)
  748. }
  749. if repeatC != nil && !(*repeatC)[city] {
  750. PCDScore(j, "city", city, score, true)
  751. (*repeatC)[city] = true
  752. } else if repeatC == nil {
  753. PCDScore(j, "city", city, score, true)
  754. }
  755. if repeatD != nil && !(*repeatD)[tmpPbrief] {
  756. PCDScore(j, "district", district.Name, score, true)
  757. (*repeatD)[district.Name] = true
  758. } else if repeatD == nil {
  759. PCDScore(j, "district", district.Name, score, true)
  760. }
  761. }
  762. }
  763. }
  764. func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
  765. for _, c := range finishC { //取最高分与province匹配的city
  766. if cfMap := e.CityFullMap[c]; cfMap != nil {
  767. if cfMap.P.Brief == area {
  768. // city = c
  769. // break
  770. tmpcity = append(tmpcity, c)
  771. }
  772. }
  773. }
  774. if len(tmpcity) == 1 {
  775. city = tmpcity[0]
  776. }
  777. return city, tmpcity
  778. }
  779. func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
  780. for _, d := range finishD { //取最高分与province匹配的district
  781. citys := e.DistrictCityMap[d]
  782. for _, c := range citys {
  783. if len(tmpcity) == 0 { //没有city
  784. if c.P.Brief == area {
  785. city = c.Name
  786. district = d
  787. return city, district
  788. }
  789. } else if len(tmpcity) == 1 { //一个city
  790. if c.Name == city && c.P.Brief == area {
  791. district = d
  792. return city, district
  793. }
  794. } else { //多个city
  795. for _, tc := range tmpcity { //多个city根据district最高分取
  796. if tc == c.Name && len(finishD) == 1 {
  797. city = c.Name
  798. district = d
  799. return city, district
  800. }
  801. }
  802. }
  803. }
  804. }
  805. return city, district
  806. }
  807. //计算province,city,district区或县匹配的得分
  808. func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
  809. defer qu.Catch()
  810. if t != "" {
  811. if stype == "d" {
  812. tmpscore := (*ds)[t]
  813. (*ds)[t] = tmpscore + score
  814. } else if stype == "c" {
  815. tmpscore := (*cs)[t]
  816. (*cs)[t] = tmpscore + score
  817. } else if stype == "p" {
  818. tmpscore := (*ps)[t]
  819. (*ps)[t] = tmpscore + score
  820. }
  821. }
  822. }
  823. func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
  824. if len(j.FullAreaScore) > 0 {
  825. for pt, ps := range *pscore {
  826. j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
  827. }
  828. for ct, cs := range *cscore {
  829. j.FullCityScore[ct] = j.FullCityScore[ct] + cs
  830. }
  831. for dt, ds := range *dscore {
  832. j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
  833. }
  834. }
  835. }
  836. func MergeFullSimScore(j *ju.Job) {
  837. if len(j.FullAreaScore) == 0 {
  838. j.FullAreaScore = j.SimAreaScore
  839. } else {
  840. for p_text, p_score := range j.FullAreaScore {
  841. j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
  842. }
  843. }
  844. for c_text, c_score := range j.SimCityScore {
  845. j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
  846. }
  847. for d_text, d_score := range j.SimDistrictScore {
  848. j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
  849. }
  850. // if len(j.FullCityScore) == 0 {
  851. // j.FullCityScore = j.SimCityScore
  852. // } else {
  853. // for c_text, c_score := range j.FullCityScore {
  854. // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
  855. // }
  856. // }
  857. // if len(j.FullDistrictScore) == 0 {
  858. // j.FullDistrictScore = j.SimDistrictScore
  859. // } else {
  860. // for d_text, d_score := range j.FullDistrictScore {
  861. // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
  862. // }
  863. // }
  864. }
  865. func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
  866. if len(j.FullDistrictScore) > 0 {
  867. for d, _ := range j.FullDistrictScore {
  868. tmpCitys := e.DistrictCityMap[d]
  869. for _, c := range tmpCitys {
  870. if j.FullCityScore[c.Name] != 0 {
  871. tmpPb := c.P.Brief
  872. //if j.FullAreaScore[tmpPb] != 0 {
  873. flag := false
  874. for _, p := range finishP {
  875. if tmpPb == p {
  876. flag = true
  877. break
  878. }
  879. }
  880. if !flag {
  881. delete(j.FullCityScore, c.Name)
  882. delete(j.FullDistrictScore, d)
  883. }
  884. //}
  885. }
  886. }
  887. }
  888. }
  889. if len(j.FullCityScore) > 0 {
  890. for tmpcity, _ := range j.FullCityScore {
  891. c := e.CityFullMap[tmpcity]
  892. if c == nil {
  893. log.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
  894. continue
  895. }
  896. tmpPb := c.P.Brief
  897. //if j.FullAreaScore[tmpPb] != 0 {
  898. flag := false
  899. for _, p := range finishP {
  900. if tmpPb == p {
  901. flag = true
  902. break
  903. }
  904. }
  905. if !flag {
  906. delete(j.FullCityScore, tmpcity)
  907. }
  908. //}
  909. }
  910. }
  911. }
  912. func HighestScoreArr(m map[string]float64) []string {
  913. result := make(map[float64][]string)
  914. tmpscore := 0.0
  915. for str, score := range m {
  916. if str != "" && tmpscore <= score {
  917. if result[tmpscore] != nil && tmpscore != score {
  918. delete(result, tmpscore)
  919. }
  920. if r := result[score]; r != nil {
  921. r = append(r, str)
  922. result[score] = r
  923. } else {
  924. result[score] = []string{str}
  925. }
  926. tmpscore = score
  927. }
  928. }
  929. return result[tmpscore]
  930. }
  931. //计算province,city,district得分
  932. func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
  933. defer qu.Catch()
  934. if text != "" {
  935. if stype == "district" {
  936. tmpdistrict := make(map[string]float64)
  937. if isfull {
  938. tmpdistrict = j.FullDistrictScore
  939. } else {
  940. tmpdistrict = j.SimDistrictScore
  941. }
  942. scoretmp := tmpdistrict[text]
  943. tmpdistrict[text] = scoretmp + score
  944. } else if stype == "city" {
  945. tmpcity := make(map[string]float64)
  946. if isfull {
  947. tmpcity = j.FullCityScore
  948. } else {
  949. tmpcity = j.SimCityScore
  950. }
  951. scoretmp := tmpcity[text]
  952. tmpcity[text] = scoretmp + score
  953. } else if stype == "province" {
  954. tmpprovince := make(map[string]float64)
  955. if isfull {
  956. tmpprovince = j.FullAreaScore
  957. } else {
  958. tmpprovince = j.SimAreaScore
  959. }
  960. scoretmp := tmpprovince[text]
  961. tmpprovince[text] = scoretmp + score
  962. }
  963. }
  964. }