newextractcity.go 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935
  1. package extract
  2. import (
  3. . "jy/pretreated"
  4. ju "jy/util"
  5. qu "qfw/util"
  6. "regexp"
  7. "strings"
  8. log "github.com/donnie4w/go-logger/logger"
  9. )
  10. var AgencyReg = []*regexp.Regexp{
  11. regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
  12. regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
  13. }
  14. //抽取city
  15. func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
  16. /*
  17. 高准确率:
  18. 1.爬虫数据jsondata
  19. 2.采购单位库
  20. 3.邮编
  21. 4.固话
  22. 5.site(todo)
  23. 低准确率:(全称库匹配到不走简称库)
  24. 1.city全称库(buyeraddr;title,projectname)
  25. 2.city简称库(buyeraddr;title,projectname)
  26. */
  27. defer qu.Catch()
  28. //初始化
  29. if j.FullAreaScore == nil {
  30. j.FullAreaScore = make(map[string]float64)
  31. }
  32. if j.FullCityScore == nil {
  33. j.FullCityScore = make(map[string]float64)
  34. }
  35. if j.FullDistrictScore == nil {
  36. j.FullDistrictScore = make(map[string]float64)
  37. }
  38. if j.SimAreaScore == nil {
  39. j.SimAreaScore = make(map[string]float64)
  40. }
  41. if j.SimCityScore == nil {
  42. j.SimCityScore = make(map[string]float64)
  43. }
  44. if j.SimDistrictScore == nil {
  45. j.SimDistrictScore = make(map[string]float64)
  46. }
  47. //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
  48. pscore := make(map[string]float64)
  49. cscore := make(map[string]float64)
  50. dscore := make(map[string]float64)
  51. sm := NewSortMap()
  52. //1.jsondata抽取
  53. e.NewGetCityByJsonData(j)
  54. //qu.Debug("jsondata打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  55. //2.site库抽取
  56. e.NewGetCityBySite(j)
  57. //qu.Debug("site打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  58. //3.采购单位库抽取(暂时没有采购单位库)
  59. //buyer, _ := resulttmp["buyer"].(string)
  60. //4.postcode邮编抽取
  61. buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
  62. e.NewGetCityByPostCode(j, buyerzipcode)
  63. //qu.Debug("邮编打分后结果---", buyerzipcode, j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  64. //5.areacode固话区号抽取
  65. buyertel, _ := resulttmp["buyertel"].(string)
  66. e.NewGetCityByAreaCode(j, buyertel)
  67. //qu.Debug("固话打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  68. //6.buyeraddr,title,projectname抽取
  69. buyeraddr, _ := resulttmp["buyeraddr"].(string)
  70. title, _ := resulttmp["title"].(string)
  71. projectname, _ := resulttmp["projectname"].(string)
  72. buyer, _ := resulttmp["buyer"].(string)
  73. //qu.Debug("buyeraddr--", buyeraddr, "--buyer--", buyer, "--title--", title, "--projectname--", projectname)
  74. sm.AddKey("buyeraddr", buyeraddr)
  75. sm.AddKey("buyer", buyer)
  76. sm.AddKey("title", title)
  77. sm.AddKey("projectname", projectname)
  78. //7.buyeraddr buyer title projectname抽取
  79. e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
  80. //qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  81. //qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  82. //全称简称得分合并
  83. MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
  84. //qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  85. //合并区简称得分
  86. //qu.Debug("pcd=====", pscore, cscore, dscore)
  87. MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
  88. //qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  89. j.SimAreaScore = map[string]float64{}
  90. j.SimCityScore = map[string]float64{}
  91. j.SimDistrictScore = map[string]float64{}
  92. //8.detail抽取
  93. if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
  94. e.NewGetCityByDetail(j)
  95. }
  96. //qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  97. //qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  98. MergeFullSimScore(j) //合并detail的全简称
  99. //qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  100. finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
  101. e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
  102. //qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
  103. //获取结果
  104. finishC := HighestScoreArr(j.FullCityScore)
  105. finishD := HighestScoreArr(j.FullDistrictScore)
  106. arearesult := ""
  107. cityresult := ""
  108. districtresult := ""
  109. tmpcity := []string{}
  110. if len(finishP) == 1 { //最高分一个
  111. arearesult = finishP[0] //抽取结果直接赋值
  112. cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  113. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  114. } else if len(finishP) > 1 { //province最高分多个
  115. if len(finishC) == 1 {
  116. cityresult = finishC[0]
  117. if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
  118. arearesult = cfMap.P.Brief
  119. tmpcity = append(tmpcity, cityresult)
  120. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  121. }
  122. } else { //对应的city有多个(多个province和city)
  123. //arearesult = finishP[0] //抽取结果直接赋值
  124. //cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  125. //cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  126. arearesult = "全国"
  127. }
  128. }
  129. if cityresult != "" && cityresult == districtresult {
  130. districtresult = ""
  131. }
  132. //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
  133. //直辖市
  134. if arearesult == "北京" {
  135. cityresult = "北京市"
  136. if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
  137. districtresult = "朝阳区"
  138. }
  139. } else if arearesult == "天津" {
  140. cityresult = "天津市"
  141. } else if arearesult == "上海" {
  142. cityresult = "上海市"
  143. } else if arearesult == "重庆" {
  144. cityresult = "重庆市"
  145. }
  146. if arearesult == "" {
  147. arearesult = "全国"
  148. } /* else if cityresult == "" {
  149. if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
  150. cityresult = pbMap.Cap
  151. resulttmp["defaultpcap"] = true
  152. }
  153. }*/
  154. //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
  155. resulttmp["area"] = arearesult
  156. resulttmp["city"] = cityresult
  157. resulttmp["district"] = districtresult
  158. }
  159. //jsondata中抽取城市
  160. func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
  161. defer qu.Catch()
  162. jsondata := *j.Jsondata
  163. if jsondata != nil { //jsondata中获取province和city
  164. if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
  165. p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
  166. GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配
  167. }
  168. city, _ = jsondata["city"].(string) //city全称或者简称
  169. province, _ = jsondata["area"].(string) //province简称
  170. district, _ = jsondata["district"].(string) //district全称
  171. }
  172. PCDScore(j, "district", district, 5, true) //district打分
  173. bp := false
  174. if province != "" {
  175. if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
  176. bp = true //省份正确
  177. }
  178. }
  179. pbrief := ""
  180. if city != "" {
  181. cityfullmap := e.CityFullMap[city] //判断city全称是否正确
  182. if cityfullmap != nil {
  183. pbrief = cityfullmap.P.Brief //province简称
  184. } else {
  185. citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
  186. if citybriefmap != nil {
  187. city = citybriefmap.Name //city简称替换为全称
  188. pbrief = citybriefmap.P.Brief
  189. }
  190. }
  191. }
  192. if bp {
  193. if pbrief == province { //爬虫的province和city匹配
  194. PCDScore(j, "city", city, 5, true)
  195. } else { //pbrief不匹配province(此时city为空或者错误)
  196. city = ""
  197. }
  198. PCDScore(j, "province", province, 5, true)
  199. } else { //省份错误或为空,取city的对应的pbrief为province
  200. if pbrief != "" {
  201. province = pbrief
  202. PCDScore(j, "province", province, 5, true)
  203. PCDScore(j, "city", city, 5, true)
  204. } else {
  205. province = ""
  206. city = ""
  207. }
  208. }
  209. return
  210. }
  211. //全称从area_city_district中抽城市
  212. func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
  213. text := e.Seg_PCD.Cut(a_c_d, true)
  214. repeatPb := map[string]bool{}
  215. for _, full := range text {
  216. if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
  217. if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
  218. pbrief = tmpPbrief //省简称
  219. PCDScore(j, "province", pbrief, 5, true)
  220. }
  221. } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
  222. if cfMap := e.CityFullMap[full]; cfMap != nil {
  223. tmpcity := cfMap.Name //城市全称
  224. tmpPbrief := cfMap.P.Brief //省简称
  225. if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
  226. city = tmpcity
  227. PCDScore(j, "city", city, 5, true)
  228. } else if pbrief == "" {
  229. city = tmpcity
  230. pbrief = tmpPbrief
  231. PCDScore(j, "city", city, 5, true)
  232. PCDScore(j, "province", pbrief, 5, true)
  233. }
  234. }
  235. } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
  236. carr := e.NewDistrictCityMap[full]
  237. if len(carr) > 0 {
  238. district = full
  239. PCDScore(j, "district", district, 5, true)
  240. for _, c := range carr {
  241. tmpcity := c.Name //城市全称
  242. tmpPbrief := c.P.Brief //省简称
  243. if pbrief == "" { //之前没有匹配到省份
  244. PCDScore(j, "city", tmpcity, 5, true)
  245. if !repeatPb[tmpPbrief] {
  246. PCDScore(j, "province", tmpPbrief, 5, true)
  247. repeatPb[tmpPbrief] = true
  248. }
  249. } else { //已有省份
  250. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  251. PCDScore(j, "city", tmpcity, -5, true)
  252. PCDScore(j, "province", tmpPbrief, -5, true)
  253. } else { //与之前匹配结果一致
  254. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  255. PCDScore(j, "city", tmpcity, 5, true)
  256. }
  257. }
  258. }
  259. }
  260. }
  261. }
  262. }
  263. return pbrief, city, district
  264. }
  265. //简称从area_city_district中抽城市
  266. func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
  267. text := e.Seg_PCD.Cut(a_c_d, true)
  268. repeatPb := map[string]bool{}
  269. for _, sim := range text {
  270. if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
  271. if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
  272. pbrief = pbMap.Brief
  273. PCDScore(j, "province", pbrief, 5, true) //打分
  274. //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
  275. }
  276. } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
  277. if cbMap := e.CityBriefMap[sim]; cbMap != nil {
  278. tmpcity := cbMap.Name
  279. tmpPbrief := cbMap.P.Brief
  280. if pbrief != "" && pbrief == tmpPbrief {
  281. city = tmpcity
  282. PCDScore(j, "city", city, 5, true)
  283. } else if pbrief == "" {
  284. city = tmpcity
  285. pbrief = tmpPbrief
  286. PCDScore(j, "city", city, 5, true)
  287. PCDScore(j, "province", pbrief, 5, true)
  288. //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
  289. }
  290. }
  291. } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
  292. dfullarr := e.NewDistrictSimAndAll[sim]
  293. if len(dfullarr) > 0 {
  294. PCDScore(j, "district", sim, 5, true)
  295. for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
  296. for _, c := range dfullAndCity {
  297. tmpcity := c.Name //城市全称
  298. tmpPbrief := c.P.Brief //省简称
  299. if pbrief == "" { //之前没有匹配到省份
  300. PCDScore(j, "city", tmpcity, 5, true)
  301. if !repeatPb[tmpPbrief] {
  302. PCDScore(j, "province", tmpPbrief, 5, true)
  303. repeatPb[tmpPbrief] = true
  304. }
  305. } else { //已有省份
  306. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  307. PCDScore(j, "city", tmpcity, -5, true)
  308. PCDScore(j, "province", tmpPbrief, -5, true)
  309. } else { //与之前匹配结果一致
  310. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  311. PCDScore(j, "city", tmpcity, 5, true)
  312. }
  313. }
  314. }
  315. }
  316. }
  317. }
  318. }
  319. }
  320. }
  321. //通过site提取城市
  322. func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
  323. site, _ := (*j.Data)["site"].(string)
  324. //qu.Debug("site--------", site)
  325. if scMap := e.SiteCityMap[site]; scMap != nil {
  326. if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
  327. PCDScore(j, "province", scMap.P, 5, true)
  328. }
  329. if scMap.C != "" && scMap.C != "null" {
  330. PCDScore(j, "city", scMap.C, 5, true)
  331. }
  332. if scMap.D != "" && scMap.D != "null" {
  333. PCDScore(j, "district", scMap.D, 5, true)
  334. }
  335. }
  336. }
  337. //通过邮编提取城市
  338. func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
  339. defer qu.Catch()
  340. pc := e.PostCodeMap[postcode]
  341. if pc != nil {
  342. province = pc.P
  343. city = pc.C
  344. districtTmp := pc.D //邮编可能对应多个区
  345. score := 3.0
  346. if len(districtTmp) == 1 && districtTmp[0] != "" {
  347. score = 5.0
  348. }
  349. for _, district := range districtTmp {
  350. PCDScore(j, "district", district, score, true)
  351. }
  352. PCDScore(j, "province", province, 5, true)
  353. PCDScore(j, "city", city, 5, true)
  354. }
  355. return
  356. }
  357. //固话区号提取城市
  358. func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
  359. defer qu.Catch()
  360. if len(buyertel) >= 11 {
  361. if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
  362. n := 4
  363. L:
  364. areacode := buyertel[:n]
  365. ac := e.AreaCodeMap[areacode]
  366. if ac != nil {
  367. province = ac.P
  368. citytmp := ac.C
  369. if len(citytmp) == 1 { //对应多个city舍去
  370. city = citytmp[0]
  371. PCDScore(j, "city", city, 5, true)
  372. }
  373. PCDScore(j, "province", province, 5, true)
  374. } else {
  375. n = n - 1
  376. if n >= 3 {
  377. goto L
  378. }
  379. }
  380. } /* else if buyertel[:3] == "853" { //澳门
  381. province = "澳门"
  382. city = "澳门"
  383. PCDScore(j, "province", province, 5, true)
  384. PCDScore(j, "city", city, 5, true)
  385. }*/
  386. }
  387. return
  388. }
  389. func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
  390. /*
  391. 1.对字段进行分词
  392. 2.省、市、区、街道、居委会全称进行匹配打分
  393. 3.省、市、区简称进行匹配打分
  394. */
  395. ts := 0.5
  396. for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
  397. if i > 1 {
  398. ts = 0.2
  399. }
  400. p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
  401. str, _ := sm.Map[from].(string)
  402. jbText := e.Seg_SV.Cut(str, true)
  403. for _, text := range jbText {
  404. if len([]rune(text)) == 1 {
  405. continue
  406. }
  407. //全称匹配
  408. //qu.Debug("text------", text)
  409. for pos_full, trie_full := range e.Trie_Fulls {
  410. if trie_full.Get(text) {
  411. if pos_full == 0 && p_full == "" { //省全称
  412. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
  413. p_full = tmpPbrief
  414. PCDScore(j, "province", p_full, 4+ts, true)
  415. break
  416. }
  417. } else if pos_full == 1 && c_full == "" { //市全称
  418. if cfMap := e.CityFullMap[text]; cfMap != nil {
  419. tmpPbrief := cfMap.P.Brief
  420. if p_full == "" {
  421. p_full = tmpPbrief
  422. c_full = cfMap.Name
  423. PCDScore(j, "province", p_full, 4+ts, true)
  424. PCDScore(j, "city", c_full, 4+ts, true)
  425. break
  426. } else if p_full == tmpPbrief {
  427. c_full = cfMap.Name
  428. PCDScore(j, "city", c_full, 4+ts, true)
  429. break
  430. } else if p_full != "" && p_full != tmpPbrief {
  431. //city不做处理
  432. }
  433. }
  434. } else if pos_full == 2 && d_full == "" { //区全称
  435. repeatPb := map[string]bool{}
  436. isOk := false
  437. districtOk := false
  438. citys := e.NewDistrictCityMap[text]
  439. for _, c := range citys {
  440. tmpPbrief := c.P.Brief
  441. if p_full == tmpPbrief { //省份一致
  442. d_full = text
  443. if c_full == "" {
  444. c_full = c.Name
  445. PCDScore(j, "city", c_full, 4+ts, true)
  446. }
  447. isOk = true
  448. districtOk = true
  449. } else if p_full == "" { //省份不存在
  450. districtOk = true
  451. if len(citys) == 1 { //对应一个city
  452. p_full = tmpPbrief
  453. c_full = c.Name
  454. d_full = text
  455. PCDScore(j, "province", p_full, 4+ts, true)
  456. PCDScore(j, "city", c_full, 4+ts, true)
  457. isOk = true
  458. } else { //多个city,只打分,不赋值
  459. if !repeatPb[tmpPbrief] {
  460. PCDScore(j, "province", tmpPbrief, 2+ts, true)
  461. repeatPb[tmpPbrief] = true
  462. }
  463. //PCDScore(j, "province", tmpPbrief, 2, true)
  464. PCDScore(j, "city", c.Name, 2+ts, true)
  465. }
  466. } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
  467. if !repeatPb[tmpPbrief] {
  468. PCDScore(j, "province", tmpPbrief, -5, true)
  469. repeatPb[tmpPbrief] = true
  470. }
  471. //PCDScore(j, "province", tmpPbrief, -5, true)
  472. PCDScore(j, "city", c.Name, -5, true)
  473. }
  474. }
  475. if districtOk {
  476. PCDScore(j, "district", text, 4+ts, true)
  477. } else {
  478. PCDScore(j, "district", text, -5, true)
  479. }
  480. if isOk {
  481. break
  482. }
  483. } else if pos_full == 3 { //街道全称
  484. districts := e.NewStreetDistrictMap[text]
  485. if len(districts) == 1 { //街道唯一
  486. DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
  487. }
  488. } else if pos_full == 4 { //居委会全称
  489. districts := e.CommunityDistrictMap[text]
  490. if len(districts) == 1 { //居委会唯一
  491. DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
  492. }
  493. }
  494. }
  495. }
  496. //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  497. //简称匹配
  498. for pos_sim, trie_sim := range e.Trie_Sims {
  499. if trie_sim.Get(text) {
  500. if pos_sim == 0 && p_sim == "" { //省简称
  501. p_sim = text
  502. PCDScore(j, "province", p_sim, 3+ts, false)
  503. break
  504. } else if pos_sim == 1 { //市简称
  505. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  506. tmpPbrief := cbMap.P.Brief
  507. if p_sim == "" {
  508. score := 2.0 + ts
  509. if tmpPbrief == p_full {
  510. score += 1.0
  511. }
  512. p_sim = tmpPbrief
  513. c_sim = cbMap.Brief
  514. PCDScore(j, "province", p_sim, score, false)
  515. PCDScore(j, "city", cbMap.Name, score, false)
  516. break
  517. } else if p_sim == tmpPbrief {
  518. c_sim = cbMap.Brief
  519. PCDScore(j, "city", cbMap.Name, 3+ts, false)
  520. break
  521. } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
  522. delete(j.SimAreaScore, p_sim)
  523. p_sim = text
  524. PCDScore(j, "province", tmpPbrief, 3+ts, false)
  525. PCDScore(j, "city", cbMap.Name, 3+ts, false)
  526. }
  527. }
  528. } else if pos_sim == 2 && d_sim == "" { //区简称
  529. repeatPb := map[string]bool{}
  530. repeatDb := map[string]bool{}
  531. dfull_citys := e.NewDistrictSimAndAll[text]
  532. for _, dfull_city := range dfull_citys {
  533. for dfull, c := range dfull_city { //dfull:简称对应的全称
  534. tmpPbrief := c.P.Brief
  535. if p_sim == tmpPbrief || p_full == tmpPbrief { //省份一致
  536. d_sim = text
  537. PCDScore(j, "district", dfull, 2+ts, false)
  538. if c_sim == "" {
  539. c_sim = c.Brief
  540. PCDScore(j, "city", c.Name, 2+ts, false)
  541. }
  542. } else if p_sim == "" {
  543. if !repeatDb[dfull] {
  544. PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
  545. repeatDb[dfull] = true
  546. }
  547. if len(dfull_citys) == 1 {
  548. PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
  549. PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
  550. } else {
  551. if !repeatPb[tmpPbrief] {
  552. PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
  553. repeatPb[tmpPbrief] = true
  554. }
  555. PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
  556. }
  557. } else if p_sim != "" && p_sim != tmpPbrief {
  558. if !repeatPb[tmpPbrief] {
  559. PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
  560. repeatPb[tmpPbrief] = true
  561. }
  562. PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
  563. PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
  564. }
  565. }
  566. }
  567. }
  568. }
  569. }
  570. //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  571. }
  572. }
  573. }
  574. func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
  575. repeatP_full := map[string]bool{}
  576. repeatC_full := map[string]bool{}
  577. repeatD_full := map[string]bool{}
  578. repeatP_sim := map[string]bool{}
  579. repeatC_sim := map[string]bool{}
  580. repeatD_sim := map[string]bool{}
  581. detailRune := []rune(j.Content)
  582. detail := j.Content
  583. if len(detailRune) > 600 {
  584. start := detailRune[:300]
  585. end := detailRune[len(detailRune)-300:]
  586. detail = string(start) + string(end)
  587. }
  588. for _, reg := range AgencyReg {
  589. detail = reg.ReplaceAllString(detail, "")
  590. }
  591. for _, text := range e.Seg_SV.Cut(detail, true) {
  592. if len([]rune(text)) > 1 {
  593. //全称匹配
  594. for pos_full, trie_full := range e.Trie_Fulls {
  595. if trie_full.Get(text) {
  596. if pos_full == 0 { //省全称
  597. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
  598. PCDScore(j, "province", tmpPbrief, 1, true)
  599. repeatP_full[tmpPbrief] = true
  600. break
  601. }
  602. } else if pos_full == 1 { //市全称
  603. if cfMap := e.CityFullMap[text]; cfMap != nil {
  604. if !repeatP_full[cfMap.P.Brief] {
  605. PCDScore(j, "province", cfMap.P.Brief, 1, true)
  606. repeatP_full[cfMap.P.Brief] = true
  607. }
  608. if !repeatC_full[cfMap.Name] {
  609. PCDScore(j, "city", cfMap.Name, 1, true)
  610. repeatC_full[cfMap.Name] = true
  611. }
  612. break
  613. }
  614. } else if pos_full == 2 { //区全称
  615. citys := e.NewDistrictCityMap[text]
  616. if len(citys) > 0 {
  617. if !repeatD_full[text] {
  618. PCDScore(j, "district", text, 1, true)
  619. repeatD_full[text] = true
  620. }
  621. for _, c := range citys {
  622. if !repeatC_full[c.Name] {
  623. PCDScore(j, "city", c.Name, 1, true)
  624. repeatC_full[c.Name] = true
  625. }
  626. if !repeatP_full[c.P.Brief] {
  627. PCDScore(j, "province", c.P.Brief, 1, true)
  628. repeatP_full[c.P.Brief] = true
  629. }
  630. }
  631. break
  632. }
  633. } else if pos_full == 3 { //街道全称
  634. districts := e.NewStreetDistrictMap[text]
  635. if len(districts) == 1 {
  636. DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
  637. }
  638. } else if pos_full == 4 { //居委会全称
  639. districts := e.CommunityDistrictMap[text]
  640. if len(districts) == 1 {
  641. DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
  642. }
  643. }
  644. }
  645. }
  646. //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
  647. //简称匹配
  648. for pos_sim, trie_sim := range e.Trie_Sims {
  649. if trie_sim.Get(text) {
  650. if pos_sim == 0 && !repeatP_sim[text] { //省简称
  651. PCDScore(j, "province", text, 1, false)
  652. repeatP_sim[text] = true
  653. break
  654. } else if pos_sim == 1 { //市简称
  655. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  656. if !repeatP_sim[cbMap.P.Brief] {
  657. PCDScore(j, "province", cbMap.P.Brief, 1, false)
  658. repeatP_sim[cbMap.P.Brief] = true
  659. }
  660. if !repeatC_sim[cbMap.Name] {
  661. PCDScore(j, "city", cbMap.Name, 1, false)
  662. repeatC_sim[cbMap.Name] = true
  663. }
  664. break
  665. }
  666. } else if pos_sim == 2 { //区简称
  667. dfull_citys := e.NewDistrictSimAndAll[text]
  668. if len(dfull_citys) == 1 {
  669. for _, dfull_city := range dfull_citys {
  670. for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
  671. if !repeatD_sim[dfull] {
  672. PCDScore(j, "district", dfull, 1, false)
  673. repeatD_sim[dfull] = true
  674. }
  675. if !repeatC_sim[ctmp.Name] {
  676. PCDScore(j, "city", ctmp.Name, 1, false)
  677. repeatC_sim[ctmp.Name] = true
  678. }
  679. if !repeatP_sim[ctmp.P.Brief] {
  680. PCDScore(j, "province", ctmp.P.Brief, 1, false)
  681. repeatP_sim[ctmp.P.Brief] = true
  682. }
  683. }
  684. }
  685. }
  686. }
  687. }
  688. }
  689. //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
  690. }
  691. }
  692. }
  693. //街道、居委会对应多地市处理
  694. func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
  695. if len(districts) == 1 {
  696. district := districts[0]
  697. city := district.C.Name
  698. tmpPbrief := district.C.P.Brief
  699. if pbrief != "" && tmpPbrief == pbrief {
  700. PCDScore(j, "province", tmpPbrief, score, true)
  701. PCDScore(j, "city", city, score, true)
  702. PCDScore(j, "district", district.Name, score, true)
  703. } else if pbrief == "" {
  704. if repeatP != nil && !(*repeatP)[tmpPbrief] {
  705. PCDScore(j, "province", tmpPbrief, score, true)
  706. (*repeatP)[tmpPbrief] = true
  707. } else if repeatP == nil {
  708. PCDScore(j, "province", tmpPbrief, score, true)
  709. }
  710. if repeatC != nil && !(*repeatC)[city] {
  711. PCDScore(j, "city", city, score, true)
  712. (*repeatC)[city] = true
  713. } else if repeatC == nil {
  714. PCDScore(j, "city", city, score, true)
  715. }
  716. if repeatD != nil && !(*repeatD)[tmpPbrief] {
  717. PCDScore(j, "district", district.Name, score, true)
  718. (*repeatD)[district.Name] = true
  719. } else if repeatD == nil {
  720. PCDScore(j, "district", district.Name, score, true)
  721. }
  722. }
  723. }
  724. }
  725. func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
  726. for _, c := range finishC { //取最高分与province匹配的city
  727. if cfMap := e.CityFullMap[c]; cfMap != nil {
  728. if cfMap.P.Brief == area {
  729. // city = c
  730. // break
  731. tmpcity = append(tmpcity, c)
  732. }
  733. }
  734. }
  735. if len(tmpcity) == 1 {
  736. city = tmpcity[0]
  737. }
  738. return city, tmpcity
  739. }
  740. func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
  741. for _, d := range finishD { //取最高分与province匹配的district
  742. citys := e.NewDistrictCityMap[d]
  743. for _, c := range citys {
  744. if len(tmpcity) == 0 { //没有city
  745. if c.P.Brief == area {
  746. city = c.Name
  747. district = d
  748. return city, district
  749. }
  750. } else if len(tmpcity) == 1 { //一个city
  751. if c.Name == city && c.P.Brief == area {
  752. district = d
  753. return city, district
  754. }
  755. } else { //多个city
  756. for _, tc := range tmpcity { //多个city根据district最高分取
  757. if tc == c.Name && len(finishD) == 1 {
  758. city = c.Name
  759. district = d
  760. return city, district
  761. }
  762. }
  763. }
  764. }
  765. }
  766. return city, district
  767. }
  768. //计算province,city,district区或县匹配的得分
  769. func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
  770. defer qu.Catch()
  771. if t != "" {
  772. if stype == "d" {
  773. tmpscore := (*ds)[t]
  774. (*ds)[t] = tmpscore + score
  775. } else if stype == "c" {
  776. tmpscore := (*cs)[t]
  777. (*cs)[t] = tmpscore + score
  778. } else if stype == "p" {
  779. tmpscore := (*ps)[t]
  780. (*ps)[t] = tmpscore + score
  781. }
  782. }
  783. }
  784. func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
  785. if len(j.FullAreaScore) > 0 {
  786. for pt, ps := range *pscore {
  787. j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
  788. }
  789. for ct, cs := range *cscore {
  790. j.FullCityScore[ct] = j.FullCityScore[ct] + cs
  791. }
  792. for dt, ds := range *dscore {
  793. j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
  794. }
  795. }
  796. }
  797. func MergeFullSimScore(j *ju.Job) {
  798. if len(j.FullAreaScore) == 0 {
  799. j.FullAreaScore = j.SimAreaScore
  800. } else {
  801. for p_text, p_score := range j.FullAreaScore {
  802. j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
  803. }
  804. }
  805. for c_text, c_score := range j.SimCityScore {
  806. j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
  807. }
  808. for d_text, d_score := range j.SimDistrictScore {
  809. j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
  810. }
  811. // if len(j.FullCityScore) == 0 {
  812. // j.FullCityScore = j.SimCityScore
  813. // } else {
  814. // for c_text, c_score := range j.FullCityScore {
  815. // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
  816. // }
  817. // }
  818. // if len(j.FullDistrictScore) == 0 {
  819. // j.FullDistrictScore = j.SimDistrictScore
  820. // } else {
  821. // for d_text, d_score := range j.FullDistrictScore {
  822. // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
  823. // }
  824. // }
  825. }
  826. func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
  827. if len(j.FullDistrictScore) > 0 {
  828. for d, _ := range j.FullDistrictScore {
  829. tmpCitys := e.NewDistrictCityMap[d]
  830. for _, c := range tmpCitys {
  831. if j.FullCityScore[c.Name] != 0 {
  832. tmpPb := c.P.Brief
  833. //if j.FullAreaScore[tmpPb] != 0 {
  834. flag := false
  835. for _, p := range finishP {
  836. if tmpPb == p {
  837. flag = true
  838. break
  839. }
  840. }
  841. if !flag {
  842. delete(j.FullCityScore, c.Name)
  843. delete(j.FullDistrictScore, d)
  844. }
  845. //}
  846. }
  847. }
  848. }
  849. }
  850. if len(j.FullCityScore) > 0 {
  851. for tmpcity, _ := range j.FullCityScore {
  852. c := e.CityFullMap[tmpcity]
  853. if c == nil {
  854. log.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
  855. continue
  856. }
  857. tmpPb := c.P.Brief
  858. //if j.FullAreaScore[tmpPb] != 0 {
  859. flag := false
  860. for _, p := range finishP {
  861. if tmpPb == p {
  862. flag = true
  863. break
  864. }
  865. }
  866. if !flag {
  867. delete(j.FullCityScore, tmpcity)
  868. }
  869. //}
  870. }
  871. }
  872. }
  873. //province,city,district干扰项减分
  874. //func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
  875. // defer qu.Catch()
  876. // if text != "" {
  877. // if stype == "city" {
  878. // for cn, cscore := range j.CityScore {
  879. // if cn != text {
  880. // j.CityScore[cn] = cscore + score
  881. // //错误的city减分后对应的province也减分
  882. // for pb, pscore := range j.AreaScore {
  883. // if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
  884. // j.AreaScore[pb] = pscore + score
  885. // }
  886. // }
  887. // }
  888. // }
  889. // } else if stype == "province" {
  890. // for pb, pscore := range j.AreaScore {
  891. // if pb != text {
  892. // j.AreaScore[pb] = pscore + score
  893. // //错误的province减分后对应的city也要减分
  894. // for cn, cscore := range j.CityScore {
  895. // if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
  896. // j.CityScore[cn] = cscore + score
  897. // }
  898. // }
  899. // }
  900. // }
  901. // }
  902. // // for name, tmpscore := range *whichMap {
  903. // // if name != text {
  904. // // (*whichMap)[name] = tmpscore + score
  905. // // }
  906. // // }
  907. // }
  908. //}