newextractcity.go 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956
  1. package extract
  2. import (
  3. . "jy/pretreated"
  4. ju "jy/util"
  5. qu "qfw/util"
  6. "regexp"
  7. "strings"
  8. log "github.com/donnie4w/go-logger/logger"
  9. )
  10. var AgencyReg = []*regexp.Regexp{
  11. regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
  12. regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
  13. }
  14. //抽取city
  15. func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}, id string) {
  16. /*
  17. 高准确率:
  18. 1.爬虫数据jsondata
  19. 2.采购单位库
  20. 3.邮编
  21. 4.固话
  22. 5.site(todo)
  23. 低准确率:(全称库匹配到不走简称库)
  24. 1.city全称库(buyeraddr;title,projectname)
  25. 2.city简称库(buyeraddr;title,projectname)
  26. */
  27. defer qu.Catch()
  28. //初始化
  29. if j.FullAreaScore == nil {
  30. j.FullAreaScore = make(map[string]float64)
  31. }
  32. if j.FullCityScore == nil {
  33. j.FullCityScore = make(map[string]float64)
  34. }
  35. if j.FullDistrictScore == nil {
  36. j.FullDistrictScore = make(map[string]float64)
  37. }
  38. if j.SimAreaScore == nil {
  39. j.SimAreaScore = make(map[string]float64)
  40. }
  41. if j.SimCityScore == nil {
  42. j.SimCityScore = make(map[string]float64)
  43. }
  44. if j.SimDistrictScore == nil {
  45. j.SimDistrictScore = make(map[string]float64)
  46. }
  47. //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
  48. pscore := make(map[string]float64)
  49. cscore := make(map[string]float64)
  50. dscore := make(map[string]float64)
  51. sm := NewSortMap()
  52. //1.jsondata抽取
  53. e.NewGetCityByJsonData(j)
  54. //qu.Debug("jsondata打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  55. //2.site库抽取
  56. e.NewGetCityBySite(j)
  57. //qu.Debug("site打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  58. //3.采购单位库抽取(暂时没有采购单位库)
  59. //buyer, _ := resulttmp["buyer"].(string)
  60. //4.postcode邮编抽取
  61. buyerzipcode := qu.ObjToString((*resulttmp)["buyerzipcode"])
  62. e.NewGetCityByPostCode(j, buyerzipcode)
  63. //qu.Debug("邮编打分后结果---", buyerzipcode, j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  64. //5.areacode固话区号抽取
  65. buyertel := qu.ObjToString((*resulttmp)["buyertel"])
  66. e.NewGetCityByAreaCode(j, buyertel)
  67. //qu.Debug("固话打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  68. //6.buyeraddr,title,projectname抽取
  69. buyeraddr := qu.ObjToString((*resulttmp)["buyeraddr"])
  70. title := qu.ObjToString((*resulttmp)["title"])
  71. projectname := qu.ObjToString((*resulttmp)["projectname"])
  72. buyer := qu.ObjToString((*resulttmp)["buyer"])
  73. //qu.Debug("buyeraddr--", buyeraddr, "--buyer--", buyer, "--title--", title, "--projectname--", projectname)
  74. sm.AddKey("buyeraddr", buyeraddr)
  75. sm.AddKey("buyer", buyer)
  76. sm.AddKey("title", title)
  77. sm.AddKey("projectname", projectname)
  78. if projectaddr, isok := (*resulttmp)["projectaddr"].(string); isok {
  79. sm.AddKey("projectaddr", projectaddr)
  80. }
  81. if bidopenaddress, isok := (*resulttmp)["bidopenaddress"].(string); isok {
  82. sm.AddKey("bidopenaddress", bidopenaddress)
  83. }
  84. //7.buyeraddr buyer title projectname抽取
  85. e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
  86. //qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  87. //qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  88. //全称简称得分合并
  89. MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
  90. //qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  91. //合并区简称得分
  92. //qu.Debug("pcd=====", pscore, cscore, dscore)
  93. MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
  94. //qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  95. j.SimAreaScore = map[string]float64{}
  96. j.SimCityScore = map[string]float64{}
  97. j.SimDistrictScore = map[string]float64{}
  98. //8.detail抽取
  99. if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
  100. e.NewGetCityByDetail(j)
  101. }
  102. //qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  103. //qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  104. MergeFullSimScore(j) //合并detail的全简称
  105. //qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  106. finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
  107. e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
  108. //qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
  109. //获取结果
  110. finishC := HighestScoreArr(j.FullCityScore)
  111. finishD := HighestScoreArr(j.FullDistrictScore)
  112. arearesult := ""
  113. cityresult := ""
  114. districtresult := ""
  115. tmpcity := []string{}
  116. if len(finishP) == 1 { //最高分一个
  117. arearesult = finishP[0] //抽取结果直接赋值
  118. cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  119. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  120. } else if len(finishP) > 1 { //province最高分多个
  121. if len(finishC) == 1 {
  122. cityresult = finishC[0]
  123. if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
  124. arearesult = cfMap.P.Brief
  125. tmpcity = append(tmpcity, cityresult)
  126. cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  127. }
  128. } else { //对应的city有多个(多个province和city)
  129. //arearesult = finishP[0] //抽取结果直接赋值
  130. //cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
  131. //cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
  132. arearesult = "全国"
  133. }
  134. }
  135. if cityresult != "" && cityresult == districtresult {
  136. districtresult = ""
  137. }
  138. //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
  139. //直辖市
  140. if arearesult == "北京" {
  141. cityresult = "北京市"
  142. if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
  143. districtresult = "朝阳区"
  144. }
  145. } else if arearesult == "天津" {
  146. cityresult = "天津市"
  147. } else if arearesult == "上海" {
  148. cityresult = "上海市"
  149. } else if arearesult == "重庆" {
  150. cityresult = "重庆市"
  151. }
  152. if arearesult == "" {
  153. arearesult = "全国"
  154. } /* else if cityresult == "" {
  155. if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
  156. cityresult = pbMap.Cap
  157. resulttmp["defaultpcap"] = true
  158. }
  159. }*/
  160. //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
  161. (*resulttmp)["area"] = arearesult
  162. (*resulttmp)["city"] = cityresult
  163. (*resulttmp)["district"] = districtresult
  164. }
  165. //jsondata中抽取城市
  166. func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
  167. defer qu.Catch()
  168. if j.Jsondata != nil {
  169. jsondata := *j.Jsondata
  170. //jsondata中获取province和city
  171. if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
  172. p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
  173. GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配
  174. }
  175. city, _ = jsondata["city"].(string) //city全称或者简称
  176. province, _ = jsondata["area"].(string) //province简称
  177. district, _ = jsondata["district"].(string) //district全称
  178. }
  179. PCDScore(j, "district", district, 5, true) //district打分
  180. bp := false
  181. if province != "" {
  182. if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
  183. bp = true //省份正确
  184. }
  185. }
  186. pbrief := ""
  187. if city != "" {
  188. cityfullmap := e.CityFullMap[city] //判断city全称是否正确
  189. if cityfullmap != nil {
  190. pbrief = cityfullmap.P.Brief //province简称
  191. } else {
  192. citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
  193. if citybriefmap != nil {
  194. city = citybriefmap.Name //city简称替换为全称
  195. pbrief = citybriefmap.P.Brief
  196. }
  197. }
  198. }
  199. if bp {
  200. if pbrief == province { //爬虫的province和city匹配
  201. PCDScore(j, "city", city, 5, true)
  202. } else { //pbrief不匹配province(此时city为空或者错误)
  203. city = ""
  204. }
  205. PCDScore(j, "province", province, 5, true)
  206. } else { //省份错误或为空,取city的对应的pbrief为province
  207. if pbrief != "" {
  208. province = pbrief
  209. PCDScore(j, "province", province, 5, true)
  210. PCDScore(j, "city", city, 5, true)
  211. } else {
  212. province = ""
  213. city = ""
  214. }
  215. }
  216. return
  217. }
  218. //全称从area_city_district中抽城市
  219. func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
  220. text := e.Seg_PCD.Cut(a_c_d, true)
  221. repeatPb := map[string]bool{}
  222. for _, full := range text {
  223. if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
  224. if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
  225. pbrief = tmpPbrief //省简称
  226. PCDScore(j, "province", pbrief, 5, true)
  227. }
  228. } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
  229. if cfMap := e.CityFullMap[full]; cfMap != nil {
  230. tmpcity := cfMap.Name //城市全称
  231. tmpPbrief := cfMap.P.Brief //省简称
  232. if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
  233. city = tmpcity
  234. PCDScore(j, "city", city, 5, true)
  235. } else if pbrief == "" {
  236. city = tmpcity
  237. pbrief = tmpPbrief
  238. PCDScore(j, "city", city, 5, true)
  239. PCDScore(j, "province", pbrief, 5, true)
  240. }
  241. }
  242. } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
  243. carr := e.NewDistrictCityMap[full]
  244. if len(carr) > 0 {
  245. district = full
  246. PCDScore(j, "district", district, 5, true)
  247. for _, c := range carr {
  248. tmpcity := c.Name //城市全称
  249. tmpPbrief := c.P.Brief //省简称
  250. if pbrief == "" { //之前没有匹配到省份
  251. PCDScore(j, "city", tmpcity, 5, true)
  252. if !repeatPb[tmpPbrief] {
  253. PCDScore(j, "province", tmpPbrief, 5, true)
  254. repeatPb[tmpPbrief] = true
  255. }
  256. } else { //已有省份
  257. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  258. PCDScore(j, "city", tmpcity, -5, true)
  259. PCDScore(j, "province", tmpPbrief, -5, true)
  260. } else { //与之前匹配结果一致
  261. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  262. PCDScore(j, "city", tmpcity, 5, true)
  263. }
  264. }
  265. }
  266. }
  267. }
  268. }
  269. }
  270. return pbrief, city, district
  271. }
  272. //简称从area_city_district中抽城市
  273. func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
  274. text := e.Seg_PCD.Cut(a_c_d, true)
  275. repeatPb := map[string]bool{}
  276. for _, sim := range text {
  277. if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
  278. if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
  279. pbrief = pbMap.Brief
  280. PCDScore(j, "province", pbrief, 5, true) //打分
  281. //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
  282. }
  283. } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
  284. if cbMap := e.CityBriefMap[sim]; cbMap != nil {
  285. tmpcity := cbMap.Name
  286. tmpPbrief := cbMap.P.Brief
  287. if pbrief != "" && pbrief == tmpPbrief {
  288. city = tmpcity
  289. PCDScore(j, "city", city, 5, true)
  290. } else if pbrief == "" {
  291. city = tmpcity
  292. pbrief = tmpPbrief
  293. PCDScore(j, "city", city, 5, true)
  294. PCDScore(j, "province", pbrief, 5, true)
  295. //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
  296. }
  297. }
  298. } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
  299. dfullarr := e.NewDistrictSimAndAll[sim]
  300. if len(dfullarr) > 0 {
  301. PCDScore(j, "district", sim, 5, true)
  302. for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
  303. for _, c := range dfullAndCity {
  304. if c == nil {
  305. continue
  306. }
  307. tmpcity := c.Name //城市全称
  308. tmpPbrief := c.P.Brief //省简称
  309. if pbrief == "" { //之前没有匹配到省份
  310. PCDScore(j, "city", tmpcity, 5, true)
  311. if !repeatPb[tmpPbrief] {
  312. PCDScore(j, "province", tmpPbrief, 5, true)
  313. repeatPb[tmpPbrief] = true
  314. }
  315. } else { //已有省份
  316. if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
  317. PCDScore(j, "city", tmpcity, -5, true)
  318. PCDScore(j, "province", tmpPbrief, -5, true)
  319. } else { //与之前匹配结果一致
  320. if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
  321. PCDScore(j, "city", tmpcity, 5, true)
  322. }
  323. }
  324. }
  325. }
  326. }
  327. }
  328. }
  329. }
  330. }
  331. //通过site提取城市
  332. func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
  333. site, _ := (*j.Data)["site"].(string)
  334. //qu.Debug("site--------", site)
  335. if scMap := e.SiteCityMap[site]; scMap != nil {
  336. if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
  337. PCDScore(j, "province", scMap.P, 5, true)
  338. }
  339. if scMap.C != "" && scMap.C != "null" {
  340. PCDScore(j, "city", scMap.C, 5, true)
  341. }
  342. if scMap.D != "" && scMap.D != "null" {
  343. PCDScore(j, "district", scMap.D, 5, true)
  344. }
  345. }
  346. }
  347. //通过邮编提取城市
  348. func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
  349. defer qu.Catch()
  350. pc := e.PostCodeMap[postcode]
  351. if pc != nil {
  352. province = pc.P
  353. city = pc.C
  354. districtTmp := pc.D //邮编可能对应多个区
  355. score := 3.0
  356. if len(districtTmp) == 1 && districtTmp[0] != "" {
  357. score = 5.0
  358. }
  359. for _, district := range districtTmp {
  360. PCDScore(j, "district", district, score, true)
  361. }
  362. PCDScore(j, "province", province, 5, true)
  363. PCDScore(j, "city", city, 5, true)
  364. }
  365. return
  366. }
  367. //固话区号提取城市
  368. func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
  369. defer qu.Catch()
  370. if len(buyertel) >= 11 {
  371. if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
  372. n := 4
  373. L:
  374. areacode := buyertel[:n]
  375. ac := e.AreaCodeMap[areacode]
  376. if ac != nil {
  377. province = ac.P
  378. citytmp := ac.C
  379. if len(citytmp) == 1 { //对应多个city舍去
  380. city = citytmp[0]
  381. PCDScore(j, "city", city, 5, true)
  382. }
  383. PCDScore(j, "province", province, 5, true)
  384. } else {
  385. n = n - 1
  386. if n >= 3 {
  387. goto L
  388. }
  389. }
  390. } /* else if buyertel[:3] == "853" { //澳门
  391. province = "澳门"
  392. city = "澳门"
  393. PCDScore(j, "province", province, 5, true)
  394. PCDScore(j, "city", city, 5, true)
  395. }*/
  396. }
  397. return
  398. }
  399. func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
  400. /*
  401. 1.对字段进行分词
  402. 2.省、市、区、街道、居委会全称进行匹配打分
  403. 3.省、市、区简称进行匹配打分
  404. */
  405. ts := 0.5
  406. for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
  407. if i > 1 {
  408. ts = 0.2
  409. }
  410. p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
  411. str, _ := sm.Map[from].(string)
  412. jbText := e.Seg_SV.Cut(str, true)
  413. for _, text := range jbText {
  414. if len([]rune(text)) == 1 {
  415. continue
  416. }
  417. //全称匹配
  418. //qu.Debug("text------", text)
  419. for pos_full, trie_full := range e.Trie_Fulls {
  420. if trie_full.Get(text) {
  421. if pos_full == 0 && p_full == "" { //省全称
  422. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
  423. p_full = tmpPbrief
  424. PCDScore(j, "province", p_full, 4+ts, true)
  425. break
  426. }
  427. } else if pos_full == 1 && c_full == "" { //市全称
  428. if cfMap := e.CityFullMap[text]; cfMap != nil {
  429. tmpPbrief := cfMap.P.Brief
  430. if p_full == "" {
  431. p_full = tmpPbrief
  432. c_full = cfMap.Name
  433. PCDScore(j, "province", p_full, 4+ts, true)
  434. PCDScore(j, "city", c_full, 4+ts, true)
  435. break
  436. } else if p_full == tmpPbrief {
  437. c_full = cfMap.Name
  438. PCDScore(j, "province", tmpPbrief, 4+ts, true) //
  439. PCDScore(j, "city", c_full, 4+ts, true)
  440. break
  441. } else if p_full != "" && p_full != tmpPbrief {
  442. //city不做处理
  443. }
  444. }
  445. } else if pos_full == 2 && d_full == "" { //区全称
  446. repeatPb := map[string]bool{}
  447. isOk := false
  448. districtOk := false
  449. citys := e.NewDistrictCityMap[text]
  450. for _, c := range citys {
  451. tmpPbrief := c.P.Brief
  452. if p_full == tmpPbrief { //省份一致
  453. d_full = text
  454. if c_full == "" {
  455. c_full = c.Name
  456. PCDScore(j, "city", c_full, 4+ts, true)
  457. PCDScore(j, "province", tmpPbrief, 4+ts, true) //
  458. }
  459. isOk = true
  460. districtOk = true
  461. } else if p_full == "" { //省份不存在
  462. districtOk = true
  463. if len(citys) == 1 { //对应一个city
  464. p_full = tmpPbrief
  465. c_full = c.Name
  466. d_full = text
  467. PCDScore(j, "province", p_full, 4+ts, true)
  468. PCDScore(j, "city", c_full, 4+ts, true)
  469. isOk = true
  470. } else { //多个city,只打分,不赋值
  471. if !repeatPb[tmpPbrief] {
  472. PCDScore(j, "province", tmpPbrief, 2+ts, true)
  473. repeatPb[tmpPbrief] = true
  474. }
  475. //PCDScore(j, "province", tmpPbrief, 2, true)
  476. PCDScore(j, "city", c.Name, 2+ts, true)
  477. }
  478. } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
  479. if !repeatPb[tmpPbrief] {
  480. PCDScore(j, "province", tmpPbrief, -5, true)
  481. repeatPb[tmpPbrief] = true
  482. }
  483. //PCDScore(j, "province", tmpPbrief, -5, true)
  484. PCDScore(j, "city", c.Name, -5, true)
  485. }
  486. }
  487. if districtOk {
  488. PCDScore(j, "district", text, 4+ts, true)
  489. } else {
  490. PCDScore(j, "district", text, -5, true)
  491. }
  492. if isOk {
  493. break
  494. }
  495. } else if pos_full == 3 { //街道全称
  496. districts := e.NewStreetDistrictMap[text]
  497. if len(districts) == 1 { //街道唯一
  498. DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
  499. }
  500. } else if pos_full == 4 { //居委会全称
  501. districts := e.CommunityDistrictMap[text]
  502. if len(districts) == 1 { //居委会唯一
  503. DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
  504. }
  505. }
  506. }
  507. }
  508. //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
  509. //简称匹配
  510. for pos_sim, trie_sim := range e.Trie_Sims {
  511. if trie_sim.Get(text) {
  512. if pos_sim == 0 && p_sim == "" { //省简称
  513. p_sim = text
  514. PCDScore(j, "province", p_sim, 3+ts, false)
  515. break
  516. } else if pos_sim == 1 { //市简称
  517. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  518. tmpPbrief := cbMap.P.Brief
  519. if p_sim == "" {
  520. score := 2.0 + ts
  521. if tmpPbrief == p_full {
  522. score += 1.0
  523. }
  524. p_sim = tmpPbrief
  525. c_sim = cbMap.Brief
  526. PCDScore(j, "province", p_sim, score, false)
  527. PCDScore(j, "city", cbMap.Name, score, false)
  528. break
  529. } else if p_sim == tmpPbrief {
  530. c_sim = cbMap.Brief
  531. PCDScore(j, "city", cbMap.Name, 3+ts, false)
  532. PCDScore(j, "province", tmpPbrief, 3+ts, false)
  533. break
  534. } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
  535. delete(j.SimAreaScore, p_sim)
  536. c_sim = text //
  537. p_sim = tmpPbrief //
  538. PCDScore(j, "province", tmpPbrief, 3+ts, false)
  539. PCDScore(j, "city", cbMap.Name, 3+ts, false)
  540. }
  541. }
  542. } else if pos_sim == 2 && d_sim == "" { //区简称
  543. repeatPb := map[string]bool{}
  544. repeatDb := map[string]bool{}
  545. dfull_citys := e.NewDistrictSimAndAll[text]
  546. for _, dfull_city := range dfull_citys {
  547. for dfull, c := range dfull_city { //dfull:简称对应的全称
  548. if c == nil || c.P == nil {
  549. continue
  550. }
  551. tmpPbrief := c.P.Brief
  552. if p_sim == tmpPbrief { //省份一致
  553. d_sim = text
  554. PCDScore(j, "district", dfull, 2+ts, false)
  555. if c_sim == "" {
  556. c_sim = c.Brief
  557. PCDScore(j, "city", c.Name, 2+ts, false)
  558. }
  559. PCDScore(j, "province", tmpPbrief, 2+ts, false) //
  560. } else if p_sim == "" {
  561. if !repeatDb[dfull] {
  562. PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
  563. repeatDb[dfull] = true
  564. }
  565. if len(dfull_citys) == 1 {
  566. PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
  567. PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
  568. } else {
  569. if !repeatPb[tmpPbrief] {
  570. PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
  571. repeatPb[tmpPbrief] = true
  572. }
  573. PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
  574. }
  575. } else if p_sim != "" && p_sim != tmpPbrief {
  576. if !repeatPb[tmpPbrief] {
  577. PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
  578. repeatPb[tmpPbrief] = true
  579. }
  580. PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
  581. PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
  582. }
  583. }
  584. }
  585. }
  586. }
  587. }
  588. //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
  589. }
  590. }
  591. }
  592. func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
  593. repeatP_full := map[string]bool{}
  594. repeatC_full := map[string]bool{}
  595. repeatD_full := map[string]bool{}
  596. repeatP_sim := map[string]bool{}
  597. repeatC_sim := map[string]bool{}
  598. repeatD_sim := map[string]bool{}
  599. detailRune := []rune(j.Content)
  600. detail := j.Content
  601. if len(detailRune) > 600 {
  602. start := detailRune[:300]
  603. end := detailRune[len(detailRune)-300:]
  604. detail = string(start) + string(end)
  605. }
  606. for _, reg := range AgencyReg {
  607. detail = reg.ReplaceAllString(detail, "")
  608. }
  609. for _, text := range e.Seg_SV.Cut(detail, true) {
  610. if len([]rune(text)) > 1 {
  611. //全称匹配
  612. for pos_full, trie_full := range e.Trie_Fulls {
  613. if trie_full.Get(text) {
  614. if pos_full == 0 { //省全称
  615. if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
  616. PCDScore(j, "province", tmpPbrief, 1, true)
  617. repeatP_full[tmpPbrief] = true
  618. break
  619. }
  620. } else if pos_full == 1 { //市全称
  621. if cfMap := e.CityFullMap[text]; cfMap != nil {
  622. if !repeatP_full[cfMap.P.Brief] {
  623. PCDScore(j, "province", cfMap.P.Brief, 1, true)
  624. repeatP_full[cfMap.P.Brief] = true
  625. }
  626. if !repeatC_full[cfMap.Name] {
  627. PCDScore(j, "city", cfMap.Name, 1, true)
  628. repeatC_full[cfMap.Name] = true
  629. }
  630. break
  631. }
  632. } else if pos_full == 2 { //区全称
  633. citys := e.NewDistrictCityMap[text]
  634. if len(citys) > 0 {
  635. if !repeatD_full[text] {
  636. PCDScore(j, "district", text, 1, true)
  637. repeatD_full[text] = true
  638. }
  639. for _, c := range citys {
  640. if !repeatC_full[c.Name] {
  641. PCDScore(j, "city", c.Name, 1, true)
  642. repeatC_full[c.Name] = true
  643. }
  644. if !repeatP_full[c.P.Brief] {
  645. PCDScore(j, "province", c.P.Brief, 1, true)
  646. repeatP_full[c.P.Brief] = true
  647. }
  648. }
  649. break
  650. }
  651. } else if pos_full == 3 { //街道全称
  652. districts := e.NewStreetDistrictMap[text]
  653. if len(districts) == 1 {
  654. DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
  655. }
  656. } else if pos_full == 4 { //居委会全称
  657. districts := e.CommunityDistrictMap[text]
  658. if len(districts) == 1 {
  659. DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
  660. }
  661. }
  662. }
  663. }
  664. //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
  665. //简称匹配
  666. for pos_sim, trie_sim := range e.Trie_Sims {
  667. if trie_sim.Get(text) {
  668. if pos_sim == 0 && !repeatP_sim[text] { //省简称
  669. PCDScore(j, "province", text, 1, false)
  670. repeatP_sim[text] = true
  671. break
  672. } else if pos_sim == 1 { //市简称
  673. if cbMap := e.CityBriefMap[text]; cbMap != nil {
  674. if !repeatP_sim[cbMap.P.Brief] {
  675. PCDScore(j, "province", cbMap.P.Brief, 1, false)
  676. repeatP_sim[cbMap.P.Brief] = true
  677. }
  678. if !repeatC_sim[cbMap.Name] {
  679. PCDScore(j, "city", cbMap.Name, 1, false)
  680. repeatC_sim[cbMap.Name] = true
  681. }
  682. break
  683. }
  684. } else if pos_sim == 2 { //区简称
  685. dfull_citys := e.NewDistrictSimAndAll[text]
  686. if len(dfull_citys) == 1 {
  687. for _, dfull_city := range dfull_citys {
  688. for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
  689. if !repeatD_sim[dfull] {
  690. PCDScore(j, "district", dfull, 1, false)
  691. repeatD_sim[dfull] = true
  692. }
  693. if ctmp == nil {
  694. continue
  695. }
  696. if !repeatC_sim[ctmp.Name] {
  697. PCDScore(j, "city", ctmp.Name, 1, false)
  698. repeatC_sim[ctmp.Name] = true
  699. }
  700. if !repeatP_sim[ctmp.P.Brief] {
  701. PCDScore(j, "province", ctmp.P.Brief, 1, false)
  702. repeatP_sim[ctmp.P.Brief] = true
  703. }
  704. }
  705. }
  706. }
  707. }
  708. }
  709. }
  710. //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
  711. }
  712. }
  713. }
  714. //街道、居委会对应多地市处理
  715. func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
  716. if len(districts) == 1 {
  717. district := districts[0]
  718. city := district.C.Name
  719. tmpPbrief := district.C.P.Brief
  720. if pbrief != "" && tmpPbrief == pbrief {
  721. PCDScore(j, "province", tmpPbrief, score, true)
  722. PCDScore(j, "city", city, score, true)
  723. PCDScore(j, "district", district.Name, score, true)
  724. } else if pbrief == "" {
  725. if repeatP != nil && !(*repeatP)[tmpPbrief] {
  726. PCDScore(j, "province", tmpPbrief, score, true)
  727. (*repeatP)[tmpPbrief] = true
  728. } else if repeatP == nil {
  729. PCDScore(j, "province", tmpPbrief, score, true)
  730. }
  731. if repeatC != nil && !(*repeatC)[city] {
  732. PCDScore(j, "city", city, score, true)
  733. (*repeatC)[city] = true
  734. } else if repeatC == nil {
  735. PCDScore(j, "city", city, score, true)
  736. }
  737. if repeatD != nil && !(*repeatD)[tmpPbrief] {
  738. PCDScore(j, "district", district.Name, score, true)
  739. (*repeatD)[district.Name] = true
  740. } else if repeatD == nil {
  741. PCDScore(j, "district", district.Name, score, true)
  742. }
  743. }
  744. }
  745. }
  746. func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
  747. for _, c := range finishC { //取最高分与province匹配的city
  748. if cfMap := e.CityFullMap[c]; cfMap != nil {
  749. if cfMap.P.Brief == area {
  750. // city = c
  751. // break
  752. tmpcity = append(tmpcity, c)
  753. }
  754. }
  755. }
  756. if len(tmpcity) == 1 {
  757. city = tmpcity[0]
  758. }
  759. return city, tmpcity
  760. }
  761. func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
  762. for _, d := range finishD { //取最高分与province匹配的district
  763. citys := e.NewDistrictCityMap[d]
  764. for _, c := range citys {
  765. if len(tmpcity) == 0 { //没有city
  766. if c.P.Brief == area {
  767. city = c.Name
  768. district = d
  769. return city, district
  770. }
  771. } else if len(tmpcity) == 1 { //一个city
  772. if c.Name == city && c.P.Brief == area {
  773. district = d
  774. return city, district
  775. }
  776. } else { //多个city
  777. for _, tc := range tmpcity { //多个city根据district最高分取
  778. if tc == c.Name && len(finishD) == 1 {
  779. city = c.Name
  780. district = d
  781. return city, district
  782. }
  783. }
  784. }
  785. }
  786. }
  787. return city, district
  788. }
  789. //计算province,city,district区或县匹配的得分
  790. func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
  791. defer qu.Catch()
  792. if t != "" {
  793. if stype == "d" {
  794. tmpscore := (*ds)[t]
  795. (*ds)[t] = tmpscore + score
  796. } else if stype == "c" {
  797. tmpscore := (*cs)[t]
  798. (*cs)[t] = tmpscore + score
  799. } else if stype == "p" {
  800. tmpscore := (*ps)[t]
  801. (*ps)[t] = tmpscore + score
  802. }
  803. }
  804. }
  805. func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
  806. if len(j.FullAreaScore) > 0 {
  807. for pt, ps := range *pscore {
  808. j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
  809. }
  810. for ct, cs := range *cscore {
  811. j.FullCityScore[ct] = j.FullCityScore[ct] + cs
  812. }
  813. for dt, ds := range *dscore {
  814. j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
  815. }
  816. }
  817. }
  818. func MergeFullSimScore(j *ju.Job) {
  819. if len(j.FullAreaScore) == 0 {
  820. j.FullAreaScore = j.SimAreaScore
  821. } else {
  822. for p_text, p_score := range j.FullAreaScore {
  823. j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
  824. }
  825. }
  826. for c_text, c_score := range j.SimCityScore {
  827. j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
  828. }
  829. for d_text, d_score := range j.SimDistrictScore {
  830. j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
  831. }
  832. // if len(j.FullCityScore) == 0 {
  833. // j.FullCityScore = j.SimCityScore
  834. // } else {
  835. // for c_text, c_score := range j.FullCityScore {
  836. // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
  837. // }
  838. // }
  839. // if len(j.FullDistrictScore) == 0 {
  840. // j.FullDistrictScore = j.SimDistrictScore
  841. // } else {
  842. // for d_text, d_score := range j.FullDistrictScore {
  843. // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
  844. // }
  845. // }
  846. }
  847. func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
  848. if len(j.FullDistrictScore) > 0 {
  849. for d, _ := range j.FullDistrictScore {
  850. tmpCitys := e.NewDistrictCityMap[d]
  851. for _, c := range tmpCitys {
  852. if j.FullCityScore[c.Name] != 0 {
  853. tmpPb := c.P.Brief
  854. //if j.FullAreaScore[tmpPb] != 0 {
  855. flag := false
  856. for _, p := range finishP {
  857. if tmpPb == p {
  858. flag = true
  859. break
  860. }
  861. }
  862. if !flag {
  863. delete(j.FullCityScore, c.Name)
  864. delete(j.FullDistrictScore, d)
  865. }
  866. //}
  867. }
  868. }
  869. }
  870. }
  871. if len(j.FullCityScore) > 0 {
  872. for tmpcity, _ := range j.FullCityScore {
  873. c := e.CityFullMap[tmpcity]
  874. if c == nil {
  875. log.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
  876. continue
  877. }
  878. tmpPb := c.P.Brief
  879. //if j.FullAreaScore[tmpPb] != 0 {
  880. flag := false
  881. for _, p := range finishP {
  882. if tmpPb == p {
  883. flag = true
  884. break
  885. }
  886. }
  887. if !flag {
  888. delete(j.FullCityScore, tmpcity)
  889. }
  890. //}
  891. }
  892. }
  893. }
  894. //province,city,district干扰项减分
  895. //func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
  896. // defer qu.Catch()
  897. // if text != "" {
  898. // if stype == "city" {
  899. // for cn, cscore := range j.CityScore {
  900. // if cn != text {
  901. // j.CityScore[cn] = cscore + score
  902. // //错误的city减分后对应的province也减分
  903. // for pb, pscore := range j.AreaScore {
  904. // if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
  905. // j.AreaScore[pb] = pscore + score
  906. // }
  907. // }
  908. // }
  909. // }
  910. // } else if stype == "province" {
  911. // for pb, pscore := range j.AreaScore {
  912. // if pb != text {
  913. // j.AreaScore[pb] = pscore + score
  914. // //错误的province减分后对应的city也要减分
  915. // for cn, cscore := range j.CityScore {
  916. // if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
  917. // j.CityScore[cn] = cscore + score
  918. // }
  919. // }
  920. // }
  921. // }
  922. // }
  923. // // for name, tmpscore := range *whichMap {
  924. // // if name != text {
  925. // // (*whichMap)[name] = tmpscore + score
  926. // // }
  927. // // }
  928. // }
  929. //}