package extract import ( . "jy/pretreated" ju "jy/util" qu "qfw/util" "strings" log "github.com/donnie4w/go-logger/logger" ) //抽取city func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}) { /* 高准确率: 1.爬虫数据jsondata 2.采购单位库 3.邮编 4.固话 5.site(todo) 低准确率:(全称库匹配到不走简称库) 1.city全称库(buyeraddr;title,projectname) 2.city简称库(buyeraddr;title,projectname) */ defer qu.Catch() //初始化 if j.FullAreaScore == nil { j.FullAreaScore = make(map[string]float64) } if j.FullCityScore == nil { j.FullCityScore = make(map[string]float64) } if j.FullDistrictScore == nil { j.FullDistrictScore = make(map[string]float64) } if j.SimAreaScore == nil { j.SimAreaScore = make(map[string]float64) } if j.SimCityScore == nil { j.SimCityScore = make(map[string]float64) } if j.SimDistrictScore == nil { j.SimDistrictScore = make(map[string]float64) } //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃 pscore := make(map[string]float64) cscore := make(map[string]float64) dscore := make(map[string]float64) sm := NewSortMap() //1.jsondata抽取 e.NewGetCityByJsonData(j) //2.site库抽取 e.NewGetCityBySite(j) //3.采购单位库抽取(暂时没有采购单位库) //4.postcode邮编抽取 buyerzipcode := qu.ObjToString((*resulttmp)["buyerzipcode"]) e.NewGetCityByPostCode(j, buyerzipcode) //5.areacode固话区号抽取 buyertel := qu.ObjToString((*resulttmp)["buyertel"]) e.NewGetCityByAreaCode(j, buyertel) //6.buyeraddr,title,projectname抽取 buyeraddr := qu.ObjToString((*resulttmp)["buyeraddr"]) title := qu.ObjToString((*resulttmp)["title"]) projectname := qu.ObjToString((*resulttmp)["projectname"]) buyer := qu.ObjToString((*resulttmp)["buyer"]) addressing := qu.ObjToString((*resulttmp)["addressing"]) sm.AddKey("buyeraddr", buyeraddr) sm.AddKey("buyer", buyer) sm.AddKey("title", title) sm.AddKey("projectname", projectname) sm.AddKey("addressing", addressing) //新增地址辅助字段 if projectaddr, isok := (*resulttmp)["projectaddr"].(string); isok { sm.AddKey("projectaddr", projectaddr) } if bidopenaddress, isok := (*resulttmp)["bidopenaddress"].(string); isok { sm.AddKey("bidopenaddress", bidopenaddress) } //7.buyeraddr buyer title projectname抽取 e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore) //qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore) //qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore) //全称简称得分合并 MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称 //qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore) //合并区简称得分 //qu.Debug("pcd=====", pscore, cscore, dscore) MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd //qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore) j.SimAreaScore = map[string]float64{} j.SimCityScore = map[string]float64{} j.SimDistrictScore = map[string]float64{} //8.detail抽取 if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断 e.NewGetCityByDetail(j) } //qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore) //qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore) MergeFullSimScore(j) //合并detail的全简称 //qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore) finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省 e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e //qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore) //获取结果 finishC := HighestScoreArr(j.FullCityScore) finishD := HighestScoreArr(j.FullDistrictScore) arearesult := "" cityresult := "" districtresult := "" tmpcity := []string{} if len(finishP) == 1 { //最高分一个 arearesult = finishP[0] //抽取结果直接赋值 cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity) cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity) } else if len(finishP) > 1 { //province最高分多个 if len(finishC) == 1 { cityresult = finishC[0] if cfMap := e.CityFullMap[cityresult]; cfMap != nil { arearesult = cfMap.P.Brief tmpcity = append(tmpcity, cityresult) cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity) } } else { //对应的city有多个(多个province和city) //arearesult = finishP[0] //抽取结果直接赋值 //cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity) //cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity) arearesult = "全国" } } if cityresult != "" && cityresult == districtresult { districtresult = "" } //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult) //直辖市 if arearesult == "北京" { cityresult = "北京市" if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90) districtresult = "朝阳区" } } else if arearesult == "天津" { cityresult = "天津市" } else if arearesult == "上海" { cityresult = "上海市" } else if arearesult == "重庆" { cityresult = "重庆市" } if arearesult == "" { arearesult = "全国" } /* else if cityresult == "" { if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil { cityresult = pbMap.Cap resulttmp["defaultpcap"] = true } }*/ //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult) (*resulttmp)["area"] = arearesult (*resulttmp)["city"] = cityresult (*resulttmp)["district"] = districtresult //校验-映射新疆兵团 if xjbtReg.MatchString(buyer) && cityresult == "" { a, c, d, ok := e.CheckingXjbtCity(buyer) if ok { (*resulttmp)["area"] = a (*resulttmp)["city"] = c (*resulttmp)["district"] = d } } //如果-仅有省份-敏感词-校验核对方法 if arearesult != "全国" && cityresult == "" { sensitive_city := e.SensitiveCityData(qu.ObjToString((*j.Data)["detail"]), arearesult) if sensitive_city != "" { (*resulttmp)["city"] = sensitive_city (*resulttmp)["is_sensitive"] = 1 } } } //jsondata中抽取城市 func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) { defer qu.Catch() if j.Jsondata != nil { jsondata := *j.Jsondata //jsondata中获取province和city if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" { p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配 GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配 } city, _ = jsondata["city"].(string) //city全称或者简称 province, _ = jsondata["area"].(string) //province简称 district, _ = jsondata["district"].(string) //district全称 } PCDScore(j, "district", district, 5, true) //district打分 bp := false if province != "" { if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国) bp = true //省份正确 } } pbrief := "" if city != "" { cityfullmap := e.CityFullMap[city] //判断city全称是否正确 if cityfullmap != nil { pbrief = cityfullmap.P.Brief //province简称 } else { citybriefmap := e.CityBriefMap[city] //判断city简称是否正确 if citybriefmap != nil { city = citybriefmap.Name //city简称替换为全称 pbrief = citybriefmap.P.Brief } } } if bp { if pbrief == province { //爬虫的province和city匹配 PCDScore(j, "city", city, 5, true) } else { //pbrief不匹配province(此时city为空或者错误) city = "" } PCDScore(j, "province", province, 5, true) } else { //省份错误或为空,取city的对应的pbrief为province if pbrief != "" { province = pbrief PCDScore(j, "province", province, 5, true) PCDScore(j, "city", city, 5, true) } else { province = "" city = "" } } return } //全称从area_city_district中抽城市 func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) { text := e.Seg_PCD.Cut(a_c_d, true) repeatPb := map[string]bool{} for _, full := range text { if e.Trie_Full_Province.Get(full) { //a_c_d有province全称 if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" { pbrief = tmpPbrief //省简称 PCDScore(j, "province", pbrief, 5, true) } } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称 if cfMap := e.CityFullMap[full]; cfMap != nil { tmpcity := cfMap.Name //城市全称 tmpPbrief := cfMap.P.Brief //省简称 if pbrief != "" && pbrief == tmpPbrief { //已获取省简称 city = tmpcity PCDScore(j, "city", city, 5, true) } else if pbrief == "" { city = tmpcity pbrief = tmpPbrief PCDScore(j, "city", city, 5, true) PCDScore(j, "province", pbrief, 5, true) } } } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市) carr := e.DistrictCityMap[full] if len(carr) > 0 { district = full PCDScore(j, "district", district, 5, true) for _, c := range carr { tmpcity := c.Name //城市全称 tmpPbrief := c.P.Brief //省简称 if pbrief == "" { //之前没有匹配到省份 PCDScore(j, "city", tmpcity, 5, true) if !repeatPb[tmpPbrief] { PCDScore(j, "province", tmpPbrief, 5, true) repeatPb[tmpPbrief] = true } } else { //已有省份 if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项 PCDScore(j, "city", tmpcity, -5, true) PCDScore(j, "province", tmpPbrief, -5, true) } else { //与之前匹配结果一致 if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充 PCDScore(j, "city", tmpcity, 5, true) } } } } } } } return pbrief, city, district } //简称从area_city_district中抽城市 func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) { text := e.Seg_PCD.Cut(a_c_d, true) repeatPb := map[string]bool{} for _, sim := range text { if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil { pbrief = pbMap.Brief PCDScore(j, "province", pbrief, 5, true) //打分 //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区) } } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city if cbMap := e.CityBriefMap[sim]; cbMap != nil { tmpcity := cbMap.Name tmpPbrief := cbMap.P.Brief if pbrief != "" && pbrief == tmpPbrief { city = tmpcity PCDScore(j, "city", city, 5, true) } else if pbrief == "" { city = tmpcity pbrief = tmpPbrief PCDScore(j, "city", city, 5, true) PCDScore(j, "province", pbrief, 5, true) //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区) } } } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district dfullarr := e.DistrictSimAndAll[sim] if len(dfullarr) > 0 { PCDScore(j, "district", sim, 5, true) for _, dfullAndCity := range dfullarr { //district简称对应的所有全称 for _, c := range dfullAndCity { if c == nil { continue } tmpcity := c.Name //城市全称 tmpPbrief := c.P.Brief //省简称 if pbrief == "" { //之前没有匹配到省份 PCDScore(j, "city", tmpcity, 5, true) if !repeatPb[tmpPbrief] { PCDScore(j, "province", tmpPbrief, 5, true) repeatPb[tmpPbrief] = true } } else { //已有省份 if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项 PCDScore(j, "city", tmpcity, -5, true) PCDScore(j, "province", tmpPbrief, -5, true) } else { //与之前匹配结果一致 if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充 PCDScore(j, "city", tmpcity, 5, true) } } } } } } } } } //通过site提取城市 func (e *ExtractTask) NewGetCityBySite(j *ju.Job) { site, _ := (*j.Data)["site"].(string) //qu.Debug("site--------", site) if scMap := e.SiteCityMap[site]; scMap != nil { if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" { PCDScore(j, "province", scMap.P, 5, true) } if scMap.C != "" && scMap.C != "null" { PCDScore(j, "city", scMap.C, 5, true) } if scMap.D != "" && scMap.D != "null" { PCDScore(j, "district", scMap.D, 5, true) } } } //通过邮编提取城市 func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) { defer qu.Catch() pc := e.PostCodeMap[postcode] if pc != nil { province = pc.P city = pc.C districtTmp := pc.D //邮编可能对应多个区 score := 3.0 if len(districtTmp) == 1 && districtTmp[0] != "" { score = 5.0 } for _, district := range districtTmp { PCDScore(j, "district", district, score, true) } PCDScore(j, "province", province, 5, true) PCDScore(j, "city", city, 5, true) } return } //固话区号提取城市 func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) { defer qu.Catch() if len(buyertel) >= 11 { if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头 n := 4 L: areacode := buyertel[:n] ac := e.AreaCodeMap[areacode] if ac != nil { province = ac.P citytmp := ac.C if len(citytmp) == 1 { //对应多个city舍去 city = citytmp[0] score := float64(5) if areacode == "0371" { score = float64(4) } PCDScore(j, "city", city, score, true) } PCDScore(j, "province", province, 5, true) } else { n = n - 1 if n >= 3 { goto L } } } /* else if buyertel[:3] == "853" { //澳门 province = "澳门" city = "澳门" PCDScore(j, "province", province, 5, true) PCDScore(j, "city", city, 5, true) }*/ } return } func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) { /* 1.对字段进行分词 2.省、市、区、街道、居委会全称进行匹配打分 3.省、市、区简称进行匹配打分 */ ts := 0.5 for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname if i > 1 { ts = 0.2 } p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district str, _ := sm.Map[from].(string) jbText := e.Seg_SV.Cut(str, true) for jb_index, text := range jbText { if len([]rune(text)) == 1 { continue } //全称匹配 //qu.Debug("text------", text) for pos_full, trie_full := range e.Trie_Fulls { if trie_full.Get(text) { if pos_full == 0 && p_full == "" { //省全称 if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称 p_full = tmpPbrief PCDScore(j, "province", p_full, 4+ts, true) break } } else if pos_full == 1 && c_full == "" { //市全称 if cfMap := e.CityFullMap[text]; cfMap != nil { tmpPbrief := cfMap.P.Brief if p_full == "" { p_full = tmpPbrief c_full = cfMap.Name PCDScore(j, "province", p_full, 4+ts, true) PCDScore(j, "city", c_full, 4+ts, true) break } else if p_full == tmpPbrief { c_full = cfMap.Name PCDScore(j, "province", tmpPbrief, 4+ts, true) // PCDScore(j, "city", c_full, 4+ts, true) break } else if p_full != "" && p_full != tmpPbrief { //city不做处理 } } } else if pos_full == 2 && d_full == "" { //区全称 repeatPb := map[string]bool{} isOk := false districtOk := false citys := e.DistrictCityMap[text] for _, c := range citys { tmpPbrief := c.P.Brief if p_full == tmpPbrief { //省份一致 d_full = text if c_full == "" { c_full = c.Name PCDScore(j, "city", c_full, 4+ts, true) PCDScore(j, "province", tmpPbrief, 4+ts, true) // } isOk = true districtOk = true } else if p_full == "" { //省份不存在 districtOk = true if len(citys) == 1 { //对应一个city p_full = tmpPbrief c_full = c.Name d_full = text PCDScore(j, "province", p_full, 4+ts, true) PCDScore(j, "city", c_full, 4+ts, true) isOk = true } else { //多个city,只打分,不赋值 if !repeatPb[tmpPbrief] { PCDScore(j, "province", tmpPbrief, 2+ts, true) repeatPb[tmpPbrief] = true } //PCDScore(j, "province", tmpPbrief, 2, true) PCDScore(j, "city", c.Name, 2+ts, true) } } else if p_full != "" && p_full != tmpPbrief { //干扰项减分 if !repeatPb[tmpPbrief] { PCDScore(j, "province", tmpPbrief, -5, true) repeatPb[tmpPbrief] = true } //PCDScore(j, "province", tmpPbrief, -5, true) PCDScore(j, "city", c.Name, -5, true) } } if districtOk { PCDScore(j, "district", text, 4+ts, true) } else { PCDScore(j, "district", text, -5, true) } if isOk { break } } else if pos_full == 3 { //街道全称 districts := e.StreetDistrictMap[text] if len(districts) == 1 { //街道唯一 DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil) } } else if pos_full == 4 { //居委会全称 //districts := e.CommunityDistrictMap[text] //if len(districts) == 1 { //居委会唯一 // DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil) //} } } } //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore) //简称匹配 for pos_sim, trie_sim := range e.Trie_Sims { if trie_sim.Get(text) { if pos_sim == 0 && p_sim == "" { //省简称 p_sim = text PCDScore(j, "province", p_sim, 3+ts, false) break } else if pos_sim == 1 { //市简称 if cbMap := e.CityBriefMap[text]; cbMap != nil { tmpPbrief := cbMap.P.Brief if p_sim == "" { score := 2.0 + ts if tmpPbrief == p_full { score += 1.0 } p_sim = tmpPbrief c_sim = cbMap.Brief PCDScore(j, "province", p_sim, score, false) PCDScore(j, "city", cbMap.Name, score, false) break } else if p_sim == tmpPbrief { c_sim = cbMap.Brief PCDScore(j, "city", cbMap.Name, 3+ts, false) PCDScore(j, "province", tmpPbrief, 3+ts, false) break } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心 delete(j.SimAreaScore, p_sim) c_sim = text // p_sim = tmpPbrief // PCDScore(j, "province", tmpPbrief, 3+ts, false) PCDScore(j, "city", cbMap.Name, 3+ts, false) } } } else if pos_sim == 2 && d_sim == "" { //区简称 repeatPb := map[string]bool{} repeatDb := map[string]bool{} dfull_citys := e.DistrictSimAndAll[text] for _, dfull_city := range dfull_citys { for dfull, c := range dfull_city { //dfull:简称对应的全称 if c == nil || c.P == nil { continue } tmpPbrief := c.P.Brief if p_sim == tmpPbrief { //省份一致 d_sim = text PCDScore(j, "district", dfull, 2+ts, false) if c_sim == "" { c_sim = c.Brief PCDScore(j, "city", c.Name, 2+ts, false) } PCDScore(j, "province", tmpPbrief, 2+ts, false) // } else if p_sim == "" { //暂未匹配到省 if !repeatDb[dfull] { PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore) repeatDb[dfull] = true } if len(dfull_citys) == 1 { PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore) PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore) } else { if !repeatPb[tmpPbrief] { PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore) repeatPb[tmpPbrief] = true } PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore) } //新增~特殊组情况下~津市高新区管委会~切词首"津市"~均未匹配到情况下 if jb_index == 0 && len(dfull_citys) == 1 && len(j.FullAreaScore) == 0 && len(j.SimAreaScore) == 0 { PCDScore(j, "district", dfull, 0, false) PCDScore(j, "city", c.Name, 0, false) PCDScore(j, "province", tmpPbrief, 0, false) // } } else if p_sim != "" && p_sim != tmpPbrief { if !repeatPb[tmpPbrief] { PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore) repeatPb[tmpPbrief] = true } PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore) PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore) } } } } } } //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore) } } } func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) { repeatP_full := map[string]bool{} repeatC_full := map[string]bool{} repeatD_full := map[string]bool{} repeatP_sim := map[string]bool{} repeatC_sim := map[string]bool{} repeatD_sim := map[string]bool{} detailRune := []rune(j.Content) detail := j.Content if len(detailRune) > 600 { start := detailRune[:300] end := detailRune[len(detailRune)-300:] detail = string(start) + string(end) } for _, reg := range AgencyReg { detail = reg.ReplaceAllString(detail, "") } for _, text := range e.Seg_SV.Cut(detail, true) { if len([]rune(text)) > 1 { //全称匹配 for pos_full, trie_full := range e.Trie_Fulls { if trie_full.Get(text) { if pos_full == 0 { //省全称 if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称 PCDScore(j, "province", tmpPbrief, 1, true) repeatP_full[tmpPbrief] = true break } } else if pos_full == 1 { //市全称 if cfMap := e.CityFullMap[text]; cfMap != nil { if !repeatP_full[cfMap.P.Brief] { PCDScore(j, "province", cfMap.P.Brief, 1, true) repeatP_full[cfMap.P.Brief] = true } if !repeatC_full[cfMap.Name] { PCDScore(j, "city", cfMap.Name, 1, true) repeatC_full[cfMap.Name] = true } break } } else if pos_full == 2 { //区全称 citys := e.DistrictCityMap[text] if len(citys) > 0 { if !repeatD_full[text] { PCDScore(j, "district", text, 1, true) repeatD_full[text] = true } for _, c := range citys { if !repeatC_full[c.Name] { PCDScore(j, "city", c.Name, 1, true) repeatC_full[c.Name] = true } if !repeatP_full[c.P.Brief] { PCDScore(j, "province", c.P.Brief, 1, true) repeatP_full[c.P.Brief] = true } } break } } else if pos_full == 3 { //街道全称 districts := e.StreetDistrictMap[text] if len(districts) == 1 { DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full) } } else if pos_full == 4 { //居委会全称 //districts := e.CommunityDistrictMap[text] //if len(districts) == 1 { // DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full) //} } } } //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore) //简称匹配 for pos_sim, trie_sim := range e.Trie_Sims { if trie_sim.Get(text) { if pos_sim == 0 && !repeatP_sim[text] { //省简称 PCDScore(j, "province", text, 1, false) repeatP_sim[text] = true break } else if pos_sim == 1 { //市简称 if cbMap := e.CityBriefMap[text]; cbMap != nil { if !repeatP_sim[cbMap.P.Brief] { PCDScore(j, "province", cbMap.P.Brief, 1, false) repeatP_sim[cbMap.P.Brief] = true } if !repeatC_sim[cbMap.Name] { PCDScore(j, "city", cbMap.Name, 1, false) repeatC_sim[cbMap.Name] = true } break } } else if pos_sim == 2 { //区简称 dfull_citys := e.DistrictSimAndAll[text] if len(dfull_citys) == 1 { for _, dfull_city := range dfull_citys { for dfull, ctmp := range dfull_city { //dfull:简称对应的全称 if !repeatD_sim[dfull] { PCDScore(j, "district", dfull, 1, false) repeatD_sim[dfull] = true } if ctmp == nil { continue } if !repeatC_sim[ctmp.Name] { PCDScore(j, "city", ctmp.Name, 1, false) repeatC_sim[ctmp.Name] = true } if !repeatP_sim[ctmp.P.Brief] { PCDScore(j, "province", ctmp.P.Brief, 1, false) repeatP_sim[ctmp.P.Brief] = true } } } } } } } //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore) } } } //街道、居委会对应多地市处理 func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) { if len(districts) == 1 { district := districts[0] city := district.C.Name tmpPbrief := district.C.P.Brief if pbrief != "" && tmpPbrief == pbrief { PCDScore(j, "province", tmpPbrief, score, true) PCDScore(j, "city", city, score, true) PCDScore(j, "district", district.Name, score, true) } else if pbrief == "" { if repeatP != nil && !(*repeatP)[tmpPbrief] { PCDScore(j, "province", tmpPbrief, score, true) (*repeatP)[tmpPbrief] = true } else if repeatP == nil { PCDScore(j, "province", tmpPbrief, score, true) } if repeatC != nil && !(*repeatC)[city] { PCDScore(j, "city", city, score, true) (*repeatC)[city] = true } else if repeatC == nil { PCDScore(j, "city", city, score, true) } if repeatD != nil && !(*repeatD)[tmpPbrief] { PCDScore(j, "district", district.Name, score, true) (*repeatD)[district.Name] = true } else if repeatD == nil { PCDScore(j, "district", district.Name, score, true) } } } } func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) { for _, c := range finishC { //取最高分与province匹配的city if cfMap := e.CityFullMap[c]; cfMap != nil { if cfMap.P.Brief == area { // city = c // break tmpcity = append(tmpcity, c) } } } if len(tmpcity) == 1 { city = tmpcity[0] } return city, tmpcity } func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) { for _, d := range finishD { //取最高分与province匹配的district citys := e.DistrictCityMap[d] for _, c := range citys { if len(tmpcity) == 0 { //没有city if c.P.Brief == area { city = c.Name district = d return city, district } } else if len(tmpcity) == 1 { //一个city if c.Name == city && c.P.Brief == area { district = d return city, district } } else { //多个city for _, tc := range tmpcity { //多个city根据district最高分取 if tc == c.Name && len(finishD) == 1 { city = c.Name district = d return city, district } } } } } return city, district } //计算province,city,district区或县匹配的得分 func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) { defer qu.Catch() if t != "" { if stype == "d" { tmpscore := (*ds)[t] (*ds)[t] = tmpscore + score } else if stype == "c" { tmpscore := (*cs)[t] (*cs)[t] = tmpscore + score } else if stype == "p" { tmpscore := (*ps)[t] (*ps)[t] = tmpscore + score } } } func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) { if len(j.FullAreaScore) > 0 { for pt, ps := range *pscore { j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps } for ct, cs := range *cscore { j.FullCityScore[ct] = j.FullCityScore[ct] + cs } for dt, ds := range *dscore { j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds } } } func MergeFullSimScore(j *ju.Job) { if len(j.FullAreaScore) == 0 { j.FullAreaScore = j.SimAreaScore } else { for p_text, p_score := range j.FullAreaScore { j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score } } for c_text, c_score := range j.SimCityScore { j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score } for d_text, d_score := range j.SimDistrictScore { j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score } // if len(j.FullCityScore) == 0 { // j.FullCityScore = j.SimCityScore // } else { // for c_text, c_score := range j.FullCityScore { // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score // } // } // if len(j.FullDistrictScore) == 0 { // j.FullDistrictScore = j.SimDistrictScore // } else { // for d_text, d_score := range j.FullDistrictScore { // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score // } // } } func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) { if len(j.FullDistrictScore) > 0 { for d, _ := range j.FullDistrictScore { tmpCitys := e.DistrictCityMap[d] for _, c := range tmpCitys { if j.FullCityScore[c.Name] != 0 { tmpPb := c.P.Brief //if j.FullAreaScore[tmpPb] != 0 { flag := false for _, p := range finishP { if tmpPb == p { flag = true break } } if !flag { delete(j.FullCityScore, c.Name) delete(j.FullDistrictScore, d) } //} } } } } if len(j.FullCityScore) > 0 { for tmpcity, _ := range j.FullCityScore { c := e.CityFullMap[tmpcity] if c == nil { log.Debug("行政区划错误数据:", tmpcity, j.SourceMid) continue } tmpPb := c.P.Brief //if j.FullAreaScore[tmpPb] != 0 { flag := false for _, p := range finishP { if tmpPb == p { flag = true break } } if !flag { delete(j.FullCityScore, tmpcity) } //} } } } func HighestScoreArr(m map[string]float64) []string { result := make(map[float64][]string) tmpscore := 0.0 for str, score := range m { if str != "" && tmpscore <= score { if result[tmpscore] != nil && tmpscore != score { delete(result, tmpscore) } if r := result[score]; r != nil { r = append(r, str) result[score] = r } else { result[score] = []string{str} } tmpscore = score } } return result[tmpscore] } //计算province,city,district得分 func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) { defer qu.Catch() if text != "" { if stype == "district" { tmpdistrict := make(map[string]float64) if isfull { tmpdistrict = j.FullDistrictScore } else { tmpdistrict = j.SimDistrictScore } scoretmp := tmpdistrict[text] tmpdistrict[text] = scoretmp + score } else if stype == "city" { tmpcity := make(map[string]float64) if isfull { tmpcity = j.FullCityScore } else { tmpcity = j.SimCityScore } scoretmp := tmpcity[text] tmpcity[text] = scoretmp + score } else if stype == "province" { tmpprovince := make(map[string]float64) if isfull { tmpprovince = j.FullAreaScore } else { tmpprovince = j.SimAreaScore } scoretmp := tmpprovince[text] tmpprovince[text] = scoretmp + score } } }