package extract import ( . "jy/pretreated" ju "jy/util" qu "qfw/util" "strings" ) //省 type Province struct { Name string Brief string Cap string Captial *City } //市 type City struct { Name string Brief string P *Province } //区或县 type District struct { Name string C *City } //街道 type Street struct { Name string D *District } //村、社区、居委会 type Community struct { Name string S *Street } //区或县简称对应的全称和市信息 type DistrictSimFull struct { SimName string FullName string C *City } //邮编 type PostCode struct { Code string P string C string D []string } //区号 type AreaCode struct { Code string P string C []string } //抽取city func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) { /* 高准确率: 1.爬虫数据jsondata 2.采购单位库 3.邮编 4.固话 5.site(todo) 低准确率:(全称库匹配到不走简称库) 1.city全称库(buyeraddr;title,projectname) 2.city简称库(buyeraddr;title,projectname) */ defer qu.Catch() //初始化 if j.AreaScore == nil { j.AreaScore = make(map[string]int) } if j.CityScore == nil { j.CityScore = make(map[string]int) } if j.DistrictScore == nil { j.DistrictScore = make(map[string]int) } sm := NewSortMap() //高精度抽取city //存储每个流程的抽取结果 area1 := make([]map[string]string, 4) city1 := make([]map[string]string, 4) district1 := make([]map[string]string, 4) //jsondata p0, c0, d0, p, c, d := e.GetCityByJsonData(j) area1 = append(area1, map[string]string{"a_c_d": p}) city1 = append(city1, map[string]string{"a_c_d": c}) district1 = append(district1, map[string]string{"a_c_d": d}) area1[0] = map[string]string{"jsondata": p0} city1[0] = map[string]string{"jsondata": c0} district1[0] = map[string]string{"jsondata": d0} //qu.Debug("=====jsondata打分---", j.AreaScore, j.CityScore, j.DistrictScore) //采购单位库 buyer, _ := resulttmp["buyer"].(string) p1, c1, d1 := e.GetCityByBuyer(j, buyer) //qu.Debug("buyer p--", p1, "c--", c1, "d--", d1) area1[1] = map[string]string{"buyer": p1} city1[1] = map[string]string{"buyer": c1} district1[1] = map[string]string{"buyer": d1} //qu.Debug("=====采购单位库打分---", j.AreaScore, j.CityScore, j.DistrictScore) //postcode邮编 buyerzipcode, _ := resulttmp["buyerzipcode"].(string) p2, c2, d2 := e.GetCityByPostCode(j, buyerzipcode) //qu.Debug("postcode p--", p2, "c--", c2, "d--", d2) area1[2] = map[string]string{"postcode": p2} city1[2] = map[string]string{"postcode": c2} district1[2] = map[string]string{"postcode": d2} //qu.Debug("=====postcode邮编打分---", j.AreaScore, j.CityScore, j.DistrictScore) //areacode固话区号 buyertel, _ := resulttmp["buyertel"].(string) p3, c3, d3 := e.GetCityByAreaCode(j, buyertel) //qu.Debug("areacode p--", p3, "c--", c3, "d--", d3, buyertel) area1[3] = map[string]string{"areacode": p3} city1[3] = map[string]string{"areacode": c3} district1[3] = map[string]string{"areacode": d3} //qu.Debug("=====areacode固话区号打分---", j.AreaScore, j.CityScore, j.DistrictScore) HighPreCity := make(map[string]interface{}) HighPreCity["area"] = area1 HighPreCity["city"] = city1 HighPreCity["district"] = district1 //低精度抽取city //buyeraddr,title,projectname buyeraddr, _ := resulttmp["buyeraddr"].(string) title, _ := resulttmp["title"].(string) projectname, _ := resulttmp["projectname"].(string) //qu.Debug(buyeraddr, "--", buyer, "--", title, "--", projectname) sm.AddKey("buyeraddr", buyeraddr) sm.AddKey("buyer", buyer) sm.AddKey("title", title) sm.AddKey("projectname", projectname) area2, city2, district2 := e.GetCityByOthers(j, sm) LowPreCity := make(map[string]interface{}) LowPreCity["area"] = area2 LowPreCity["city"] = city2 LowPreCity["district"] = district2 // resulttmp["highprecity"] = HighPreCity // resulttmp["lowprecity"] = LowPreCity //qu.Debug("最终打分---", j.AreaScore, j.CityScore, j.DistrictScore) //最终抽取结果 finishP := HighestScoreArr(j.AreaScore) finishC := HighestScoreArr(j.CityScore) finishD := HighestScoreArr(j.DistrictScore) // area, _ := resulttmp["area"].(string) // city, _ := resulttmp["city"].(string) // district, _ := resulttmp["district"].(string) // qu.Debug("之前结果结果===", area, city, district) arearesult := "" cityresult := "" districtresult := "" if len(finishP) == 1 { //最高分一个 arearesult = finishP[0] //抽取结果直接赋值 cityresult = GetCity(arearesult, cityresult, e, finishC) cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD) } else if len(finishP) > 1 { //province最高分多个 if len(finishC) == 1 { cityresult = finishC[0] if cfMap := e.CityFullMap[cityresult]; cfMap != nil { arearesult = cfMap.P.Brief cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD) } } else { //对应的city有多个(多个province和city) arearesult = finishP[0] //抽取结果直接赋值 cityresult = GetCity(arearesult, cityresult, e, finishC) cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD) } } //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult) if arearesult == "" { arearesult = "全国" } else if cityresult == "" { if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil { cityresult = pbMap.Cap resulttmp["defaultpcap"] = true } } //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult) resulttmp["area"] = arearesult resulttmp["city"] = cityresult resulttmp["district"] = districtresult } func (e *ExtractTask) GetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) { defer qu.Catch() jsondata := *j.Jsondata if jsondata != nil { //jsondata中获取province和city if acd, ok := jsondata["area_city_district"].(string); ok && acd != "" { flag := false p, flag = GetPCDByAreaDFA(p, acd, e, j, flag) if !flag { p, c, flag = GetPCDByCityDFA(p, c, acd, e, j, flag) } if !flag { p, city, c = GetPCDByDistrictDFA(p, c, d, acd, e, j) } } city, _ = jsondata["city"].(string) //city全称或者简称 province, _ = jsondata["area"].(string) //province简称 district, _ = jsondata["district"].(string) //district全称 } PCDScore(j, "district", district, 5) //district打分 bp := false if province != "" { if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国) bp = true //省份正确 } } pbrief := "" if city != "" { cityfullmap := e.CityFullMap[city] //判断city全称是否正确 if cityfullmap != nil { pbrief = cityfullmap.P.Brief //province简称 } else { citybriefmap := e.CityBriefMap[city] //判断city简称是否正确 if citybriefmap != nil { city = citybriefmap.Name //city简称替换为全称 pbrief = citybriefmap.P.Brief } } } if bp { if pbrief == province { //爬虫的province和city匹配 PCDScore(j, "city", city, 5) } else { //pbrief不匹配province(此时city为空或者错误) city = "" } PCDScore(j, "province", province, 5) } else { //省份错误或为空,取city的对应的pbrief为province if pbrief != "" { province = pbrief PCDScore(j, "province", province, 5) PCDScore(j, "city", city, 5) } else { province = "" city = "" } } return } func (e *ExtractTask) GetCityByBuyer(j *ju.Job, buyer string) (province, city, district string) { defer qu.Catch() return } func (e *ExtractTask) GetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) { defer qu.Catch() pc := e.PostCodeMap[postcode] if pc != nil { province = pc.P city = pc.C districtTmp := pc.D if len(districtTmp) == 1 { //对应多个district舍去 district = districtTmp[0] PCDScore(j, "district", district, 5) } PCDScore(j, "province", province, 5) PCDScore(j, "city", city, 5) } return } func (e *ExtractTask) GetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) { defer qu.Catch() if len(buyertel) >= 11 { if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头 n := 4 L: areacode := buyertel[:n] ac := e.AreaCodeMap[areacode] if ac != nil { province = ac.P citytmp := ac.C if len(citytmp) == 1 { //对应多个city舍去 city = citytmp[0] PCDScore(j, "city", city, 5) } PCDScore(j, "province", province, 5) } else { n = n - 1 if n >= 3 { goto L } } } else if buyertel[:3] == "853" { //澳门 province = "澳门" city = "澳门" PCDScore(j, "province", province, 5) PCDScore(j, "city", city, 5) } } return } func (e *ExtractTask) GetCityByOthers(j *ju.Job, sm *SortMap) ([]map[string]string, []map[string]string, []map[string]string) { //存储每个流程的抽取结果 area2 := []map[string]string{} city2 := []map[string]string{} district2 := []map[string]string{} isExtPC := false for _, from := range sm.Keys { //buyeraddr;title;projectname str, _ := sm.Map[from].(string) //分别记录buyeraddr;title;projectname全称匹配的打分情况 pscore1 := make(map[string]int) cscore1 := make(map[string]int) dscore1 := make(map[string]int) //优先province,city,district,street全称匹配 for pos, GET := range []*ju.DFA{e.ProvinceAllGet, e.CityAllGet, e.DistrictAllGet, e.StreetGet} { word := GET.CheckSensitiveWord(str) if word != "" { if pos == 0 { //province pbrief := e.ProvinceMap[word] //取province简称 OtherScore("p", []string{pbrief}, &pscore1, &cscore1, &dscore1) } else if pos == 1 { //city p := "" cityfullmap := e.CityFullMap[word] if cityfullmap != nil { p = cityfullmap.P.Brief //取province简称 } OtherScore("c", []string{p, word}, &pscore1, &cscore1, &dscore1) } else if pos == 2 { //district p, c := "", "" dcitymap := e.DistrictCityMap[word] //区对应的city if dcitymap != nil { c = dcitymap.Name //city全称 p = dcitymap.P.Brief //province简称 } tmpArr := []string{p, c, word} if word == c { //河南济源市 tmpArr = []string{p, c} } OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1) } else if pos == 3 { //street p, c, d := "", "", "" sdmap := e.StreetDistrictMap[word] //对应的区 if sdmap != nil { d = sdmap.Name c = sdmap.C.Name p = sdmap.C.P.Brief } tmpArr := []string{p, c, d} if c == d { //河南济源市 tmpArr = []string{p, c} } OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1) } } } //取最高分的province,city,district ph1 := HighestScore(pscore1) ch1 := HighestScore(cscore1) dh1 := HighestScore(dscore1) isMatch := IsMatch(ph1, ch1, e) //最高分p和最高分c可能不对应 if ch1 != "" && ph1 != "" && isMatch { isExtPC = true } //是否相互匹配 area2 = append(area2, map[string]string{from + "_all": ph1}) city2 = append(city2, map[string]string{from + "_all": ch1}) district2 = append(district2, map[string]string{from + "_all": dh1}) //buyeraddr,title,projectname匹配对应的结果加入最终得分 if isMatch { if from == "buyeraddr" || from == "buyer" { //全称匹配,buyeraddr和buyer3分,title和projectname2分 PCDScore(j, "province", ph1, 3) PCDScore(j, "city", ch1, 3) PCDScore(j, "district", dh1, 3) } else { PCDScore(j, "province", ph1, 2) PCDScore(j, "city", ch1, 2) PCDScore(j, "district", dh1, 2) } } } //判断全称是否抽出了province和city,一个未抽出走简称抽取 if !isExtPC { for _, from := range sm.Keys { //buyeraddr;title;projectname str, _ := sm.Map[from].(string) pscore2 := make(map[string]int) cscore2 := make(map[string]int) dscore2 := make(map[string]int) for pos, GET := range []*ju.DFA{e.ProvinceSimGet, e.CitySimGet, e.DistrictSimGet} { word := GET.CheckSensitiveWord(str) if word != "" { if pos == 0 { //province OtherScore("p", []string{word}, &pscore2, &cscore2, &dscore2) } else if pos == 1 { //city p, c := "", "" citybriefmap := e.CityBriefMap[word] if citybriefmap != nil { p = citybriefmap.P.Brief c = citybriefmap.Name } OtherScore("c", []string{p, c}, &pscore2, &cscore2, &dscore2) } else if pos == 2 { //district p, c := "", "" d := e.DistrictSimAndAll[word] dcitymap := e.DistrictCityMap[word] if dcitymap != nil { c = dcitymap.Name p = dcitymap.P.Brief } OtherScore("d", []string{p, c, d}, &pscore2, &cscore2, &dscore2) } } } //取最高分的province,city,district ph2 := HighestScore(pscore2) ch2 := HighestScore(cscore2) dh2 := HighestScore(dscore2) area2 = append(area2, map[string]string{from + "_sim": ph2}) city2 = append(city2, map[string]string{from + "_sim": ch2}) district2 = append(district2, map[string]string{from + "_sim": dh2}) //buyeraddr,title,projectname匹配对应的结果加入最终得分 if from == "buyeraddr" { PCDScore(j, "province", ph2, 2) PCDScore(j, "city", ch2, 2) PCDScore(j, "district", dh2, 2) } else { PCDScore(j, "province", ph2, 1) PCDScore(j, "city", ch2, 1) PCDScore(j, "district", dh2, 1) } } } return area2, city2, district2 } func IsMatch(p, c string, e *ExtractTask) bool { ism := false if p != "" && c == "" { return true } if cfMap := e.CityFullMap[c]; cfMap != nil { if cfMap.P.Brief == p { ism = true } } return ism } //计算province,city,district得分 func PCDScore(j *ju.Job, stype, text string, score int) { defer qu.Catch() if text != "" { if stype == "district" { scoretmp := j.DistrictScore[text] j.DistrictScore[text] = scoretmp + score } else if stype == "city" { scoretmp := j.CityScore[text] j.CityScore[text] = scoretmp + score } else if stype == "province" { scoretmp := j.AreaScore[text] j.AreaScore[text] = scoretmp + score } } } func OtherScore(stype string, text []string, ps, cs, ds *map[string]int) { defer qu.Catch() for i, t := range text { if t != "" { if i == 0 { //p tmpscore := (*ps)[t] (*ps)[t] = tmpscore + 1 } else if i == 1 { //c tmpscore := (*cs)[t] (*cs)[t] = tmpscore + 1 } else if i == 2 { //d tmpscore := (*ds)[t] (*ds)[t] = tmpscore + 1 } } } } func HighestScore(m map[string]int) string { result := "" tmpscore := 0 for str, score := range m { if str != "" && tmpscore < score { result = str tmpscore = score } } return result } func HighestScoreArr(m map[string]int) []string { result := make(map[int][]string) tmpscore := 0 for str, score := range m { if str != "" && tmpscore <= score { if result[tmpscore] != nil && tmpscore != score { delete(result, tmpscore) } if r := result[score]; r != nil { r = append(r, str) result[score] = r } else { result[score] = []string{str} } tmpscore = score } } return result[tmpscore] } func GetCity(area, city string, e *ExtractTask, finishC []string) string { for _, c := range finishC { //取最高分与province匹配的city if cfMap := e.CityFullMap[c]; cfMap != nil { if cfMap.P.Brief == area { city = c break } } } return city } func GetDistrict(area, city, district string, e *ExtractTask, finishD []string) (string, string) { for _, d := range finishD { //取最高分与province匹配的district if dcMap := e.DistrictCityMap[d]; dcMap != nil { if dcMap.P.Brief == area { district = d tmpcity := dcMap.Name if city != tmpcity { if cfMap := e.CityFullMap[tmpcity]; cfMap != nil { if cfMap.P.Brief == area { city = tmpcity break } } } } } } return city, district } func GetPCDByAreaDFA(province, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, bool) { if word := e.ProvinceSimGet.CheckSensitiveWord(acd); word != "" { //取省 if pbMap := e.ProvinceBriefMap[word]; pbMap != nil { province = pbMap.Brief if province == acd || pbMap.Name == acd { //用于判断area_city_district是否只有省份信息,flag为true就不在匹配area_city_district中的city和district flag = true } PCDScore(j, "province", province, 5) } } return province, flag } func GetPCDByCityDFA(province, city, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, string, bool) { for pos, GET := range []*ju.DFA{e.CityAllGet, e.CitySimGet} { //取市 if word := GET.CheckSensitiveWord(acd); word != "" { if pos == 0 { //全称 if cfMap := e.CityFullMap[word]; cfMap != nil { if province != "" && cfMap.P.Brief == province { //acd有province信息 city = cfMap.Name if acd == province+city || acd == cfMap.P.Name+city { flag = true } } else if province == "" { //acd有city;city和district信息 city = cfMap.Name province = cfMap.P.Brief PCDScore(j, "province", province, 5) if acd == city { flag = true } } PCDScore(j, "city", city, 5) break } } else { //简称 if cbMap := e.CityBriefMap[word]; cbMap != nil { if province != "" && cbMap.P.Brief == province { city = cbMap.Name if acd == province+city || acd == cbMap.P.Name+city { flag = true } } else if province == "" { city = cbMap.Name province = cbMap.P.Brief PCDScore(j, "province", province, 5) if acd == city { flag = true } } PCDScore(j, "city", city, 5) break } } } } return province, city, flag } func GetPCDByDistrictDFA(province, city, district, acd string, e *ExtractTask, j *ju.Job) (string, string, string) { //area_city_district字段不会单独存区信息(省市,省,市,省区,省市区) for pos, GET := range []*ju.DFA{e.DistrictAllGet, e.DistrictSimGet} { //取区 if word := GET.CheckSensitiveWord(acd); word != "" { if dcMap := e.DistrictCityMap[word]; dcMap != nil { district = word if pos == 1 { //简称换为全称 district = e.DistrictSimAndAll[district] } if city == "" && dcMap.P.Brief == province { //只有province和district(are_city_district:河南省二七区) city = dcMap.Name PCDScore(j, "city", city, 5) } else if province == "" { //province和city都没有(are_city_district:二七区) city = dcMap.Name province = dcMap.P.Brief PCDScore(j, "city", city, 5) PCDScore(j, "province", province, 5) } PCDScore(j, "district", district, 5) break } } } return province, city, district }