|
@@ -8,7 +8,10 @@ import (
|
|
|
"strings"
|
|
|
)
|
|
|
|
|
|
-var AgencyReg = regexp.MustCompile("((代理机构|中标供应商).{0,30}|.{2,15}((招标)?代理|咨询|政府采购))")
|
|
|
+var AgencyReg = []*regexp.Regexp{
|
|
|
+ regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
|
|
|
+ regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
|
|
|
+}
|
|
|
|
|
|
//抽取city
|
|
|
func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
|
|
@@ -26,37 +29,46 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}
|
|
|
defer qu.Catch()
|
|
|
|
|
|
//初始化
|
|
|
- if j.AreaScore == nil {
|
|
|
- j.AreaScore = make(map[string]int)
|
|
|
+ if j.FullAreaScore == nil {
|
|
|
+ j.FullAreaScore = make(map[string]float64)
|
|
|
+ }
|
|
|
+ if j.FullCityScore == nil {
|
|
|
+ j.FullCityScore = make(map[string]float64)
|
|
|
+ }
|
|
|
+ if j.FullDistrictScore == nil {
|
|
|
+ j.FullDistrictScore = make(map[string]float64)
|
|
|
+ }
|
|
|
+ if j.SimAreaScore == nil {
|
|
|
+ j.SimAreaScore = make(map[string]float64)
|
|
|
}
|
|
|
- if j.CityScore == nil {
|
|
|
- j.CityScore = make(map[string]int)
|
|
|
+ if j.SimCityScore == nil {
|
|
|
+ j.SimCityScore = make(map[string]float64)
|
|
|
}
|
|
|
- if j.DistrictScore == nil {
|
|
|
- j.DistrictScore = make(map[string]int)
|
|
|
+ if j.SimDistrictScore == nil {
|
|
|
+ j.SimDistrictScore = make(map[string]float64)
|
|
|
}
|
|
|
//记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
|
|
|
- pscore := make(map[string]int)
|
|
|
- cscore := make(map[string]int)
|
|
|
- dscore := make(map[string]int)
|
|
|
+ pscore := make(map[string]float64)
|
|
|
+ cscore := make(map[string]float64)
|
|
|
+ dscore := make(map[string]float64)
|
|
|
|
|
|
sm := NewSortMap()
|
|
|
//1.jsondata抽取
|
|
|
e.NewGetCityByJsonData(j)
|
|
|
- //qu.Debug("jsondata打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("jsondata打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
//2.site库抽取
|
|
|
e.NewGetCityBySite(j)
|
|
|
- //qu.Debug("site打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("site打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
//3.采购单位库抽取(暂时没有采购单位库)
|
|
|
//buyer, _ := resulttmp["buyer"].(string)
|
|
|
//4.postcode邮编抽取
|
|
|
buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
|
|
|
e.NewGetCityByPostCode(j, buyerzipcode)
|
|
|
- //qu.Debug("邮编打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("邮编打分后结果---", buyerzipcode, j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
//5.areacode固话区号抽取
|
|
|
buyertel, _ := resulttmp["buyertel"].(string)
|
|
|
e.NewGetCityByAreaCode(j, buyertel)
|
|
|
- //qu.Debug("固话打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("固话打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
//6.buyeraddr,title,projectname抽取
|
|
|
buyeraddr, _ := resulttmp["buyeraddr"].(string)
|
|
|
title, _ := resulttmp["title"].(string)
|
|
@@ -64,24 +76,40 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}
|
|
|
buyer, _ := resulttmp["buyer"].(string)
|
|
|
//qu.Debug("buyeraddr--", buyeraddr, "--buyer--", buyer, "--title--", title, "--projectname--", projectname)
|
|
|
sm.AddKey("buyeraddr", buyeraddr)
|
|
|
+ sm.AddKey("buyer", buyer)
|
|
|
sm.AddKey("title", title)
|
|
|
sm.AddKey("projectname", projectname)
|
|
|
- sm.AddKey("buyer", buyer)
|
|
|
+ //7.buyeraddr buyer title projectname抽取
|
|
|
e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
|
|
|
- //qu.Debug("打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
- //7.detail抽取
|
|
|
- if len(j.AreaScore) > 0 {
|
|
|
+ //qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
+ //qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
|
|
|
+ //全称简称得分合并
|
|
|
+ MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
|
|
|
+ //qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
+ //合并区简称得分
|
|
|
+ //qu.Debug("pcd=====", pscore, cscore, dscore)
|
|
|
+ MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
|
|
|
+ //qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
+
|
|
|
+ j.SimAreaScore = map[string]float64{}
|
|
|
+ j.SimCityScore = map[string]float64{}
|
|
|
+ j.SimDistrictScore = map[string]float64{}
|
|
|
+
|
|
|
+ //8.detail抽取
|
|
|
+ if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
|
|
|
e.NewGetCityByDetail(j)
|
|
|
}
|
|
|
- //qu.Debug("detail打分后---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
- //合并得分
|
|
|
- //qu.Debug("pcd=====", pscore, cscore, dscore)
|
|
|
- MergeScores(j, &pscore, &cscore, &dscore)
|
|
|
- //qu.Debug("合并打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
+ //qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
|
|
|
+ MergeFullSimScore(j) //合并detail的全简称
|
|
|
+ //qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
|
|
|
- finishP := HighestScoreArr(j.AreaScore)
|
|
|
- finishC := HighestScoreArr(j.CityScore)
|
|
|
- finishD := HighestScoreArr(j.DistrictScore)
|
|
|
+ finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
|
|
|
+ e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
|
|
|
+ //qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
|
|
|
+ //获取结果
|
|
|
+ finishC := HighestScoreArr(j.FullCityScore)
|
|
|
+ finishD := HighestScoreArr(j.FullDistrictScore)
|
|
|
arearesult := ""
|
|
|
cityresult := ""
|
|
|
districtresult := ""
|
|
@@ -112,6 +140,9 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}
|
|
|
//直辖市
|
|
|
if arearesult == "北京" {
|
|
|
cityresult = "北京市"
|
|
|
+ if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
|
|
|
+ districtresult = "朝阳区"
|
|
|
+ }
|
|
|
} else if arearesult == "天津" {
|
|
|
cityresult = "天津市"
|
|
|
} else if arearesult == "上海" {
|
|
@@ -146,7 +177,7 @@ func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district,
|
|
|
province, _ = jsondata["area"].(string) //province简称
|
|
|
district, _ = jsondata["district"].(string) //district全称
|
|
|
}
|
|
|
- PCDScore(j, "district", district, 5) //district打分
|
|
|
+ PCDScore(j, "district", district, 5, true) //district打分
|
|
|
bp := false
|
|
|
if province != "" {
|
|
|
if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
|
|
@@ -168,16 +199,16 @@ func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district,
|
|
|
}
|
|
|
if bp {
|
|
|
if pbrief == province { //爬虫的province和city匹配
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
} else { //pbrief不匹配province(此时city为空或者错误)
|
|
|
city = ""
|
|
|
}
|
|
|
- PCDScore(j, "province", province, 5)
|
|
|
+ PCDScore(j, "province", province, 5, true)
|
|
|
} else { //省份错误或为空,取city的对应的pbrief为province
|
|
|
if pbrief != "" {
|
|
|
province = pbrief
|
|
|
- PCDScore(j, "province", province, 5)
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "province", province, 5, true)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
} else {
|
|
|
province = ""
|
|
|
city = ""
|
|
@@ -195,7 +226,7 @@ func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.
|
|
|
if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
|
|
|
if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
|
|
|
pbrief = tmpPbrief //省简称
|
|
|
- PCDScore(j, "province", pbrief, 5)
|
|
|
+ PCDScore(j, "province", pbrief, 5, true)
|
|
|
}
|
|
|
} else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
|
|
|
if cfMap := e.CityFullMap[full]; cfMap != nil {
|
|
@@ -203,35 +234,35 @@ func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.
|
|
|
tmpPbrief := cfMap.P.Brief //省简称
|
|
|
if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
|
|
|
city = tmpcity
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
} else if pbrief == "" {
|
|
|
city = tmpcity
|
|
|
pbrief = tmpPbrief
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
- PCDScore(j, "province", pbrief, 5)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
+ PCDScore(j, "province", pbrief, 5, true)
|
|
|
}
|
|
|
}
|
|
|
} else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
|
|
|
carr := e.NewDistrictCityMap[full]
|
|
|
if len(carr) > 0 {
|
|
|
district = full
|
|
|
- PCDScore(j, "district", district, 5)
|
|
|
+ PCDScore(j, "district", district, 5, true)
|
|
|
for _, c := range carr {
|
|
|
tmpcity := c.Name //城市全称
|
|
|
tmpPbrief := c.P.Brief //省简称
|
|
|
if pbrief == "" { //之前没有匹配到省份
|
|
|
- PCDScore(j, "city", tmpcity, 5)
|
|
|
+ PCDScore(j, "city", tmpcity, 5, true)
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, 5)
|
|
|
+ PCDScore(j, "province", tmpPbrief, 5, true)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
} else { //已有省份
|
|
|
if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
|
|
|
- PCDScore(j, "city", tmpcity, -5)
|
|
|
- PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ PCDScore(j, "city", tmpcity, -5, true)
|
|
|
+ PCDScore(j, "province", tmpPbrief, -5, true)
|
|
|
} else { //与之前匹配结果一致
|
|
|
if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
|
|
|
- PCDScore(j, "city", tmpcity, 5)
|
|
|
+ PCDScore(j, "city", tmpcity, 5, true)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -250,7 +281,7 @@ func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.J
|
|
|
if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
|
|
|
if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
|
|
|
pbrief = pbMap.Brief
|
|
|
- PCDScore(j, "province", pbrief, 5) //打分
|
|
|
+ PCDScore(j, "province", pbrief, 5, true) //打分
|
|
|
//PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
|
|
|
}
|
|
|
} else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
|
|
@@ -259,36 +290,36 @@ func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.J
|
|
|
tmpPbrief := cbMap.P.Brief
|
|
|
if pbrief != "" && pbrief == tmpPbrief {
|
|
|
city = tmpcity
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
} else if pbrief == "" {
|
|
|
city = tmpcity
|
|
|
pbrief = tmpPbrief
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
- PCDScore(j, "province", pbrief, 5)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
+ PCDScore(j, "province", pbrief, 5, true)
|
|
|
//PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
|
|
|
}
|
|
|
}
|
|
|
} else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
|
|
|
dfullarr := e.NewDistrictSimAndAll[sim]
|
|
|
if len(dfullarr) > 0 {
|
|
|
- PCDScore(j, "district", sim, 5)
|
|
|
+ PCDScore(j, "district", sim, 5, true)
|
|
|
for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
|
|
|
for _, c := range dfullAndCity {
|
|
|
tmpcity := c.Name //城市全称
|
|
|
tmpPbrief := c.P.Brief //省简称
|
|
|
if pbrief == "" { //之前没有匹配到省份
|
|
|
- PCDScore(j, "city", tmpcity, 5)
|
|
|
+ PCDScore(j, "city", tmpcity, 5, true)
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, 5)
|
|
|
+ PCDScore(j, "province", tmpPbrief, 5, true)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
} else { //已有省份
|
|
|
if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
|
|
|
- PCDScore(j, "city", tmpcity, -5)
|
|
|
- PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ PCDScore(j, "city", tmpcity, -5, true)
|
|
|
+ PCDScore(j, "province", tmpPbrief, -5, true)
|
|
|
} else { //与之前匹配结果一致
|
|
|
if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
|
|
|
- PCDScore(j, "city", tmpcity, 5)
|
|
|
+ PCDScore(j, "city", tmpcity, 5, true)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -305,13 +336,13 @@ func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
|
|
|
//qu.Debug("site--------", site)
|
|
|
if scMap := e.SiteCityMap[site]; scMap != nil {
|
|
|
if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
|
|
|
- PCDScore(j, "province", scMap.P, 5)
|
|
|
+ PCDScore(j, "province", scMap.P, 5, true)
|
|
|
}
|
|
|
if scMap.C != "" && scMap.C != "null" {
|
|
|
- PCDScore(j, "city", scMap.C, 5)
|
|
|
+ PCDScore(j, "city", scMap.C, 5, true)
|
|
|
}
|
|
|
if scMap.D != "" && scMap.D != "null" {
|
|
|
- PCDScore(j, "district", scMap.D, 5)
|
|
|
+ PCDScore(j, "district", scMap.D, 5, true)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -324,15 +355,15 @@ func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province
|
|
|
province = pc.P
|
|
|
city = pc.C
|
|
|
districtTmp := pc.D //邮编可能对应多个区
|
|
|
- score := 3
|
|
|
+ score := 3.0
|
|
|
if len(districtTmp) == 1 && districtTmp[0] != "" {
|
|
|
- score = 5
|
|
|
+ score = 5.0
|
|
|
}
|
|
|
for _, district := range districtTmp {
|
|
|
- PCDScore(j, "district", district, score)
|
|
|
+ PCDScore(j, "district", district, score, true)
|
|
|
}
|
|
|
- PCDScore(j, "province", province, 5)
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "province", province, 5, true)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
}
|
|
|
return
|
|
|
}
|
|
@@ -351,36 +382,40 @@ func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province
|
|
|
citytmp := ac.C
|
|
|
if len(citytmp) == 1 { //对应多个city舍去
|
|
|
city = citytmp[0]
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
}
|
|
|
- PCDScore(j, "province", province, 5)
|
|
|
+ PCDScore(j, "province", province, 5, true)
|
|
|
} else {
|
|
|
n = n - 1
|
|
|
if n >= 3 {
|
|
|
goto L
|
|
|
}
|
|
|
}
|
|
|
- } else if buyertel[:3] == "853" { //澳门
|
|
|
+ } /* else if buyertel[:3] == "853" { //澳门
|
|
|
province = "澳门"
|
|
|
city = "澳门"
|
|
|
- PCDScore(j, "province", province, 5)
|
|
|
- PCDScore(j, "city", city, 5)
|
|
|
- }
|
|
|
+ PCDScore(j, "province", province, 5, true)
|
|
|
+ PCDScore(j, "city", city, 5, true)
|
|
|
+ }*/
|
|
|
}
|
|
|
return
|
|
|
}
|
|
|
|
|
|
-func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]int) {
|
|
|
+func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
|
|
|
/*
|
|
|
1.对字段进行分词
|
|
|
2.省、市、区、街道、居委会全称进行匹配打分
|
|
|
3.省、市、区简称进行匹配打分
|
|
|
*/
|
|
|
- for _, from := range sm.Keys { //buyeraddr;title;projectname
|
|
|
+ ts := 0.5
|
|
|
+ for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
|
|
|
+ if i > 1 {
|
|
|
+ ts = 0.2
|
|
|
+ }
|
|
|
p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
|
|
|
str, _ := sm.Map[from].(string)
|
|
|
jbText := e.Seg_SV.Cut(str, true)
|
|
|
- for _, text := range jbText { //结巴分词
|
|
|
+ for _, text := range jbText {
|
|
|
if len([]rune(text)) == 1 {
|
|
|
continue
|
|
|
}
|
|
@@ -391,7 +426,7 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
if pos_full == 0 && p_full == "" { //省全称
|
|
|
if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
|
|
|
p_full = tmpPbrief
|
|
|
- PCDScore(j, "province", p_full, 4)
|
|
|
+ PCDScore(j, "province", p_full, 4+ts, true)
|
|
|
break
|
|
|
}
|
|
|
} else if pos_full == 1 && c_full == "" { //市全称
|
|
@@ -400,12 +435,12 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
if p_full == "" {
|
|
|
p_full = tmpPbrief
|
|
|
c_full = cfMap.Name
|
|
|
- PCDScore(j, "province", p_full, 4)
|
|
|
- PCDScore(j, "city", c_full, 4)
|
|
|
+ PCDScore(j, "province", p_full, 4+ts, true)
|
|
|
+ PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
break
|
|
|
} else if p_full == tmpPbrief {
|
|
|
c_full = cfMap.Name
|
|
|
- PCDScore(j, "city", c_full, 4)
|
|
|
+ PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
break
|
|
|
} else if p_full != "" && p_full != tmpPbrief {
|
|
|
//city不做处理
|
|
@@ -422,7 +457,7 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
d_full = text
|
|
|
if c_full == "" {
|
|
|
c_full = c.Name
|
|
|
- PCDScore(j, "city", c_full, 4)
|
|
|
+ PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
}
|
|
|
isOk = true
|
|
|
districtOk = true
|
|
@@ -432,30 +467,30 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
p_full = tmpPbrief
|
|
|
c_full = c.Name
|
|
|
d_full = text
|
|
|
- PCDScore(j, "province", p_full, 4)
|
|
|
- PCDScore(j, "city", c_full, 4)
|
|
|
+ PCDScore(j, "province", p_full, 4+ts, true)
|
|
|
+ PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
isOk = true
|
|
|
} else { //多个city,只打分,不赋值
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, 2)
|
|
|
+ PCDScore(j, "province", tmpPbrief, 2+ts, true)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
- //PCDScore(j, "province", tmpPbrief, 2)
|
|
|
- PCDScore(j, "city", c.Name, 2)
|
|
|
+ //PCDScore(j, "province", tmpPbrief, 2, true)
|
|
|
+ PCDScore(j, "city", c.Name, 2+ts, true)
|
|
|
}
|
|
|
} else if p_full != "" && p_full != tmpPbrief { //干扰项减分
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ PCDScore(j, "province", tmpPbrief, -5, true)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
- //PCDScore(j, "province", tmpPbrief, -5)
|
|
|
- PCDScore(j, "city", c.Name, -5)
|
|
|
+ //PCDScore(j, "province", tmpPbrief, -5, true)
|
|
|
+ PCDScore(j, "city", c.Name, -5, true)
|
|
|
}
|
|
|
}
|
|
|
if districtOk {
|
|
|
- PCDScore(j, "district", text, 4)
|
|
|
+ PCDScore(j, "district", text, 4+ts, true)
|
|
|
} else {
|
|
|
- PCDScore(j, "district", text, -5)
|
|
|
+ PCDScore(j, "district", text, -5, true)
|
|
|
}
|
|
|
if isOk {
|
|
|
break
|
|
@@ -463,40 +498,46 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
} else if pos_full == 3 { //街道全称
|
|
|
districts := e.NewStreetDistrictMap[text]
|
|
|
if len(districts) == 1 { //街道唯一
|
|
|
- DealMultipleDistrict(e, j, districts, 2, p_full, nil, nil, nil)
|
|
|
+ DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
|
|
|
}
|
|
|
} else if pos_full == 4 { //居委会全称
|
|
|
districts := e.CommunityDistrictMap[text]
|
|
|
if len(districts) == 1 { //居委会唯一
|
|
|
- DealMultipleDistrict(e, j, districts, 2, p_full, nil, nil, nil)
|
|
|
+ DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- //qu.Debug("全称后--", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
//简称匹配
|
|
|
for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
if trie_sim.Get(text) {
|
|
|
if pos_sim == 0 && p_sim == "" { //省简称
|
|
|
p_sim = text
|
|
|
- PCDScore(j, "province", p_sim, 3)
|
|
|
+ PCDScore(j, "province", p_sim, 3+ts, false)
|
|
|
break
|
|
|
- } else if pos_sim == 1 && c_sim == "" { //市简称
|
|
|
+ } else if pos_sim == 1 { //市简称
|
|
|
if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
tmpPbrief := cbMap.P.Brief
|
|
|
if p_sim == "" {
|
|
|
+ score := 2.0 + ts
|
|
|
+ if tmpPbrief == p_full {
|
|
|
+ score += 1.0
|
|
|
+ }
|
|
|
p_sim = tmpPbrief
|
|
|
c_sim = cbMap.Brief
|
|
|
- PCDScore(j, "province", p_sim, 2)
|
|
|
- PCDScore(j, "city", cbMap.Name, 2)
|
|
|
+ PCDScore(j, "province", p_sim, score, false)
|
|
|
+ PCDScore(j, "city", cbMap.Name, score, false)
|
|
|
break
|
|
|
} else if p_sim == tmpPbrief {
|
|
|
c_sim = cbMap.Brief
|
|
|
- PCDScore(j, "city", cbMap.Name, 3)
|
|
|
+ PCDScore(j, "city", cbMap.Name, 3+ts, false)
|
|
|
break
|
|
|
- } else if p_sim != "" && p_sim != tmpPbrief { //北京师范大学广州实验学校
|
|
|
- PCDScore(j, "province", tmpPbrief, 1)
|
|
|
- PCDScore(j, "city", cbMap.Name, 1)
|
|
|
+ } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
|
|
|
+ delete(j.SimAreaScore, p_sim)
|
|
|
+ p_sim = text
|
|
|
+ PCDScore(j, "province", tmpPbrief, 3+ts, false)
|
|
|
+ PCDScore(j, "city", cbMap.Name, 3+ts, false)
|
|
|
}
|
|
|
}
|
|
|
} else if pos_sim == 2 && d_sim == "" { //区简称
|
|
@@ -506,49 +547,53 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
for _, dfull_city := range dfull_citys {
|
|
|
for dfull, c := range dfull_city { //dfull:简称对应的全称
|
|
|
tmpPbrief := c.P.Brief
|
|
|
- if p_sim == tmpPbrief { //省份一致
|
|
|
+ if p_sim == tmpPbrief || p_full == tmpPbrief { //省份一致
|
|
|
d_sim = text
|
|
|
- PCDScore(j, "district", dfull, 2)
|
|
|
+ PCDScore(j, "district", dfull, 2+ts, false)
|
|
|
if c_sim == "" {
|
|
|
c_sim = c.Brief
|
|
|
- PCDScore(j, "city", c.Name, 2)
|
|
|
+ PCDScore(j, "city", c.Name, 2+ts, false)
|
|
|
}
|
|
|
} else if p_sim == "" {
|
|
|
if !repeatDb[dfull] {
|
|
|
- PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
|
|
|
repeatDb[dfull] = true
|
|
|
}
|
|
|
if len(dfull_citys) == 1 {
|
|
|
- PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
|
|
|
- PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
|
|
|
} else {
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
- PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
- PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
|
|
|
}
|
|
|
} else if p_sim != "" && p_sim != tmpPbrief {
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
- PCDScoreByDistrictSim("p", tmpPbrief, -5, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
- PCDScoreByDistrictSim("c", c.Name, -5, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- //qu.Debug("简称后--", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
- repeatP := map[string]bool{}
|
|
|
- repeatC := map[string]bool{}
|
|
|
- repeatD := map[string]bool{}
|
|
|
+ repeatP_full := map[string]bool{}
|
|
|
+ repeatC_full := map[string]bool{}
|
|
|
+ repeatD_full := map[string]bool{}
|
|
|
+ repeatP_sim := map[string]bool{}
|
|
|
+ repeatC_sim := map[string]bool{}
|
|
|
+ repeatD_sim := map[string]bool{}
|
|
|
detailRune := []rune(j.Content)
|
|
|
detail := j.Content
|
|
|
if len(detailRune) > 600 {
|
|
@@ -556,45 +601,47 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
end := detailRune[len(detailRune)-300:]
|
|
|
detail = string(start) + string(end)
|
|
|
}
|
|
|
- detail = AgencyReg.ReplaceAllString(detail, "")
|
|
|
+ for _, reg := range AgencyReg {
|
|
|
+ detail = reg.ReplaceAllString(detail, "")
|
|
|
+ }
|
|
|
for _, text := range e.Seg_SV.Cut(detail, true) {
|
|
|
if len([]rune(text)) > 1 {
|
|
|
//全称匹配
|
|
|
for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
if trie_full.Get(text) {
|
|
|
if pos_full == 0 { //省全称
|
|
|
- if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP[tmpPbrief] { //取简称
|
|
|
- PCDScore(j, "province", tmpPbrief, 1)
|
|
|
- repeatP[tmpPbrief] = true
|
|
|
+ if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
|
|
|
+ PCDScore(j, "province", tmpPbrief, 1, true)
|
|
|
+ repeatP_full[tmpPbrief] = true
|
|
|
break
|
|
|
}
|
|
|
} else if pos_full == 1 { //市全称
|
|
|
if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
- if !repeatP[cfMap.P.Brief] {
|
|
|
- PCDScore(j, "province", cfMap.P.Brief, 1)
|
|
|
- repeatP[cfMap.P.Brief] = true
|
|
|
+ if !repeatP_full[cfMap.P.Brief] {
|
|
|
+ PCDScore(j, "province", cfMap.P.Brief, 1, true)
|
|
|
+ repeatP_full[cfMap.P.Brief] = true
|
|
|
}
|
|
|
- if !repeatC[cfMap.Name] {
|
|
|
- PCDScore(j, "city", cfMap.Name, 1)
|
|
|
- repeatC[cfMap.Name] = true
|
|
|
+ if !repeatC_full[cfMap.Name] {
|
|
|
+ PCDScore(j, "city", cfMap.Name, 1, true)
|
|
|
+ repeatC_full[cfMap.Name] = true
|
|
|
}
|
|
|
break
|
|
|
}
|
|
|
} else if pos_full == 2 { //区全称
|
|
|
citys := e.NewDistrictCityMap[text]
|
|
|
if len(citys) > 0 {
|
|
|
- if !repeatD[text] {
|
|
|
- PCDScore(j, "district", text, 1)
|
|
|
- repeatD[text] = true
|
|
|
+ if !repeatD_full[text] {
|
|
|
+ PCDScore(j, "district", text, 1, true)
|
|
|
+ repeatD_full[text] = true
|
|
|
}
|
|
|
for _, c := range citys {
|
|
|
- if !repeatC[c.Name] {
|
|
|
- PCDScore(j, "city", c.Name, 1)
|
|
|
- repeatC[c.Name] = true
|
|
|
+ if !repeatC_full[c.Name] {
|
|
|
+ PCDScore(j, "city", c.Name, 1, true)
|
|
|
+ repeatC_full[c.Name] = true
|
|
|
}
|
|
|
- if !repeatP[c.P.Brief] {
|
|
|
- PCDScore(j, "province", c.P.Brief, 1)
|
|
|
- repeatP[c.P.Brief] = true
|
|
|
+ if !repeatP_full[c.P.Brief] {
|
|
|
+ PCDScore(j, "province", c.P.Brief, 1, true)
|
|
|
+ repeatP_full[c.P.Brief] = true
|
|
|
}
|
|
|
}
|
|
|
break
|
|
@@ -602,12 +649,12 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
} else if pos_full == 3 { //街道全称
|
|
|
districts := e.NewStreetDistrictMap[text]
|
|
|
if len(districts) == 1 {
|
|
|
- DealMultipleDistrict(e, j, districts, 1, "", &repeatP, &repeatC, &repeatD)
|
|
|
+ DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
|
|
|
}
|
|
|
} else if pos_full == 4 { //居委会全称
|
|
|
districts := e.CommunityDistrictMap[text]
|
|
|
if len(districts) == 1 {
|
|
|
- DealMultipleDistrict(e, j, districts, 1, "", &repeatP, &repeatC, &repeatD)
|
|
|
+ DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -616,34 +663,43 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
//简称匹配
|
|
|
for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
if trie_sim.Get(text) {
|
|
|
- if pos_sim == 0 && !repeatP[text] { //省简称
|
|
|
- PCDScore(j, "province", text, 1)
|
|
|
- repeatP[text] = true
|
|
|
+ if pos_sim == 0 && !repeatP_sim[text] { //省简称
|
|
|
+ PCDScore(j, "province", text, 1, false)
|
|
|
+ repeatP_sim[text] = true
|
|
|
break
|
|
|
} else if pos_sim == 1 { //市简称
|
|
|
if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
- if !repeatP[cbMap.P.Brief] {
|
|
|
- PCDScore(j, "province", cbMap.P.Brief, 1)
|
|
|
- repeatP[cbMap.P.Brief] = true
|
|
|
+ if !repeatP_sim[cbMap.P.Brief] {
|
|
|
+ PCDScore(j, "province", cbMap.P.Brief, 1, false)
|
|
|
+ repeatP_sim[cbMap.P.Brief] = true
|
|
|
}
|
|
|
- if !repeatC[cbMap.Name] {
|
|
|
- PCDScore(j, "city", cbMap.Name, 1)
|
|
|
- repeatC[cbMap.Name] = true
|
|
|
+ if !repeatC_sim[cbMap.Name] {
|
|
|
+ PCDScore(j, "city", cbMap.Name, 1, false)
|
|
|
+ repeatC_sim[cbMap.Name] = true
|
|
|
}
|
|
|
break
|
|
|
}
|
|
|
- } /* else if pos_sim == 2 { //区简称
|
|
|
- repeatDb := map[string]bool{}
|
|
|
+ } else if pos_sim == 2 { //区简称
|
|
|
dfull_citys := e.NewDistrictSimAndAll[text]
|
|
|
- for _, dfull_city := range dfull_citys {
|
|
|
- for dfull, _ := range dfull_city { //dfull:简称对应的全称
|
|
|
- if !repeatDb[dfull] {
|
|
|
- PCDScore(j, "district", dfull, 1)
|
|
|
- repeatDb[dfull] = true
|
|
|
+ if len(dfull_citys) == 1 {
|
|
|
+ for _, dfull_city := range dfull_citys {
|
|
|
+ for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
|
|
|
+ if !repeatD_sim[dfull] {
|
|
|
+ PCDScore(j, "district", dfull, 1, false)
|
|
|
+ repeatD_sim[dfull] = true
|
|
|
+ }
|
|
|
+ if !repeatC_sim[ctmp.Name] {
|
|
|
+ PCDScore(j, "city", ctmp.Name, 1, false)
|
|
|
+ repeatC_sim[ctmp.Name] = true
|
|
|
+ }
|
|
|
+ if !repeatP_sim[ctmp.P.Brief] {
|
|
|
+ PCDScore(j, "province", ctmp.P.Brief, 1, false)
|
|
|
+ repeatP_sim[ctmp.P.Brief] = true
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- }*/
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
//qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
@@ -652,57 +708,36 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
}
|
|
|
|
|
|
//街道、居委会对应多地市处理
|
|
|
-func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score int, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
|
|
|
+func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
|
|
|
if len(districts) == 1 {
|
|
|
district := districts[0]
|
|
|
city := district.C.Name
|
|
|
tmpPbrief := district.C.P.Brief
|
|
|
if pbrief != "" && tmpPbrief == pbrief {
|
|
|
- PCDScore(j, "province", tmpPbrief, score)
|
|
|
- PCDScore(j, "city", city, score)
|
|
|
- PCDScore(j, "district", district.Name, score)
|
|
|
+ PCDScore(j, "province", tmpPbrief, score, true)
|
|
|
+ PCDScore(j, "city", city, score, true)
|
|
|
+ PCDScore(j, "district", district.Name, score, true)
|
|
|
} else if pbrief == "" {
|
|
|
if repeatP != nil && !(*repeatP)[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, score)
|
|
|
+ PCDScore(j, "province", tmpPbrief, score, true)
|
|
|
(*repeatP)[tmpPbrief] = true
|
|
|
} else if repeatP == nil {
|
|
|
- PCDScore(j, "province", tmpPbrief, score)
|
|
|
+ PCDScore(j, "province", tmpPbrief, score, true)
|
|
|
}
|
|
|
if repeatC != nil && !(*repeatC)[city] {
|
|
|
- PCDScore(j, "city", city, score)
|
|
|
+ PCDScore(j, "city", city, score, true)
|
|
|
(*repeatC)[city] = true
|
|
|
} else if repeatC == nil {
|
|
|
- PCDScore(j, "city", city, score)
|
|
|
+ PCDScore(j, "city", city, score, true)
|
|
|
}
|
|
|
if repeatD != nil && !(*repeatD)[tmpPbrief] {
|
|
|
- PCDScore(j, "district", district.Name, score)
|
|
|
+ PCDScore(j, "district", district.Name, score, true)
|
|
|
(*repeatD)[district.Name] = true
|
|
|
} else if repeatD == nil {
|
|
|
- PCDScore(j, "district", district.Name, score)
|
|
|
+ PCDScore(j, "district", district.Name, score, true)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- // repeatPb := map[string]bool{}
|
|
|
- // repeatCb := map[string]bool{}
|
|
|
- // repeatDb := map[string]bool{}
|
|
|
- // for _, district := range districts {
|
|
|
- // tmpDistrict := district.Name
|
|
|
- // tmpCity := district.C.Name
|
|
|
- // tmpPbrief := district.C.P.Brief
|
|
|
- // if !repeatPb[tmpPbrief] {
|
|
|
- // PCDScore(j, "province", tmpPbrief, score)
|
|
|
- // repeatPb[tmpPbrief] = true
|
|
|
- // }
|
|
|
- // if !repeatCb[tmpCity] {
|
|
|
- // PCDScore(j, "city", tmpCity, score)
|
|
|
- // repeatCb[tmpCity] = true
|
|
|
- // }
|
|
|
- // if !repeatDb[tmpDistrict] {
|
|
|
- // PCDScore(j, "district", tmpDistrict, score)
|
|
|
- // repeatDb[tmpDistrict] = true
|
|
|
- // }
|
|
|
- // }
|
|
|
}
|
|
|
|
|
|
func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
|
|
@@ -736,7 +771,7 @@ func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcit
|
|
|
district = d
|
|
|
return city, district
|
|
|
}
|
|
|
- } /*else { //多个city
|
|
|
+ } else { //多个city
|
|
|
for _, tc := range tmpcity { //多个city根据district最高分取
|
|
|
if tc == c.Name && len(finishD) == 1 {
|
|
|
city = c.Name
|
|
@@ -744,27 +779,14 @@ func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcit
|
|
|
return city, district
|
|
|
}
|
|
|
}
|
|
|
- }*/
|
|
|
-
|
|
|
- // if len(citys) == 1 { //区对应一个市
|
|
|
- // if c.P.Brief == area {
|
|
|
- // district = d
|
|
|
- // city = c.Name
|
|
|
- // return city, district
|
|
|
- // }
|
|
|
- // } else {
|
|
|
- // if c.P.Brief == area && c.Name == city {
|
|
|
- // district = d
|
|
|
- // return city, district
|
|
|
- // }
|
|
|
- // }
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
return city, district
|
|
|
}
|
|
|
|
|
|
//计算province,city,district区或县匹配的得分
|
|
|
-func PCDScoreByDistrictSim(stype, t string, score int, ps, cs, ds *map[string]int) {
|
|
|
+func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
|
|
|
defer qu.Catch()
|
|
|
if t != "" {
|
|
|
if stype == "d" {
|
|
@@ -780,19 +802,98 @@ func PCDScoreByDistrictSim(stype, t string, score int, ps, cs, ds *map[string]in
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]int) {
|
|
|
- if len(j.AreaScore) > 0 {
|
|
|
+func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
|
|
|
+ if len(j.FullAreaScore) > 0 {
|
|
|
for pt, ps := range *pscore {
|
|
|
- j.AreaScore[pt] = j.AreaScore[pt] + ps
|
|
|
+ j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
|
|
|
}
|
|
|
for ct, cs := range *cscore {
|
|
|
- j.CityScore[ct] = j.CityScore[ct] + cs
|
|
|
+ j.FullCityScore[ct] = j.FullCityScore[ct] + cs
|
|
|
}
|
|
|
for dt, ds := range *dscore {
|
|
|
- j.DistrictScore[dt] = j.DistrictScore[dt] + ds
|
|
|
+ j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+func MergeFullSimScore(j *ju.Job) {
|
|
|
+ if len(j.FullAreaScore) == 0 {
|
|
|
+ j.FullAreaScore = j.SimAreaScore
|
|
|
+ } else {
|
|
|
+ for p_text, p_score := range j.FullAreaScore {
|
|
|
+ j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for c_text, c_score := range j.SimCityScore {
|
|
|
+ j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
|
|
|
+ }
|
|
|
+
|
|
|
+ for d_text, d_score := range j.SimDistrictScore {
|
|
|
+ j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
|
|
|
+ }
|
|
|
+ // if len(j.FullCityScore) == 0 {
|
|
|
+ // j.FullCityScore = j.SimCityScore
|
|
|
+ // } else {
|
|
|
+ // for c_text, c_score := range j.FullCityScore {
|
|
|
+ // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // if len(j.FullDistrictScore) == 0 {
|
|
|
+ // j.FullDistrictScore = j.SimDistrictScore
|
|
|
+ // } else {
|
|
|
+ // for d_text, d_score := range j.FullDistrictScore {
|
|
|
+ // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+}
|
|
|
+
|
|
|
+func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
|
|
|
+ if len(j.FullDistrictScore) > 0 {
|
|
|
+ for d, _ := range j.FullDistrictScore {
|
|
|
+ tmpCitys := e.NewDistrictCityMap[d]
|
|
|
+ for _, c := range tmpCitys {
|
|
|
+ if j.FullCityScore[c.Name] != 0 {
|
|
|
+ tmpPb := c.P.Brief
|
|
|
+ //if j.FullAreaScore[tmpPb] != 0 {
|
|
|
+ flag := false
|
|
|
+ for _, p := range finishP {
|
|
|
+ if tmpPb == p {
|
|
|
+ flag = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if !flag {
|
|
|
+ delete(j.FullCityScore, c.Name)
|
|
|
+ delete(j.FullDistrictScore, d)
|
|
|
+ }
|
|
|
+ //}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if len(j.FullCityScore) > 0 {
|
|
|
+ for tmpcity, _ := range j.FullCityScore {
|
|
|
+ c := e.CityFullMap[tmpcity]
|
|
|
+ if c == nil {
|
|
|
+ qu.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ tmpPb := c.P.Brief
|
|
|
+ //if j.FullAreaScore[tmpPb] != 0 {
|
|
|
+ flag := false
|
|
|
+ for _, p := range finishP {
|
|
|
+ if tmpPb == p {
|
|
|
+ flag = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if !flag {
|
|
|
+ delete(j.FullCityScore, tmpcity)
|
|
|
+ }
|
|
|
+ //}
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
|
|
|
//province,city,district干扰项减分
|
|
|
//func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
|