|
@@ -4,9 +4,12 @@ import (
|
|
|
. "jy/pretreated"
|
|
|
ju "jy/util"
|
|
|
qu "qfw/util"
|
|
|
+ "regexp"
|
|
|
"strings"
|
|
|
)
|
|
|
|
|
|
+var AgencyReg = regexp.MustCompile("((代理机构|中标供应商).{0,30}|.{2,15}((招标)?代理|咨询|政府采购))")
|
|
|
+
|
|
|
//抽取city
|
|
|
func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
|
|
|
/*
|
|
@@ -92,26 +95,38 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}
|
|
|
cityresult = finishC[0]
|
|
|
if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
|
|
|
arearesult = cfMap.P.Brief
|
|
|
+ tmpcity = append(tmpcity, cityresult)
|
|
|
cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
}
|
|
|
} else { //对应的city有多个(多个province和city)
|
|
|
- arearesult = finishP[0] //抽取结果直接赋值
|
|
|
- cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
|
|
|
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
+ //arearesult = finishP[0] //抽取结果直接赋值
|
|
|
+ //cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
|
|
|
+ //cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
+ arearesult = "全国"
|
|
|
}
|
|
|
}
|
|
|
- if cityresult == districtresult {
|
|
|
+ if cityresult != "" && cityresult == districtresult {
|
|
|
districtresult = ""
|
|
|
}
|
|
|
//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
|
|
|
+ //直辖市
|
|
|
+ if arearesult == "北京" {
|
|
|
+ cityresult = "北京市"
|
|
|
+ } else if arearesult == "天津" {
|
|
|
+ cityresult = "天津市"
|
|
|
+ } else if arearesult == "上海" {
|
|
|
+ cityresult = "上海市"
|
|
|
+ } else if arearesult == "重庆" {
|
|
|
+ cityresult = "重庆市"
|
|
|
+ }
|
|
|
if arearesult == "" {
|
|
|
arearesult = "全国"
|
|
|
- } else if cityresult == "" {
|
|
|
+ } /* else if cityresult == "" {
|
|
|
if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
|
|
|
cityresult = pbMap.Cap
|
|
|
resulttmp["defaultpcap"] = true
|
|
|
}
|
|
|
- }
|
|
|
+ }*/
|
|
|
//qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
|
|
|
resulttmp["area"] = arearesult
|
|
|
resulttmp["city"] = cityresult
|
|
@@ -382,7 +397,6 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
} else if pos_full == 1 && c_full == "" { //市全称
|
|
|
if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
tmpPbrief := cfMap.P.Brief
|
|
|
- //qu.Debug("市--------", text, tmpPbrief, p_full)
|
|
|
if p_full == "" {
|
|
|
p_full = tmpPbrief
|
|
|
c_full = cfMap.Name
|
|
@@ -398,7 +412,6 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
}
|
|
|
}
|
|
|
} else if pos_full == 2 && d_full == "" { //区全称
|
|
|
- //qu.Debug("区全称===========")
|
|
|
repeatPb := map[string]bool{}
|
|
|
isOk := false
|
|
|
districtOk := false
|
|
@@ -449,10 +462,14 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
}
|
|
|
} else if pos_full == 3 { //街道全称
|
|
|
districts := e.NewStreetDistrictMap[text]
|
|
|
- DealMultipleDistrict(e, j, districts, 2, p_full)
|
|
|
+ if len(districts) == 1 { //街道唯一
|
|
|
+ DealMultipleDistrict(e, j, districts, 2, p_full, nil, nil, nil)
|
|
|
+ }
|
|
|
} else if pos_full == 4 { //居委会全称
|
|
|
districts := e.CommunityDistrictMap[text]
|
|
|
- DealMultipleDistrict(e, j, districts, 2, p_full)
|
|
|
+ if len(districts) == 1 { //居委会唯一
|
|
|
+ DealMultipleDistrict(e, j, districts, 2, p_full, nil, nil, nil)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -477,58 +494,46 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
c_sim = cbMap.Brief
|
|
|
PCDScore(j, "city", cbMap.Name, 3)
|
|
|
break
|
|
|
- } else if p_sim != "" && p_sim != tmpPbrief {
|
|
|
- //city不做处理
|
|
|
+ } else if p_sim != "" && p_sim != tmpPbrief { //北京师范大学广州实验学校
|
|
|
+ PCDScore(j, "province", tmpPbrief, 1)
|
|
|
+ PCDScore(j, "city", cbMap.Name, 1)
|
|
|
}
|
|
|
}
|
|
|
} else if pos_sim == 2 && d_sim == "" { //区简称
|
|
|
repeatPb := map[string]bool{}
|
|
|
repeatDb := map[string]bool{}
|
|
|
dfull_citys := e.NewDistrictSimAndAll[text]
|
|
|
- //qu.Debug(text, dfull_citys, p_sim)
|
|
|
for _, dfull_city := range dfull_citys {
|
|
|
for dfull, c := range dfull_city { //dfull:简称对应的全称
|
|
|
tmpPbrief := c.P.Brief
|
|
|
if p_sim == tmpPbrief { //省份一致
|
|
|
d_sim = text
|
|
|
- //PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
|
|
|
PCDScore(j, "district", dfull, 2)
|
|
|
if c_sim == "" {
|
|
|
c_sim = c.Brief
|
|
|
- //PCDScoreByDistrictSim("c", c.Name, 2, pscore, cscore, dscore)
|
|
|
PCDScore(j, "city", c.Name, 2)
|
|
|
}
|
|
|
} else if p_sim == "" {
|
|
|
if !repeatDb[dfull] {
|
|
|
PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
|
|
|
- //PCDScore(j, "district", dfull, 1)
|
|
|
repeatDb[dfull] = true
|
|
|
}
|
|
|
if len(dfull_citys) == 1 {
|
|
|
- //p_sim = tmpPbrief
|
|
|
- //c_sim = c.Brief
|
|
|
- //d_sim = text
|
|
|
PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
|
|
|
PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
|
|
|
- //PCDScore(j, "province", p_sim, 2)
|
|
|
- //PCDScore(j, "city", c.Name, 2)
|
|
|
} else {
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
|
|
|
- //PCDScore(j, "province", tmpPbrief, 1)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
- //PCDScore(j, "city", c.Name, 1)
|
|
|
PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
|
|
|
}
|
|
|
} else if p_sim != "" && p_sim != tmpPbrief {
|
|
|
if !repeatPb[tmpPbrief] {
|
|
|
PCDScoreByDistrictSim("p", tmpPbrief, -5, pscore, cscore, dscore)
|
|
|
- //PCDScore(j, "province", tmpPbrief, -5)
|
|
|
repeatPb[tmpPbrief] = true
|
|
|
}
|
|
|
PCDScoreByDistrictSim("c", c.Name, -5, pscore, cscore, dscore)
|
|
|
- //PCDScore(j, "city", c.Name, -5)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -541,6 +546,9 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
|
|
|
}
|
|
|
|
|
|
func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
+ repeatP := map[string]bool{}
|
|
|
+ repeatC := map[string]bool{}
|
|
|
+ repeatD := map[string]bool{}
|
|
|
detailRune := []rune(j.Content)
|
|
|
detail := j.Content
|
|
|
if len(detailRune) > 600 {
|
|
@@ -548,42 +556,59 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
end := detailRune[len(detailRune)-300:]
|
|
|
detail = string(start) + string(end)
|
|
|
}
|
|
|
+ detail = AgencyReg.ReplaceAllString(detail, "")
|
|
|
for _, text := range e.Seg_SV.Cut(detail, true) {
|
|
|
if len([]rune(text)) > 1 {
|
|
|
//全称匹配
|
|
|
for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
if trie_full.Get(text) {
|
|
|
if pos_full == 0 { //省全称
|
|
|
- if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
|
|
|
+ if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP[tmpPbrief] { //取简称
|
|
|
PCDScore(j, "province", tmpPbrief, 1)
|
|
|
+ repeatP[tmpPbrief] = true
|
|
|
break
|
|
|
}
|
|
|
} else if pos_full == 1 { //市全称
|
|
|
if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
- PCDScore(j, "province", cfMap.P.Brief, 1)
|
|
|
- PCDScore(j, "city", cfMap.Name, 1)
|
|
|
+ if !repeatP[cfMap.P.Brief] {
|
|
|
+ PCDScore(j, "province", cfMap.P.Brief, 1)
|
|
|
+ repeatP[cfMap.P.Brief] = true
|
|
|
+ }
|
|
|
+ if !repeatC[cfMap.Name] {
|
|
|
+ PCDScore(j, "city", cfMap.Name, 1)
|
|
|
+ repeatC[cfMap.Name] = true
|
|
|
+ }
|
|
|
break
|
|
|
}
|
|
|
} else if pos_full == 2 { //区全称
|
|
|
citys := e.NewDistrictCityMap[text]
|
|
|
if len(citys) > 0 {
|
|
|
- repeatPb := map[string]bool{}
|
|
|
- PCDScore(j, "district", text, 1)
|
|
|
+ if !repeatD[text] {
|
|
|
+ PCDScore(j, "district", text, 1)
|
|
|
+ repeatD[text] = true
|
|
|
+ }
|
|
|
for _, c := range citys {
|
|
|
- PCDScore(j, "city", c.Name, 1)
|
|
|
- if !repeatPb[c.P.Brief] {
|
|
|
+ if !repeatC[c.Name] {
|
|
|
+ PCDScore(j, "city", c.Name, 1)
|
|
|
+ repeatC[c.Name] = true
|
|
|
+ }
|
|
|
+ if !repeatP[c.P.Brief] {
|
|
|
PCDScore(j, "province", c.P.Brief, 1)
|
|
|
- repeatPb[c.P.Brief] = true
|
|
|
+ repeatP[c.P.Brief] = true
|
|
|
}
|
|
|
}
|
|
|
break
|
|
|
}
|
|
|
} else if pos_full == 3 { //街道全称
|
|
|
districts := e.NewStreetDistrictMap[text]
|
|
|
- DealMultipleDistrict(e, j, districts, 1, "")
|
|
|
+ if len(districts) == 1 {
|
|
|
+ DealMultipleDistrict(e, j, districts, 1, "", &repeatP, &repeatC, &repeatD)
|
|
|
+ }
|
|
|
} else if pos_full == 4 { //居委会全称
|
|
|
districts := e.CommunityDistrictMap[text]
|
|
|
- DealMultipleDistrict(e, j, districts, 1, "")
|
|
|
+ if len(districts) == 1 {
|
|
|
+ DealMultipleDistrict(e, j, districts, 1, "", &repeatP, &repeatC, &repeatD)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -591,13 +616,20 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
//简称匹配
|
|
|
for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
if trie_sim.Get(text) {
|
|
|
- if pos_sim == 0 { //省简称
|
|
|
+ if pos_sim == 0 && !repeatP[text] { //省简称
|
|
|
PCDScore(j, "province", text, 1)
|
|
|
+ repeatP[text] = true
|
|
|
break
|
|
|
} else if pos_sim == 1 { //市简称
|
|
|
if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
- PCDScore(j, "city", cbMap.Name, 1)
|
|
|
- PCDScore(j, "province", cbMap.P.Brief, 1)
|
|
|
+ if !repeatP[cbMap.P.Brief] {
|
|
|
+ PCDScore(j, "province", cbMap.P.Brief, 1)
|
|
|
+ repeatP[cbMap.P.Brief] = true
|
|
|
+ }
|
|
|
+ if !repeatC[cbMap.Name] {
|
|
|
+ PCDScore(j, "city", cbMap.Name, 1)
|
|
|
+ repeatC[cbMap.Name] = true
|
|
|
+ }
|
|
|
break
|
|
|
}
|
|
|
} /* else if pos_sim == 2 { //区简称
|
|
@@ -620,60 +652,57 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
}
|
|
|
|
|
|
//街道、居委会对应多地市处理
|
|
|
-func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score int, pbrief string) {
|
|
|
- repeatPb := map[string]bool{}
|
|
|
- repeatCb := map[string]bool{}
|
|
|
- repeatDb := map[string]bool{}
|
|
|
- for _, district := range districts {
|
|
|
- tmpDistrict := district.Name
|
|
|
- tmpCity := district.C.Name
|
|
|
+func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score int, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
|
|
|
+ if len(districts) == 1 {
|
|
|
+ district := districts[0]
|
|
|
+ city := district.C.Name
|
|
|
tmpPbrief := district.C.P.Brief
|
|
|
- if !repeatPb[tmpPbrief] {
|
|
|
+ if pbrief != "" && tmpPbrief == pbrief {
|
|
|
PCDScore(j, "province", tmpPbrief, score)
|
|
|
- repeatPb[tmpPbrief] = true
|
|
|
- }
|
|
|
- if !repeatCb[tmpCity] {
|
|
|
- PCDScore(j, "city", tmpCity, score)
|
|
|
- repeatCb[tmpCity] = true
|
|
|
- }
|
|
|
- if !repeatDb[tmpDistrict] {
|
|
|
- PCDScore(j, "district", tmpDistrict, score)
|
|
|
- repeatDb[tmpDistrict] = true
|
|
|
+ PCDScore(j, "city", city, score)
|
|
|
+ PCDScore(j, "district", district.Name, score)
|
|
|
+ } else if pbrief == "" {
|
|
|
+ if repeatP != nil && !(*repeatP)[tmpPbrief] {
|
|
|
+ PCDScore(j, "province", tmpPbrief, score)
|
|
|
+ (*repeatP)[tmpPbrief] = true
|
|
|
+ } else if repeatP == nil {
|
|
|
+ PCDScore(j, "province", tmpPbrief, score)
|
|
|
+ }
|
|
|
+ if repeatC != nil && !(*repeatC)[city] {
|
|
|
+ PCDScore(j, "city", city, score)
|
|
|
+ (*repeatC)[city] = true
|
|
|
+ } else if repeatC == nil {
|
|
|
+ PCDScore(j, "city", city, score)
|
|
|
+ }
|
|
|
+ if repeatD != nil && !(*repeatD)[tmpPbrief] {
|
|
|
+ PCDScore(j, "district", district.Name, score)
|
|
|
+ (*repeatD)[district.Name] = true
|
|
|
+ } else if repeatD == nil {
|
|
|
+ PCDScore(j, "district", district.Name, score)
|
|
|
+ }
|
|
|
}
|
|
|
- // citys := e.NewDistrictCityMap[tmpDistrict]
|
|
|
- // for _, c := range citys {
|
|
|
- // tmpPbrief := c.P.Brief
|
|
|
- // if pbrief != "" && pbrief == tmpPbrief {
|
|
|
- // if !repeatPb[tmpPbrief] {
|
|
|
- // PCDScore(j, "province", tmpPbrief, score)
|
|
|
- // repeatPb[tmpPbrief] = true
|
|
|
- // }
|
|
|
- // tmpCity := c.Name
|
|
|
- // if !repeatCb[tmpCity] {
|
|
|
- // PCDScore(j, "city", tmpCity, score)
|
|
|
- // repeatCb[tmpCity] = true
|
|
|
- // }
|
|
|
- // if !repeatDb[tmpDistrict] {
|
|
|
- // PCDScore(j, "district", tmpDistrict, score)
|
|
|
- // repeatDb[tmpDistrict] = true
|
|
|
- // }
|
|
|
- // } else if pbrief == "" {
|
|
|
- // if !repeatPb[tmpPbrief] {
|
|
|
- // PCDScore(j, "province", tmpPbrief, score)
|
|
|
- // repeatPb[tmpPbrief] = true
|
|
|
- // }
|
|
|
- // tmpCity := c.Name
|
|
|
- // if !repeatCb[tmpCity] {
|
|
|
- // PCDScore(j, "city", tmpCity, score)
|
|
|
- // repeatCb[tmpCity] = true
|
|
|
- // }
|
|
|
- // if !repeatDb[tmpDistrict] {
|
|
|
- // PCDScore(j, "district", tmpDistrict, score)
|
|
|
- // repeatDb[tmpDistrict] = true
|
|
|
- // }
|
|
|
- // }
|
|
|
- // }
|
|
|
}
|
|
|
+
|
|
|
+ // repeatPb := map[string]bool{}
|
|
|
+ // repeatCb := map[string]bool{}
|
|
|
+ // repeatDb := map[string]bool{}
|
|
|
+ // for _, district := range districts {
|
|
|
+ // tmpDistrict := district.Name
|
|
|
+ // tmpCity := district.C.Name
|
|
|
+ // tmpPbrief := district.C.P.Brief
|
|
|
+ // if !repeatPb[tmpPbrief] {
|
|
|
+ // PCDScore(j, "province", tmpPbrief, score)
|
|
|
+ // repeatPb[tmpPbrief] = true
|
|
|
+ // }
|
|
|
+ // if !repeatCb[tmpCity] {
|
|
|
+ // PCDScore(j, "city", tmpCity, score)
|
|
|
+ // repeatCb[tmpCity] = true
|
|
|
+ // }
|
|
|
+ // if !repeatDb[tmpDistrict] {
|
|
|
+ // PCDScore(j, "district", tmpDistrict, score)
|
|
|
+ // repeatDb[tmpDistrict] = true
|
|
|
+ // }
|
|
|
+ // }
|
|
|
}
|
|
|
|
|
|
func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
|
|
@@ -691,6 +720,7 @@ func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (s
|
|
|
}
|
|
|
return city, tmpcity
|
|
|
}
|
|
|
+
|
|
|
func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
|
|
|
for _, d := range finishD { //取最高分与province匹配的district
|
|
|
citys := e.NewDistrictCityMap[d]
|
|
@@ -706,15 +736,15 @@ func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcit
|
|
|
district = d
|
|
|
return city, district
|
|
|
}
|
|
|
- } else { //多个city
|
|
|
- for _, tc := range tmpcity {
|
|
|
- if tc == c.Name {
|
|
|
+ } /*else { //多个city
|
|
|
+ for _, tc := range tmpcity { //多个city根据district最高分取
|
|
|
+ if tc == c.Name && len(finishD) == 1 {
|
|
|
city = c.Name
|
|
|
district = d
|
|
|
return city, district
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
+ }*/
|
|
|
|
|
|
// if len(citys) == 1 { //区对应一个市
|
|
|
// if c.P.Brief == area {
|