|
@@ -0,0 +1,226 @@
|
|
|
|
+package extract
|
|
|
|
+
|
|
|
|
+import ju "jy/util"
|
|
|
|
+
|
|
|
|
+// 未提取到标准地域,进行补充
|
|
|
|
+func (e *ExtractTask) ExtractRegionOtherInfo(j *ju.Job, tmp *map[string]interface{}) {
|
|
|
|
+ e.GetMatchScores(j)
|
|
|
|
+ finishA, finishC, finishD := []string{}, []string{}, []string{}
|
|
|
|
+ if len(j.FullAreaScore) > 0 {
|
|
|
|
+ finishA = GetHighestScoreArr(j.FullAreaScore)
|
|
|
|
+ e.RemoveScoreRegion(finishA, j)
|
|
|
|
+ finishC = GetHighestScoreArr(j.FullCityScore)
|
|
|
|
+ finishD = GetHighestScoreArr(j.FullDistrictScore)
|
|
|
|
+ } else {
|
|
|
|
+ finishA = GetHighestScoreArr(j.SimAreaScore)
|
|
|
|
+ e.RemoveScoreRegion(finishA, j)
|
|
|
|
+ finishC = GetHighestScoreArr(j.SimCityScore)
|
|
|
|
+ finishD = GetHighestScoreArr(j.SimDistrictScore)
|
|
|
|
+ }
|
|
|
|
+ s_area, s_city, s_district := e.GetFinallyScoreRegion(finishA, finishC, finishD)
|
|
|
|
+ e.StandardizedegionInfo(&s_area, &s_city, &s_district)
|
|
|
|
+ //对于补充的地域信息进行标准化校验......
|
|
|
|
+ if s_area != "" && s_area != "全国" {
|
|
|
|
+ (*tmp)["s_area"] = s_area
|
|
|
|
+ (*tmp)["s_city"] = s_city
|
|
|
|
+ (*tmp)["s_district"] = s_district
|
|
|
|
+ s_rdata := e.StandardCheckCity(s_area, s_city, s_district)
|
|
|
|
+ for k, v := range s_rdata {
|
|
|
|
+ (*tmp)["s_"+k] = v
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// 获取所有匹配地域并赋分数
|
|
|
|
+func (e *ExtractTask) GetMatchScores(j *ju.Job) {
|
|
|
|
+ j.FullAreaScore, j.FullCityScore, j.FullDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
|
|
|
|
+ j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
|
|
|
|
+ rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
|
|
|
|
+ rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
|
|
|
|
+ for _, text := range e.Seg_SV.Cut(j.Content, true) {
|
|
|
|
+ if text == "" {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
|
+ if trie_full.Get(text) {
|
|
|
|
+ if pos_full == 0 { //省全称
|
|
|
|
+ if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !rf_area[tmpPbrief] { //取简称
|
|
|
|
+ j.FullAreaScore[tmpPbrief] += 1.0
|
|
|
|
+ rf_area[tmpPbrief] = true
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ } else if pos_full == 1 { //市全称
|
|
|
|
+ if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
|
+ if !rf_area[cfMap.P.Brief] {
|
|
|
|
+ j.FullAreaScore[cfMap.P.Brief] += 1.0
|
|
|
|
+ rf_area[cfMap.P.Brief] = true
|
|
|
|
+ }
|
|
|
|
+ if !rf_city[cfMap.Name] {
|
|
|
|
+ j.FullCityScore[cfMap.Name] += 1.0
|
|
|
|
+ rf_city[cfMap.Name] = true
|
|
|
|
+ }
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ } else if pos_full == 2 { //区全称
|
|
|
|
+ citys := e.DistrictCityMap[text]
|
|
|
|
+ if len(citys) > 0 {
|
|
|
|
+ if !rf_district[text] {
|
|
|
|
+ j.FullDistrictScore[text] += 1.0
|
|
|
|
+ rf_district[text] = true
|
|
|
|
+ }
|
|
|
|
+ for _, c := range citys {
|
|
|
|
+ if !rf_city[c.Name] {
|
|
|
|
+ j.FullCityScore[c.Name] += 1.0
|
|
|
|
+ rf_city[c.Name] = true
|
|
|
|
+ }
|
|
|
|
+ if !rf_area[c.P.Brief] {
|
|
|
|
+ j.FullAreaScore[c.P.Brief] += 1.0
|
|
|
|
+ rf_area[c.P.Brief] = true
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ //简称匹配
|
|
|
|
+ for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
|
+ if trie_sim.Get(text) {
|
|
|
|
+ if pos_sim == 0 && !rs_area[text] { //省简称
|
|
|
|
+ j.SimAreaScore[text] += 1.0
|
|
|
|
+ rs_area[text] = true
|
|
|
|
+ break
|
|
|
|
+ } else if pos_sim == 1 { //市简称
|
|
|
|
+ if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
|
+ if !rs_area[cbMap.P.Brief] {
|
|
|
|
+ j.SimAreaScore[cbMap.P.Brief] += 1.0
|
|
|
|
+ rs_area[cbMap.P.Brief] = true
|
|
|
|
+ }
|
|
|
|
+ if !rs_city[cbMap.Name] {
|
|
|
|
+ j.SimCityScore[cbMap.Name] += 1.0
|
|
|
|
+ rs_city[cbMap.Name] = true
|
|
|
|
+ }
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ } else if pos_sim == 2 { //区简称
|
|
|
|
+ dfull_citys := e.DistrictSimAndAll[text]
|
|
|
|
+ if len(dfull_citys) == 1 {
|
|
|
|
+ for _, dfull_city := range dfull_citys {
|
|
|
|
+ for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
|
|
|
|
+ if ctmp == nil {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ if !rs_district[dfull] {
|
|
|
|
+ j.SimDistrictScore[dfull] += 1.0
|
|
|
|
+ rs_district[dfull] = true
|
|
|
|
+ }
|
|
|
|
+ if !rs_city[ctmp.Name] {
|
|
|
|
+ j.SimCityScore[ctmp.Name] += 1.0
|
|
|
|
+ rs_city[ctmp.Name] = true
|
|
|
|
+ }
|
|
|
|
+ if !rs_area[ctmp.P.Brief] {
|
|
|
|
+ j.SimAreaScore[ctmp.P.Brief] += 1.0
|
|
|
|
+ rs_area[ctmp.P.Brief] = true
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// 获取最高分地域
|
|
|
|
+func GetHighestScoreArr(m map[string]float64) []string {
|
|
|
|
+ result := make(map[float64][]string)
|
|
|
|
+ tmpscore := 0.0
|
|
|
|
+ for str, score := range m {
|
|
|
|
+ if str != "" && tmpscore <= score {
|
|
|
|
+ if result[tmpscore] != nil && tmpscore != score {
|
|
|
|
+ delete(result, tmpscore)
|
|
|
|
+ }
|
|
|
|
+ if r := result[score]; r != nil {
|
|
|
|
+ r = append(r, str)
|
|
|
|
+ result[score] = r
|
|
|
|
+ } else {
|
|
|
|
+ result[score] = []string{str}
|
|
|
|
+ }
|
|
|
|
+ tmpscore = score
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return result[tmpscore]
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// 移除干扰地域
|
|
|
|
+func (e *ExtractTask) RemoveScoreRegion(finishP []string, j *ju.Job) {
|
|
|
|
+ if len(j.FullDistrictScore) > 0 {
|
|
|
|
+ for d, _ := range j.FullDistrictScore {
|
|
|
|
+ tmpCitys := e.DistrictCityMap[d]
|
|
|
|
+ for _, c := range tmpCitys {
|
|
|
|
+ if j.FullCityScore[c.Name] != 0 {
|
|
|
|
+ tmpPb := c.P.Brief
|
|
|
|
+ flag := false
|
|
|
|
+ for _, p := range finishP {
|
|
|
|
+ if tmpPb == p {
|
|
|
|
+ flag = true
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if !flag {
|
|
|
|
+ delete(j.FullCityScore, c.Name)
|
|
|
|
+ delete(j.FullDistrictScore, d)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if len(j.FullCityScore) > 0 {
|
|
|
|
+ for tmpcity, _ := range j.FullCityScore {
|
|
|
|
+ c := e.CityFullMap[tmpcity]
|
|
|
|
+ if c == nil {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ tmpPb := c.P.Brief
|
|
|
|
+ flag := false
|
|
|
|
+ for _, p := range finishP {
|
|
|
|
+ if tmpPb == p {
|
|
|
|
+ flag = true
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if !flag {
|
|
|
|
+ delete(j.FullCityScore, tmpcity)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// 获取最后分数地域
|
|
|
|
+func (e *ExtractTask) GetFinallyScoreRegion(finishA, finishC, finishD []string) (string, string, string) {
|
|
|
|
+ s_area, s_city, s_district := "", "", ""
|
|
|
|
+ tmpcity := []string{}
|
|
|
|
+ if len(finishA) == 1 {
|
|
|
|
+ s_area = finishA[0]
|
|
|
|
+ s_city, tmpcity = NewGetCity(s_area, s_city, e, finishC, tmpcity)
|
|
|
|
+ s_city, s_district = NewGetDistrict(s_area, s_city, s_district, e, finishD, tmpcity)
|
|
|
|
+ } else if len(finishA) > 1 {
|
|
|
|
+ if len(finishC) == 1 {
|
|
|
|
+ s_city = finishC[0]
|
|
|
|
+ if cfMap := e.CityFullMap[s_city]; cfMap != nil {
|
|
|
|
+ s_area = cfMap.P.Brief
|
|
|
|
+ tmpcity = append(tmpcity, s_city)
|
|
|
|
+ s_city, s_district = NewGetDistrict(s_area, s_city, s_district, e, finishD, tmpcity)
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ s_area = finishA[0] //抽取结果直接赋值
|
|
|
|
+ s_city, tmpcity = NewGetCity(s_area, s_city, e, finishC, tmpcity)
|
|
|
|
+ s_city, s_district = NewGetDistrict(s_area, s_city, s_district, e, finishD, tmpcity)
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if s_city != "" && s_city == s_district {
|
|
|
|
+ s_district = ""
|
|
|
|
+ }
|
|
|
|
+ return s_area, s_city, s_district
|
|
|
|
+}
|