Forráskód Böngészése

抽取备份,修改逻辑

zhengkun 1 éve
szülő
commit
4d29c7d023

+ 2 - 1
extcity/src/ext/extRegion.go

@@ -221,7 +221,7 @@ func (e *ExtractTask) GetRegionByTentativeJsonData(jsondata map[string]interface
 	regions := map[string]map[string]map[string]string{}
 	if jsondata != nil {
 		if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
-			e.GetRegionFromText(a_c_d, &regions, false, false, 1)
+			e.GetRegionFromText(a_c_d, &regions, false, false, 2)
 		}
 	}
 	if len(regions) == 1 {
@@ -379,6 +379,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
+		//校验...
 		for _, v := range fullCityArr {
 			if cityMap := e.CityFullMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {

+ 18 - 2
src/jy/extract/extractcity_new.go

@@ -148,7 +148,7 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, jf *ju.Job, tmp *map[string]i
 		})
 	}
 
-	//文本正文-提取补充
+	//文本正文-提取补充-敏感词识别...
 	if f_area == "全国" || f_area == "" || f_city == "" {
 		if b := e.NewVerifySensitiveInfo(j.Title+"\n"+j.Content, &f_area, &f_city, &f_district); b && isLog {
 			LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
@@ -531,7 +531,8 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全程省份
 	if *area == "" || *area == "全国" {
 		fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
-		if len(fullProvinceArr) == 1 {
+		if len(fullProvinceArr) == 1 { //再次计算
+			fullProvinceArr = e.findAmbiguityRegion(detail, fullProvinceArr[0])
 			for _, v := range fullProvinceArr {
 				if sim_province := e.ProvinceMap[v]; sim_province != "" {
 					*area = sim_province
@@ -543,6 +544,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
+		fullCityArr = e.findAmbiguityRegion(detail, fullCityArr[0])
 		for _, v := range fullCityArr {
 			if cityMap := e.CityFullMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
@@ -556,6 +558,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称区县
 	fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
 	if len(fullDistrictArr) == 1 {
+		fullDistrictArr = e.findAmbiguityRegion(detail, fullDistrictArr[0])
 		for _, v := range fullDistrictArr {
 			if citys := e.DistrictCityMap[v]; len(citys) == 1 {
 				if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
@@ -570,6 +573,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//简称城市
 	simCityArr := e.SensitiveSimCity.FindAll(detail)
 	if len(simCityArr) == 1 {
+		simCityArr = e.findAmbiguityRegion(detail, simCityArr[0])
 		for _, v := range simCityArr {
 			if cityMap := e.CityBriefMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" {
@@ -591,6 +595,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 		simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
 		if len(simProvinceArr) == 1 {
+			simProvinceArr = e.findAmbiguityRegion(detail, simProvinceArr[0])
 			for _, v := range simProvinceArr {
 				if v != "" {
 					*area = v
@@ -683,6 +688,17 @@ func (e *ExtractTask) GetRegionByTentativeOperator(winner string, all_regions *m
 	}
 }
 
+// 通用方法找到指定地域有效词组
+func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
+	wordsArr := e.Seg_Full.Cut(text, true)
+	for _, word := range wordsArr {
+		if word == key {
+			return []string{key}
+		}
+	}
+	return []string{}
+}
+
 //初步确认~站点
 //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
 //	area, city, district := "", "", ""

+ 0 - 1054
src/jy/extract/extractcity_old.go

@@ -1,1054 +0,0 @@
-package extract
-
-import (
-	. "jy/pretreated"
-	ju "jy/util"
-	qu "qfw/util"
-	"strings"
-
-	log "github.com/donnie4w/go-logger/logger"
-)
-
-//抽取city
-func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}) {
-	/*
-		高准确率:
-			1.爬虫数据jsondata
-			2.采购单位库
-			3.邮编
-			4.固话
-			5.site(todo)
-		低准确率:(全称库匹配到不走简称库)
-			1.city全称库(buyeraddr;title,projectname)
-			2.city简称库(buyeraddr;title,projectname)
-	*/
-	defer qu.Catch()
-
-	//初始化
-	if j.FullAreaScore == nil {
-		j.FullAreaScore = make(map[string]float64)
-	}
-	if j.FullCityScore == nil {
-		j.FullCityScore = make(map[string]float64)
-	}
-	if j.FullDistrictScore == nil {
-		j.FullDistrictScore = make(map[string]float64)
-	}
-	if j.SimAreaScore == nil {
-		j.SimAreaScore = make(map[string]float64)
-	}
-	if j.SimCityScore == nil {
-		j.SimCityScore = make(map[string]float64)
-	}
-	if j.SimDistrictScore == nil {
-		j.SimDistrictScore = make(map[string]float64)
-	}
-	//记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
-	pscore := make(map[string]float64)
-	cscore := make(map[string]float64)
-	dscore := make(map[string]float64)
-
-	sm := NewSortMap()
-	//1.jsondata抽取
-	e.NewGetCityByJsonData(j)
-	//2.site库抽取
-	e.NewGetCityBySite(j)
-	//3.采购单位库抽取(暂时没有采购单位库)
-
-	//4.postcode邮编抽取
-	buyerzipcode := qu.ObjToString((*resulttmp)["buyerzipcode"])
-	e.NewGetCityByPostCode(j, buyerzipcode)
-	//5.areacode固话区号抽取
-	buyertel := qu.ObjToString((*resulttmp)["buyertel"])
-	e.NewGetCityByAreaCode(j, buyertel)
-	//6.buyeraddr,title,projectname抽取
-	buyeraddr := qu.ObjToString((*resulttmp)["buyeraddr"])
-	title := qu.ObjToString((*resulttmp)["title"])
-	projectname := qu.ObjToString((*resulttmp)["projectname"])
-	buyer := qu.ObjToString((*resulttmp)["buyer"])
-	addressing := qu.ObjToString((*resulttmp)["addressing"])
-	sm.AddKey("buyeraddr", buyeraddr)
-	sm.AddKey("buyer", buyer)
-	sm.AddKey("title", title)
-	sm.AddKey("projectname", projectname)
-	sm.AddKey("addressing", addressing) //新增地址辅助字段
-	if projectaddr, isok := (*resulttmp)["projectaddr"].(string); isok {
-		sm.AddKey("projectaddr", projectaddr)
-	}
-	if bidopenaddress, isok := (*resulttmp)["bidopenaddress"].(string); isok {
-		sm.AddKey("bidopenaddress", bidopenaddress)
-	}
-	//7.buyeraddr buyer title projectname抽取
-	e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
-	//qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-	//qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
-	//全称简称得分合并
-	MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
-	//qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-	//合并区简称得分
-	//qu.Debug("pcd=====", pscore, cscore, dscore)
-	MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
-	//qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-
-	j.SimAreaScore = map[string]float64{}
-	j.SimCityScore = map[string]float64{}
-	j.SimDistrictScore = map[string]float64{}
-
-	//8.detail抽取
-	if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
-		e.NewGetCityByDetail(j)
-	}
-	//qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-	//qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
-	MergeFullSimScore(j) //合并detail的全简称
-	//qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-
-	finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
-	e.RemoveCD(finishP, j)                      //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
-	//qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
-	//获取结果
-	finishC := HighestScoreArr(j.FullCityScore)
-	finishD := HighestScoreArr(j.FullDistrictScore)
-	arearesult := ""
-	cityresult := ""
-	districtresult := ""
-	tmpcity := []string{}
-	if len(finishP) == 1 { //最高分一个
-		arearesult = finishP[0] //抽取结果直接赋值
-		cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
-		cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-	} else if len(finishP) > 1 { //province最高分多个
-		if len(finishC) == 1 {
-			cityresult = finishC[0]
-			if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
-				arearesult = cfMap.P.Brief
-				tmpcity = append(tmpcity, cityresult)
-				cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-			}
-		} else { //对应的city有多个(多个province和city)
-			//arearesult = finishP[0] //抽取结果直接赋值
-			//cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
-			//cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-			arearesult = "全国"
-		}
-	}
-	if cityresult != "" && cityresult == districtresult {
-		districtresult = ""
-	}
-	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
-	//直辖市
-	if arearesult == "北京" {
-		cityresult = "北京市"
-		if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
-			districtresult = "朝阳区"
-		}
-	} else if arearesult == "天津" {
-		cityresult = "天津市"
-	} else if arearesult == "上海" {
-		cityresult = "上海市"
-	} else if arearesult == "重庆" {
-		cityresult = "重庆市"
-	}
-	if arearesult == "" {
-		arearesult = "全国"
-	} /* else if cityresult == "" {
-		if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
-			cityresult = pbMap.Cap
-			resulttmp["defaultpcap"] = true
-		}
-	}*/
-	//qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
-	(*resulttmp)["area"] = arearesult
-	(*resulttmp)["city"] = cityresult
-	(*resulttmp)["district"] = districtresult
-
-	//校验-映射新疆兵团
-	if XjbtReg.MatchString(buyer) && cityresult == "" {
-		a, c, d, ok := e.CheckingXjbtCity(buyer)
-		if ok {
-			(*resulttmp)["area"] = a
-			(*resulttmp)["city"] = c
-			(*resulttmp)["district"] = d
-		}
-	}
-
-	//如果-仅有省份-敏感词-校验核对方法
-	if arearesult != "全国" && cityresult == "" {
-		sensitive_city := e.SensitiveCityData(qu.ObjToString((*j.Data)["detail"]), arearesult)
-		if sensitive_city != "" {
-			(*resulttmp)["city"] = sensitive_city
-			(*resulttmp)["is_sensitive"] = 1
-		}
-	}
-
-}
-
-//jsondata中抽取城市
-func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
-	defer qu.Catch()
-	if j.Jsondata != nil {
-		jsondata := *j.Jsondata
-		//jsondata中获取province和city
-		if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
-			p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
-			GetByACDSimJb(p, c, d, a_c_d, e, j)            //简称匹配
-		}
-		city, _ = jsondata["city"].(string)         //city全称或者简称
-		province, _ = jsondata["area"].(string)     //province简称
-		district, _ = jsondata["district"].(string) //district全称
-	}
-	PCDScore(j, "district", district, 5, true) //district打分
-	bp := false
-	if province != "" {
-		if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
-			bp = true //省份正确
-		}
-	}
-	pbrief := ""
-	if city != "" {
-		cityfullmap := e.CityFullMap[city] //判断city全称是否正确
-		if cityfullmap != nil {
-			pbrief = cityfullmap.P.Brief //province简称
-		} else {
-			citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
-			if citybriefmap != nil {
-				city = citybriefmap.Name //city简称替换为全称
-				pbrief = citybriefmap.P.Brief
-			}
-		}
-	}
-	if bp {
-		if pbrief == province { //爬虫的province和city匹配
-			PCDScore(j, "city", city, 5, true)
-		} else { //pbrief不匹配province(此时city为空或者错误)
-			city = ""
-		}
-		PCDScore(j, "province", province, 5, true)
-	} else { //省份错误或为空,取city的对应的pbrief为province
-		if pbrief != "" {
-			province = pbrief
-			PCDScore(j, "province", province, 5, true)
-			PCDScore(j, "city", city, 5, true)
-		} else {
-			province = ""
-			city = ""
-		}
-	}
-	return
-
-}
-
-//全称从area_city_district中抽城市
-func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
-	text := e.Seg_PCD.Cut(a_c_d, true)
-	repeatPb := map[string]bool{}
-	for _, full := range text {
-		if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
-			if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
-				pbrief = tmpPbrief //省简称
-				PCDScore(j, "province", pbrief, 5, true)
-			}
-		} else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
-			if cfMap := e.CityFullMap[full]; cfMap != nil {
-				tmpcity := cfMap.Name                    //城市全称
-				tmpPbrief := cfMap.P.Brief               //省简称
-				if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
-					city = tmpcity
-					PCDScore(j, "city", city, 5, true)
-				} else if pbrief == "" {
-					city = tmpcity
-					pbrief = tmpPbrief
-					PCDScore(j, "city", city, 5, true)
-					PCDScore(j, "province", pbrief, 5, true)
-				}
-			}
-		} else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
-			carr := e.DistrictCityMap[full]
-			if len(carr) > 0 {
-				district = full
-				PCDScore(j, "district", district, 5, true)
-				for _, c := range carr {
-					tmpcity := c.Name      //城市全称
-					tmpPbrief := c.P.Brief //省简称
-					if pbrief == "" {      //之前没有匹配到省份
-						PCDScore(j, "city", tmpcity, 5, true)
-						if !repeatPb[tmpPbrief] {
-							PCDScore(j, "province", tmpPbrief, 5, true)
-							repeatPb[tmpPbrief] = true
-						}
-					} else { //已有省份
-						if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
-							PCDScore(j, "city", tmpcity, -5, true)
-							PCDScore(j, "province", tmpPbrief, -5, true)
-						} else { //与之前匹配结果一致
-							if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
-								PCDScore(j, "city", tmpcity, 5, true)
-							}
-						}
-					}
-				}
-			}
-		}
-	}
-	return pbrief, city, district
-}
-
-//简称从area_city_district中抽城市
-func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
-	text := e.Seg_PCD.Cut(a_c_d, true)
-	repeatPb := map[string]bool{}
-	for _, sim := range text {
-		if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
-			if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
-				pbrief = pbMap.Brief
-				PCDScore(j, "province", pbrief, 5, true) //打分
-				//PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
-			}
-		} else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
-			if cbMap := e.CityBriefMap[sim]; cbMap != nil {
-				tmpcity := cbMap.Name
-				tmpPbrief := cbMap.P.Brief
-				if pbrief != "" && pbrief == tmpPbrief {
-					city = tmpcity
-					PCDScore(j, "city", city, 5, true)
-				} else if pbrief == "" {
-					city = tmpcity
-					pbrief = tmpPbrief
-					PCDScore(j, "city", city, 5, true)
-					PCDScore(j, "province", pbrief, 5, true)
-					//PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
-				}
-			}
-		} else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
-			dfullarr := e.DistrictSimAndAll[sim]
-			if len(dfullarr) > 0 {
-				PCDScore(j, "district", sim, 5, true)
-				for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
-					for _, c := range dfullAndCity {
-						if c == nil {
-							continue
-						}
-						tmpcity := c.Name      //城市全称
-						tmpPbrief := c.P.Brief //省简称
-						if pbrief == "" {      //之前没有匹配到省份
-							PCDScore(j, "city", tmpcity, 5, true)
-							if !repeatPb[tmpPbrief] {
-								PCDScore(j, "province", tmpPbrief, 5, true)
-								repeatPb[tmpPbrief] = true
-							}
-						} else { //已有省份
-							if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
-								PCDScore(j, "city", tmpcity, -5, true)
-								PCDScore(j, "province", tmpPbrief, -5, true)
-							} else { //与之前匹配结果一致
-								if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
-									PCDScore(j, "city", tmpcity, 5, true)
-								}
-							}
-						}
-					}
-				}
-			}
-		}
-	}
-}
-
-//通过site提取城市
-func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
-	site, _ := (*j.Data)["site"].(string)
-	//qu.Debug("site--------", site)
-	if scMap := e.SiteCityMap[site]; scMap != nil {
-		if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
-			PCDScore(j, "province", scMap.P, 5, true)
-		}
-		if scMap.C != "" && scMap.C != "null" {
-			PCDScore(j, "city", scMap.C, 5, true)
-		}
-		if scMap.D != "" && scMap.D != "null" {
-			PCDScore(j, "district", scMap.D, 5, true)
-		}
-	}
-}
-
-//通过邮编提取城市
-func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
-	defer qu.Catch()
-	pc := e.PostCodeMap[postcode]
-	if pc != nil {
-		province = pc.P
-		city = pc.C
-		districtTmp := pc.D //邮编可能对应多个区
-		score := 3.0
-		if len(districtTmp) == 1 && districtTmp[0] != "" {
-			score = 5.0
-		}
-		for _, district := range districtTmp {
-			PCDScore(j, "district", district, score, true)
-		}
-		PCDScore(j, "province", province, 5, true)
-		PCDScore(j, "city", city, 5, true)
-	}
-	return
-}
-
-//固话区号提取城市
-func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
-	defer qu.Catch()
-	if len(buyertel) >= 11 {
-		if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
-			n := 4
-		L:
-			areacode := buyertel[:n]
-			ac := e.AreaCodeMap[areacode]
-			if ac != nil {
-				province = ac.P
-				citytmp := ac.C
-				if len(citytmp) == 1 { //对应多个city舍去
-					city = citytmp[0]
-					score := float64(5)
-					if areacode == "0371" {
-						score = float64(4)
-					}
-					PCDScore(j, "city", city, score, true)
-				}
-				PCDScore(j, "province", province, 5, true)
-			} else {
-				n = n - 1
-				if n >= 3 {
-					goto L
-				}
-			}
-		} /* else if buyertel[:3] == "853" { //澳门
-			province = "澳门"
-			city = "澳门"
-			PCDScore(j, "province", province, 5, true)
-			PCDScore(j, "city", city, 5, true)
-		}*/
-	}
-	return
-}
-
-func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
-	/*
-		1.对字段进行分词
-		2.省、市、区、街道、居委会全称进行匹配打分
-		3.省、市、区简称进行匹配打分
-	*/
-	ts := 0.5
-	for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
-		if i > 1 {
-			ts = 0.2
-		}
-		p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
-		str, _ := sm.Map[from].(string)
-		jbText := e.Seg_SV.Cut(str, true)
-		for jb_index, text := range jbText {
-			if len([]rune(text)) == 1 {
-				continue
-			}
-			//全称匹配
-			//qu.Debug("text------", text)
-			for pos_full, trie_full := range e.Trie_Fulls {
-				if trie_full.Get(text) {
-					if pos_full == 0 && p_full == "" { //省全称
-						if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
-							p_full = tmpPbrief
-							PCDScore(j, "province", p_full, 4+ts, true)
-							break
-						}
-					} else if pos_full == 1 && c_full == "" { //市全称
-						if cfMap := e.CityFullMap[text]; cfMap != nil {
-							tmpPbrief := cfMap.P.Brief
-							if p_full == "" {
-								p_full = tmpPbrief
-								c_full = cfMap.Name
-								PCDScore(j, "province", p_full, 4+ts, true)
-								PCDScore(j, "city", c_full, 4+ts, true)
-								break
-							} else if p_full == tmpPbrief {
-								c_full = cfMap.Name
-								PCDScore(j, "province", tmpPbrief, 4+ts, true) //
-								PCDScore(j, "city", c_full, 4+ts, true)
-								break
-							} else if p_full != "" && p_full != tmpPbrief {
-								//city不做处理
-							}
-						}
-					} else if pos_full == 2 && d_full == "" { //区全称
-						repeatPb := map[string]bool{}
-						isOk := false
-						districtOk := false
-						citys := e.DistrictCityMap[text]
-						for _, c := range citys {
-							tmpPbrief := c.P.Brief
-							if p_full == tmpPbrief { //省份一致
-								d_full = text
-								if c_full == "" {
-									c_full = c.Name
-									PCDScore(j, "city", c_full, 4+ts, true)
-									PCDScore(j, "province", tmpPbrief, 4+ts, true) //
-								}
-								isOk = true
-								districtOk = true
-							} else if p_full == "" { //省份不存在
-								districtOk = true
-								if len(citys) == 1 { //对应一个city
-									p_full = tmpPbrief
-									c_full = c.Name
-									d_full = text
-									PCDScore(j, "province", p_full, 4+ts, true)
-									PCDScore(j, "city", c_full, 4+ts, true)
-									isOk = true
-								} else { //多个city,只打分,不赋值
-									if !repeatPb[tmpPbrief] {
-										PCDScore(j, "province", tmpPbrief, 2+ts, true)
-										repeatPb[tmpPbrief] = true
-									}
-									//PCDScore(j, "province", tmpPbrief, 2, true)
-									PCDScore(j, "city", c.Name, 2+ts, true)
-								}
-							} else if p_full != "" && p_full != tmpPbrief { //干扰项减分
-								if !repeatPb[tmpPbrief] {
-									PCDScore(j, "province", tmpPbrief, -5, true)
-									repeatPb[tmpPbrief] = true
-								}
-								//PCDScore(j, "province", tmpPbrief, -5, true)
-								PCDScore(j, "city", c.Name, -5, true)
-							}
-						}
-						if districtOk {
-							PCDScore(j, "district", text, 4+ts, true)
-						} else {
-							PCDScore(j, "district", text, -5, true)
-						}
-						if isOk {
-							break
-						}
-					} else if pos_full == 3 { //街道全称
-						districts := e.StreetDistrictMap[text]
-						if len(districts) == 1 { //街道唯一
-							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
-						}
-					} else if pos_full == 4 { //居委会全称
-						//districts := e.CommunityDistrictMap[text]
-						//if len(districts) == 1 { //居委会唯一
-						//	DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
-						//}
-					}
-				}
-			}
-			//qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-			//简称匹配
-			for pos_sim, trie_sim := range e.Trie_Sims {
-				if trie_sim.Get(text) {
-					if pos_sim == 0 && p_sim == "" { //省简称
-						p_sim = text
-						PCDScore(j, "province", p_sim, 3+ts, false)
-						break
-					} else if pos_sim == 1 { //市简称
-						if cbMap := e.CityBriefMap[text]; cbMap != nil {
-							tmpPbrief := cbMap.P.Brief
-							if p_sim == "" {
-								score := 2.0 + ts
-								if tmpPbrief == p_full {
-									score += 1.0
-								}
-								p_sim = tmpPbrief
-								c_sim = cbMap.Brief
-								PCDScore(j, "province", p_sim, score, false)
-								PCDScore(j, "city", cbMap.Name, score, false)
-								break
-							} else if p_sim == tmpPbrief {
-								c_sim = cbMap.Brief
-								PCDScore(j, "city", cbMap.Name, 3+ts, false)
-								PCDScore(j, "province", tmpPbrief, 3+ts, false)
-								break
-							} else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
-								delete(j.SimAreaScore, p_sim)
-								c_sim = text      //
-								p_sim = tmpPbrief //
-								PCDScore(j, "province", tmpPbrief, 3+ts, false)
-								PCDScore(j, "city", cbMap.Name, 3+ts, false)
-							}
-						}
-					} else if pos_sim == 2 && d_sim == "" { //区简称
-						repeatPb := map[string]bool{}
-						repeatDb := map[string]bool{}
-						dfull_citys := e.DistrictSimAndAll[text]
-						for _, dfull_city := range dfull_citys {
-							for dfull, c := range dfull_city { //dfull:简称对应的全称
-								if c == nil || c.P == nil {
-									continue
-								}
-								tmpPbrief := c.P.Brief
-								if p_sim == tmpPbrief { //省份一致
-									d_sim = text
-									PCDScore(j, "district", dfull, 2+ts, false)
-									if c_sim == "" {
-										c_sim = c.Brief
-										PCDScore(j, "city", c.Name, 2+ts, false)
-									}
-									PCDScore(j, "province", tmpPbrief, 2+ts, false) //
-								} else if p_sim == "" { //暂未匹配到省
-									if !repeatDb[dfull] {
-										PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
-										repeatDb[dfull] = true
-									}
-									if len(dfull_citys) == 1 {
-										PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
-										PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
-									} else {
-										if !repeatPb[tmpPbrief] {
-											PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
-											repeatPb[tmpPbrief] = true
-										}
-										PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
-									}
-
-									//新增~特殊组情况下~津市高新区管委会~切词首"津市"~均未匹配到情况下
-									if jb_index == 0 && len(dfull_citys) == 1 && len(j.FullAreaScore) == 0 && len(j.SimAreaScore) == 0 {
-										PCDScore(j, "district", dfull, 0, false)
-										PCDScore(j, "city", c.Name, 0, false)
-										PCDScore(j, "province", tmpPbrief, 0, false) //
-									}
-
-								} else if p_sim != "" && p_sim != tmpPbrief {
-									if !repeatPb[tmpPbrief] {
-										PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
-										repeatPb[tmpPbrief] = true
-									}
-									PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
-									PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
-								}
-							}
-						}
-					}
-				}
-			}
-			//qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
-		}
-	}
-}
-
-func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
-	repeatP_full := map[string]bool{}
-	repeatC_full := map[string]bool{}
-	repeatD_full := map[string]bool{}
-	repeatP_sim := map[string]bool{}
-	repeatC_sim := map[string]bool{}
-	repeatD_sim := map[string]bool{}
-	detailRune := []rune(j.Content)
-	detail := j.Content
-	if len(detailRune) > 600 {
-		start := detailRune[:300]
-		end := detailRune[len(detailRune)-300:]
-		detail = string(start) + string(end)
-	}
-	for _, reg := range AgencyReg {
-		detail = reg.ReplaceAllString(detail, "")
-	}
-	for _, text := range e.Seg_SV.Cut(detail, true) {
-		if len([]rune(text)) > 1 {
-			//全称匹配
-			for pos_full, trie_full := range e.Trie_Fulls {
-				if trie_full.Get(text) {
-					if pos_full == 0 { //省全称
-						if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
-							PCDScore(j, "province", tmpPbrief, 1, true)
-							repeatP_full[tmpPbrief] = true
-							break
-						}
-					} else if pos_full == 1 { //市全称
-						if cfMap := e.CityFullMap[text]; cfMap != nil {
-							if !repeatP_full[cfMap.P.Brief] {
-								PCDScore(j, "province", cfMap.P.Brief, 1, true)
-								repeatP_full[cfMap.P.Brief] = true
-							}
-							if !repeatC_full[cfMap.Name] {
-								PCDScore(j, "city", cfMap.Name, 1, true)
-								repeatC_full[cfMap.Name] = true
-							}
-							break
-						}
-					} else if pos_full == 2 { //区全称
-						citys := e.DistrictCityMap[text]
-						if len(citys) > 0 {
-							if !repeatD_full[text] {
-								PCDScore(j, "district", text, 1, true)
-								repeatD_full[text] = true
-							}
-							for _, c := range citys {
-								if !repeatC_full[c.Name] {
-									PCDScore(j, "city", c.Name, 1, true)
-									repeatC_full[c.Name] = true
-								}
-								if !repeatP_full[c.P.Brief] {
-									PCDScore(j, "province", c.P.Brief, 1, true)
-									repeatP_full[c.P.Brief] = true
-								}
-							}
-							break
-						}
-					} else if pos_full == 3 { //街道全称
-						districts := e.StreetDistrictMap[text]
-						if len(districts) == 1 {
-							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
-						}
-					} else if pos_full == 4 { //居委会全称
-						//districts := e.CommunityDistrictMap[text]
-						//if len(districts) == 1 {
-						//	DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
-						//}
-					}
-				}
-			}
-			//qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
-			//简称匹配
-			for pos_sim, trie_sim := range e.Trie_Sims {
-				if trie_sim.Get(text) {
-					if pos_sim == 0 && !repeatP_sim[text] { //省简称
-						PCDScore(j, "province", text, 1, false)
-						repeatP_sim[text] = true
-						break
-					} else if pos_sim == 1 { //市简称
-						if cbMap := e.CityBriefMap[text]; cbMap != nil {
-							if !repeatP_sim[cbMap.P.Brief] {
-								PCDScore(j, "province", cbMap.P.Brief, 1, false)
-								repeatP_sim[cbMap.P.Brief] = true
-							}
-							if !repeatC_sim[cbMap.Name] {
-								PCDScore(j, "city", cbMap.Name, 1, false)
-								repeatC_sim[cbMap.Name] = true
-							}
-							break
-						}
-					} else if pos_sim == 2 { //区简称
-						dfull_citys := e.DistrictSimAndAll[text]
-						if len(dfull_citys) == 1 {
-							for _, dfull_city := range dfull_citys {
-								for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
-									if !repeatD_sim[dfull] {
-										PCDScore(j, "district", dfull, 1, false)
-										repeatD_sim[dfull] = true
-									}
-									if ctmp == nil {
-										continue
-									}
-									if !repeatC_sim[ctmp.Name] {
-										PCDScore(j, "city", ctmp.Name, 1, false)
-										repeatC_sim[ctmp.Name] = true
-									}
-									if !repeatP_sim[ctmp.P.Brief] {
-										PCDScore(j, "province", ctmp.P.Brief, 1, false)
-										repeatP_sim[ctmp.P.Brief] = true
-									}
-								}
-							}
-						}
-					}
-				}
-			}
-			//qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
-		}
-	}
-}
-
-//街道、居委会对应多地市处理
-func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
-	if len(districts) == 1 {
-		district := districts[0]
-		city := district.C.Name
-		tmpPbrief := district.C.P.Brief
-		if pbrief != "" && tmpPbrief == pbrief {
-			PCDScore(j, "province", tmpPbrief, score, true)
-			PCDScore(j, "city", city, score, true)
-			PCDScore(j, "district", district.Name, score, true)
-		} else if pbrief == "" {
-			if repeatP != nil && !(*repeatP)[tmpPbrief] {
-				PCDScore(j, "province", tmpPbrief, score, true)
-				(*repeatP)[tmpPbrief] = true
-			} else if repeatP == nil {
-				PCDScore(j, "province", tmpPbrief, score, true)
-			}
-			if repeatC != nil && !(*repeatC)[city] {
-				PCDScore(j, "city", city, score, true)
-				(*repeatC)[city] = true
-			} else if repeatC == nil {
-				PCDScore(j, "city", city, score, true)
-			}
-			if repeatD != nil && !(*repeatD)[tmpPbrief] {
-				PCDScore(j, "district", district.Name, score, true)
-				(*repeatD)[district.Name] = true
-			} else if repeatD == nil {
-				PCDScore(j, "district", district.Name, score, true)
-			}
-		}
-	}
-}
-func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
-	for _, c := range finishC { //取最高分与province匹配的city
-		if cfMap := e.CityFullMap[c]; cfMap != nil {
-			if cfMap.P.Brief == area {
-				//				city = c
-				//				break
-				tmpcity = append(tmpcity, c)
-			}
-		}
-	}
-	if len(tmpcity) == 1 {
-		city = tmpcity[0]
-	}
-	return city, tmpcity
-}
-func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
-	for _, d := range finishD { //取最高分与province匹配的district
-		citys := e.DistrictCityMap[d]
-		for _, c := range citys {
-			if len(tmpcity) == 0 { //没有city
-				if c.P.Brief == area {
-					city = c.Name
-					district = d
-					return city, district
-				}
-			} else if len(tmpcity) == 1 { //一个city
-				if c.Name == city && c.P.Brief == area {
-					district = d
-					return city, district
-				}
-			} else { //多个city
-				for _, tc := range tmpcity { //多个city根据district最高分取
-					if tc == c.Name && len(finishD) == 1 {
-						city = c.Name
-						district = d
-						return city, district
-					}
-				}
-			}
-		}
-	}
-	return city, district
-}
-
-//计算province,city,district区或县匹配的得分
-func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
-	defer qu.Catch()
-	if t != "" {
-		if stype == "d" {
-			tmpscore := (*ds)[t]
-			(*ds)[t] = tmpscore + score
-		} else if stype == "c" {
-			tmpscore := (*cs)[t]
-			(*cs)[t] = tmpscore + score
-		} else if stype == "p" {
-			tmpscore := (*ps)[t]
-			(*ps)[t] = tmpscore + score
-		}
-	}
-}
-func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
-	if len(j.FullAreaScore) > 0 {
-		for pt, ps := range *pscore {
-			j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
-		}
-		for ct, cs := range *cscore {
-			j.FullCityScore[ct] = j.FullCityScore[ct] + cs
-		}
-		for dt, ds := range *dscore {
-			j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
-		}
-	}
-}
-func MergeFullSimScore(j *ju.Job) {
-	if len(j.FullAreaScore) == 0 {
-		j.FullAreaScore = j.SimAreaScore
-	} else {
-		for p_text, p_score := range j.FullAreaScore {
-			j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
-		}
-	}
-	for c_text, c_score := range j.SimCityScore {
-		j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
-	}
-	for d_text, d_score := range j.SimDistrictScore {
-		j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
-	}
-
-	//	if len(j.FullCityScore) == 0 {
-	//		j.FullCityScore = j.SimCityScore
-	//	} else {
-	//		for c_text, c_score := range j.FullCityScore {
-	//			j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
-	//		}
-	//	}
-	//	if len(j.FullDistrictScore) == 0 {
-	//		j.FullDistrictScore = j.SimDistrictScore
-	//	} else {
-	//		for d_text, d_score := range j.FullDistrictScore {
-	//			j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
-	//		}
-	//	}
-}
-
-func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
-	if len(j.FullDistrictScore) > 0 {
-		for d, _ := range j.FullDistrictScore {
-			tmpCitys := e.DistrictCityMap[d]
-			for _, c := range tmpCitys {
-				if j.FullCityScore[c.Name] != 0 {
-					tmpPb := c.P.Brief
-					//if j.FullAreaScore[tmpPb] != 0 {
-					flag := false
-					for _, p := range finishP {
-						if tmpPb == p {
-							flag = true
-							break
-						}
-					}
-					if !flag {
-						delete(j.FullCityScore, c.Name)
-						delete(j.FullDistrictScore, d)
-					}
-					//}
-				}
-			}
-		}
-	}
-	if len(j.FullCityScore) > 0 {
-		for tmpcity, _ := range j.FullCityScore {
-			c := e.CityFullMap[tmpcity]
-			if c == nil {
-				log.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
-				continue
-			}
-			tmpPb := c.P.Brief
-			//if j.FullAreaScore[tmpPb] != 0 {
-			flag := false
-			for _, p := range finishP {
-				if tmpPb == p {
-					flag = true
-					break
-				}
-			}
-			if !flag {
-				delete(j.FullCityScore, tmpcity)
-			}
-			//}
-		}
-	}
-
-}
-
-func HighestScoreArr(m map[string]float64) []string {
-	result := make(map[float64][]string)
-	tmpscore := 0.0
-	for str, score := range m {
-		if str != "" && tmpscore <= score {
-			if result[tmpscore] != nil && tmpscore != score {
-				delete(result, tmpscore)
-			}
-			if r := result[score]; r != nil {
-				r = append(r, str)
-				result[score] = r
-			} else {
-				result[score] = []string{str}
-			}
-			tmpscore = score
-		}
-	}
-	return result[tmpscore]
-}
-
-//计算province,city,district得分
-func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
-	defer qu.Catch()
-	if text != "" {
-		if stype == "district" {
-			tmpdistrict := make(map[string]float64)
-			if isfull {
-				tmpdistrict = j.FullDistrictScore
-			} else {
-				tmpdistrict = j.SimDistrictScore
-			}
-			scoretmp := tmpdistrict[text]
-			tmpdistrict[text] = scoretmp + score
-		} else if stype == "city" {
-			tmpcity := make(map[string]float64)
-			if isfull {
-				tmpcity = j.FullCityScore
-			} else {
-				tmpcity = j.SimCityScore
-			}
-			scoretmp := tmpcity[text]
-			tmpcity[text] = scoretmp + score
-		} else if stype == "province" {
-			tmpprovince := make(map[string]float64)
-			if isfull {
-				tmpprovince = j.FullAreaScore
-			} else {
-				tmpprovince = j.SimAreaScore
-			}
-			scoretmp := tmpprovince[text]
-			tmpprovince[text] = scoretmp + score
-		}
-	}
-}
-
-//新疆兵团~~~旧版
-func (e *ExtractTask) CheckingXjbtCity(buyer string) (new_a, new_c, new_d string, ok bool) {
-	buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
-	ok = false
-	for _, info := range e.XjbtCityArr {
-		name := qu.ObjToString(info["name"])
-		alias := qu.ObjToString(info["alias"])
-		if strings.Contains(buyer, name) || strings.Contains(buyer, alias) {
-			new_a = qu.ObjToString(info["area"])
-			new_c = qu.ObjToString(info["city"])
-			new_d = qu.ObjToString(info["district"])
-			ok = true
-			if res, ok := info["list"].([]interface{}); ok {
-				list := qu.ObjArrToMapArr(res)
-				for _, c := range list {
-					c_name := qu.ObjToString(c["name"])
-					if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
-						new_a = qu.ObjToString(c["area"])
-						new_c = qu.ObjToString(c["city"])
-						new_d = qu.ObjToString(c["district"])
-						break
-					}
-				}
-			}
-			break
-		}
-	}
-	return new_a, new_c, new_d, ok
-}
-
-//敏感词识别~~~旧版
-func (e *ExtractTask) SensitiveCityData(detail string, area string) string {
-	//采用正文
-	detail = SensitiveReg.ReplaceAllString(detail, "")
-	//删除表格相关-文本
-	detail = TextAfterRemoveTable(detail)
-
-	sim_arr := e.SensitiveSimCity.FindAll(detail)
-	full_arr := e.SensitiveFullCity.FindAll(detail)
-	if len(full_arr) < 3 {
-		for _, v := range full_arr {
-			if cityMap := e.CityFullMap[v]; cityMap != nil {
-				if cityMap.P.Brief == area {
-					return cityMap.Name
-				}
-			}
-		}
-	}
-	if len(sim_arr) < 3 {
-		for _, v := range sim_arr {
-			if cityMap := e.CityBriefMap[v]; cityMap != nil {
-				if cityMap.P.Brief == area && !strings.Contains(area, v) {
-					return cityMap.Name
-				}
-			}
-		}
-	}
-	return ""
-}

+ 45 - 1
src/jy/extract/extractcity_other.go

@@ -37,7 +37,7 @@ func (e *ExtractTask) GetMatchScores(j *ju.Job) {
 	j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
 	rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
 	rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
-	for _, text := range e.Seg_SV.Cut(j.Content, true) {
+	for _, text := range e.Seg_Full.Cut(j.Content, true) {
 		if text == "" {
 			continue
 		}
@@ -224,3 +224,47 @@ func (e *ExtractTask) GetFinallyScoreRegion(finishA, finishC, finishD []string)
 	}
 	return s_area, s_city, s_district
 }
+
+func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
+	for _, c := range finishC { //取最高分与province匹配的city
+		if cfMap := e.CityFullMap[c]; cfMap != nil {
+			if cfMap.P.Brief == area {
+				//				city = c
+				//				break
+				tmpcity = append(tmpcity, c)
+			}
+		}
+	}
+	if len(tmpcity) == 1 {
+		city = tmpcity[0]
+	}
+	return city, tmpcity
+}
+func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
+	for _, d := range finishD { //取最高分与province匹配的district
+		citys := e.DistrictCityMap[d]
+		for _, c := range citys {
+			if len(tmpcity) == 0 { //没有city
+				if c.P.Brief == area {
+					city = c.Name
+					district = d
+					return city, district
+				}
+			} else if len(tmpcity) == 1 { //一个city
+				if c.Name == city && c.P.Brief == area {
+					district = d
+					return city, district
+				}
+			} else { //多个city
+				for _, tc := range tmpcity { //多个city根据district最高分取
+					if tc == c.Name && len(finishD) == 1 {
+						city = c.Name
+						district = d
+						return city, district
+					}
+				}
+			}
+		}
+	}
+	return city, district
+}

+ 14 - 11
src/jy/extract/extractcity_way.go

@@ -197,11 +197,12 @@ func (e *ExtractTask) GetRegionFromText(text string, regions *map[string]map[str
 		return regionValues
 	}
 	wordsArr := []string{}
-	if from == 1 {
-		wordsArr = e.Seg_PCD.Cut(text, true)
-	} else if from == 2 {
-		wordsArr = e.Seg_SV.Cut(text, true)
-	}
+	//if from == 1 {
+	//	wordsArr = e.Seg_PCD.Cut(text, true)
+	//} else if from == 2 {
+	//	wordsArr = e.Seg_SV.Cut(text, true)
+	//}
+	wordsArr = e.Seg_Full.Cut(text, true)
 	//词组清洗
 	wordsArr = CleanRegionTextWords(wordsArr)
 	regionsArr := []map[string]string{}
@@ -506,7 +507,8 @@ func CleanRegionTextWords(wordsArr []string) []string {
 // 链路补充~全称类
 func (e *ExtractTask) LinkSpecialRuleFullStep(text string, area *string, city *string, district *string) {
 	regions := map[string]map[string]map[string]string{}
-	wordsArr := e.Seg_SV.Cut(text, true)
+	//wordsArr := e.Seg_SV.Cut(text, true)
+	wordsArr := e.Seg_Full.Cut(text, true)
 	for _, word := range wordsArr {
 		for pos_full, trie_full := range e.Trie_Fulls {
 			if pos_full == 3 {
@@ -563,11 +565,12 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep2(text string, area *string, city
 func (e *ExtractTask) FetchBriefRules(text string, cutype int) map[string]map[string]map[string]string {
 	regions := map[string]map[string]map[string]string{}
 	wordsArr := []string{}
-	if cutype == 1 {
-		wordsArr = e.Seg_Full.Cut(text, true)
-	} else {
-		wordsArr = e.Seg_SV.Cut(text, true)
-	}
+	//if cutype == 1 {
+	//	wordsArr = e.Seg_Full.Cut(text, true)
+	//} else {
+	//	wordsArr = e.Seg_SV.Cut(text, true)
+	//}
+	wordsArr = e.Seg_Full.Cut(text, true)
 	for _, word := range wordsArr {
 		for pos_sim, trie_sim := range e.Trie_Sims {
 			if pos_sim == 2 {

+ 19 - 19
src/jy/extract/extractinit.go

@@ -129,21 +129,21 @@ type ExtractTask struct {
 	AreaCodeMap map[string]*AreaCode //区号
 	InfoType    []map[string]interface{}
 
-	Trie_Full_Province  *ju.Trie       //省全称 省、直辖市、自治区
-	Trie_Full_City      *ju.Trie       //市全称 地级市
-	Trie_Full_District  *ju.Trie       //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
-	Trie_Full_Street    *ju.Trie       //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
-	Trie_Full_Community *ju.Trie       //村/委员会全称  村、居委会
-	Trie_Sim_Province   *ju.Trie       //省简称
-	Trie_Sim_City       *ju.Trie       //市简称
-	Trie_Sim_District   *ju.Trie       //县简称
-	Trie_Fulls          []*ju.Trie     //所有全称
-	Trie_Sims           []*ju.Trie     //所有简称
-	Seg_PCD             *gse.Segmenter //分词
-	Seg_SV              *gse.Segmenter //分词
-	Seg_Full            *gse.Segmenter //分词
-	Luacodes            *sync.Map      //站点规则
-	SiteMerge           *sync.Map      //抽取合并
+	Trie_Full_Province  *ju.Trie   //省全称 省、直辖市、自治区
+	Trie_Full_City      *ju.Trie   //市全称 地级市
+	Trie_Full_District  *ju.Trie   //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
+	Trie_Full_Street    *ju.Trie   //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
+	Trie_Full_Community *ju.Trie   //村/委员会全称  村、居委会
+	Trie_Sim_Province   *ju.Trie   //省简称
+	Trie_Sim_City       *ju.Trie   //市简称
+	Trie_Sim_District   *ju.Trie   //县简称
+	Trie_Fulls          []*ju.Trie //所有全称
+	Trie_Sims           []*ju.Trie //所有简称
+	//Seg_PCD             *gse.Segmenter //分词
+	//Seg_SV              *gse.Segmenter //分词
+	Seg_Full  *gse.Segmenter //分词
+	Luacodes  *sync.Map      //站点规则
+	SiteMerge *sync.Map      //抽取合并
 }
 
 type SiteCity struct {
@@ -1333,11 +1333,11 @@ func (e *ExtractTask) InitVar() {
 	e.Trie_Sim_District = &ju.Trie{}
 
 	//初始化分词
-	e.Seg_PCD = &gse.Segmenter{}
-	e.Seg_SV = &gse.Segmenter{}
+	//e.Seg_PCD = &gse.Segmenter{}
+	//e.Seg_SV = &gse.Segmenter{}
 	e.Seg_Full = &gse.Segmenter{}
-	e.Seg_PCD.LoadDict("./res/pcd.txt")
-	e.Seg_SV.LoadDict("./res/sv.txt")
+	//e.Seg_PCD.LoadDict("./res/pcd.txt")
+	//e.Seg_SV.LoadDict("./res/sv.txt")
 	e.Seg_Full.LoadDict("./res/dictionary.txt")
 
 	//初始化城市相关