Преглед изворни кода

城市优化...
1、切词
2、标签
3、站点配置

zhengkun пре 1 година
родитељ
комит
7a35e4edfe

+ 14 - 14
src/jy/extract/extractcity_new.go

@@ -532,7 +532,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 		fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
 		if len(fullProvinceArr) == 1 { //再次计算
-			//fullProvinceArr = e.findAmbiguityRegion(detail, fullProvinceArr[0])
+			fullProvinceArr = e.findAmbiguityRegion(detail, fullProvinceArr[0])
 			for _, v := range fullProvinceArr {
 				if sim_province := e.ProvinceMap[v]; sim_province != "" {
 					*area = sim_province
@@ -544,7 +544,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
-		//fullCityArr = e.findAmbiguityRegion(detail, fullCityArr[0])
+		fullCityArr = e.findAmbiguityRegion(detail, fullCityArr[0])
 		for _, v := range fullCityArr {
 			if cityMap := e.CityFullMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
@@ -558,7 +558,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称区县
 	fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
 	if len(fullDistrictArr) == 1 {
-		//fullDistrictArr = e.findAmbiguityRegion(detail, fullDistrictArr[0])
+		fullDistrictArr = e.findAmbiguityRegion(detail, fullDistrictArr[0])
 		for _, v := range fullDistrictArr {
 			if citys := e.DistrictCityMap[v]; len(citys) == 1 {
 				if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
@@ -573,7 +573,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//简称城市
 	simCityArr := e.SensitiveSimCity.FindAll(detail)
 	if len(simCityArr) == 1 {
-		//simCityArr = e.findAmbiguityRegion(detail, simCityArr[0])
+		simCityArr = e.findAmbiguityRegion(detail, simCityArr[0])
 		for _, v := range simCityArr {
 			if cityMap := e.CityBriefMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" {
@@ -595,7 +595,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 		simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
 		if len(simProvinceArr) == 1 {
-			//simProvinceArr = e.findAmbiguityRegion(detail, simProvinceArr[0])
+			simProvinceArr = e.findAmbiguityRegion(detail, simProvinceArr[0])
 			for _, v := range simProvinceArr {
 				if v != "" {
 					*area = v
@@ -689,15 +689,15 @@ func (e *ExtractTask) GetRegionByTentativeOperator(winner string, all_regions *m
 }
 
 // 通用方法找到指定地域有效词组
-//func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
-//	wordsArr := e.Seg_Full.Cut(text, true)
-//	for _, word := range wordsArr {
-//		if word == key {
-//			return []string{key}
-//		}
-//	}
-//	return []string{}
-//}
+func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
+	wordsArr := e.Seg_Full.Cut(text, true)
+	for _, word := range wordsArr {
+		if word == key {
+			return []string{key}
+		}
+	}
+	return []string{}
+}
 
 //初步确认~站点
 //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {

+ 1 - 2
src/jy/extract/extractcity_other.go

@@ -37,8 +37,7 @@ func (e *ExtractTask) GetMatchScores(j *ju.Job) {
 	j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
 	rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
 	rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
-	//for _, text := range e.Seg_Full.Cut(j.Content, true) {
-	for _, text := range e.Seg_SV.Cut(j.Content, true) {
+	for _, text := range e.Seg_Full.Cut(j.Content, true) {
 		if text == "" {
 			continue
 		}

+ 14 - 14
src/jy/extract/extractcity_way.go

@@ -197,12 +197,12 @@ func (e *ExtractTask) GetRegionFromText(text string, regions *map[string]map[str
 		return regionValues
 	}
 	wordsArr := []string{}
-	if from == 1 {
-		wordsArr = e.Seg_PCD.Cut(text, true)
-	} else if from == 2 {
-		wordsArr = e.Seg_SV.Cut(text, true)
-	}
-	//wordsArr = e.Seg_Full.Cut(text, true)
+	//if from == 1 {
+	//	wordsArr = e.Seg_PCD.Cut(text, true)
+	//} else if from == 2 {
+	//	wordsArr = e.Seg_SV.Cut(text, true)
+	//}
+	wordsArr = e.Seg_Full.Cut(text, true)
 	//词组清洗
 	wordsArr = CleanRegionTextWords(wordsArr)
 	regionsArr := []map[string]string{}
@@ -507,8 +507,8 @@ func CleanRegionTextWords(wordsArr []string) []string {
 // 链路补充~全称类
 func (e *ExtractTask) LinkSpecialRuleFullStep(text string, area *string, city *string, district *string) {
 	regions := map[string]map[string]map[string]string{}
-	wordsArr := e.Seg_SV.Cut(text, true)
-	//wordsArr := e.Seg_Full.Cut(text, true)
+	//wordsArr := e.Seg_SV.Cut(text, true)
+	wordsArr := e.Seg_Full.Cut(text, true)
 	for _, word := range wordsArr {
 		for pos_full, trie_full := range e.Trie_Fulls {
 			if pos_full == 3 {
@@ -565,12 +565,12 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep2(text string, area *string, city
 func (e *ExtractTask) FetchBriefRules(text string, cutype int) map[string]map[string]map[string]string {
 	regions := map[string]map[string]map[string]string{}
 	wordsArr := []string{}
-	if cutype == 1 {
-		wordsArr = e.Seg_Full.Cut(text, true)
-	} else {
-		wordsArr = e.Seg_SV.Cut(text, true)
-	}
-	//wordsArr = e.Seg_Full.Cut(text, true)
+	//if cutype == 1 {
+	//	wordsArr = e.Seg_Full.Cut(text, true)
+	//} else {
+	//	wordsArr = e.Seg_SV.Cut(text, true)
+	//}
+	wordsArr = e.Seg_Full.Cut(text, true)
 	for _, word := range wordsArr {
 		for pos_sim, trie_sim := range e.Trie_Sims {
 			if pos_sim == 2 {

+ 19 - 19
src/jy/extract/extractinit.go

@@ -129,21 +129,21 @@ type ExtractTask struct {
 	AreaCodeMap map[string]*AreaCode //区号
 	InfoType    []map[string]interface{}
 
-	Trie_Full_Province  *ju.Trie       //省全称 省、直辖市、自治区
-	Trie_Full_City      *ju.Trie       //市全称 地级市
-	Trie_Full_District  *ju.Trie       //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
-	Trie_Full_Street    *ju.Trie       //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
-	Trie_Full_Community *ju.Trie       //村/委员会全称  村、居委会
-	Trie_Sim_Province   *ju.Trie       //省简称
-	Trie_Sim_City       *ju.Trie       //市简称
-	Trie_Sim_District   *ju.Trie       //县简称
-	Trie_Fulls          []*ju.Trie     //所有全称
-	Trie_Sims           []*ju.Trie     //所有简称
-	Seg_PCD             *gse.Segmenter //分词
-	Seg_SV              *gse.Segmenter //分词
-	Seg_Full            *gse.Segmenter //分词
-	Luacodes            *sync.Map      //站点规则
-	SiteMerge           *sync.Map      //抽取合并
+	Trie_Full_Province  *ju.Trie   //省全称 省、直辖市、自治区
+	Trie_Full_City      *ju.Trie   //市全称 地级市
+	Trie_Full_District  *ju.Trie   //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
+	Trie_Full_Street    *ju.Trie   //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
+	Trie_Full_Community *ju.Trie   //村/委员会全称  村、居委会
+	Trie_Sim_Province   *ju.Trie   //省简称
+	Trie_Sim_City       *ju.Trie   //市简称
+	Trie_Sim_District   *ju.Trie   //县简称
+	Trie_Fulls          []*ju.Trie //所有全称
+	Trie_Sims           []*ju.Trie //所有简称
+	//Seg_PCD             *gse.Segmenter //分词
+	//Seg_SV              *gse.Segmenter //分词
+	Seg_Full  *gse.Segmenter //分词
+	Luacodes  *sync.Map      //站点规则
+	SiteMerge *sync.Map      //抽取合并
 }
 
 type SiteCity struct {
@@ -1333,11 +1333,11 @@ func (e *ExtractTask) InitVar() {
 	e.Trie_Sim_District = &ju.Trie{}
 
 	//初始化分词
-	e.Seg_PCD = &gse.Segmenter{}
-	e.Seg_SV = &gse.Segmenter{}
+	//e.Seg_PCD = &gse.Segmenter{}
+	//e.Seg_SV = &gse.Segmenter{}
 	e.Seg_Full = &gse.Segmenter{}
-	e.Seg_PCD.LoadDict("./res/pcd.txt")
-	e.Seg_SV.LoadDict("./res/sv.txt")
+	//e.Seg_PCD.LoadDict("./res/pcd.txt")
+	//e.Seg_SV.LoadDict("./res/sv.txt")
 	e.Seg_Full.LoadDict("./res/dictionary.txt")
 
 	//初始化城市相关