فهرست منبع

城市抽取修改

maxiaoshan 5 سال پیش
والد
کامیت
e7bdaae9a6
4فایلهای تغییر یافته به همراه36 افزوده شده و 33 حذف شده
  1. 29 26
      src/jy/extract/newextractcity.go
  2. 1 1
      src/main_test.go
  3. 3 3
      src/res/pcd.txt
  4. 3 3
      src/res/sv.txt

+ 29 - 26
src/jy/extract/newextractcity.go

@@ -8,7 +8,7 @@ import (
 	"strings"
 )
 
-var AgencyReg = regexp.MustCompile("(代理机构.{0,30}|.{2,15}((招标)?代理|咨询|政府采购))")
+var AgencyReg = regexp.MustCompile("((代理机构|中标供应商).{0,30}|.{2,15}((招标)?代理|咨询|政府采购))")
 
 //抽取city
 func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
@@ -109,6 +109,16 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}
 		districtresult = ""
 	}
 	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
+	//直辖市
+	if arearesult == "北京" {
+		cityresult = "北京市"
+	} else if arearesult == "天津" {
+		cityresult = "天津市"
+	} else if arearesult == "上海" {
+		cityresult = "上海市"
+	} else if arearesult == "重庆" {
+		cityresult = "重庆市"
+	}
 	if arearesult == "" {
 		arearesult = "全国"
 	} /* else if cityresult == "" {
@@ -118,9 +128,9 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}
 		}
 	}*/
 	//qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
-	resulttmp["area10"] = arearesult
-	resulttmp["city10"] = cityresult
-	resulttmp["district10"] = districtresult
+	resulttmp["area"] = arearesult
+	resulttmp["city"] = cityresult
+	resulttmp["district"] = districtresult
 }
 
 //jsondata中抽取城市
@@ -387,7 +397,6 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
 					} else if pos_full == 1 && c_full == "" { //市全称
 						if cfMap := e.CityFullMap[text]; cfMap != nil {
 							tmpPbrief := cfMap.P.Brief
-							//qu.Debug("市--------", text, tmpPbrief, p_full)
 							if p_full == "" {
 								p_full = tmpPbrief
 								c_full = cfMap.Name
@@ -403,7 +412,6 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
 							}
 						}
 					} else if pos_full == 2 && d_full == "" { //区全称
-						//qu.Debug("区全称===========")
 						repeatPb := map[string]bool{}
 						isOk := false
 						districtOk := false
@@ -486,58 +494,46 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
 								c_sim = cbMap.Brief
 								PCDScore(j, "city", cbMap.Name, 3)
 								break
-							} else if p_sim != "" && p_sim != tmpPbrief {
-								//city不做处理
+							} else if p_sim != "" && p_sim != tmpPbrief { //北京师范大学广州实验学校
+								PCDScore(j, "province", tmpPbrief, 1)
+								PCDScore(j, "city", cbMap.Name, 1)
 							}
 						}
 					} else if pos_sim == 2 && d_sim == "" { //区简称
 						repeatPb := map[string]bool{}
 						repeatDb := map[string]bool{}
 						dfull_citys := e.NewDistrictSimAndAll[text]
-						//qu.Debug(text, dfull_citys, p_sim)
 						for _, dfull_city := range dfull_citys {
 							for dfull, c := range dfull_city { //dfull:简称对应的全称
 								tmpPbrief := c.P.Brief
 								if p_sim == tmpPbrief { //省份一致
 									d_sim = text
-									//PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
 									PCDScore(j, "district", dfull, 2)
 									if c_sim == "" {
 										c_sim = c.Brief
-										//PCDScoreByDistrictSim("c", c.Name, 2, pscore, cscore, dscore)
 										PCDScore(j, "city", c.Name, 2)
 									}
 								} else if p_sim == "" {
 									if !repeatDb[dfull] {
 										PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
-										//PCDScore(j, "district", dfull, 1)
 										repeatDb[dfull] = true
 									}
 									if len(dfull_citys) == 1 {
-										//p_sim = tmpPbrief
-										//c_sim = c.Brief
-										//d_sim = text
 										PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
 										PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
-										//PCDScore(j, "province", p_sim, 2)
-										//PCDScore(j, "city", c.Name, 2)
 									} else {
 										if !repeatPb[tmpPbrief] {
 											PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
-											//PCDScore(j, "province", tmpPbrief, 1)
 											repeatPb[tmpPbrief] = true
 										}
-										//PCDScore(j, "city", c.Name, 1)
 										PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
 									}
 								} else if p_sim != "" && p_sim != tmpPbrief {
 									if !repeatPb[tmpPbrief] {
 										PCDScoreByDistrictSim("p", tmpPbrief, -5, pscore, cscore, dscore)
-										//PCDScore(j, "province", tmpPbrief, -5)
 										repeatPb[tmpPbrief] = true
 									}
 									PCDScoreByDistrictSim("c", c.Name, -5, pscore, cscore, dscore)
-									//PCDScore(j, "city", c.Name, -5)
 								}
 							}
 						}
@@ -620,13 +616,20 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
 			//简称匹配
 			for pos_sim, trie_sim := range e.Trie_Sims {
 				if trie_sim.Get(text) {
-					if pos_sim == 0 { //省简称
+					if pos_sim == 0 && !repeatP[text] { //省简称
 						PCDScore(j, "province", text, 1)
+						repeatP[text] = true
 						break
 					} else if pos_sim == 1 { //市简称
 						if cbMap := e.CityBriefMap[text]; cbMap != nil {
-							PCDScore(j, "city", cbMap.Name, 1)
-							PCDScore(j, "province", cbMap.P.Brief, 1)
+							if !repeatP[cbMap.P.Brief] {
+								PCDScore(j, "province", cbMap.P.Brief, 1)
+								repeatP[cbMap.P.Brief] = true
+							}
+							if !repeatC[cbMap.Name] {
+								PCDScore(j, "city", cbMap.Name, 1)
+								repeatC[cbMap.Name] = true
+							}
 							break
 						}
 					} /* else if pos_sim == 2 { //区简称
@@ -733,7 +736,7 @@ func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcit
 					district = d
 					return city, district
 				}
-			} else { //多个city
+			} /*else { //多个city
 				for _, tc := range tmpcity { //多个city根据district最高分取
 					if tc == c.Name && len(finishD) == 1 {
 						city = c.Name
@@ -741,7 +744,7 @@ func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcit
 						return city, district
 					}
 				}
-			}
+			}*/
 
 			//			if len(citys) == 1 { //区对应一个市
 			//				if c.P.Brief == area {

+ 1 - 1
src/main_test.go

@@ -28,7 +28,7 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5a84269a40d2d9bbe88c177b", "1", "mxs_v1", "mxs_v1")
+	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5a83eab640d2d9bbe88b5711", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }

+ 3 - 3
src/res/pcd.txt

@@ -4316,7 +4316,7 @@
 留坝 3 n
 冕宁 3 n
 博爱 3 n
-彭水 3 n
+彭水 3 n
 蓬江 3 n
 马山 3 n
 萨尔图 3 n
@@ -4484,7 +4484,7 @@
 五莲 3 n
 杨陵 3 n
 潮州 3 n
-秀山 3 n
+秀山 3 n
 鼎湖 3 n
 定兴 3 n
 定南 3 n
@@ -4967,7 +4967,7 @@
 永丰 3 n
 松江 3 n
 荥阳 3 n
-石柱 3 n
+石柱 3 n
 花都 3 n
 东乡 3 n
 宜丰 3 n

+ 3 - 3
src/res/sv.txt

@@ -4316,7 +4316,7 @@
 留坝 3 n
 冕宁 3 n
 博爱 3 n
-彭水 3 n
+彭水 3 n
 蓬江 3 n
 马山 3 n
 萨尔图 3 n
@@ -4484,7 +4484,7 @@
 五莲 3 n
 杨陵 3 n
 潮州 3 n
-秀山 3 n
+秀山 3 n
 鼎湖 3 n
 定兴 3 n
 定南 3 n
@@ -4967,7 +4967,7 @@
 永丰 3 n
 松江 3 n
 荥阳 3 n
-石柱 3 n
+石柱 3 n
 花都 3 n
 东乡 3 n
 宜丰 3 n