zhengkun 2 лет назад
Родитель
Сommit
414389b4af

+ 38 - 48
src/jy/extract/extractInit.go

@@ -104,26 +104,23 @@ type ExtractTask struct {
 	} //规则
 	AuditFields []string //需要审核的字段名称
 
-	SiteCityMap          map[string]*SiteCity //站点对应的省市区
-	ProvinceMap          map[string]string    //省全称简称(key:浙江省 val:浙江)
-	ProvinceBriefMap     map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
-	CityMap              map[string]string    //市全称简称(key:杭州市 val:杭州)
-	CityBriefMap         map[string]*City     //市简称对应的市信息(key:杭州 val:&City{})
-	CityFullMap          map[string]*City     //市全称对应的市信息(key:杭州市 val:&City{})
-	DistrictCityMap      map[string]*City
-	NewDistrictCityMap   map[string][]*City            //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
-	DistrictSimAndAll    map[string]string             //区或县(key:简称 val:全称)
-	NewDistrictSimAndAll map[string][]map[string]*City //区或县(key:简称 val:  相同简称的区全称:所在市)
-	StreetDistrictMap    map[string]*District          //街道对应的区或县
-	NewStreetDistrictMap map[string][]*District        //街道全称对应的区或县
-	CommunityDistrictMap map[string][]*District        //村、居委会对应的区或县
-	ProvinceAllGet       *ju.DFA                       //省全称
-	ProvinceSimGet       *ju.DFA                       //省简称
-	CityAllGet           *ju.DFA                       //市全称
-	CitySimGet           *ju.DFA                       //市简称
-	DistrictAllGet       *ju.DFA                       //区或县全称
-	DistrictSimGet       *ju.DFA                       //区或县简称
-	StreetGet            *ju.DFA                       //街道
+	SiteCityMap       map[string]*SiteCity          //站点对应的省市区
+	ProvinceMap       map[string]string             //省全称简称(key:浙江省 val:浙江)
+	ProvinceBriefMap  map[string]*Province          //省简称对应的省信息(key:浙江 val:&Province{})
+	CityMap           map[string]string             //市全称简称(key:杭州市 val:杭州)
+	CityBriefMap      map[string]*City              //市简称对应的市信息(key:杭州 val:&City{})
+	CityFullMap       map[string]*City              //市全称对应的市信息(key:杭州市 val:&City{})
+	DistrictCityMap   map[string][]*City            //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
+	DistrictSimAndAll map[string][]map[string]*City //区或县简称对应的city(全国有相同名称的区或县,这里对应的city用slice)
+	StreetDistrictMap map[string][]*District        //街道全称对应的区或县
+
+	ProvinceAllGet *ju.DFA //省全称
+	ProvinceSimGet *ju.DFA //省简称
+	CityAllGet     *ju.DFA //市全称
+	CitySimGet     *ju.DFA //市简称
+	DistrictAllGet *ju.DFA //区或县全称
+	DistrictSimGet *ju.DFA //区或县简称
+	StreetGet      *ju.DFA //街道
 
 	PostCodeMap map[string]*PostCode //邮编
 	AreaCodeMap map[string]*AreaCode //区号
@@ -131,8 +128,7 @@ type ExtractTask struct {
 	XjbtCityArr       []map[string]interface{} //新疆兵团相关数据
 	SensitiveFullCity *sensitive.Filter
 	SensitiveSimCity  *sensitive.Filter
-	InfoType          []map[string]interface {
-	}
+	InfoType          []map[string]interface{}
 
 	Trie_Full_Province  *ju.Trie       //省全称 省、直辖市、自治区
 	Trie_Full_City      *ju.Trie       //市全称 地级市
@@ -148,7 +144,6 @@ type ExtractTask struct {
 	Seg_SV              *gse.Segmenter //分词
 	Luacodes            *sync.Map      //站点规则
 	SiteMerge           *sync.Map      //抽取合并
-
 }
 
 type SiteCity struct {
@@ -1080,7 +1075,6 @@ func (e *ExtractTask) InitCityInfo() {
 		}
 	}
 	alldata := InitProvincesx()
-
 	fnx := make([]map[string]interface{}, 0)
 	citys_maps := make(map[string][]map[string]interface{}, 0)
 	districts_maps := make(map[string]map[string][]map[string]interface{}, 0)
@@ -1199,32 +1193,32 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 			e.Trie_Sim_District.AddWords(jc_district) //加入区或县简称Trie
 			//初始化城市简称
 			c := e.CityBriefMap[jc_city]
-			dfullarr := e.NewDistrictSimAndAll[jc_district]
+			dfullarr := e.DistrictSimAndAll[jc_district]
 			dfullcity := map[string]*City{qc_district: c}
 			if len(dfullarr) == 0 {
 				tmparr := []map[string]*City{dfullcity}
-				e.NewDistrictSimAndAll[jc_district] = tmparr
+				e.DistrictSimAndAll[jc_district] = tmparr
 			} else {
-				e.NewDistrictSimAndAll[jc_district] = append(e.NewDistrictSimAndAll[jc_district], dfullcity)
+				e.DistrictSimAndAll[jc_district] = append(e.DistrictSimAndAll[jc_district], dfullcity)
 			}
 		}
-		ctmp := e.NewDistrictCityMap[qc_district]
+		ctmp := e.DistrictCityMap[qc_district]
 		if len(ctmp) == 0 {
 			tmpcarr := []*City{c}
-			e.NewDistrictCityMap[qc_district] = tmpcarr
+			e.DistrictCityMap[qc_district] = tmpcarr
 		} else {
-			e.NewDistrictCityMap[qc_district] = append(e.NewDistrictCityMap[qc_district], c)
+			e.DistrictCityMap[qc_district] = append(e.DistrictCityMap[qc_district], c)
 		}
 		if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok {
 			for _, vdistrict_alias := range district_alias {
 				strvdistrict_alias := qu.ObjToString(vdistrict_alias)
 				e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
-				ctmp := e.NewDistrictCityMap[strvdistrict_alias]
-				if len(ctmp) == 0 {
+				c_tmp := e.DistrictCityMap[strvdistrict_alias]
+				if len(c_tmp) == 0 {
 					tmpcarr := []*City{c}
-					e.NewDistrictCityMap[strvdistrict_alias] = tmpcarr
+					e.DistrictCityMap[strvdistrict_alias] = tmpcarr
 				} else {
-					e.NewDistrictCityMap[strvdistrict_alias] = append(e.NewDistrictCityMap[strvdistrict_alias], c)
+					e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c)
 				}
 			}
 		}
@@ -1236,12 +1230,12 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 			s.Name = strvtown
 			s.D = d
 			e.Trie_Full_Street.AddWords(strvtown) //加入街道全称Trie
-			dtmp := e.NewStreetDistrictMap[strvtown]
+			dtmp := e.StreetDistrictMap[strvtown]
 			if len(dtmp) == 0 {
 				tmpdarr := []*District{d}
-				e.NewStreetDistrictMap[strvtown] = tmpdarr
+				e.StreetDistrictMap[strvtown] = tmpdarr
 			} else {
-				e.NewStreetDistrictMap[strvtown] = append(e.NewStreetDistrictMap[strvtown], d)
+				e.StreetDistrictMap[strvtown] = append(e.StreetDistrictMap[strvtown], d)
 			}
 			//村、居委会
 			//jwhs := jwhs_maps[jc_province][qc_city][qc_district][strvtown]
@@ -1289,6 +1283,7 @@ func (e *ExtractTask) InitVar() {
 	//敏感词-筛选
 	e.SensitiveFullCity = sensitive.New()
 	e.SensitiveSimCity = sensitive.New()
+
 	//初始化map
 	if e.SiteCityMap == nil {
 		e.SiteCityMap = make(map[string]*SiteCity)
@@ -1299,11 +1294,9 @@ func (e *ExtractTask) InitVar() {
 	if e.CityMap == nil {
 		e.CityMap = make(map[string]string)
 	}
+
 	if e.DistrictSimAndAll == nil {
-		e.DistrictSimAndAll = make(map[string]string)
-	}
-	if e.NewDistrictSimAndAll == nil {
-		e.NewDistrictSimAndAll = make(map[string][]map[string]*City)
+		e.DistrictSimAndAll = make(map[string][]map[string]*City)
 	}
 
 	if e.CityBriefMap == nil {
@@ -1315,15 +1308,12 @@ func (e *ExtractTask) InitVar() {
 	if e.ProvinceBriefMap == nil {
 		e.ProvinceBriefMap = make(map[string]*Province)
 	}
-	if e.NewDistrictCityMap == nil {
-		e.NewDistrictCityMap = make(map[string][]*City)
+	if e.DistrictCityMap == nil {
+		e.DistrictCityMap = make(map[string][]*City)
 	}
 
-	if e.NewStreetDistrictMap == nil {
-		e.NewStreetDistrictMap = make(map[string][]*District)
-	}
-	if e.CommunityDistrictMap == nil {
-		e.CommunityDistrictMap = make(map[string][]*District)
+	if e.StreetDistrictMap == nil {
+		e.StreetDistrictMap = make(map[string][]*District)
 	}
 
 }

+ 6 - 598
src/jy/extract/extractcity.go

@@ -1,13 +1,7 @@
 package extract
 
-import (
-	. "jy/pretreated"
-	ju "jy/util"
-	qu "qfw/util"
-	"strings"
-)
+import "regexp"
 
-//省
 type Province struct {
 	Name    string
 	Brief   string
@@ -62,596 +56,10 @@ type AreaCode struct {
 	C    []string
 }
 
-//抽取city
-func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
-	/*
-		高准确率:
-			1.爬虫数据jsondata
-			2.采购单位库
-			3.邮编
-			4.固话
-			5.site(todo)
-		低准确率:(全称库匹配到不走简称库)
-			1.city全称库(buyeraddr;title,projectname)
-			2.city简称库(buyeraddr;title,projectname)
-	*/
-	defer qu.Catch()
-	//初始化
-	if j.FullAreaScore == nil {
-		j.FullAreaScore = make(map[string]float64)
-	}
-	if j.FullCityScore == nil {
-		j.FullCityScore = make(map[string]float64)
-	}
-	if j.FullDistrictScore == nil {
-		j.FullDistrictScore = make(map[string]float64)
-	}
-	sm := NewSortMap()
-	//高精度抽取city
-	//存储每个流程的抽取结果
-	area1 := make([]map[string]string, 4)
-	city1 := make([]map[string]string, 4)
-	district1 := make([]map[string]string, 4)
-
-	//jsondata
-	p0, c0, d0, p, c, d := e.GetCityByJsonData(j)
-	area1 = append(area1, map[string]string{"a_c_d": p})
-	city1 = append(city1, map[string]string{"a_c_d": c})
-	district1 = append(district1, map[string]string{"a_c_d": d})
-	area1[0] = map[string]string{"jsondata": p0}
-	city1[0] = map[string]string{"jsondata": c0}
-	district1[0] = map[string]string{"jsondata": d0}
-	//qu.Debug("=====jsondata打分---", j.AreaScore, j.CityScore, j.DistrictScore)
-	//采购单位库
-	buyer, _ := resulttmp["buyer"].(string)
-	p1, c1, d1 := e.GetCityByBuyer(j, buyer)
-	//qu.Debug("buyer	p--", p1, "c--", c1, "d--", d1)
-	area1[1] = map[string]string{"buyer": p1}
-	city1[1] = map[string]string{"buyer": c1}
-	district1[1] = map[string]string{"buyer": d1}
-	//qu.Debug("=====采购单位库打分---", j.AreaScore, j.CityScore, j.DistrictScore)
-	//postcode邮编
-	buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
-	p2, c2, d2 := e.GetCityByPostCode(j, buyerzipcode)
-	//qu.Debug("postcode	p--", p2, "c--", c2, "d--", d2)
-	area1[2] = map[string]string{"postcode": p2}
-	city1[2] = map[string]string{"postcode": c2}
-	district1[2] = map[string]string{"postcode": d2}
-	//qu.Debug("=====postcode邮编打分---", j.AreaScore, j.CityScore, j.DistrictScore)
-	//areacode固话区号
-	buyertel, _ := resulttmp["buyertel"].(string)
-	p3, c3, d3 := e.GetCityByAreaCode(j, buyertel)
-	//qu.Debug("areacode	p--", p3, "c--", c3, "d--", d3, buyertel)
-	area1[3] = map[string]string{"areacode": p3}
-	city1[3] = map[string]string{"areacode": c3}
-	district1[3] = map[string]string{"areacode": d3}
-	//qu.Debug("=====areacode固话区号打分---", j.AreaScore, j.CityScore, j.DistrictScore)
-	HighPreCity := make(map[string]interface{})
-	HighPreCity["area"] = area1
-	HighPreCity["city"] = city1
-	HighPreCity["district"] = district1
-	//低精度抽取city
-	//buyeraddr,title,projectname
-	buyeraddr, _ := resulttmp["buyeraddr"].(string)
-	title, _ := resulttmp["title"].(string)
-	projectname, _ := resulttmp["projectname"].(string)
-	//qu.Debug(buyeraddr, "--", buyer, "--", title, "--", projectname)
-	sm.AddKey("buyeraddr", buyeraddr)
-	sm.AddKey("buyer", buyer)
-	sm.AddKey("title", title)
-	sm.AddKey("projectname", projectname)
-	area2, city2, district2 := e.GetCityByOthers(j, sm)
-	LowPreCity := make(map[string]interface{})
-	LowPreCity["area"] = area2
-	LowPreCity["city"] = city2
-	LowPreCity["district"] = district2
-	//	resulttmp["highprecity"] = HighPreCity
-	//	resulttmp["lowprecity"] = LowPreCity
-	//qu.Debug("最终打分---", j.AreaScore, j.CityScore, j.DistrictScore)
-	//最终抽取结果
-	finishP := HighestScoreArr(j.FullAreaScore)
-	finishC := HighestScoreArr(j.FullCityScore)
-	finishD := HighestScoreArr(j.FullDistrictScore)
-
-	//	area, _ := resulttmp["area"].(string)
-	//	city, _ := resulttmp["city"].(string)
-	//	district, _ := resulttmp["district"].(string)
-	//	qu.Debug("之前结果结果===", area, city, district)
-	arearesult := ""
-	cityresult := ""
-	districtresult := ""
-
-	if len(finishP) == 1 { //最高分一个
-		arearesult = finishP[0] //抽取结果直接赋值
-		cityresult = GetCity(arearesult, cityresult, e, finishC)
-		cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
-	} else if len(finishP) > 1 { //province最高分多个
-		if len(finishC) == 1 {
-			cityresult = finishC[0]
-			if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
-				arearesult = cfMap.P.Brief
-				cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
-			}
-		} else { //对应的city有多个(多个province和city)
-			arearesult = finishP[0] //抽取结果直接赋值
-			cityresult = GetCity(arearesult, cityresult, e, finishC)
-			cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
-		}
-	}
-	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
-	if arearesult == "" {
-		arearesult = "全国"
-	} else if cityresult == "" {
-		if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
-			cityresult = pbMap.Cap
-			resulttmp["defaultpcap"] = true
-		}
-	}
-
-	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
-	resulttmp["area"] = arearesult
-	resulttmp["city"] = cityresult
-	resulttmp["district"] = districtresult
-}
-func (e *ExtractTask) GetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
-	defer qu.Catch()
-	jsondata := *j.Jsondata
-	if jsondata != nil { //jsondata中获取province和city
-		if acd, ok := jsondata["area_city_district"].(string); ok && acd != "" {
-			flag := false
-			p, flag = GetPCDByAreaDFA(p, acd, e, j, flag)
-			if !flag {
-				p, c, flag = GetPCDByCityDFA(p, c, acd, e, j, flag)
-			}
-			if !flag {
-				p, city, c = GetPCDByDistrictDFA(p, c, d, acd, e, j)
-			}
-		}
-
-		city, _ = jsondata["city"].(string)         //city全称或者简称
-		province, _ = jsondata["area"].(string)     //province简称
-		district, _ = jsondata["district"].(string) //district全称
-	}
-	PCDScore(j, "district", district, 5, true) //district打分
-	bp := false
-	if province != "" {
-		if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
-			bp = true //省份正确
-		}
-	}
-	pbrief := ""
-	if city != "" {
-		cityfullmap := e.CityFullMap[city] //判断city全称是否正确
-		if cityfullmap != nil {
-			pbrief = cityfullmap.P.Brief //province简称
-		} else {
-			citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
-			if citybriefmap != nil {
-				city = citybriefmap.Name //city简称替换为全称
-				pbrief = citybriefmap.P.Brief
-			}
-		}
-	}
-	if bp {
-		if pbrief == province { //爬虫的province和city匹配
-			PCDScore(j, "city", city, 5, true)
-		} else { //pbrief不匹配province(此时city为空或者错误)
-			city = ""
-		}
-		PCDScore(j, "province", province, 5, true)
-	} else { //省份错误或为空,取city的对应的pbrief为province
-		if pbrief != "" {
-			province = pbrief
-			PCDScore(j, "province", province, 5, true)
-			PCDScore(j, "city", city, 5, true)
-		} else {
-			province = ""
-			city = ""
-		}
-	}
-	return
-
-}
-func (e *ExtractTask) GetCityByBuyer(j *ju.Job, buyer string) (province, city, district string) {
-	defer qu.Catch()
-	return
-}
-func (e *ExtractTask) GetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
-	defer qu.Catch()
-	pc := e.PostCodeMap[postcode]
-	if pc != nil {
-		province = pc.P
-		city = pc.C
-		districtTmp := pc.D
-		if len(districtTmp) == 1 { //对应多个district舍去
-			district = districtTmp[0]
-			PCDScore(j, "district", district, 5, true)
-		}
-		PCDScore(j, "province", province, 5, true)
-		PCDScore(j, "city", city, 5, true)
-	}
-	return
-}
-func (e *ExtractTask) GetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
-	defer qu.Catch()
-	if len(buyertel) >= 11 {
-		if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
-			n := 4
-		L:
-			areacode := buyertel[:n]
-			ac := e.AreaCodeMap[areacode]
-			if ac != nil {
-				province = ac.P
-				citytmp := ac.C
-				if len(citytmp) == 1 { //对应多个city舍去
-					city = citytmp[0]
-					PCDScore(j, "city", city, 5, true)
-				}
-				PCDScore(j, "province", province, 5, true)
-			} else {
-				n = n - 1
-				if n >= 3 {
-					goto L
-				}
-			}
-		} else if buyertel[:3] == "853" { //澳门
-			province = "澳门"
-			city = "澳门"
-			PCDScore(j, "province", province, 5, true)
-			PCDScore(j, "city", city, 5, true)
-		}
-	}
-	return
-}
-func (e *ExtractTask) GetCityByOthers(j *ju.Job, sm *SortMap) ([]map[string]string, []map[string]string, []map[string]string) {
-	//存储每个流程的抽取结果
-	area2 := []map[string]string{}
-	city2 := []map[string]string{}
-	district2 := []map[string]string{}
-	isExtPC := false
-	for _, from := range sm.Keys { //buyeraddr;title;projectname
-		str, _ := sm.Map[from].(string)
-		//分别记录buyeraddr;title;projectname全称匹配的打分情况
-		pscore1 := make(map[string]int)
-		cscore1 := make(map[string]int)
-		dscore1 := make(map[string]int)
-		//优先province,city,district,street全称匹配
-		for pos, GET := range []*ju.DFA{e.ProvinceAllGet, e.CityAllGet, e.DistrictAllGet, e.StreetGet} {
-			word := GET.CheckSensitiveWord(str)
-			if word != "" {
-				if pos == 0 { //province
-					pbrief := e.ProvinceMap[word] //取province简称
-					OtherScore("p", []string{pbrief}, &pscore1, &cscore1, &dscore1)
-				} else if pos == 1 { //city
-					p := ""
-					cityfullmap := e.CityFullMap[word]
-					if cityfullmap != nil {
-						p = cityfullmap.P.Brief //取province简称
-					}
-					OtherScore("c", []string{p, word}, &pscore1, &cscore1, &dscore1)
-				} else if pos == 2 { //district
-					p, c := "", ""
-					dcitymap := e.DistrictCityMap[word] //区对应的city
-
-					if dcitymap != nil {
-						c = dcitymap.Name    //city全称
-						p = dcitymap.P.Brief //province简称
-					}
-					tmpArr := []string{p, c, word}
-					if word == c { //河南济源市
-						tmpArr = []string{p, c}
-					}
-					OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
-				} else if pos == 3 { //street
-					p, c, d := "", "", ""
-					sdmap := e.StreetDistrictMap[word] //对应的区
-
-					if sdmap != nil {
-						d = sdmap.Name
-						c = sdmap.C.Name
-						p = sdmap.C.P.Brief
-					}
-					tmpArr := []string{p, c, d}
-					if c == d { //河南济源市
-						tmpArr = []string{p, c}
-					}
-					OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
-				}
-			}
-		}
-
-		//取最高分的province,city,district
-		ph1 := HighestScore(pscore1)
-		ch1 := HighestScore(cscore1)
-		dh1 := HighestScore(dscore1)
-		isMatch := IsMatch(ph1, ch1, e) //最高分p和最高分c可能不对应
-		if ch1 != "" && ph1 != "" && isMatch {
-			isExtPC = true
-		}
-		//是否相互匹配
-		area2 = append(area2, map[string]string{from + "_all": ph1})
-		city2 = append(city2, map[string]string{from + "_all": ch1})
-		district2 = append(district2, map[string]string{from + "_all": dh1})
-		//buyeraddr,title,projectname匹配对应的结果加入最终得分
-		if isMatch {
-			if from == "buyeraddr" || from == "buyer" { //全称匹配,buyeraddr和buyer3分,title和projectname2分
-				PCDScore(j, "province", ph1, 3, true)
-				PCDScore(j, "city", ch1, 3, true)
-				PCDScore(j, "district", dh1, 3, true)
-			} else {
-				PCDScore(j, "province", ph1, 2, true)
-				PCDScore(j, "city", ch1, 2, true)
-				PCDScore(j, "district", dh1, 2, true)
-			}
-		}
-
-	}
-	//判断全称是否抽出了province和city,一个未抽出走简称抽取
-	if !isExtPC {
-		for _, from := range sm.Keys { //buyeraddr;title;projectname
-			str, _ := sm.Map[from].(string)
-			pscore2 := make(map[string]int)
-			cscore2 := make(map[string]int)
-			dscore2 := make(map[string]int)
-			for pos, GET := range []*ju.DFA{e.ProvinceSimGet, e.CitySimGet, e.DistrictSimGet} {
-				word := GET.CheckSensitiveWord(str)
-				if word != "" {
-					if pos == 0 { //province
-						OtherScore("p", []string{word}, &pscore2, &cscore2, &dscore2)
-					} else if pos == 1 { //city
-						p, c := "", ""
-						citybriefmap := e.CityBriefMap[word]
-						if citybriefmap != nil {
-							p = citybriefmap.P.Brief
-							c = citybriefmap.Name
-						}
-						OtherScore("c", []string{p, c}, &pscore2, &cscore2, &dscore2)
-					} else if pos == 2 { //district
-						p, c := "", ""
-						d := e.DistrictSimAndAll[word]
-
-						dcitymap := e.DistrictCityMap[word]
-						if dcitymap != nil {
-							c = dcitymap.Name
-							p = dcitymap.P.Brief
-						}
-						OtherScore("d", []string{p, c, d}, &pscore2, &cscore2, &dscore2)
-					}
-				}
-			}
-			//取最高分的province,city,district
-			ph2 := HighestScore(pscore2)
-			ch2 := HighestScore(cscore2)
-			dh2 := HighestScore(dscore2)
-			area2 = append(area2, map[string]string{from + "_sim": ph2})
-			city2 = append(city2, map[string]string{from + "_sim": ch2})
-			district2 = append(district2, map[string]string{from + "_sim": dh2})
-			//buyeraddr,title,projectname匹配对应的结果加入最终得分
-			if from == "buyeraddr" {
-				PCDScore(j, "province", ph2, 2, true)
-				PCDScore(j, "city", ch2, 2, true)
-				PCDScore(j, "district", dh2, 2, true)
-			} else {
-				PCDScore(j, "province", ph2, 1, true)
-				PCDScore(j, "city", ch2, 1, true)
-				PCDScore(j, "district", dh2, 1, true)
-			}
-		}
-	}
-
-	return area2, city2, district2
-}
-
-func IsMatch(p, c string, e *ExtractTask) bool {
-	ism := false
-	if p != "" && c == "" {
-		return true
-	}
-	if cfMap := e.CityFullMap[c]; cfMap != nil {
-		if cfMap.P.Brief == p {
-			ism = true
-		}
-	}
-	return ism
-}
-
-//计算province,city,district得分
-func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
-	defer qu.Catch()
-	if text != "" {
-		if stype == "district" {
-			tmpdistrict := make(map[string]float64)
-			if isfull {
-				tmpdistrict = j.FullDistrictScore
-			} else {
-				tmpdistrict = j.SimDistrictScore
-			}
-			scoretmp := tmpdistrict[text]
-			tmpdistrict[text] = scoretmp + score
-		} else if stype == "city" {
-			tmpcity := make(map[string]float64)
-			if isfull {
-				tmpcity = j.FullCityScore
-			} else {
-				tmpcity = j.SimCityScore
-			}
-			scoretmp := tmpcity[text]
-			tmpcity[text] = scoretmp + score
-		} else if stype == "province" {
-			tmpprovince := make(map[string]float64)
-			if isfull {
-				tmpprovince = j.FullAreaScore
-			} else {
-				tmpprovince = j.SimAreaScore
-			}
-			scoretmp := tmpprovince[text]
-			tmpprovince[text] = scoretmp + score
-		}
-	}
-}
-
-func OtherScore(stype string, text []string, ps, cs, ds *map[string]int) {
-	defer qu.Catch()
-	for i, t := range text {
-		if t != "" {
-			if i == 0 { //p
-				tmpscore := (*ps)[t]
-				(*ps)[t] = tmpscore + 1
-			} else if i == 1 { //c
-				tmpscore := (*cs)[t]
-				(*cs)[t] = tmpscore + 1
-			} else if i == 2 { //d
-				tmpscore := (*ds)[t]
-				(*ds)[t] = tmpscore + 1
-			}
-		}
-
-	}
-}
-
-func HighestScore(m map[string]int) string {
-	result := ""
-	tmpscore := 0
-	for str, score := range m {
-		if str != "" && tmpscore < score {
-			result = str
-			tmpscore = score
-		}
-	}
-	return result
-}
-
-func HighestScoreArr(m map[string]float64) []string {
-	result := make(map[float64][]string)
-	tmpscore := 0.0
-	for str, score := range m {
-		if str != "" && tmpscore <= score {
-			if result[tmpscore] != nil && tmpscore != score {
-				delete(result, tmpscore)
-			}
-			if r := result[score]; r != nil {
-				r = append(r, str)
-				result[score] = r
-			} else {
-				result[score] = []string{str}
-			}
-			tmpscore = score
-		}
-	}
-	return result[tmpscore]
-}
-
-func GetCity(area, city string, e *ExtractTask, finishC []string) string {
-	for _, c := range finishC { //取最高分与province匹配的city
-		if cfMap := e.CityFullMap[c]; cfMap != nil {
-			if cfMap.P.Brief == area {
-				city = c
-				break
-			}
-		}
-	}
-	return city
-}
-
-func GetDistrict(area, city, district string, e *ExtractTask, finishD []string) (string, string) {
-	for _, d := range finishD { //取最高分与province匹配的district
-		if dcMap := e.DistrictCityMap[d]; dcMap != nil {
-			if dcMap.P.Brief == area {
-				district = d
-				tmpcity := dcMap.Name
-				if city != tmpcity {
-					if cfMap := e.CityFullMap[tmpcity]; cfMap != nil {
-						if cfMap.P.Brief == area {
-							city = tmpcity
-							break
-						}
-					}
-				}
-			}
-		}
-	}
-	return city, district
+var AgencyReg = []*regexp.Regexp{
+	regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
+	regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
 }
 
-func GetPCDByAreaDFA(province, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, bool) {
-	if word := e.ProvinceSimGet.CheckSensitiveWord(acd); word != "" { //取省
-		if pbMap := e.ProvinceBriefMap[word]; pbMap != nil {
-			province = pbMap.Brief
-			if province == acd || pbMap.Name == acd { //用于判断area_city_district是否只有省份信息,flag为true就不在匹配area_city_district中的city和district
-				flag = true
-			}
-			PCDScore(j, "province", province, 5, true)
-		}
-	}
-	return province, flag
-}
-
-func GetPCDByCityDFA(province, city, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, string, bool) {
-	for pos, GET := range []*ju.DFA{e.CityAllGet, e.CitySimGet} { //取市
-		if word := GET.CheckSensitiveWord(acd); word != "" {
-			if pos == 0 { //全称
-				if cfMap := e.CityFullMap[word]; cfMap != nil {
-					if province != "" && cfMap.P.Brief == province { //acd有province信息
-						city = cfMap.Name
-						if acd == province+city || acd == cfMap.P.Name+city {
-							flag = true
-						}
-					} else if province == "" { //acd有city;city和district信息
-						city = cfMap.Name
-						province = cfMap.P.Brief
-						PCDScore(j, "province", province, 5, true)
-						if acd == city {
-							flag = true
-						}
-					}
-					PCDScore(j, "city", city, 5, true)
-					break
-				}
-			} else { //简称
-				if cbMap := e.CityBriefMap[word]; cbMap != nil {
-					if province != "" && cbMap.P.Brief == province {
-						city = cbMap.Name
-						if acd == province+city || acd == cbMap.P.Name+city {
-							flag = true
-						}
-					} else if province == "" {
-						city = cbMap.Name
-						province = cbMap.P.Brief
-						PCDScore(j, "province", province, 5, true)
-						if acd == city {
-							flag = true
-						}
-					}
-					PCDScore(j, "city", city, 5, true)
-					break
-				}
-			}
-		}
-	}
-	return province, city, flag
-}
-func GetPCDByDistrictDFA(province, city, district, acd string, e *ExtractTask, j *ju.Job) (string, string, string) {
-	//area_city_district字段不会单独存区信息(省市,省,市,省区,省市区)
-	for pos, GET := range []*ju.DFA{e.DistrictAllGet, e.DistrictSimGet} { //取区
-		if word := GET.CheckSensitiveWord(acd); word != "" {
-			if dcMap := e.DistrictCityMap[word]; dcMap != nil {
-				district = word
-				if pos == 1 { //简称换为全称
-					district = e.DistrictSimAndAll[district]
-				}
-				if city == "" && dcMap.P.Brief == province { //只有province和district(are_city_district:河南省二七区)
-					city = dcMap.Name
-					PCDScore(j, "city", city, 5, true)
-				} else if province == "" { //province和city都没有(are_city_district:二七区)
-					city = dcMap.Name
-					province = dcMap.P.Brief
-					PCDScore(j, "city", city, 5, true)
-					PCDScore(j, "province", province, 5, true)
-				}
-				PCDScore(j, "district", district, 5, true)
-				break
-			}
-		}
-	}
-	return province, city, district
-}
+var xjbtReg *regexp.Regexp = regexp.MustCompile("^(新疆生产建设兵团|新疆兵团)")
+var sensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")

+ 14 - 0
src/jy/extract/extractcity_new.go

@@ -0,0 +1,14 @@
+package extract
+
+import (
+	ju "jy/util"
+	qu "qfw/util"
+)
+
+//抽取地域信息
+func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}) {
+	defer qu.Catch()
+	//
+
+}
+

+ 78 - 73
src/jy/extract/newextractcity.go → src/jy/extract/extractcity_old.go

@@ -4,20 +4,11 @@ import (
 	. "jy/pretreated"
 	ju "jy/util"
 	qu "qfw/util"
-	"regexp"
 	"strings"
 
 	log "github.com/donnie4w/go-logger/logger"
 )
 
-var AgencyReg = []*regexp.Regexp{
-	regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
-	regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
-}
-
-var xjbtReg *regexp.Regexp = regexp.MustCompile("^(新疆生产建设兵团|新疆兵团)")
-var sensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")
-
 //抽取city
 func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}, id string) {
 	/*
@@ -272,7 +263,7 @@ func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.
 				}
 			}
 		} else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
-			carr := e.NewDistrictCityMap[full]
+			carr := e.DistrictCityMap[full]
 			if len(carr) > 0 {
 				district = full
 				PCDScore(j, "district", district, 5, true)
@@ -285,11 +276,11 @@ func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.
 							PCDScore(j, "province", tmpPbrief, 5, true)
 							repeatPb[tmpPbrief] = true
 						}
-					} else {                     //已有省份
+					} else { //已有省份
 						if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
 							PCDScore(j, "city", tmpcity, -5, true)
 							PCDScore(j, "province", tmpPbrief, -5, true)
-						} else {            //与之前匹配结果一致
+						} else { //与之前匹配结果一致
 							if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
 								PCDScore(j, "city", tmpcity, 5, true)
 							}
@@ -329,7 +320,7 @@ func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.J
 				}
 			}
 		} else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
-			dfullarr := e.NewDistrictSimAndAll[sim]
+			dfullarr := e.DistrictSimAndAll[sim]
 			if len(dfullarr) > 0 {
 				PCDScore(j, "district", sim, 5, true)
 				for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
@@ -345,11 +336,11 @@ func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.J
 								PCDScore(j, "province", tmpPbrief, 5, true)
 								repeatPb[tmpPbrief] = true
 							}
-						} else {                     //已有省份
+						} else { //已有省份
 							if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
 								PCDScore(j, "city", tmpcity, -5, true)
 								PCDScore(j, "province", tmpPbrief, -5, true)
-							} else {            //与之前匹配结果一致
+							} else { //与之前匹配结果一致
 								if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
 									PCDScore(j, "city", tmpcity, 5, true)
 								}
@@ -487,7 +478,7 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
 						repeatPb := map[string]bool{}
 						isOk := false
 						districtOk := false
-						citys := e.NewDistrictCityMap[text]
+						citys := e.DistrictCityMap[text]
 						for _, c := range citys {
 							tmpPbrief := c.P.Brief
 							if p_full == tmpPbrief { //省份一致
@@ -534,15 +525,15 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
 							break
 						}
 					} else if pos_full == 3 { //街道全称
-						districts := e.NewStreetDistrictMap[text]
+						districts := e.StreetDistrictMap[text]
 						if len(districts) == 1 { //街道唯一
 							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
 						}
 					} else if pos_full == 4 { //居委会全称
-						districts := e.CommunityDistrictMap[text]
-						if len(districts) == 1 { //居委会唯一
-							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
-						}
+						//districts := e.CommunityDistrictMap[text]
+						//if len(districts) == 1 { //居委会唯一
+						//	DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
+						//}
 					}
 				}
 			}
@@ -583,7 +574,7 @@ func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore,
 					} else if pos_sim == 2 && d_sim == "" { //区简称
 						repeatPb := map[string]bool{}
 						repeatDb := map[string]bool{}
-						dfull_citys := e.NewDistrictSimAndAll[text]
+						dfull_citys := e.DistrictSimAndAll[text]
 						for _, dfull_city := range dfull_citys {
 							for dfull, c := range dfull_city { //dfull:简称对应的全称
 								if c == nil || c.P == nil {
@@ -680,7 +671,7 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
 							break
 						}
 					} else if pos_full == 2 { //区全称
-						citys := e.NewDistrictCityMap[text]
+						citys := e.DistrictCityMap[text]
 						if len(citys) > 0 {
 							if !repeatD_full[text] {
 								PCDScore(j, "district", text, 1, true)
@@ -699,15 +690,15 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
 							break
 						}
 					} else if pos_full == 3 { //街道全称
-						districts := e.NewStreetDistrictMap[text]
+						districts := e.StreetDistrictMap[text]
 						if len(districts) == 1 {
 							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
 						}
 					} else if pos_full == 4 { //居委会全称
-						districts := e.CommunityDistrictMap[text]
-						if len(districts) == 1 {
-							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
-						}
+						//districts := e.CommunityDistrictMap[text]
+						//if len(districts) == 1 {
+						//	DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
+						//}
 					}
 				}
 			}
@@ -732,7 +723,7 @@ func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
 							break
 						}
 					} else if pos_sim == 2 { //区简称
-						dfull_citys := e.NewDistrictSimAndAll[text]
+						dfull_citys := e.DistrictSimAndAll[text]
 						if len(dfull_citys) == 1 {
 							for _, dfull_city := range dfull_citys {
 								for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
@@ -794,7 +785,6 @@ func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, scor
 		}
 	}
 }
-
 func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
 	for _, c := range finishC { //取最高分与province匹配的city
 		if cfMap := e.CityFullMap[c]; cfMap != nil {
@@ -810,10 +800,9 @@ func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (s
 	}
 	return city, tmpcity
 }
-
 func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
 	for _, d := range finishD { //取最高分与province匹配的district
-		citys := e.NewDistrictCityMap[d]
+		citys := e.DistrictCityMap[d]
 		for _, c := range citys {
 			if len(tmpcity) == 0 { //没有city
 				if c.P.Brief == area {
@@ -826,7 +815,7 @@ func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcit
 					district = d
 					return city, district
 				}
-			} else {                         //多个city
+			} else { //多个city
 				for _, tc := range tmpcity { //多个city根据district最高分取
 					if tc == c.Name && len(finishD) == 1 {
 						city = c.Name
@@ -856,7 +845,6 @@ func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[strin
 		}
 	}
 }
-
 func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
 	if len(j.FullAreaScore) > 0 {
 		for pt, ps := range *pscore {
@@ -904,7 +892,7 @@ func MergeFullSimScore(j *ju.Job) {
 func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
 	if len(j.FullDistrictScore) > 0 {
 		for d, _ := range j.FullDistrictScore {
-			tmpCitys := e.NewDistrictCityMap[d]
+			tmpCitys := e.DistrictCityMap[d]
 			for _, c := range tmpCitys {
 				if j.FullCityScore[c.Name] != 0 {
 					tmpPb := c.P.Brief
@@ -950,6 +938,61 @@ func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
 
 }
 
+func HighestScoreArr(m map[string]float64) []string {
+	result := make(map[float64][]string)
+	tmpscore := 0.0
+	for str, score := range m {
+		if str != "" && tmpscore <= score {
+			if result[tmpscore] != nil && tmpscore != score {
+				delete(result, tmpscore)
+			}
+			if r := result[score]; r != nil {
+				r = append(r, str)
+				result[score] = r
+			} else {
+				result[score] = []string{str}
+			}
+			tmpscore = score
+		}
+	}
+	return result[tmpscore]
+}
+
+//计算province,city,district得分
+func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
+	defer qu.Catch()
+	if text != "" {
+		if stype == "district" {
+			tmpdistrict := make(map[string]float64)
+			if isfull {
+				tmpdistrict = j.FullDistrictScore
+			} else {
+				tmpdistrict = j.SimDistrictScore
+			}
+			scoretmp := tmpdistrict[text]
+			tmpdistrict[text] = scoretmp + score
+		} else if stype == "city" {
+			tmpcity := make(map[string]float64)
+			if isfull {
+				tmpcity = j.FullCityScore
+			} else {
+				tmpcity = j.SimCityScore
+			}
+			scoretmp := tmpcity[text]
+			tmpcity[text] = scoretmp + score
+		} else if stype == "province" {
+			tmpprovince := make(map[string]float64)
+			if isfull {
+				tmpprovince = j.FullAreaScore
+			} else {
+				tmpprovince = j.SimAreaScore
+			}
+			scoretmp := tmpprovince[text]
+			tmpprovince[text] = scoretmp + score
+		}
+	}
+}
+
 //-新疆兵团映射-
 func (e *ExtractTask) CheckingXjbtCity(buyer string) (new_a, new_c, new_d string, ok bool) {
 	buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
@@ -1012,43 +1055,5 @@ func (e *ExtractTask) SensitiveCityData(detail string, area string) string {
 	//		return v
 	//	}
 	//}
-
 	return ""
 }
-
-//province,city,district干扰项减分
-//func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
-//	defer qu.Catch()
-//	if text != "" {
-//		if stype == "city" {
-//			for cn, cscore := range j.CityScore {
-//				if cn != text {
-//					j.CityScore[cn] = cscore + score
-//					//错误的city减分后对应的province也减分
-//					for pb, pscore := range j.AreaScore {
-//						if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
-//							j.AreaScore[pb] = pscore + score
-//						}
-//					}
-//				}
-//			}
-//		} else if stype == "province" {
-//			for pb, pscore := range j.AreaScore {
-//				if pb != text {
-//					j.AreaScore[pb] = pscore + score
-//					//错误的province减分后对应的city也要减分
-//					for cn, cscore := range j.CityScore {
-//						if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
-//							j.CityScore[cn] = cscore + score
-//						}
-//					}
-//				}
-//			}
-//		}
-//		//		for name, tmpscore := range *whichMap {
-//		//			if name != text {
-//		//				(*whichMap)[name] = tmpscore + score
-//		//			}
-//		//		}
-//	}
-//}

+ 1 - 0
src/main.go

@@ -28,6 +28,7 @@ func init() {
 }
 
 func main() {
+	//调试
 	extract.ExtractUdpUpdateMachine() //节点上传~构建
 	extract.ExtractUdp()              //udp通知抽取
 	go Router.Run(":" + qu.ObjToString(u.Config["port"]))

+ 1 - 1
udpcontrol/src/initdata.go

@@ -46,6 +46,7 @@ func initVarData() {
 	using_machine = qu.IntAll(sysconfig["using_machine"])
 	nextNode = qu.ObjArrToMapArr(sysconfig["nextNode"].([]interface{}))
 	lastNodeResponse = time.Now().Unix()
+	isGetask = false
 }
 
 //加载抽取
@@ -95,7 +96,6 @@ func initExtractNode() {
 //重置抽取
 func resetExtNodeArr() {
 	isAction = false
-	isGetask = false
 	using_ext_node = []map[string]interface{}{}
 	standby_ext_node = []map[string]interface{}{}
 	invalid_ext_node = []map[string]interface{}{}

+ 1 - 1
udpcontrol/src/method.go

@@ -43,7 +43,7 @@ func getRepeatTask() {
 			}
 			getasklock.Unlock()
 		} else {
-			time.Sleep(15 * time.Second)
+			time.Sleep(30 * time.Second)
 		}
 	}
 }