Răsfoiți Sursa

1、抽取服务~地域提取
2、备份抽取项目

zhengkun 2 ani în urmă
părinte
comite
e7533c5863

+ 0 - 253
extcity/src/ext/checkcity.go

@@ -1,253 +0,0 @@
-package ext
-
-import (
-	"fmt"
-	log "github.com/donnie4w/go-logger/logger"
-	qu "qfw/util"
-	"regexp"
-	"strings"
-	ul "util"
-)
-
-
-type CP struct {
-	P_Name    string
-}
-type CC struct {
-	P_Name string
-	C_Name string
-}
-type CD struct {
-	P_Name string
-	C_Name string
-	D_Name string
-}
-var (
-	ProvinceDict	map[string][]CP				//省份-map
-	CityDict		map[string][]CC					//城市-map
-	DistrictDict	map[string][]CD				//区县-map
-)
-var cityEndReg *regexp.Regexp = regexp.MustCompile("(区|县|市)$")
-
-//核对城市
-func GetCheckDataCity(tmp map[string]interface{},update_check *map[string]interface{}) {
-
-	area := qu.ObjToString(tmp["area"])
-	city := qu.ObjToString(tmp["city"])
-	district := qu.ObjToString(tmp["district"])
-	rdata := standardCheckCity(area,city,district)//标准城市-校验
-	if len(rdata)>0 {
-		umap:=updateLogging(tmp,rdata,"标准信息")
-		copyUpdateData(umap,update_check)
-	}
-}
-
-
-//标准校验
-func standardCheckCity(area string,city string,district string) map[string]string{
-
-	rdata := make(map[string]string,0)
-	if area=="香港"||area=="澳门"||area=="台湾" || (area=="全国"&&(city==""&&district=="")) {
-		return rdata
-	}
-	//第一步:区校验
-	if district!="" {
-		districtArr := DistrictDict[district]
-		if districtArr==nil {//涉及了 个别别名相关的数据
-			trim_arr := aliasDataDistrict(district)//拆分后缀
-			if len(trim_arr)>0 {
-				for _,alias_district := range trim_arr {
-					alias_districtArr := DistrictDict[alias_district]
-					for _,v:=range alias_districtArr{
-						if  city == v.C_Name && area == v.P_Name {
-							rdata["district"] = alias_district
-							return rdata
-						}
-					}
-				}
-			}
-			rdata["district"] = ""
-		}else {
-			isTrue := false
-			for _,v:=range districtArr{
-				if  city == v.C_Name && area == v.P_Name {
-					isTrue = true
-					break
-				}
-			}
-			if isTrue { //完全匹配
-				return rdata
-			}else { //未完全匹配
-				if len(districtArr)==1 {
-					rdata["area"] = districtArr[0].P_Name
-					rdata["city"] = districtArr[0].C_Name
-					rdata["district"] = districtArr[0].D_Name
-					return rdata
-				}else {
-					rdata["district"] = ""
-				}
-			}
-		}
-	}
-
-	//第二步:区校验-失败   市-校验
-	if city != "" {
-		cityArr := CityDict[city]
-		if cityArr==nil {
-			//把市当成区,匹配三级   - 存在优化空间- city:郑州  别名
-			districtArr := DistrictDict[city]
-			for _,v:=range districtArr{
-				if  city == v.C_Name && area == v.P_Name {
-					rdata["area"] = districtArr[0].P_Name
-					rdata["city"] = districtArr[0].C_Name
-					rdata["district"] = districtArr[0].D_Name
-					return rdata
-				}
-			}
-			rdata["city"] = ""
-		}else {
-			isTrue := false
-			for _,v:=range cityArr{
-				if  area == v.P_Name {
-					isTrue = true
-					break
-				}
-			}
-			if isTrue { //完全匹配
-				return rdata
-			}else { //未完全匹配
-				if len(cityArr)==1 {
-					rdata["area"] = cityArr[0].P_Name
-					rdata["city"] = cityArr[0].C_Name
-					rdata["district"] = ""
-					return rdata
-				}else {
-					rdata["city"] = ""
-				}
-			}
-		}
-	}
-
-	//第三步:省份校验
-	if ProvinceDict[area]==nil {
-		rdata["area"] = "全国"
-		rdata["city"] = ""
-		rdata["district"] = ""
-	}
-
-	return rdata
-}
-
-//更新日志
-func updateLogging(tmp map[string]interface{},rdata map[string]string,desc string) map[string]interface{} {
-	umap := make(map[string]interface{})
-	if tmp["modifycheck"] == nil {
-		umap["modifycheck"] = make(map[string]interface{})
-	} else {
-		umap["modifycheck"] = tmp["modifycheck"]
-	}
-	for rk, rv := range rdata {
-		umap[rk] = rv
-		umap["modifycheck"].(map[string]interface{})[rk] = fmt.Sprintf("%s~%s~%s",desc,qu.ObjToString(tmp[rk]),rv)
-	}
-	return umap
-}
-
-func copyUpdateData(tmp map[string]interface{},update_check *map[string]interface{}) {
-	for k,v := range tmp {
-		(*update_check)[k] = v
-	}
-}
-
-//拆分三级县
-func aliasDataDistrict(district string) []string {
-	arr :=[]string{}
-	if cityEndReg.MatchString(district) {
-		str := cityEndReg.FindString(district)
-		strings.TrimRight(district, str)
-		if str=="县"{
-			arr = append(arr,fmt.Sprintf("%s区",strings.TrimRight(district, str)))
-			arr = append(arr,fmt.Sprintf("%s市",strings.TrimRight(district, str)))
-		}else if str=="区"{
-			arr = append(arr,fmt.Sprintf("%s县",strings.TrimRight(district, str)))
-			arr = append(arr,fmt.Sprintf("%s市",strings.TrimRight(district, str)))
-		} else if str=="市"{
-			arr = append(arr,fmt.Sprintf("%s县",strings.TrimRight(district, str)))
-			arr = append(arr,fmt.Sprintf("%s区",strings.TrimRight(district, str)))
-		}else {
-
-		}
-	}else { //未找到 district- 区县市  例: district : 金水
-		arr = append(arr,fmt.Sprintf("%s区",district))
-		arr = append(arr,fmt.Sprintf("%s县",district))
-		arr = append(arr,fmt.Sprintf("%s市",district))
-	}
-	return arr
-}
-
-
-
-
-
-//初始化
-func InitCheckCity()  {
-	//初始化-城市配置
-	ProvinceDict = make(map[string][]CP,0)
-	CityDict = make(map[string][]CC,0)
-	DistrictDict = make(map[string][]CD,0)
-
-	q := map[string]interface{}{
-		"town_code":map[string]interface{}{
-			"$exists":0,
-		},
-	}
-	sess := ul.ExtMgo.GetMgoConn()
-	defer ul.ExtMgo.DestoryMongoConn(sess)
-	it := sess.DB(ul.ExtMgo.DbName).C(ul.CheckColl).Find(&q).Iter()
-	total  := 0
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total%1000 == 0 {
-			log.Debug("当前数量:", total)
-		}
-		district_code := qu.IntAll(tmp["district_code"])
-		city_code := qu.IntAll(tmp["city_code"])
-		if district_code > 0 {
-			province := qu.ObjToString(tmp["province"])
-			city := qu.ObjToString(tmp["city"])
-			district := qu.ObjToString(tmp["district"])
-			data := CD{province,city,district}
-			if DistrictDict[district]==nil {
-				DistrictDict[district] = []CD{data}
-			}else {
-				arr := DistrictDict[district]
-				arr = append(arr,data)
-				DistrictDict[district] = arr
-			}
-		}else {
-			if city_code>0 {
-				province := qu.ObjToString(tmp["province"])
-				city := qu.ObjToString(tmp["city"])
-				data := CC{province,city}
-				if CityDict[city]==nil {
-					CityDict[city] = []CC{data}
-				}else {
-					arr := CityDict[city]
-					arr = append(arr,data)
-					CityDict[city] = arr
-				}
-			}else {
-				province := qu.ObjToString(tmp["province"])
-				data := CP{province}
-				if ProvinceDict[province]==nil {
-					ProvinceDict[province] = []CP{data}
-				}else {
-					arr := ProvinceDict[province]
-					arr = append(arr,data)
-					ProvinceDict[province] = arr
-				}
-			}
-		}
-		tmp = make(map[string]interface{})
-	}
-	log.Debug(fmt.Sprintf("城市配置加载完毕...省~%d 市~%d 区~%d",len(ProvinceDict),len(CityDict),len(DistrictDict)))
-}

+ 133 - 96
extcity/src/ext/extInit.go

@@ -1,62 +1,119 @@
 package ext
 
 import (
+	log "github.com/donnie4w/go-logger/logger"
 	"github.com/go-ego/gse"
+	"github.com/sensitive"
 	qu "qfw/util"
 	ul "util"
 )
+
+var Ext *ExtractTask
+var DefaultRegions, AdjustmentRegions = []string{}, []string{}
+
 type ExtractTask struct {
-	ProvinceMap          map[string]string    //省全称简称(key:浙江省 val:浙江)
-	ProvinceBriefMap     map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
-	CityMap              map[string]string    //市全称简称(key:杭州市 val:杭州)
-	CityBriefMap         map[string]*City     //市简称对应的市信息(key:杭州 val:&City{})
-	CityFullMap          map[string]*City     //市全称对应的市信息(key:杭州市 val:&City{})
-	DistrictCityMap      map[string]*City
-	NewDistrictCityMap   map[string][]*City            //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
-	DistrictSimAndAll    map[string]string             //区或县(key:简称 val:全称)
-	NewDistrictSimAndAll map[string][]map[string]*City //区或县(key:简称 val:  相同简称的区全称:所在市)
-	StreetDistrictMap    map[string]*District          //街道对应的区或县
-	NewStreetDistrictMap map[string][]*District        //街道全称对应的区或县
-	CommunityDistrictMap map[string][]*District        //村、居委会对应的区或县
-	ProvinceAllGet       *DFA                       //省全称
-	ProvinceSimGet       *DFA                       //省简称
-	CityAllGet           *DFA                       //市全称
-	CitySimGet           *DFA                       //市简称
-	DistrictAllGet       *DFA                       //区或县全称
-	DistrictSimGet       *DFA                       //区或县简称
-	StreetGet            *DFA                       //街道
+	SiteCityMap       map[string]*SiteCity          //站点对应的省市区
+	ProvinceMap       map[string]string             //省全称简称(key:浙江省 val:浙江)
+	ProvinceBriefMap  map[string]*Province          //省简称对应的省信息(key:浙江 val:&Province{})
+	CityMap           map[string]string             //市全称简称(key:杭州市 val:杭州)
+	CityBriefMap      map[string]*City              //市简称对应的市信息(key:杭州 val:&City{})
+	CityFullMap       map[string]*City              //市全称对应的市信息(key:杭州市 val:&City{})
+	DistrictCityMap   map[string][]*City            //区或县全称对应的city(全国有相同名称的区或县,这里对应的city用slice)
+	DistrictSimAndAll map[string][]map[string]*City //区或县简称对应的city(全国有相同名称的区或县,这里对应的city用slice)
+	StreetDistrictMap map[string][]*District        //街道全称对应的区或县
 
-	Trie_Full_Province  *Trie       //省全称 省、直辖市、自治区
-	Trie_Full_City      *Trie       //市全称 地级市
-	Trie_Full_District  *Trie       //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
-	Trie_Full_Street    *Trie       //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
-	Trie_Full_Community *Trie       //村/委员会全称  村、居委会
-	Trie_Sim_Province   *Trie       //省简称
-	Trie_Sim_City       *Trie       //市简称
-	Trie_Sim_District   *Trie       //县简称
-	Trie_Fulls          []*Trie     //所有全称
-	Trie_Sims           []*Trie     //所有简称
-	Seg_PCD             *gse.Segmenter //分词
-	Seg_SV              *gse.Segmenter //分词
+	ProvinceAllGet *DFA //省全称
+	ProvinceSimGet *DFA //省简称
+	CityAllGet     *DFA //市全称
+	CitySimGet     *DFA //市简称
+	DistrictAllGet *DFA //区或县全称
+	DistrictSimGet *DFA //区或县简称
+	StreetGet      *DFA //街道
 
-}
+	XjbtCityArr           []map[string]interface{} //新疆兵团相关数据
+	SensitiveFullCity     *sensitive.Filter
+	SensitiveSimCity      *sensitive.Filter
+	SensitiveFullDistrict *sensitive.Filter
 
+	PostCodeMap map[string]*PostCode //邮编
+	AreaCodeMap map[string]*AreaCode //区号
 
+	Trie_Full_Province  *Trie          //省全称 省、直辖市、自治区
+	Trie_Full_City      *Trie          //市全称 地级市
+	Trie_Full_District  *Trie          //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
+	Trie_Full_Street    *Trie          //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
+	Trie_Full_Community *Trie          //村/委员会全称  村、居委会
+	Trie_Sim_Province   *Trie          //省简称
+	Trie_Sim_City       *Trie          //市简称
+	Trie_Sim_District   *Trie          //县简称
+	Trie_Fulls          []*Trie        //所有全称
+	Trie_Sims           []*Trie        //所有简称
+	Seg_PCD             *gse.Segmenter //分词
+	Seg_SV              *gse.Segmenter //分词
+}
 
+//加载所有
 func InitProvincesx() []map[string]interface{} {
 	defer qu.Catch()
-	provinces := make([]map[string]interface{}, 0)
-	provinces,_ = ul.ExtMgo.Find(ul.ExtColl,map[string]interface{}{
+	provinces, _ := ul.ExtMgo.Find("address_new_2020", map[string]interface{}{
 		"Remarks": nil,
-	},nil,nil)
+	}, nil, nil)
 	return provinces
 }
 
+//加载站点库site城市信息
+func InitSite() []map[string]interface{} {
+	defer qu.Catch()
+	query := map[string]interface{}{}
+	list, _ := ul.SiteMgo.Find("site", query, nil, map[string]interface{}{
+		"site":      1,
+		"area":      1,
+		"city":      1,
+		"district":  1,
+		"site_type": 1,
+		"qy_area":   1,
+	})
+	return list
+}
+
+//加载新疆兵团映射关系
+func (e *ExtractTask) InitXjbtCityInfo() {
+	defer qu.Catch()
+	query := map[string]interface{}{}
+	list, _ := ul.ExtMgo.Find("area_xjbt", query, nil, nil)
+	arr := []map[string]interface{}{}
+	for _, v := range list {
+		delete(v, "_id")
+		arr = append(arr, v)
+	}
+	e.XjbtCityArr = arr
+}
 
+//站点加载...
+func (e *ExtractTask) InitUpdateSite() {
+	defer qu.Catch()
+	e.SiteCityMap = make(map[string]*SiteCity)
+	for _, v := range InitSite() {
+		site := qu.ObjToString(v["site"])
+		s := &SiteCity{
+			P: qu.ObjToString(v["area"]),
+			C: qu.ObjToString(v["city"]),
+			D: qu.ObjToString(v["district"]),
+			T: qu.ObjToString(v["site_type"]),
+			Q: qu.ObjToString(v["qy_area"]),
+		}
+		e.SiteCityMap[site] = s
+	}
+	log.Debug("有效站点数量:", len(e.SiteCityMap))
+}
 
 func (e *ExtractTask) InitCityInfo() {
 	defer qu.Catch()
 	e.InitVar() //初始化变量
+	//新疆兵团数据
+	e.InitXjbtCityInfo()
+	//site站点信息
+	e.InitUpdateSite()
 	//初始化省信息
 	alldata := InitProvincesx()
 	fnx := make([]map[string]interface{}, 0)
@@ -125,14 +182,16 @@ func (e *ExtractTask) InitCityInfo() {
 			qc_city := qu.ObjToString(vcity["city"])
 			jc_city := qu.ObjToString(vcity["brief_city"])
 			e.Trie_Full_City.AddWords(qc_city) //加入市全称Trie(k:杭州市)
+			e.SensitiveFullCity.AddWord(qc_city)
 			c := &City{}
-			c.Name = qc_city //市全称:杭州市
+			c.Name = qc_city           //市全称:杭州市
+			e.CityFullMap[qc_city] = c //杭州市:市信息{}
 			if jc_city != "" {
 				c.Brief = jc_city                 //市简称:杭州
 				e.Trie_Sim_City.AddWords(c.Brief) //加入市简称Trie(k:杭州)
-				e.CityMap[qc_city] = c.Brief      //杭州市:杭州
-				e.CityBriefMap[c.Brief] = c       //杭州:市信息{}
-				e.CityFullMap[qc_city] = c        //杭州市:市信息{}
+				e.SensitiveSimCity.AddWord(c.Brief)
+				e.CityMap[qc_city] = c.Brief //杭州市:杭州
+				e.CityBriefMap[c.Brief] = c  //杭州:市信息{}
 			}
 			c.P = p
 			if city_alias, ok := vcity["city_alias"].([]interface{}); ok {
@@ -170,40 +229,41 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 		d.Name = qc_district
 		d.C = c
 		e.Trie_Full_District.AddWords(qc_district) //加入区或县全称Trie
+		e.SensitiveFullDistrict.AddWord(qc_district)
 		if jc_district != "" {
 			e.Trie_Sim_District.AddWords(jc_district) //加入区或县简称Trie
 			//初始化城市简称
 			c := e.CityBriefMap[jc_city]
-			dfullarr := e.NewDistrictSimAndAll[jc_district]
+			dfullarr := e.DistrictSimAndAll[jc_district]
 			dfullcity := map[string]*City{qc_district: c}
 			if len(dfullarr) == 0 {
 				tmparr := []map[string]*City{dfullcity}
-				e.NewDistrictSimAndAll[jc_district] = tmparr
+				e.DistrictSimAndAll[jc_district] = tmparr
 			} else {
-				e.NewDistrictSimAndAll[jc_district] = append(e.NewDistrictSimAndAll[jc_district], dfullcity)
+				e.DistrictSimAndAll[jc_district] = append(e.DistrictSimAndAll[jc_district], dfullcity)
 			}
 		}
-		ctmp := e.NewDistrictCityMap[qc_district]
+		ctmp := e.DistrictCityMap[qc_district]
 		if len(ctmp) == 0 {
 			tmpcarr := []*City{c}
-			e.NewDistrictCityMap[qc_district] = tmpcarr
+			e.DistrictCityMap[qc_district] = tmpcarr
 		} else {
-			e.NewDistrictCityMap[qc_district] = append(e.NewDistrictCityMap[qc_district], c)
+			e.DistrictCityMap[qc_district] = append(e.DistrictCityMap[qc_district], c)
 		}
 		if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok {
 			for _, vdistrict_alias := range district_alias {
 				strvdistrict_alias := qu.ObjToString(vdistrict_alias)
 				e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
-				ctmp := e.NewDistrictCityMap[strvdistrict_alias]
-				if len(ctmp) == 0 {
+				c_tmp := e.DistrictCityMap[strvdistrict_alias]
+				if len(c_tmp) == 0 {
 					tmpcarr := []*City{c}
-					e.NewDistrictCityMap[strvdistrict_alias] = tmpcarr
+					e.DistrictCityMap[strvdistrict_alias] = tmpcarr
 				} else {
-					e.NewDistrictCityMap[strvdistrict_alias] = append(e.NewDistrictCityMap[strvdistrict_alias], c)
+					e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c)
 				}
 			}
 		}
-		//街道
+		//街道乡镇
 		towns := towns_maps[jc_province][qc_city][qc_district]
 		for _, vtown := range towns {
 			strvtown := qu.ObjToString(vtown["town"])
@@ -211,12 +271,12 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 			s.Name = strvtown
 			s.D = d
 			e.Trie_Full_Street.AddWords(strvtown) //加入街道全称Trie
-			dtmp := e.NewStreetDistrictMap[strvtown]
+			dtmp := e.StreetDistrictMap[strvtown]
 			if len(dtmp) == 0 {
 				tmpdarr := []*District{d}
-				e.NewStreetDistrictMap[strvtown] = tmpdarr
+				e.StreetDistrictMap[strvtown] = tmpdarr
 			} else {
-				e.NewStreetDistrictMap[strvtown] = append(e.NewStreetDistrictMap[strvtown], d)
+				e.StreetDistrictMap[strvtown] = append(e.StreetDistrictMap[strvtown], d)
 			}
 			//村、居委会
 			//jwhs := jwhs_maps[jc_province][qc_city][qc_district][strvtown]
@@ -235,6 +295,7 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 
 	}
 }
+
 func (t *Trie) AddWords(words ...string) {
 	cur := t
 	for _, v := range words {
@@ -266,11 +327,9 @@ func (t *Trie) Get(word string) bool {
 	return cur.y
 }
 
-
 func (e *ExtractTask) InitVar() {
 	defer qu.Catch()
-	//初始化Trie
-	//全称
+
 	e.Trie_Full_Province = &Trie{}
 	e.Trie_Full_City = &Trie{}
 	e.Trie_Full_District = &Trie{}
@@ -284,45 +343,23 @@ func (e *ExtractTask) InitVar() {
 	//初始化分词
 	e.Seg_PCD = &gse.Segmenter{}
 	e.Seg_SV = &gse.Segmenter{}
-	e.Seg_PCD.LoadDict("./pcd.txt")
-	e.Seg_SV.LoadDict("./sv.txt")
-
-	if e.ProvinceMap == nil {
-		e.ProvinceMap = make(map[string]string)
-	}
-	if e.CityMap == nil {
-		e.CityMap = make(map[string]string)
-	}
-	if e.DistrictSimAndAll == nil {
-		e.DistrictSimAndAll = make(map[string]string)
-	}
-	if e.NewDistrictSimAndAll == nil {
-		e.NewDistrictSimAndAll = make(map[string][]map[string]*City)
-	}
-
-	if e.CityBriefMap == nil {
-		e.CityBriefMap = make(map[string]*City)
-	}
-	if e.CityFullMap == nil {
-		e.CityFullMap = make(map[string]*City)
-	}
-	if e.ProvinceBriefMap == nil {
-		e.ProvinceBriefMap = make(map[string]*Province)
-	}
-	if e.NewDistrictCityMap == nil {
-		e.NewDistrictCityMap = make(map[string][]*City)
-	}
-
-	if e.NewStreetDistrictMap == nil {
-		e.NewStreetDistrictMap = make(map[string][]*District)
-	}
-	if e.CommunityDistrictMap == nil {
-		e.CommunityDistrictMap = make(map[string][]*District)
-	}
+	e.Seg_PCD.LoadDict("./res/pcd.txt")
+	e.Seg_SV.LoadDict("./res/sv.txt")
 
+	//初始化城市相关
+	e.SiteCityMap = make(map[string]*SiteCity)
+	e.ProvinceMap = make(map[string]string)
+	e.CityMap = make(map[string]string)
+	e.DistrictSimAndAll = make(map[string][]map[string]*City)
+	e.CityBriefMap = make(map[string]*City)
+	e.CityFullMap = make(map[string]*City)
+	e.ProvinceBriefMap = make(map[string]*Province)
+	e.DistrictCityMap = make(map[string][]*City)
+	e.StreetDistrictMap = make(map[string][]*District)
+	//新疆兵团-数组
+	e.XjbtCityArr = make([]map[string]interface{}, 0)
+	//敏感词-筛选
+	e.SensitiveFullCity = sensitive.New()
+	e.SensitiveSimCity = sensitive.New()
+	e.SensitiveFullDistrict = sensitive.New()
 }
-
-
-
-
-

+ 517 - 0
extcity/src/ext/extRegion.go

@@ -0,0 +1,517 @@
+package ext
+
+import (
+	qu "qfw/util"
+	"strings"
+)
+
+//抽取地域信息
+func (e *ExtractTask) ExtractRegionInfo(tmp *map[string]interface{}, isLog bool) {
+	defer qu.Catch()
+	//日志记录
+	logRecordInfo := []map[string]interface{}{}
+	f_area, f_city, f_district := "", "", ""
+	all_regions := map[string]map[string]map[string]string{}
+	//jsondata ~ 前置条件
+	jsondata := map[string]interface{}{}
+	e.GetRegionByTentativeJsonData(jsondata, &all_regions)
+	if isLog && len(all_regions) > 0 {
+		valueArr := []string{}
+		valueArr = append(valueArr, qu.ObjToString(jsondata["area_city_district"]))
+		LogProcessRecordingForTentative("jsondata", valueArr, all_regions, &logRecordInfo)
+	}
+	b := ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
+	if b {
+		CompleteRegionInfo(&f_area, &f_city, &f_district)
+		//最终赋值
+		(*tmp)["area"] = f_area
+		(*tmp)["city"] = f_city
+		(*tmp)["district"] = f_district
+		(*tmp)["regions_log"] = logRecordInfo
+		return
+	}
+	//字段可控
+	RegionFieldsArr := DefaultRegions
+	//采购单位比较特殊~需要根据站点类型进行重新组合
+	if e.IsConsecutionRegion(qu.ObjToString((*tmp)["site"])) {
+		RegionFieldsArr = AdjustmentRegions
+	}
+	for _, v := range RegionFieldsArr {
+		keyArr := strings.Split(v, ",")
+		isExists, textValues, field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp)
+		if isExists { //是否存在抽取有效值
+			AnalysisIsUniqueInfo(new_regions, &all_regions)
+			if isLog { //日志记录
+				LogProcessRecordingForGroupInfo(strings.Join(keyArr, "_"), textValues, field_regions, old_regions, all_regions, &logRecordInfo)
+			}
+			b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
+			if b {
+				CompleteRegionInfo(&f_area, &f_city, &f_district)
+				//最终赋值
+				(*tmp)["area"] = f_area
+				(*tmp)["city"] = f_city
+				(*tmp)["district"] = f_district
+				(*tmp)["regions_log"] = logRecordInfo
+				return
+			}
+		}
+	}
+	//未提前结束~筛选出~最终的
+	ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
+	//給地域做建议的清洗完善
+	CompleteRegionInfo(&f_area, &f_city, &f_district)
+	//用到的字段
+	projectname := qu.ObjToString((*tmp)["projectname"])
+	buyer := qu.ObjToString((*tmp)["buyer"])
+	site := qu.ObjToString((*tmp)["site"])
+	//新疆兵团补充地域~
+	if XjbtReg.MatchString(buyer) && f_city == "" {
+		if a, c, d, ok := e.NewVerifyXjCorpsInfo(buyer); ok {
+			f_area, f_city, f_district = a, c, d
+			if isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_xjbt": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
+		}
+	}
+	//此时进行特殊链路新增、补充原则
+	if f_city == "" {
+		e.LinkSpecialRuleFullStep(projectname, &f_area, &f_city, &f_district)
+	}
+	if f_city == "" {
+		e.LinkSpecialRuleBriefStep(projectname, &f_area, &f_city, &f_district)
+	}
+	if f_city == "" {
+		e.LinkSpecialRuleBriefStep(buyer, &f_area, &f_city, &f_district)
+	}
+	if isLog {
+		LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+			"sup_link": f_area + "~" + f_city + "~" + f_district,
+		})
+	}
+
+	//正文补充地域~
+	if f_area == "全国" || f_area == "" || f_city == "" {
+		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["detail"]), &f_area, &f_city, &f_district); b {
+			if isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_detail": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
+		}
+	}
+	//最终站点补充
+	if f_area == "全国" || f_area == "" {
+		if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" {
+			f_area = sc.Q
+			if isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_site": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
+		}
+	}
+	//最终在清洗一遍数据
+	CompleteRegionInfo(&f_area, &f_city, &f_district)
+	//最终赋值
+	(*tmp)["area"] = f_area
+	(*tmp)["city"] = f_city
+	(*tmp)["district"] = f_district
+	(*tmp)["regions_log"] = logRecordInfo
+}
+
+//对组进行分析处理
+func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]interface{}) (bool, []string, map[string]interface{}, map[string]map[string]map[string]string, map[string]map[string]map[string]string) {
+	old_regions := map[string]map[string]map[string]string{}
+	isExists := false
+	textArr := []string{}
+	field_regions := map[string]interface{}{}
+	for _, key := range keyArr {
+		text := ""
+		if key == "site_area" || key == "site_city" {
+			text = qu.ObjToString(tmp["site"])
+		} else if key == "buyer_filiale" {
+			text = GetFilialeByBuyerInfo(qu.ObjToString(tmp["buyer"]))
+		} else if key == "projectname" {
+			text = CleanRegionProjectNameInfo(qu.ObjToString(tmp[key]), qu.ObjToString(tmp["buyer"]))
+		} else {
+			text = qu.ObjToString(tmp[key])
+		}
+		textArr = append(textArr, text)
+		if text != "" {
+			isExists = true
+		} else {
+			continue //无值不用提取
+		}
+		valuesArr := []map[string]interface{}{}
+		if key == "buyerzipcode" {
+			valuesArr = e.GetRegionByPostCode(text, &old_regions)
+		} else if key == "buyertel" {
+			valuesArr = e.GetRegionByTelNumber(text, &old_regions)
+		} else if key == "site_area" {
+			valuesArr = e.GetRegionBySite(text, &old_regions, 1)
+		} else if key == "site_city" {
+			valuesArr = e.GetRegionBySite(text, &old_regions, 2)
+		} else if key == "buyer_filiale" {
+			valuesArr = e.GetRegionFromText(text, &old_regions, false, false, 2)
+		} else {
+			isAddress, isBrief := false, false
+			if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" {
+				isAddress = true
+			}
+			valuesArr = e.GetRegionFromText(text, &old_regions, isAddress, isBrief, 2)
+		}
+		field_regions[key] = valuesArr
+	}
+	//校验当前组的合理性
+	new_regions := ReasonableGroupRegionInfo(old_regions)
+
+	return isExists, textArr, field_regions, old_regions, new_regions
+}
+
+//邮政编号
+func (e *ExtractTask) GetRegionByPostCode(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
+	regionsArr := []map[string]interface{}{}
+	pc := e.PostCodeMap[text]
+	if pc != nil {
+		if len(pc.D) == 1 {
+			UpdateRegionsInfo(pc.P, pc.C, pc.D[0], regions)
+			regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": pc.D[0]})
+		} else {
+			UpdateRegionsInfo(pc.P, pc.C, "", regions)
+			regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": ""})
+		}
+	}
+	return regionsArr
+}
+
+//固话号码
+func (e *ExtractTask) GetRegionByTelNumber(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
+	regionsArr := []map[string]interface{}{}
+	if len(text) >= 11 {
+		if strings.HasPrefix(text, "0") { //区号除了澳门853其他都是以0开头
+			n := 4
+		L:
+			areacode := text[:n]
+			ac := e.AreaCodeMap[areacode]
+			if ac != nil {
+				if len(ac.C) == 1 {
+					UpdateRegionsInfo(ac.P, ac.C[0], "", regions)
+					regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": ac.C[0], "district": ""})
+				} else {
+					UpdateRegionsInfo(ac.P, "", "", regions)
+					regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": "", "district": ""})
+				}
+			} else {
+				n = n - 1
+				if n >= 3 {
+					goto L
+				}
+			}
+		}
+	}
+	return regionsArr
+}
+
+//初步确认~采集
+func (e *ExtractTask) GetRegionByTentativeJsonData(jsondata map[string]interface{}, all_regions *map[string]map[string]map[string]string) {
+	area, city, district := "", "", ""
+	regions := map[string]map[string]map[string]string{}
+	if jsondata != nil {
+		if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
+			e.GetRegionFromText(a_c_d, &regions, false, false, 1)
+		}
+	}
+	if len(regions) == 1 {
+		for k, v := range regions {
+			area = k
+			if len(v) == 1 {
+				for k1, v1 := range v {
+					city = k1
+					if len(v1) == 1 {
+						for k2, _ := range v1 {
+							district = k2
+						}
+					} else {
+						break
+					}
+				}
+			} else {
+				break
+			}
+		}
+	}
+	if area != "" { //组装结构
+		city_info := map[string]map[string]string{}
+		district_info := map[string]string{}
+		if city != "" {
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+		}
+		(*all_regions)[area] = city_info
+	}
+}
+
+//简称全程标准化的校验~
+func (e *ExtractTask) StandardizedegionInfo(area *string, city *string, district *string) {
+	//特殊市补充
+	if *area == "北京" {
+		*city = "北京市"
+	} else if *area == "天津" {
+		*city = "天津市"
+	} else if *area == "上海" {
+		*city = "上海市"
+	} else if *area == "重庆" {
+		*city = "重庆市"
+	}
+	//非空与空~是否标准校验
+	if *area == "" {
+		*city = ""
+		*district = ""
+	} else {
+		if province := e.ProvinceMap[*area]; province != "" {
+			*area = province
+		}
+		if *city == "" {
+			*district = ""
+		} else {
+			if csMap := e.CityBriefMap[*city]; csMap != nil {
+				if csMap.P.Brief == *area && csMap.Name != "" {
+					*city = csMap.Name
+				} else {
+					*city = ""
+					*district = ""
+				}
+			} else {
+				if e.CityMap[*city] == "" {
+					*city = ""
+					*district = ""
+				}
+			}
+			if *district != "" {
+				citysArr := e.DistrictSimAndAll[*district]
+				if len(citysArr) == 1 {
+					full_city := citysArr[0]
+					for d, _ := range full_city {
+						*district = d
+					}
+				} else if len(citysArr) > 1 {
+					*district = ""
+				} else if len(citysArr) == 0 {
+					fullArr := e.DistrictCityMap[*district]
+					if len(fullArr) == 0 {
+						*district = ""
+					}
+				} else {
+
+				}
+			}
+		}
+	}
+}
+
+//站点取值   from 1-省  2-省市
+func (e *ExtractTask) GetRegionBySite(site string, regions *map[string]map[string]map[string]string, from int) []map[string]interface{} {
+	regionArr := []map[string]interface{}{}
+	area, city, district := "", "", ""
+	if scMap := e.SiteCityMap[site]; scMap != nil {
+		if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
+			area = scMap.P
+		}
+		if scMap.C != "" && scMap.C != "null" && area != "" {
+			city = scMap.C
+		}
+	}
+	e.StandardizedegionInfo(&area, &city, &district)
+	if from == 1 && area != "" && area != "全国" {
+		UpdateRegionsInfo(area, "", "", regions)
+		regionArr = append(regionArr, map[string]interface{}{"area": area, "city": "", "district": ""})
+
+	}
+	if from == 2 && area != "" && area != "全国" && city != "" {
+		UpdateRegionsInfo(area, city, "", regions)
+		regionArr = append(regionArr, map[string]interface{}{"area": area, "city": city, "district": ""})
+	}
+
+	return regionArr
+}
+
+//新疆兵团
+func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d string, ok bool) {
+	buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
+	ok = false
+	for _, info := range e.XjbtCityArr {
+		name := qu.ObjToString(info["name"])
+		alias := qu.ObjToString(info["alias"])
+		if strings.Contains(buyer, name) || strings.Contains(buyer, alias) {
+			new_a = qu.ObjToString(info["area"])
+			new_c = qu.ObjToString(info["city"])
+			new_d = qu.ObjToString(info["district"])
+			ok = true
+			if res, ok := info["list"].([]interface{}); ok {
+				list := qu.ObjArrToMapArr(res)
+				for _, c := range list {
+					c_name := qu.ObjToString(c["name"])
+					if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
+						new_a = qu.ObjToString(c["area"])
+						new_c = qu.ObjToString(c["city"])
+						new_d = qu.ObjToString(c["district"])
+						break
+					}
+				}
+			}
+			break
+		}
+	}
+	return new_a, new_c, new_d, ok
+}
+
+//敏感词识别
+func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool {
+	detail = SensitiveReg.ReplaceAllString(detail, "")
+	detail = TextAfterRemoveTable(detail)
+	detail = CleanDetailReg1.ReplaceAllString(detail, "")
+	//全称城市
+	fullCityArr := e.SensitiveFullCity.FindAll(detail)
+	if len(fullCityArr) == 1 {
+		for _, v := range fullCityArr {
+			if cityMap := e.CityFullMap[v]; cityMap != nil {
+				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
+					*area = cityMap.P.Brief
+					*city = cityMap.Name
+					return true
+				}
+			}
+		}
+	}
+	//全称区县
+	fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
+	if len(fullDistrictArr) == 1 {
+		for _, v := range fullDistrictArr {
+			if citys := e.DistrictCityMap[v]; len(citys) == 1 {
+				if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
+					*area = citys[0].P.Brief
+					*city = citys[0].Name
+					*district = v
+					return true
+				}
+			}
+		}
+	}
+	//简称城市
+	simCityArr := e.SensitiveSimCity.FindAll(detail)
+	if len(simCityArr) == 1 {
+		for _, v := range simCityArr {
+			if cityMap := e.CityBriefMap[v]; cityMap != nil {
+				if *area == "" || *area == "全国" {
+					*area = cityMap.P.Brief
+					if !strings.Contains(*area, v) {
+						*city = cityMap.Name
+					}
+					return true
+				}
+				if cityMap.P.Brief == *area && !strings.Contains(*area, v) {
+					*area = cityMap.P.Brief
+					*city = cityMap.Name
+					return true
+				}
+			}
+		}
+	}
+
+	//疑似固话提取~
+	if *area == "" || *area == "全国" {
+		fixedTelArr := FixedTelReg.FindAllString(detail, -1)
+		if len(fixedTelArr) > 0 {
+			codeArr := resetFixedTelInfo(fixedTelArr)
+			if len(codeArr) == 1 {
+				for _, v := range codeArr {
+					if ac := e.AreaCodeMap[v]; ac != nil {
+						*area = ac.P
+						return true
+					}
+				}
+			}
+		}
+	}
+
+	return false
+}
+
+func resetFixedTelInfo(telArr []string) []string {
+	codeArr := []string{}
+	telsMap := map[string]string{}
+	for _, v := range telArr {
+		if v != "" {
+			arr := strings.Split(v, "-")
+			code := qu.ObjToString(arr[0])
+			if telsMap[code] == "" {
+				telsMap[code] = code
+				codeArr = append(codeArr, code)
+			}
+		}
+	}
+	return codeArr
+}
+
+//初步确认~站点
+//func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
+//	area, city, district := "", "", ""
+//	site, _ := (*j.Data)["site"].(string)
+//	if scMap := e.SiteCityMap[site]; scMap != nil {
+//		if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
+//			area = scMap.P
+//		}
+//		if scMap.C != "" && scMap.C != "null" && area != "" {
+//			city = scMap.C
+//		}
+//		//if scMap.D != "" && scMap.D != "null" && city != "" {
+//		//	district = scMap.D
+//		//}
+//	}
+//
+//	//对省市区进行标准化校验~简称全程的问题
+//	e.StandardizedegionInfo(&area, &city, &district)
+//
+//	//取出唯一数据
+//	j_area, j_city, j_district := "", "", ""
+//	is_adjust := false
+//	if len(*all_regions) == 1 { //有值~只进行补充操作
+//		for k, v := range *all_regions {
+//			j_area = k
+//			for k1, v1 := range v {
+//				j_city = k1
+//				for k2, _ := range v1 {
+//					j_district = k2
+//				}
+//			}
+//		}
+//		if j_area == area && area != "" {
+//			if city != "" {
+//				if j_city == "" {
+//					is_adjust = true
+//				} else if j_city == city {
+//					if district != "" && j_district == "" {
+//						is_adjust = true
+//					}
+//				}
+//			}
+//		}
+//	} else {
+//		is_adjust = true
+//	}
+//	if is_adjust && area != "" { //进行调整
+//		city_info := map[string]map[string]string{}
+//		district_info := map[string]string{}
+//		if city != "" {
+//			if district != "" {
+//				district_info[district] = district
+//			}
+//			city_info[city] = district_info
+//		}
+//		(*all_regions)[area] = city_info
+//	}
+//}

+ 730 - 0
extcity/src/ext/extWay.go

@@ -0,0 +1,730 @@
+package ext
+
+import (
+	"github.com/PuerkitoBio/goquery"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+)
+
+var FilialeReg1 = regexp.MustCompile("(.{1,3})分(公司|院|校|行)$")
+var FilialeReg2 = regexp.MustCompile(".*[((](.*)[))].*")
+var FilialeReg3 = regexp.MustCompile(".*(集团|公司|大学)(.*)(公司|院|所|校)")
+
+var CleanRegionReg1 = regexp.MustCompile(".*公司")
+
+var FixedTelReg = regexp.MustCompile("0[0-9]{2,3}\\-[2-9][0-9]{6,7}")
+
+var CleanDetailReg1 = regexp.MustCompile("(北京时间)")
+var XjbtReg *regexp.Regexp = regexp.MustCompile("^(新疆生产建设兵团|新疆兵团)")
+var SensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")
+
+//取特殊类数据
+func GetFilialeByBuyerInfo(buyer string) string {
+	if FilialeReg1.MatchString(buyer) {
+		return FilialeReg1.FindString(buyer)
+	}
+	if FilialeReg2.MatchString(buyer) {
+		return FilialeReg2.ReplaceAllString(buyer, "${1}")
+	}
+	if FilialeReg3.MatchString(buyer) {
+		return FilialeReg3.ReplaceAllString(buyer, "${2}")
+	}
+
+	return ""
+}
+
+//最终确认~指定地域
+func ConfirmUniqueRegionInfo(regions map[string]map[string]map[string]string, area *string, city *string, district *string) bool {
+	if len(regions) > 1 || len(regions) == 0 {
+		return false
+	}
+	for k, v := range regions {
+		*area = k
+		if len(v) == 1 {
+			for k1, v1 := range v {
+				*city = k1
+				if len(v1) == 1 {
+					for k2, _ := range v1 {
+						*district = k2
+					}
+				}
+			}
+		}
+	}
+	if *area != "" && *city != "" && *district != "" {
+		return true
+	}
+	return false
+}
+
+//完整信息
+func CompleteRegionInfo(area *string, city *string, district *string) {
+	if *area == "北京" {
+		*city = "北京市"
+		if *district == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
+			*district = "朝阳区"
+		}
+	} else if *area == "天津" {
+		*city = "天津市"
+	} else if *area == "上海" {
+		*city = "上海市"
+	} else if *area == "重庆" {
+		*city = "重庆市"
+	}
+	if *area == "" {
+		*area = "全国"
+		*city = ""
+		*district = ""
+	}
+}
+
+//根据词获取所有的地域 ~ 暂时不采用三级简称提取城市
+func (e *ExtractTask) takeRegionsFromWords(text string, isAddress bool, isBrief bool, regionsArr *[]map[string]string) {
+	//全称匹配
+	for pos_full, trie_full := range e.Trie_Fulls {
+		if trie_full.Get(text) {
+			infoArr := []map[string]string{}
+			if pos_full == 0 {
+				if province := e.ProvinceMap[text]; province != "" {
+					infoArr = append(infoArr, map[string]string{"area": province, "city": "", "district": ""})
+					SplicingRegionsInfo(isAddress, regionsArr, infoArr)
+				}
+			} else if pos_full == 1 {
+				if data := e.CityFullMap[text]; data != nil {
+					if data.P.Brief != "" && data.Name != "" {
+						infoArr = append(infoArr, map[string]string{"area": data.P.Brief, "city": data.Name, "district": ""})
+						SplicingRegionsInfo(isAddress, regionsArr, infoArr)
+					}
+				}
+			} else if pos_full == 2 {
+				citys := e.DistrictCityMap[text]
+				for _, c := range citys {
+					if c.P.Brief != "" && c.Name != "" && text != "" {
+						infoArr = append(infoArr, map[string]string{"area": c.P.Brief, "city": c.Name, "district": text})
+					}
+				}
+				SplicingRegionsInfo(isAddress, regionsArr, infoArr)
+			}
+			break
+		}
+	}
+	//简称匹配
+	for pos_sim, trie_sim := range e.Trie_Sims {
+		if trie_sim.Get(text) {
+			infoArr := []map[string]string{}
+			if pos_sim == 0 {
+				if text != "" {
+					infoArr = append(infoArr, map[string]string{"area": text, "city": "", "district": ""})
+					SplicingRegionsInfo(isAddress, regionsArr, infoArr)
+				}
+			} else if pos_sim == 1 {
+				if csMap := e.CityBriefMap[text]; csMap != nil {
+					if csMap.P.Brief != "" && csMap.Name != "" {
+						infoArr = append(infoArr, map[string]string{"area": csMap.P.Brief, "city": csMap.Name, "district": ""})
+						SplicingRegionsInfo(isAddress, regionsArr, infoArr)
+					}
+				}
+			} else if pos_sim == 2 {
+				if isBrief {
+					citysArr := e.DistrictSimAndAll[text]
+					for _, full_citys := range citysArr {
+						for district, c := range full_citys {
+							if c == nil || c.P == nil || c.Name == "" {
+								continue
+							}
+							if c.P.Brief != "" && c.Name != "" && district != "" {
+								infoArr = append(infoArr, map[string]string{"area": c.P.Brief, "city": c.Name, "district": district})
+							}
+						}
+						SplicingRegionsInfo(isAddress, regionsArr, infoArr)
+					}
+				}
+			}
+			break
+		}
+	}
+}
+
+//是否拼接数据~且根据地址类~首地域判断
+func SplicingRegionsInfo(isAddress bool, regionsArr *[]map[string]string, infoArr []map[string]string) {
+	if isAddress {
+		if len(*regionsArr) == 0 { //第一次
+			for _, info := range infoArr {
+				*regionsArr = append(*regionsArr, info)
+			}
+		} else {
+			for _, info := range infoArr {
+				area := qu.ObjToString(info["area"])
+				city := qu.ObjToString(info["city"])
+				isUseful := false
+				for _, v := range *regionsArr {
+					v_area := qu.ObjToString(v["area"])
+					v_city := qu.ObjToString(v["city"])
+					if area == v_area {
+						if v_city != "" {
+							if v_city == city {
+								isUseful = true
+								break
+							}
+						} else {
+							isUseful = true
+							break
+						}
+					}
+				}
+				if isUseful {
+					*regionsArr = append(*regionsArr, info)
+				}
+			}
+		}
+	} else {
+		for _, info := range infoArr {
+			*regionsArr = append(*regionsArr, info)
+		}
+	}
+}
+
+//文本取地域   from  1~jsondata文本   2~其他文本
+func (e *ExtractTask) GetRegionFromText(text string, regions *map[string]map[string]map[string]string, isAddress bool, isBrief bool, from int) []map[string]interface{} {
+	regionValues := []map[string]interface{}{}
+	if text == "" {
+		return regionValues
+	}
+	wordsArr := []string{}
+	if from == 1 {
+		wordsArr = e.Seg_PCD.Cut(text, true)
+	} else if from == 2 {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
+	//词组清洗
+	wordsArr = CleanRegionTextWords(wordsArr)
+	regionsArr := []map[string]string{}
+	for _, word := range wordsArr {
+		e.takeRegionsFromWords(word, isAddress, isBrief, &regionsArr)
+	}
+	for _, v := range regionsArr {
+		area := qu.ObjToString(v["area"])
+		city := qu.ObjToString(v["city"])
+		district := qu.ObjToString(v["district"])
+		UpdateRegionsInfo(area, city, district, regions)
+		regionValues = append(regionValues, map[string]interface{}{"area": area, "city": city, "district": district})
+	}
+	return regionValues
+}
+
+//更新方法
+func UpdateRegionsInfo(area, city, district string, regions *map[string]map[string]map[string]string) {
+	if (*regions)[area] == nil {
+		city_info := map[string]map[string]string{}
+		district_info := map[string]string{}
+		if city != "" {
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+		}
+		(*regions)[area] = city_info //新增
+	} else {
+		city_info := (*regions)[area]
+		if city != "" {
+			district_info := map[string]string{}
+			if city_info[city] != nil {
+				district_info = city_info[city]
+			}
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+			(*regions)[area] = city_info
+		}
+	}
+}
+
+//同组合并后合理性校验
+func ReasonableGroupRegionInfo(datas map[string]map[string]map[string]string) map[string]map[string]map[string]string {
+	if len(datas) > 2 || len(datas) == 0 { //省份超限,无效
+		return map[string]map[string]map[string]string{}
+	}
+	uncity, undistrict := 0, 0
+	for _, v := range datas {
+		uncity += len(v)
+		for _, v1 := range v {
+			undistrict += len(v1)
+		}
+	}
+
+	if uncity > 3 {
+		regions_1 := map[string]map[string]map[string]string{}
+		for k, v := range datas {
+			city_info := map[string]map[string]string{}
+			if len(v) == 1 {
+				city_info = v
+			}
+			regions_1[k] = city_info
+		}
+		//计算当前
+		uncity_district := 0
+		for _, v := range regions_1 {
+			for _, v1 := range v {
+				uncity_district += len(v1)
+			}
+		}
+		if uncity_district > 3 {
+			regions_2 := map[string]map[string]map[string]string{}
+			for k, v := range regions_1 {
+				city_info := map[string]map[string]string{}
+				for k1, v1 := range v {
+					district_info := map[string]string{}
+					if len(v1) == 1 {
+						district_info = v1
+					}
+					city_info[k1] = district_info
+				}
+				regions_2[k] = city_info
+			}
+			return regions_2
+		}
+		return regions_1
+	}
+
+	if undistrict > 3 {
+		new_regions := map[string]map[string]map[string]string{}
+		for k, v := range datas {
+			city_info := map[string]map[string]string{}
+			for k1, v1 := range v {
+				district_info := map[string]string{}
+				if len(v1) == 1 {
+					district_info = v1
+				}
+				city_info[k1] = district_info
+			}
+			new_regions[k] = city_info
+		}
+		return new_regions
+	}
+	return datas
+}
+
+//两组比对~找寻补充,排除数据
+func AnalysisIsUniqueInfo(regions map[string]map[string]map[string]string, all_regions *map[string]map[string]map[string]string) {
+	if len(regions) == 0 {
+		return
+	}
+	if len(*all_regions) == 0 {
+		*all_regions = regions
+		return
+	}
+	regionsArr := splitRegionsInfos(regions)          //目标数据
+	all_regionsArr := splitRegionsInfos(*all_regions) //源数据
+	new_all_regionsArr := []map[string]string{}       //新数据
+	for _, info := range regionsArr {
+		area := qu.ObjToString(info["area"])
+		if (*all_regions)[area] == nil {
+			continue
+		}
+		unmatchInfo1 := ScreenOutReasonableRegionInfo(info, &all_regionsArr, &new_all_regionsArr)
+		if unmatchInfo1 != nil { //降级匹配~最多二级
+			unmatchInfo2 := ScreenOutReasonableRegionInfo(unmatchInfo1, &all_regionsArr, &new_all_regionsArr)
+			if unmatchInfo2 != nil { //降级匹配~最多一级
+				ScreenOutReasonableRegionInfo(unmatchInfo2, &all_regionsArr, &new_all_regionsArr)
+			}
+		}
+	}
+	//根据最新有效地域组~重新构建所有信息
+	reset_regions_infos := map[string]map[string]map[string]string{}
+	if len(new_all_regionsArr) > 0 {
+		for _, v := range new_all_regionsArr {
+			area := qu.ObjToString(v["area"])
+			city := qu.ObjToString(v["city"])
+			district := qu.ObjToString(v["district"])
+			UpdateRegionsInfo(area, city, district, &reset_regions_infos)
+		}
+		*all_regions = reset_regions_infos
+	}
+}
+
+//选取规则方法
+func ScreenOutReasonableRegionInfo(info map[string]string, regions_infosArr *[]map[string]string, new_regions_infosArr *[]map[string]string) map[string]string {
+	area := qu.ObjToString(info["area"])
+	city := qu.ObjToString(info["city"])
+	district := qu.ObjToString(info["district"])
+	is_Exist := false
+	for _, s := range *regions_infosArr {
+		s_area := qu.ObjToString(s["area"])
+		s_city := qu.ObjToString(s["city"])
+		s_district := qu.ObjToString(s["district"])
+		if s_area == area && s_city == city && s_district == district {
+			is_Exist = true
+			*new_regions_infosArr = append(*new_regions_infosArr, info)
+		} else {
+			//判断是否为补充原则
+			if area != "" && city != "" && district != "" { //3级补2级 3级补2级
+				if s_area == area && s_city == city && s_district == "" {
+					is_Exist = true
+					*new_regions_infosArr = append(*new_regions_infosArr, info)
+				} else if s_area == area && s_city == "" && s_district == "" {
+					is_Exist = true
+					*new_regions_infosArr = append(*new_regions_infosArr, info)
+				}
+			} else if area != "" && city != "" && district == "" { //2级补1级
+				if s_area == area && s_city == "" {
+					is_Exist = true
+					*new_regions_infosArr = append(*new_regions_infosArr, info)
+				}
+			}
+		}
+	}
+
+	//若没有补充~针对二三级 再次进行去掉末位一级,再次进行一轮比对
+	if !is_Exist {
+		if area != "" && city != "" && district != "" {
+			return map[string]string{"area": area, "city": city, "district": ""}
+		}
+		if area != "" && city != "" && district == "" {
+			return map[string]string{"area": area, "city": "", "district": ""}
+		}
+	}
+	return nil
+}
+
+//拆分地域数据~目的更好的合并选取
+func splitRegionsInfos(infos map[string]map[string]map[string]string) []map[string]string {
+	infosArr := []map[string]string{}
+	for k, v := range infos {
+		if len(v) > 0 {
+			for k1, v1 := range v {
+				if len(v1) > 0 {
+					for k2, _ := range v1 {
+						infosArr = append(infosArr, map[string]string{"area": k, "city": k1, "district": k2})
+					}
+				} else {
+					infosArr = append(infosArr, map[string]string{"area": k, "city": k1, "district": ""})
+				}
+			}
+		} else {
+			infosArr = append(infosArr, map[string]string{"area": k, "city": "", "district": ""})
+		}
+	}
+	return infosArr
+}
+
+//日志流程记录~组级别
+func LogProcessRecordingForGroupInfo(key string, valueArr []string, fieldInfos map[string]interface{}, groupInfos map[string]map[string]map[string]string, finallyInfos map[string]map[string]map[string]string, logRecordInfo *[]map[string]interface{}) {
+	groupArr := splitRegionsInfos(groupInfos)
+	finalluArr := splitRegionsInfos(finallyInfos)
+	data := map[string]interface{}{
+		key + "_value":   valueArr,
+		key + "_group":   groupArr,
+		"finally_region": finalluArr,
+	}
+	for k, v := range fieldInfos {
+		data[k] = v
+	}
+	*logRecordInfo = append(*logRecordInfo, data)
+}
+
+//日志流程记录~第二链路
+func LogProcessRecordingForSupplement(logRecordInfo *[]map[string]interface{}, data map[string]interface{}) {
+	*logRecordInfo = append(*logRecordInfo, data)
+}
+
+//日志流程记录~初步
+func LogProcessRecordingForTentative(key string, valueArr interface{}, finallyInfos map[string]map[string]map[string]string, logRecordInfo *[]map[string]interface{}) {
+	finallyArr := splitRegionsInfos(finallyInfos)
+	data := map[string]interface{}{
+		key + "_value":   valueArr,
+		"finally_region": finallyArr,
+	}
+	*logRecordInfo = append(*logRecordInfo, data)
+}
+
+//重构地域逻辑顺序
+func (e *ExtractTask) IsConsecutionRegion(site string) bool {
+	isReset := false
+	if tmp := e.SiteCityMap[site]; tmp != nil {
+		if tmp.T == "学校" || tmp.T == "军队" || tmp.T == "政府采购" || tmp.T == "公共资源" ||
+			tmp.T == "人民政府" || tmp.T == "政府门户" || tmp.T == "在线审批平台" {
+			isReset = true
+		}
+	}
+	return isReset
+}
+
+//清洗项目名称
+func CleanRegionProjectNameInfo(projectname string, buyer string) string {
+	new_str := projectname
+	if new_str == "" {
+		return new_str
+	}
+	if buyer != "" {
+		new_str = strings.ReplaceAll(projectname, buyer, "")
+	}
+	if CleanRegionReg1.MatchString(new_str) {
+		new_str = CleanRegionReg1.ReplaceAllString(new_str, "")
+	}
+	return new_str
+}
+
+//清洗文本词组
+func CleanRegionTextWords(wordsArr []string) []string {
+	if len(wordsArr) <= 1 {
+		return wordsArr
+	}
+	newArr, index := []string{}, 0
+	for k, v := range wordsArr {
+		if k > 0 && (v == "路" || v == "街道") {
+			index = k
+			break
+		}
+	}
+	if index > 0 {
+		for k, v := range wordsArr {
+			if k == index || k == index-1 {
+				continue
+			}
+			newArr = append(newArr, v)
+		}
+		return newArr
+	}
+
+	//清除特殊词组~城区
+	for _, v := range wordsArr {
+		if v == "城区" {
+			continue
+		}
+		newArr = append(newArr, v)
+	}
+	return newArr
+}
+
+//链路补充~全称类
+func (e *ExtractTask) LinkSpecialRuleFullStep(text string, area *string, city *string, district *string) {
+	regions := map[string]map[string]map[string]string{}
+	wordsArr := e.Seg_SV.Cut(text, true)
+	for _, word := range wordsArr {
+		for pos_full, trie_full := range e.Trie_Fulls {
+			if pos_full == 3 {
+				if trie_full.Get(word) {
+					districts := e.StreetDistrictMap[word]
+					for _, d := range districts {
+						v_area, v_city, v_district := d.C.P.Brief, d.C.Name, d.Name
+						UpdateRegionsInfo(v_area, v_city, v_district, &regions)
+					}
+				}
+			}
+		}
+	}
+	if len(regions) > 0 {
+		if *area == "" || *area == "全国" { //新增原则
+			LinkAddedRules(regions, area, city, district)
+		} else { //补充原则
+			LinkSuppleRules(regions, area, city, district)
+		}
+	}
+}
+
+//链路补充~简称类
+func (e *ExtractTask) LinkSpecialRuleBriefStep(text string, area *string, city *string, district *string) {
+	regions := map[string]map[string]map[string]string{}
+	wordsArr := e.Seg_SV.Cut(text, true)
+	for _, word := range wordsArr {
+		for pos_sim, trie_sim := range e.Trie_Sims {
+			if pos_sim == 2 {
+				if trie_sim.Get(word) {
+					citysArr := e.DistrictSimAndAll[word]
+					for _, full_citys := range citysArr {
+						for d, c := range full_citys {
+							if c == nil || c.P == nil || c.Name == "" {
+								continue
+							}
+							if c.P.Brief != "" && c.Name != "" && d != "" {
+								v_area, v_city, v_district := c.P.Brief, c.Name, d
+								UpdateRegionsInfo(v_area, v_city, v_district, &regions)
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	if len(regions) > 0 {
+		if *area == "" || *area == "全国" { //新增原则
+			LinkAddedRules(regions, area, city, district)
+		} else { //补充原则
+			LinkSuppleRules(regions, area, city, district)
+		}
+	}
+}
+
+//新增原则
+func LinkAddedRules(regions map[string]map[string]map[string]string, area *string, city *string, district *string) {
+	if len(regions) == 1 {
+		for k, v := range regions {
+			*area = k
+			if len(v) == 1 {
+				for k1, v1 := range v {
+					*city = k1
+					if len(v1) == 1 {
+						for k2, _ := range v1 {
+							*district = k2
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+//补充原则
+func LinkSuppleRules(regions map[string]map[string]map[string]string, area *string, city *string, district *string) {
+	for k, v := range regions {
+		if *area == k {
+			if len(v) == 1 {
+				for k1, v1 := range v {
+					*city = k1
+					if len(v1) == 1 {
+						for k2, _ := range v1 {
+							*district = k2
+						}
+					}
+				}
+			}
+		}
+	}
+
+}
+
+//取出排除表格之外的文本
+func TextAfterRemoveTable(con string) string {
+	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	doc2.Find("table").Remove()
+	return doc2.Text()
+}
+
+//同组合并的地域数据
+//func MergeGroupRegionInfo(datas_1, datas_2 map[string]map[string]map[string]string) map[string]map[string]map[string]string {
+//	regions := map[string]map[string]map[string]string{}
+//	if len(datas_1) > 0 && len(datas_2) == 0 {
+//		return datas_1
+//	}
+//	if len(datas_2) > 0 && len(datas_1) == 0 {
+//		return datas_2
+//	}
+//	for k, v := range datas_1 {
+//		area, city, district := "", "", ""
+//		area = k
+//		if len(v) > 0 {
+//			for k1, v1 := range v {
+//				city = k1
+//				if len(v1) > 0 {
+//					for k2, _ := range v1 {
+//						district = k2
+//						UpdateRegionsInfo(area, city, district, &regions)
+//					}
+//				} else {
+//					UpdateRegionsInfo(area, city, district, &regions)
+//				}
+//			}
+//		} else {
+//			UpdateRegionsInfo(area, city, district, &regions)
+//		}
+//	}
+//
+//	for k, v := range datas_2 {
+//		area, city, district := "", "", ""
+//		area = k
+//		if len(v) > 0 {
+//			for k1, v1 := range v {
+//				city = k1
+//				if len(v1) > 0 {
+//					for k2, _ := range v1 {
+//						district = k2
+//						UpdateRegionsInfo(area, city, district, &regions)
+//					}
+//				} else {
+//					UpdateRegionsInfo(area, city, district, &regions)
+//				}
+//			}
+//		} else {
+//			UpdateRegionsInfo(area, city, district, &regions)
+//		}
+//	}
+//	return regions
+//}
+
+//临时修复校验~~~
+//func (e *ExtractTask) temporaryRepairRegionSite() {
+//	sess := ju.Site_Mgo.GetMgoConn()
+//	defer ju.Site_Mgo.DestoryMongoConn(sess)
+//	q, total, isok := map[string]interface{}{}, 0, 0
+//	it := sess.DB(ju.Site_Mgo.DbName).C("site").Find(&q).Sort("_id").Select(map[string]interface{}{
+//		"area":     1,
+//		"city":     1,
+//		"district": 1,
+//	}).Iter()
+//	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+//		if total%10000 == 0 {
+//			log.Debug("cur index ", total, "~", isok)
+//		}
+//		area := qu.ObjToString(tmp["area"])
+//		city := qu.ObjToString(tmp["city"])
+//		district := qu.ObjToString(tmp["district"])
+//		isUpdate := false
+//		if area == "" {
+//			isUpdate = true
+//			area = "全国"
+//			city = ""
+//			district = ""
+//		} else {
+//			if province := e.ProvinceMap[area]; province != "" {
+//				isUpdate = true
+//				area = province
+//			}
+//			if city == "" {
+//				district = ""
+//			} else {
+//				if csMap := e.CityBriefMap[city]; csMap != nil {
+//					if csMap.P.Brief == area && csMap.Name != "" {
+//						isUpdate = true
+//						city = csMap.Name
+//					}
+//				} else { //市区~为省份的情况
+//					if e.CityMap[city] == "" {
+//						fmt.Println("市异常~", tmp["_id"])
+//					}
+//				}
+//				if district != "" {
+//					citysArr := e.DistrictSimAndAll[district]
+//					if len(citysArr) == 1 {
+//						full_city := citysArr[0]
+//						for d, _ := range full_city {
+//							isUpdate = true
+//							district = d
+//						}
+//					} else if len(citysArr) == 0 {
+//						fullArr := e.DistrictCityMap[district]
+//						if len(fullArr) == 0 {
+//							fmt.Println("县异常~", tmp["_id"])
+//						}
+//					} else {
+//
+//					}
+//				}
+//			}
+//		}
+//		if isUpdate {
+//			isok++
+//			ju.Site_Mgo.UpdateById("site", BsonTOStringId(tmp["_id"]), map[string]interface{}{
+//				"$set": map[string]interface{}{
+//					"area":     area,
+//					"city":     city,
+//					"district": district,
+//				},
+//			})
+//		}
+//		tmp = make(map[string]interface{})
+//	}
+//	log.Debug("监测修复完毕~ ", total, "~", isok)
+//
+//}

+ 0 - 784
extcity/src/ext/extcity.go

@@ -1,784 +0,0 @@
-package ext
-
-import (
-	log "github.com/donnie4w/go-logger/logger"
-	qu "qfw/util"
-	"regexp"
-	"sync"
-)
-type Job struct {
-	FullAreaScore     map[string]float64                //全称province得分
-	FullCityScore     map[string]float64                //全称city得分
-	FullDistrictScore map[string]float64                //全称district得分
-	SimAreaScore      map[string]float64                //简称province得分
-	SimCityScore      map[string]float64                //简称city得分
-	SimDistrictScore  map[string]float64                //简称district得分
-}
-var AgencyReg = []*regexp.Regexp{
-	regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
-	regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
-}
-//支持排序的map
-type SortMap struct {
-	Index map[string]int
-	Keys  []string
-	Map   map[string]interface{}
-	Lock  sync.Mutex
-}
-func NewSortMap() *SortMap {
-	return &SortMap{
-		Index: map[string]int{},
-		Keys:  []string{},
-		Map:   map[string]interface{}{},
-	}
-}
-func (s *SortMap) AddKey(key string, val interface{}) {
-	//判断val
-	//	if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" {
-	//		return
-	//	}
-	s.Lock.Lock()
-	defer s.Lock.Unlock()
-	//重复
-	if s.Map[key] == nil {
-		s.Index[key] = len(s.Keys)
-		s.Keys = append(s.Keys, key)
-	}
-	s.Map[key] = val
-}
-//增加值
-func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) {
-	s.Lock.Lock()
-	defer s.Lock.Unlock()
-	//重复
-	v := s.Index[replacekey]
-	s.Index[key] = v
-	delete(s.Index, replacekey)
-	s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...)
-	delete(s.Map, replacekey)
-	s.Map[key] = val
-}
-//删除值
-func (s *SortMap) RemoveKey(key string) {
-	s.Lock.Lock()
-	defer s.Lock.Unlock()
-	delete(s.Map, key)
-	//pos := s.Index[key]
-	delete(s.Index, key)
-	s.Keys = removeslice(s.Keys, key)
-}
-func removeslice(slice []string, elem interface{}) []string {
-	if len(slice) == 0 {
-		return slice
-	}
-	for i, v := range slice {
-		if v == elem {
-			slice = append(slice[:i], slice[i+1:]...)
-			return removeslice(slice, elem)
-		}
-	}
-	return slice
-}
-
-
-//抽取城市
-func (e *ExtractTask) NewExtractCityField(data map[string]interface{})( map[string]interface{}) {
-	defer qu.Catch()
-	//初始化
-	j := &Job{
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-	}
-	pscore := make(map[string]float64)
-	cscore := make(map[string]float64)
-	dscore := make(map[string]float64)
-	sm := NewSortMap()
-	for k,v := range data{
-		sm.AddKey(k, qu.ObjToString(v))
-	}
-	e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
-	MergeFullSimScore(j)//全称简称得分合并
-	MergeScores(j, &pscore, &cscore, &dscore)
-
-	finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
-	e.RemoveCD(finishP, j)
-	finishC := HighestScoreArr(j.FullCityScore)
-	finishD := HighestScoreArr(j.FullDistrictScore)
-	arearesult := ""
-	cityresult := ""
-	districtresult := ""
-	tmpcity := []string{}
-	if len(finishP) == 1 { //最高分一个
-		arearesult = finishP[0] //抽取结果直接赋值
-		cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
-		cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-	} else if len(finishP) > 1 { //province最高分多个
-		if len(finishC) == 1 {
-			cityresult = finishC[0]
-			if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
-				arearesult = cfMap.P.Brief
-				tmpcity = append(tmpcity, cityresult)
-				cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-			}
-		} else { //对应的city有多个(多个province和city)
-			arearesult = "全国"
-		}
-	}
-	if cityresult != "" && cityresult == districtresult {
-		districtresult = ""
-	}
-	//直辖市
-	if arearesult == "北京" {
-		cityresult = "北京市"
-		if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
-			districtresult = "朝阳区"
-		}
-	} else if arearesult == "天津" {
-		cityresult = "天津市"
-	} else if arearesult == "上海" {
-		cityresult = "上海市"
-	} else if arearesult == "重庆" {
-		cityresult = "重庆市"
-	}
-	if arearesult == "" {
-		arearesult = "全国"
-	}
-	resultTmp := map[string]interface{}{}
-	if arearesult!="" {
-		resultTmp["area"] = arearesult
-	}
-	if cityresult!="" {
-		resultTmp["city"] = cityresult
-	}
-	if districtresult!="" {
-		resultTmp["district"] = districtresult
-	}
-	return resultTmp
-}
-//单字段正文
-func (e *ExtractTask) NewExtractCityDetail(data map[string]interface{})( map[string]interface{}) {
-	defer qu.Catch()
-	//初始化
-	j := &Job{
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-		map[string]float64{},
-	}
-	detail := qu.ObjToString(data["detail"])
-	e.NewGetCityByDetail(j,detail)
-	MergeFullSimScore(j) //合并detail的全简称
-	finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
-	e.RemoveCD(finishP, j)                      //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
-	//获取结果
-	finishC := HighestScoreArr(j.FullCityScore)
-	finishD := HighestScoreArr(j.FullDistrictScore)
-	arearesult := ""
-	cityresult := ""
-	districtresult := ""
-	tmpcity := []string{}
-	if len(finishP) == 1 { //最高分一个
-		arearesult = finishP[0] //抽取结果直接赋值
-		cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
-		cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-	} else if len(finishP) > 1 { //province最高分多个
-		if len(finishC) == 1 {
-			cityresult = finishC[0]
-			if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
-				arearesult = cfMap.P.Brief
-				tmpcity = append(tmpcity, cityresult)
-				cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
-			}
-		} else { //对应的city有多个(多个province和city)
-			arearesult = "全国"
-		}
-	}
-	if cityresult != "" && cityresult == districtresult {
-		districtresult = ""
-	}
-	//直辖市
-	if arearesult == "北京" {
-		cityresult = "北京市"
-		if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
-			districtresult = "朝阳区"
-		}
-	} else if arearesult == "天津" {
-		cityresult = "天津市"
-	} else if arearesult == "上海" {
-		cityresult = "上海市"
-	} else if arearesult == "重庆" {
-		cityresult = "重庆市"
-	}
-	if arearesult == "" {
-		arearesult = "全国"
-	}
-	resultTmp := map[string]interface{}{}
-	if arearesult!="" {
-		resultTmp["area"] = arearesult
-	}
-	if cityresult!="" {
-		resultTmp["city"] = cityresult
-	}
-	if districtresult!="" {
-		resultTmp["district"] = districtresult
-	}
-	return resultTmp
-}
-
-
-
-
-//不同情况的抽取方法
-func (e *ExtractTask) NewGetCityByDetail(j *Job,detail string) {
-	repeatP_full := map[string]bool{}
-	repeatC_full := map[string]bool{}
-	repeatD_full := map[string]bool{}
-	repeatP_sim := map[string]bool{}
-	repeatC_sim := map[string]bool{}
-	repeatD_sim := map[string]bool{}
-	for _, reg := range AgencyReg {
-		detail = reg.ReplaceAllString(detail, "")
-	}
-	for _, text := range e.Seg_SV.Cut(detail, true) {
-		if len([]rune(text)) > 1 {
-			//全称匹配
-			for pos_full, trie_full := range e.Trie_Fulls {
-				if trie_full.Get(text) {
-					if pos_full == 0 { //省全称
-						if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
-							PCDScore(j, "province", tmpPbrief, 1, true)
-							repeatP_full[tmpPbrief] = true
-							break
-						}
-					} else if pos_full == 1 { //市全称
-						if cfMap := e.CityFullMap[text]; cfMap != nil {
-							if !repeatP_full[cfMap.P.Brief] {
-								PCDScore(j, "province", cfMap.P.Brief, 1, true)
-								repeatP_full[cfMap.P.Brief] = true
-							}
-							if !repeatC_full[cfMap.Name] {
-								PCDScore(j, "city", cfMap.Name, 1, true)
-								repeatC_full[cfMap.Name] = true
-							}
-							break
-						}
-					} else if pos_full == 2 { //区全称
-						citys := e.NewDistrictCityMap[text]
-						if len(citys) > 0 {
-							if !repeatD_full[text] {
-								PCDScore(j, "district", text, 1, true)
-								repeatD_full[text] = true
-							}
-							for _, c := range citys {
-								if !repeatC_full[c.Name] {
-									PCDScore(j, "city", c.Name, 1, true)
-									repeatC_full[c.Name] = true
-								}
-								if !repeatP_full[c.P.Brief] {
-									PCDScore(j, "province", c.P.Brief, 1, true)
-									repeatP_full[c.P.Brief] = true
-								}
-							}
-							break
-						}
-					} else if pos_full == 3 { //街道全称
-						districts := e.NewStreetDistrictMap[text]
-						if len(districts) == 1 {
-							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
-						}
-					} else if pos_full == 4 { //居委会全称
-						districts := e.CommunityDistrictMap[text]
-						if len(districts) == 1 {
-							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
-						}
-					}
-				}
-			}
-			//qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
-			//简称匹配
-			for pos_sim, trie_sim := range e.Trie_Sims {
-				if trie_sim.Get(text) {
-					if pos_sim == 0 && !repeatP_sim[text] { //省简称
-						PCDScore(j, "province", text, 1, false)
-						repeatP_sim[text] = true
-						break
-					} else if pos_sim == 1 { //市简称
-						if cbMap := e.CityBriefMap[text]; cbMap != nil {
-							if !repeatP_sim[cbMap.P.Brief] {
-								PCDScore(j, "province", cbMap.P.Brief, 1, false)
-								repeatP_sim[cbMap.P.Brief] = true
-							}
-							if !repeatC_sim[cbMap.Name] {
-								PCDScore(j, "city", cbMap.Name, 1, false)
-								repeatC_sim[cbMap.Name] = true
-							}
-							break
-						}
-					} else if pos_sim == 2 { //区简称
-						dfull_citys := e.NewDistrictSimAndAll[text]
-						if len(dfull_citys) == 1 {
-							for _, dfull_city := range dfull_citys {
-								for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
-									if !repeatD_sim[dfull] {
-										PCDScore(j, "district", dfull, 1, false)
-										repeatD_sim[dfull] = true
-									}
-									if ctmp == nil {
-										continue
-									}
-									if !repeatC_sim[ctmp.Name] {
-										PCDScore(j, "city", ctmp.Name, 1, false)
-										repeatC_sim[ctmp.Name] = true
-									}
-									if !repeatP_sim[ctmp.P.Brief] {
-										PCDScore(j, "province", ctmp.P.Brief, 1, false)
-										repeatP_sim[ctmp.P.Brief] = true
-									}
-								}
-							}
-						}
-					}
-				}
-			}
-			//qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
-		}
-	}
-}
-func (e *ExtractTask) NewGetCityByOthers(j *Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
-	/*
-		1.对字段进行分词
-		2.省、市、区、街道、居委会全称进行匹配打分
-		3.省、市、区简称进行匹配打分
-	*/
-	ts := 0.5
-	for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
-		if i > 1 {
-			ts = 0.2
-		}
-		p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
-		str, _ := sm.Map[from].(string)
-		jbText := e.Seg_SV.Cut(str, true)
-		for _, text := range jbText {
-			if len([]rune(text)) == 1 {
-				continue
-			}
-			//全称匹配
-			//qu.Debug("text------", text)
-			for pos_full, trie_full := range e.Trie_Fulls {
-				if trie_full.Get(text) {
-					if pos_full == 0 && p_full == "" { //省全称
-						if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
-							p_full = tmpPbrief
-							PCDScore(j, "province", p_full, 4+ts, true)
-							break
-						}
-					} else if pos_full == 1 && c_full == "" { //市全称
-						if cfMap := e.CityFullMap[text]; cfMap != nil {
-							tmpPbrief := cfMap.P.Brief
-							if p_full == "" {
-								p_full = tmpPbrief
-								c_full = cfMap.Name
-								PCDScore(j, "province", p_full, 4+ts, true)
-								PCDScore(j, "city", c_full, 4+ts, true)
-								break
-							} else if p_full == tmpPbrief {
-								c_full = cfMap.Name
-								PCDScore(j, "province", tmpPbrief, 4+ts, true) //
-								PCDScore(j, "city", c_full, 4+ts, true)
-								break
-							} else if p_full != "" && p_full != tmpPbrief {
-								//city不做处理
-							}
-						}
-					} else if pos_full == 2 && d_full == "" { //区全称
-						repeatPb := map[string]bool{}
-						isOk := false
-						districtOk := false
-						citys := e.NewDistrictCityMap[text]
-						for _, c := range citys {
-							tmpPbrief := c.P.Brief
-							if p_full == tmpPbrief { //省份一致
-								d_full = text
-								if c_full == "" {
-									c_full = c.Name
-									PCDScore(j, "city", c_full, 4+ts, true)
-									PCDScore(j, "province", tmpPbrief, 4+ts, true) //
-								}
-								isOk = true
-								districtOk = true
-							} else if p_full == "" { //省份不存在
-								districtOk = true
-								if len(citys) == 1 { //对应一个city
-									p_full = tmpPbrief
-									c_full = c.Name
-									d_full = text
-									PCDScore(j, "province", p_full, 4+ts, true)
-									PCDScore(j, "city", c_full, 4+ts, true)
-									isOk = true
-								} else { //多个city,只打分,不赋值
-									if !repeatPb[tmpPbrief] {
-										PCDScore(j, "province", tmpPbrief, 2+ts, true)
-										repeatPb[tmpPbrief] = true
-									}
-									//PCDScore(j, "province", tmpPbrief, 2, true)
-									PCDScore(j, "city", c.Name, 2+ts, true)
-								}
-							} else if p_full != "" && p_full != tmpPbrief { //干扰项减分
-								if !repeatPb[tmpPbrief] {
-									PCDScore(j, "province", tmpPbrief, -5, true)
-									repeatPb[tmpPbrief] = true
-								}
-								//PCDScore(j, "province", tmpPbrief, -5, true)
-								PCDScore(j, "city", c.Name, -5, true)
-							}
-						}
-						if districtOk {
-							PCDScore(j, "district", text, 4+ts, true)
-						} else {
-							PCDScore(j, "district", text, -5, true)
-						}
-						if isOk {
-							break
-						}
-					} else if pos_full == 3 { //街道全称
-						districts := e.NewStreetDistrictMap[text]
-						if len(districts) == 1 { //街道唯一
-							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
-						}
-					} else if pos_full == 4 { //居委会全称
-						districts := e.CommunityDistrictMap[text]
-						if len(districts) == 1 { //居委会唯一
-							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
-						}
-					}
-				}
-			}
-			//qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
-			//简称匹配
-			for pos_sim, trie_sim := range e.Trie_Sims {
-				if trie_sim.Get(text) {
-					if pos_sim == 0 && p_sim == "" { //省简称
-						p_sim = text
-						PCDScore(j, "province", p_sim, 3+ts, false)
-						break
-					} else if pos_sim == 1 { //市简称
-						if cbMap := e.CityBriefMap[text]; cbMap != nil {
-							tmpPbrief := cbMap.P.Brief
-							if p_sim == "" {
-								score := 2.0 + ts
-								if tmpPbrief == p_full {
-									score += 1.0
-								}
-								p_sim = tmpPbrief
-								c_sim = cbMap.Brief
-								PCDScore(j, "province", p_sim, score, false)
-								PCDScore(j, "city", cbMap.Name, score, false)
-								break
-							} else if p_sim == tmpPbrief {
-								c_sim = cbMap.Brief
-								PCDScore(j, "city", cbMap.Name, 3+ts, false)
-								PCDScore(j, "province", tmpPbrief, 3+ts, false)
-								break
-							} else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
-								delete(j.SimAreaScore, p_sim)
-								c_sim = text      //
-								p_sim = tmpPbrief //
-								PCDScore(j, "province", tmpPbrief, 3+ts, false)
-								PCDScore(j, "city", cbMap.Name, 3+ts, false)
-							}
-						}
-					} else if pos_sim == 2 && d_sim == "" { //区简称
-						repeatPb := map[string]bool{}
-						repeatDb := map[string]bool{}
-						dfull_citys := e.NewDistrictSimAndAll[text]
-						for _, dfull_city := range dfull_citys {
-							for dfull, c := range dfull_city { //dfull:简称对应的全称
-								if c == nil || c.P == nil {
-									continue
-								}
-								tmpPbrief := c.P.Brief
-								if p_sim == tmpPbrief { //省份一致
-									d_sim = text
-									PCDScore(j, "district", dfull, 2+ts, false)
-									if c_sim == "" {
-										c_sim = c.Brief
-										PCDScore(j, "city", c.Name, 2+ts, false)
-									}
-									PCDScore(j, "province", tmpPbrief, 2+ts, false) //
-								} else if p_sim == "" {
-									if !repeatDb[dfull] {
-										PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
-										repeatDb[dfull] = true
-									}
-									if len(dfull_citys) == 1 {
-										PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
-										PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
-									} else {
-										if !repeatPb[tmpPbrief] {
-											PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
-											repeatPb[tmpPbrief] = true
-										}
-										PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
-									}
-								} else if p_sim != "" && p_sim != tmpPbrief {
-									if !repeatPb[tmpPbrief] {
-										PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
-										repeatPb[tmpPbrief] = true
-									}
-									PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
-									PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
-								}
-							}
-						}
-					}
-				}
-			}
-			//qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
-		}
-	}
-}
-
-
-
-
-
-func MergeScores(j *Job, pscore, cscore, dscore *map[string]float64) {
-	if len(j.FullAreaScore) > 0 {
-		for pt, ps := range *pscore {
-			j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
-		}
-		for ct, cs := range *cscore {
-			j.FullCityScore[ct] = j.FullCityScore[ct] + cs
-		}
-		for dt, ds := range *dscore {
-			j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
-		}
-	}
-}
-func MergeFullSimScore(j *Job) {
-	if len(j.FullAreaScore) == 0 {
-		j.FullAreaScore = j.SimAreaScore
-	} else {
-		for p_text, p_score := range j.FullAreaScore {
-			j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
-		}
-	}
-	for c_text, c_score := range j.SimCityScore {
-		j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
-	}
-	for d_text, d_score := range j.SimDistrictScore {
-		j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
-	}
-	//	if len(j.FullCityScore) == 0 {
-	//		j.FullCityScore = j.SimCityScore
-	//	} else {
-	//		for c_text, c_score := range j.FullCityScore {
-	//			j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
-	//		}
-	//	}
-	//	if len(j.FullDistrictScore) == 0 {
-	//		j.FullDistrictScore = j.SimDistrictScore
-	//	} else {
-	//		for d_text, d_score := range j.FullDistrictScore {
-	//			j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
-	//		}
-	//	}
-}
-func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
-	for _, c := range finishC { //取最高分与province匹配的city
-		if cfMap := e.CityFullMap[c]; cfMap != nil {
-			if cfMap.P.Brief == area {
-				//				city = c
-				//				break
-				tmpcity = append(tmpcity, c)
-			}
-		}
-	}
-	if len(tmpcity) == 1 {
-		city = tmpcity[0]
-	}
-	return city, tmpcity
-}
-func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
-	for _, d := range finishD { //取最高分与province匹配的district
-		citys := e.NewDistrictCityMap[d]
-		for _, c := range citys {
-			if len(tmpcity) == 0 { //没有city
-				if c.P.Brief == area {
-					city = c.Name
-					district = d
-					return city, district
-				}
-			} else if len(tmpcity) == 1 { //一个city
-				if c.Name == city && c.P.Brief == area {
-					district = d
-					return city, district
-				}
-			} else {                         //多个city
-				for _, tc := range tmpcity { //多个city根据district最高分取
-					if tc == c.Name && len(finishD) == 1 {
-						city = c.Name
-						district = d
-						return city, district
-					}
-				}
-			}
-		}
-	}
-	return city, district
-}
-func HighestScoreArr(m map[string]float64) []string {
-	result := make(map[float64][]string)
-	tmpscore := 0.0
-	for str, score := range m {
-		if str != "" && tmpscore <= score {
-			if result[tmpscore] != nil && tmpscore != score {
-				delete(result, tmpscore)
-			}
-			if r := result[score]; r != nil {
-				r = append(r, str)
-				result[score] = r
-			} else {
-				result[score] = []string{str}
-			}
-			tmpscore = score
-		}
-	}
-	return result[tmpscore]
-}
-func (e *ExtractTask) RemoveCD(finishP []string, j *Job) {
-	if len(j.FullDistrictScore) > 0 {
-		for d, _ := range j.FullDistrictScore {
-			tmpCitys := e.NewDistrictCityMap[d]
-			for _, c := range tmpCitys {
-				if j.FullCityScore[c.Name] != 0 {
-					tmpPb := c.P.Brief
-					//if j.FullAreaScore[tmpPb] != 0 {
-					flag := false
-					for _, p := range finishP {
-						if tmpPb == p {
-							flag = true
-							break
-						}
-					}
-					if !flag {
-						delete(j.FullCityScore, c.Name)
-						delete(j.FullDistrictScore, d)
-					}
-					//}
-				}
-			}
-		}
-	}
-	if len(j.FullCityScore) > 0 {
-		for tmpcity, _ := range j.FullCityScore {
-			c := e.CityFullMap[tmpcity]
-			if c == nil {
-				log.Debug("行政区划错误数据:", tmpcity)
-				continue
-			}
-			tmpPb := c.P.Brief
-			//if j.FullAreaScore[tmpPb] != 0 {
-			flag := false
-			for _, p := range finishP {
-				if tmpPb == p {
-					flag = true
-					break
-				}
-			}
-			if !flag {
-				delete(j.FullCityScore, tmpcity)
-			}
-			//}
-		}
-	}
-
-}
-//计算province,city,district得分
-func PCDScore(j *Job, stype, text string, score float64, isfull bool) {
-	defer qu.Catch()
-	if text != "" {
-		if stype == "district" {
-			tmpdistrict := make(map[string]float64)
-			if isfull {
-				tmpdistrict = j.FullDistrictScore
-			} else {
-				tmpdistrict = j.SimDistrictScore
-			}
-			scoretmp := tmpdistrict[text]
-			tmpdistrict[text] = scoretmp + score
-		} else if stype == "city" {
-			tmpcity := make(map[string]float64)
-			if isfull {
-				tmpcity = j.FullCityScore
-			} else {
-				tmpcity = j.SimCityScore
-			}
-			scoretmp := tmpcity[text]
-			tmpcity[text] = scoretmp + score
-		} else if stype == "province" {
-			tmpprovince := make(map[string]float64)
-			if isfull {
-				tmpprovince = j.FullAreaScore
-			} else {
-				tmpprovince = j.SimAreaScore
-			}
-			scoretmp := tmpprovince[text]
-			tmpprovince[text] = scoretmp + score
-		}
-	}
-}
-//街道、居委会对应多地市处理
-func DealMultipleDistrict(e *ExtractTask, j *Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
-	if len(districts) == 1 {
-		district := districts[0]
-		city := district.C.Name
-		tmpPbrief := district.C.P.Brief
-		if pbrief != "" && tmpPbrief == pbrief {
-			PCDScore(j, "province", tmpPbrief, score, true)
-			PCDScore(j, "city", city, score, true)
-			PCDScore(j, "district", district.Name, score, true)
-		} else if pbrief == "" {
-			if repeatP != nil && !(*repeatP)[tmpPbrief] {
-				PCDScore(j, "province", tmpPbrief, score, true)
-				(*repeatP)[tmpPbrief] = true
-			} else if repeatP == nil {
-				PCDScore(j, "province", tmpPbrief, score, true)
-			}
-			if repeatC != nil && !(*repeatC)[city] {
-				PCDScore(j, "city", city, score, true)
-				(*repeatC)[city] = true
-			} else if repeatC == nil {
-				PCDScore(j, "city", city, score, true)
-			}
-			if repeatD != nil && !(*repeatD)[tmpPbrief] {
-				PCDScore(j, "district", district.Name, score, true)
-				(*repeatD)[district.Name] = true
-			} else if repeatD == nil {
-				PCDScore(j, "district", district.Name, score, true)
-			}
-		}
-	}
-}
-//计算province,city,district区或县匹配的得分
-func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
-	defer qu.Catch()
-	if t != "" {
-		if stype == "d" {
-			tmpscore := (*ds)[t]
-			(*ds)[t] = tmpscore + score
-		} else if stype == "c" {
-			tmpscore := (*cs)[t]
-			(*cs)[t] = tmpscore + score
-		} else if stype == "p" {
-			tmpscore := (*ps)[t]
-			(*ps)[t] = tmpscore + score
-		}
-	}
-}

+ 3 - 2
extcity/src/ext/struct.go

@@ -10,7 +10,6 @@ type Trie struct {
 	c map[rune]*Trie
 }
 
-
 //省
 type Province struct {
 	Name    string
@@ -70,4 +69,6 @@ type SiteCity struct {
 	P string //省简称
 	C string //市全称
 	D string //区全称
-}
+	T string //站点类型
+	Q string //企业地域
+}

+ 9 - 87
extcity/src/main.go

@@ -1,101 +1,23 @@
 package main
 
 import (
-	"encoding/json"
 	"ext"
 	log "github.com/donnie4w/go-logger/logger"
 	"net/http"
-	qu "qfw/util"
-	"sync"
+	"service"
 	"time"
 	ul "util"
 )
 
-var e *ext.ExtractTask
-var datalock sync.Mutex
-func init()  {
-	log.Debug("配置文件准备完毕...")
+func init() {
 	ul.InitExt()
 }
-
-func main()  {
-	e = &ext.ExtractTask{}
-	e.InitCityInfo()  		//初始化-抽取城市
-	log.Debug("城市抽取加载完毕...")
-	ext.InitCheckCity()  	//初始化-校验城市
-
-	//调试
-	ext_res := map[string]interface{}{}
-	ext_res = e.NewExtractCityDetail(map[string]interface{}{
-		"detail":"东风路街道",
-	})
-	log.Debug(ext_res)
-
-	//初始化接口
-	initNetwork()
-	log.Debug("开始监听~",ul.Port)
+func main() {
+	ext.Ext = &ext.ExtractTask{}
+	ext.Ext.InitCityInfo() //初始化-抽取城市
+	log.Debug("地域模型数据加载完毕~~~")
+	service.InitRegionsService()
+	log.Debug("开始监听~", ul.Port)
 	http.ListenAndServe(ul.Port, nil)
-	time.Sleep(99999*time.Hour)
+	time.Sleep(99999 * time.Hour)
 }
-
-//接口方法
-func initNetwork()  {
-	http.HandleFunc("/getcity", func(w http.ResponseWriter, r *http.Request) {
-		datalock.Lock()
-		detail := r.FormValue("detail")
-		//处理detail
-		ext_res := map[string]interface{}{}
-		if detail!="" {
-			ext_res = e.NewExtractCityDetail(map[string]interface{}{
-				"detail":detail,
-			})
-		}
-		res,_:=json.Marshal(ext_res)
-		w.Write(res)
-		datalock.Unlock()
-	})
-}
-
-
-//抽取城市-源表-采用表格数据
-func fromSourceDataExtCity()  {
-	q := map[string]interface{}{}
-	sess := ul.SaveMgo.GetMgoConn()
-	defer ul.SaveMgo.DestoryMongoConn(sess)
-	iter := sess.DB(ul.SaveMgo.DbName).C(ul.SaveColl).Find(&q).Iter()
-	total,isok,ischeck:=0,0,0
-	for tmp := map[string]interface{}{}; iter.Next(&tmp); total++{
-		if total%10000==0 {
-			log.Debug("cur index ",total,isok,ischeck)
-		}
-		tmpid := qu.ObjToString(tmp["_id"])
-		source_field := map[string]interface{}{}
-		for k,_ := range ul.SF {
-			source_field[k] = qu.ObjToString(tmp[k])
-		}
-		if len(source_field)>0 {
-			ext_res := e.NewExtractCityField(source_field)
-			isok++
-			ul.SaveMgo.UpdateByStringId(ul.SaveColl,tmpid, map[string]interface{}{
-				"$set": ext_res,
-			})
-			if len(ext_res)>0 { //审查-城市
-				update_check := make(map[string]interface{},0)
-				ext.GetCheckDataCity(ext_res,&update_check)
-				if len(update_check)>0 {
-					ischeck++
-					ul.SaveMgo.UpdateByStringId(ul.SaveColl,tmpid, map[string]interface{}{
-						"$set": update_check,
-					})
-				}
-			}
-		}
-		tmp = map[string]interface{}{}
-	}
-	log.Debug("is over ",total,isok,ischeck)
-}
-
-
-
-
-

+ 3 - 8
extcity/src/config.json → extcity/src/res/config.json

@@ -2,18 +2,13 @@
    "port": ":9991",
    "ext_mgodb": {
      "addr": "127.0.0.1:27017",
-     "db": "mixdata",
-     "ext_coll": "address_new_2020",
-     "check_coll": "address_jy_2022",
-     "pool": 10,
+     "db": "extract_service",
      "user": "",
      "password": ""
    },
-  "save_mgodb": {
-    "addr": "120.0.0.1:27017",
+  "site_mgodb": {
+    "addr": "127.0.0.1:27017",
     "db": "zhengkun",
-    "coll": "xxxxxx",
-    "pool": 10,
     "user": "",
     "password": ""
   },

+ 0 - 1
extcity/src/pcd.txt → extcity/src/res/pcd.txt

@@ -2095,7 +2095,6 @@
 色达县 4 n
 雷山县 4 n
 新兴县 4 n
-城区 4 n
 石阡县 4 n
 厦门市 4 n
 开化县 4 n

+ 26 - 0
extcity/src/res/regions.json

@@ -0,0 +1,26 @@
+{
+    "default_regions": [
+        "site_area",
+        "addressing,projectaddr",
+        "projectname",
+        "buyer_filiale",
+        "approvedepartment",
+        "buyeraddr",
+        "site_city",
+        "buyer",
+        "title",
+        "buyerzipcode,buyertel,bidopenaddress"
+    ],
+    "adjustment_regions" : [
+        "site_area",
+        "addressing,projectaddr",
+        "projectname",
+        "buyer_filiale",
+        "approvedepartment",
+        "buyer",
+        "buyeraddr",
+        "site_city",
+        "title",
+        "buyerzipcode,buyertel,bidopenaddress"
+    ]
+}

+ 4 - 1
extcity/src/sv.txt → extcity/src/res/sv.txt

@@ -3332,6 +3332,8 @@
 渭源县 4 n
 播州区 4 n
 禄劝彝族苗族自治县 4 n
+复合肥 4 n
+南阳坡 4 n
 蚌山 3 n
 诸城 3 n
 伽师 3 n
@@ -409394,4 +409396,5 @@ IT家园社区居委会 4 n
 冀南新区 4 n
 任泽区 4 n
 西洞庭管理区 4 n
-羊山新区 4 n
+羊山新区 4 n
+西咸新区 4 n

+ 32 - 0
extcity/src/service/service.go

@@ -0,0 +1,32 @@
+package service
+
+import (
+	"encoding/json"
+	"ext"
+	"net/http"
+	qu "qfw/util"
+	"sync"
+)
+
+var datalock sync.Mutex
+
+func InitRegionsService() {
+	http.HandleFunc("/service/region", func(w http.ResponseWriter, r *http.Request) {
+		datalock.Lock()
+		tmp := map[string]interface{}{
+			"detail": "金水区",
+		}
+		//需要构建解析的模型~
+		ext.Ext.ExtractRegionInfo(&tmp, true)
+		area := qu.ObjToString(tmp["area"])
+		city := qu.ObjToString(tmp["city"])
+		district := qu.ObjToString(tmp["district"])
+		res, _ := json.Marshal(map[string]interface{}{
+			"area":     area,
+			"city":     city,
+			"district": district,
+		})
+		w.Write(res)
+		datalock.Unlock()
+	})
+}

+ 21 - 25
extcity/src/util/init.go

@@ -6,16 +6,14 @@ import (
 )
 
 var (
-	Sysconfig 		map[string]interface{}
-	ExtMgo,SaveMgo  *MongodbSim //抽取初始化-相关
-	ExtColl,CheckColl,SaveColl		string
-	SF 	map[string]interface{}
-	Port			string
+	Sysconfig       map[string]interface{}
+	ExtMgo, SiteMgo *MongodbSim //抽取初始化-相关
+	SF              map[string]interface{}
+	Port            string
 )
 
-
 func InitExt() {
-	qu.ReadConfig(&Sysconfig)
+	qu.ReadConfig("./res/config.json", &Sysconfig)
 	log.Println(Sysconfig)
 	if len(Sysconfig) == 0 {
 		log.Fatalln("init config err")
@@ -24,29 +22,27 @@ func InitExt() {
 	Port = qu.ObjToString(Sysconfig["port"])
 	initMgo()
 }
-func initMgo () {
-	saveconf := Sysconfig["save_mgodb"].(map[string]interface{})
-	SaveColl = qu.ObjToString(saveconf["coll"])
-	SaveMgo = &MongodbSim{
-		MongodbAddr: saveconf["addr"].(string),
-		DbName:      saveconf["db"].(string),
-		Size:        qu.IntAllDef(saveconf["pool"], 5),
-		UserName: 	saveconf["user"].(string),
-		PassWord: 	saveconf["password"].(string),
-	}
-	SaveMgo.InitPool()
 
-	//初始化mgo
+func initMgo() {
+	//地域相关
 	extconf := Sysconfig["ext_mgodb"].(map[string]interface{})
-	ExtColl = qu.ObjToString(extconf["ext_coll"])
-	CheckColl = qu.ObjToString(extconf["check_coll"])
 	ExtMgo = &MongodbSim{
 		MongodbAddr: extconf["addr"].(string),
 		DbName:      extconf["db"].(string),
-		Size:        qu.IntAllDef(extconf["pool"], 5),
-		UserName: 	extconf["user"].(string),
-		PassWord: 	extconf["password"].(string),
+		Size:        5,
+		UserName:    extconf["user"].(string),
+		PassWord:    extconf["password"].(string),
 	}
 	ExtMgo.InitPool()
-}
 
+	//站点相关
+	siteconf := Sysconfig["site_mgodb"].(map[string]interface{})
+	SiteMgo = &MongodbSim{
+		MongodbAddr: siteconf["addr"].(string),
+		DbName:      siteconf["db"].(string),
+		Size:        5,
+		UserName:    siteconf["user"].(string),
+		PassWord:    siteconf["password"].(string),
+	}
+	SiteMgo.InitPool()
+}

+ 4 - 1
src/jy/extract/extract.go

@@ -2298,6 +2298,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					if v.Score > -1 {
 						ffield[v.Field] = v.Value
 						if tmp[v.Field] == nil {
+							if v.Field == "addressing" {
+								break
+							}
 							if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 {
 								tmp[v.Field] = v.Value
 								fieldSource[v.Field] = map[string]interface{}{
@@ -2369,7 +2372,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//城市抽取
 		if e.IsExtractCity {
 			//e.NewExtractCity(j, &tmp) //旧版
-			e.ExtractRegionInfo(j, &tmp, true)
+			e.ExtractRegionInfo(j, &tmp, false)
 		}
 		//品牌抽取
 		if ju.IsBrandGoods {

+ 0 - 3
src/jy/extract/extractcity.go

@@ -60,6 +60,3 @@ var AgencyReg = []*regexp.Regexp{
 	regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
 	regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
 }
-
-var xjbtReg *regexp.Regexp = regexp.MustCompile("^(新疆生产建设兵团|新疆兵团)")
-var sensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")

+ 37 - 11
src/jy/extract/extractcity_new.go

@@ -66,9 +66,14 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 	buyer := qu.ObjToString((*tmp)["buyer"])
 	site := qu.ObjToString((*tmp)["site"])
 	//新疆兵团补充地域~
-	if xjbtReg.MatchString(buyer) && f_city == "" {
+	if XjbtReg.MatchString(buyer) && f_city == "" {
 		if a, c, d, ok := e.NewVerifyXjCorpsInfo(buyer); ok {
 			f_area, f_city, f_district = a, c, d
+			if isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_xjbt": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
 		}
 	}
 	//此时进行特殊链路新增、补充原则
@@ -81,16 +86,31 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
 	if f_city == "" {
 		e.LinkSpecialRuleBriefStep(buyer, &f_area, &f_city, &f_district)
 	}
+	if isLog {
+		LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+			"sup_link": f_area + "~" + f_city + "~" + f_district,
+		})
+	}
+
 	//正文补充地域~
 	if f_area == "全国" || f_area == "" || f_city == "" {
 		if b := e.NewVerifySensitiveInfo(qu.ObjToString((*j.Data)["detail"]), &f_area, &f_city, &f_district); b {
-			(*tmp)["is_sensitive"] = 1
+			if isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_detail": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
 		}
 	}
 	//最终站点补充
 	if f_area == "全国" || f_area == "" {
 		if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" {
 			f_area = sc.Q
+			if isLog {
+				LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
+					"sup_site": f_area + "~" + f_city + "~" + f_district,
+				})
+			}
 		}
 	}
 	//最终在清洗一遍数据
@@ -353,8 +373,11 @@ func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d st
 
 //敏感词识别
 func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool {
-	detail = sensitiveReg.ReplaceAllString(detail, "")
+	detail = SensitiveReg.ReplaceAllString(detail, "")
 	detail = TextAfterRemoveTable(detail)
+
+	detail = CleanDetailReg1.ReplaceAllString(detail, "")
+
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
@@ -404,18 +427,21 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	}
 
 	//疑似固话提取~
-	fixedTelArr := FixedTelReg.FindAllString(detail, -1)
-	if len(fixedTelArr) > 0 {
-		codeArr := resetFixedTelInfo(fixedTelArr)
-		if len(codeArr) == 1 {
-			for _, v := range codeArr {
-				if ac := e.AreaCodeMap[v]; ac != nil {
-					*area = ac.P
-					return true
+	if *area == "" || *area == "全国" {
+		fixedTelArr := FixedTelReg.FindAllString(detail, -1)
+		if len(fixedTelArr) > 0 {
+			codeArr := resetFixedTelInfo(fixedTelArr)
+			if len(codeArr) == 1 {
+				for _, v := range codeArr {
+					if ac := e.AreaCodeMap[v]; ac != nil {
+						*area = ac.P
+						return true
+					}
 				}
 			}
 		}
 	}
+
 	return false
 }
 

+ 2 - 2
src/jy/extract/extractcity_old.go

@@ -163,7 +163,7 @@ func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{
 	(*resulttmp)["district"] = districtresult
 
 	//校验-映射新疆兵团
-	if xjbtReg.MatchString(buyer) && cityresult == "" {
+	if XjbtReg.MatchString(buyer) && cityresult == "" {
 		a, c, d, ok := e.CheckingXjbtCity(buyer)
 		if ok {
 			(*resulttmp)["area"] = a
@@ -1026,7 +1026,7 @@ func (e *ExtractTask) CheckingXjbtCity(buyer string) (new_a, new_c, new_d string
 //敏感词识别~~~旧版
 func (e *ExtractTask) SensitiveCityData(detail string, area string) string {
 	//采用正文
-	detail = sensitiveReg.ReplaceAllString(detail, "")
+	detail = SensitiveReg.ReplaceAllString(detail, "")
 	//删除表格相关-文本
 	detail = TextAfterRemoveTable(detail)
 

+ 10 - 0
src/jy/extract/extractcity_way.go

@@ -14,6 +14,11 @@ var CleanRegionReg1 = regexp.MustCompile(".*公司")
 
 var FixedTelReg = regexp.MustCompile("0[0-9]{2,3}\\-[2-9][0-9]{6,7}")
 
+var CleanDetailReg1 = regexp.MustCompile("(北京时间)")
+
+var XjbtReg *regexp.Regexp = regexp.MustCompile("^(新疆生产建设兵团|新疆兵团)")
+var SensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")
+
 //取特殊类数据
 func GetFilialeByBuyerInfo(buyer string) string {
 	if FilialeReg1.MatchString(buyer) {
@@ -419,6 +424,11 @@ func LogProcessRecordingForGroupInfo(key string, valueArr []string, fieldInfos m
 	*logRecordInfo = append(*logRecordInfo, data)
 }
 
+//日志流程记录~第二链路
+func LogProcessRecordingForSupplement(logRecordInfo *[]map[string]interface{}, data map[string]interface{}) {
+	*logRecordInfo = append(*logRecordInfo, data)
+}
+
 //日志流程记录~初步
 func LogProcessRecordingForTentative(key string, valueArr interface{}, finallyInfos map[string]map[string]map[string]string, logRecordInfo *[]map[string]interface{}) {
 	finallyArr := splitRegionsInfos(finallyInfos)

+ 0 - 91
src/jy/heart/heart.go

@@ -1,91 +0,0 @@
-package heart
-
-import (
-	"fmt"
-	"github.com/cron"
-	jyu "jy/util"
-	"net/http"
-	"net/url"
-	"os/exec"
-	qu "qfw/util"
-	"strconv"
-	"strings"
-)
-
-//心跳监测
-func HeartMonitor()  {
-	c := cron.New()
-	spec := "5 */10 * * * ?"
-	c.AddFunc(spec, func() {
-		heart()
-	})
-	c.Start()
-}
-func heart()  {
-	u, _ := url.Parse("http://monitor.spdata.jianyu360.com")
-	//u, _ := url.Parse("http://127.0.0.1:7811")
-	q := u.Query()
-	q.Set("extract", qu.ObjToString(jyu.Config["udpport"]))
-	u.RawQuery = q.Encode()
-	_, err := http.Get(u.String());
-	if err != nil {
-		fmt.Println("心跳监测~异常")
-	}
-}
-
-
-
-
-
-
-/****************分割线******************/
-/****************分割线******************/
-/****************分割线******************/
-/****************暂不使用******************/
-
-
-//根据进程名判断进程是否运行
-func checkProRunning(serverName string) (bool, error) {
-	cmd := `ps ux | awk '/` + serverName + `/ && !/awk/ {print $2}'`
-	pid, err := runInLinux(cmd)
-	if err != nil {
-		return false, err
-	}
-	return pid != "", nil
-}
-//根据进程名称获取进程ID
-func getPid(serverName string) (string, error) {
-	cmd := `ps ux | awk '/` + serverName + `/ && !/awk/ {print $2}'`
-	pid, err := runInLinux(cmd)
-	return pid , err
-}
-//
-func runInLinux(cmd string) (string, error) {
-	result, err := exec.Command("/bin/sh", "-c", cmd).Output()
-	if err != nil {
-		return "", err
-	}
-	return strings.TrimSpace(string(result)), err
-}
-
-func isProcessExist(appName string) (bool, string, int) {
-	appary := make(map[string]int)
-	cmd := exec.Command("cmd", "/C", "tasklist")
-	output, _ := cmd.Output()
-	//fmt.Printf("fields: %v\n", output)
-	n := strings.Index(string(output), "System")
-	if n == -1 {
-		//log.Println("no find")
-		//os.Exit(1)
-	}
-	data := string(output)[n:]
-	fields := strings.Fields(data)
-	for k, v := range fields {
-		if v == appName {
-			appary[appName], _ = strconv.Atoi(fields[k+1])
-
-			return true, appName, appary[appName]
-		}
-	}
-	return false, appName, -1
-}

+ 0 - 2
src/main.go

@@ -26,11 +26,9 @@ func init() {
 	qu.ReadConfig("./res/pricenumber.json", &u.PriceNumberConfig)
 	//初始化util
 	u.UtilInit()
-
 }
 
 func main() {
-
 	extract.ExtractUdpUpdateMachine() //节点上传~构建
 	extract.ExtractUdp()              //udp通知抽取
 	go Router.Run(":" + qu.ObjToString(u.Config["port"]))