Browse Source

city抽取

maxiaoshan 6 years ago
parent
commit
e472d2e0c8

+ 19 - 13
src/jy/extract/extract.go

@@ -29,7 +29,7 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 200                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1}`
+	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -51,7 +51,9 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.InitClearFn()
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
-		ext.InitDFA()
+		ext.InitCityDFA()
+		ext.InitAreaCode()
+		ext.InitPostCode()
 	}
 	//质量审核
 	ext.InitAuditFields()
@@ -124,7 +126,9 @@ func StartExtractTaskId(taskId string) bool {
 	ext.InitClearFn()
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
-		ext.InitDFA()
+		ext.InitCityDFA()
+		ext.InitAreaCode()
+		ext.InitPostCode()
 	}
 	//质量审核
 	ext.InitAuditFields()
@@ -181,7 +185,7 @@ func RunExtractTask(taskId string) {
 				continue
 			}
 			_id := qu.BsonIdToSId(v["_id"])
-			log.Debug(_id)
+			//log.Debug(_id)
 			if !ext.IsRun {
 				break
 			}
@@ -253,6 +257,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 		Data:      &doc,
 		City:      qu.ObjToString(doc["city"]),
 		Province:  qu.ObjToString(doc["area"]),
+		Jsondata:  qu.ObjToMap(doc["jsondata"]),
 		Result:    map[string][]*ju.ExtField{},
 		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
 		RuleBlock: e.RuleBlock,
@@ -267,6 +272,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 			Data:       &doc,
 			City:       qu.ObjToString(doc["city"]),
 			Province:   qu.ObjToString(doc["area"]),
+			Jsondata:   qu.ObjToMap(doc["jsondata"]),
 			Result:     map[string][]*ju.ExtField{},
 			BuyerAddr:  qu.ObjToString(doc["buyeraddr"]),
 			RuleBlock:  e.RuleBlock,
@@ -1118,19 +1124,19 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				tmp[k] = v
 			}
 		}
-
 		//质量审核
 		if ok, _ := ju.Config["qualityaudit"].(bool); ok {
 			e.QualityAudit(tmp)
 		}
 		if e.IsExtractCity { //城市抽取
-			b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
-			// log.Debug("省份---", p, "城市---", c, "区---", d)
-			tmp["district"] = d
-			if b {
-				tmp["city"] = c
-				tmp["area"] = p
-			}
+			e.ExtractCity(j, tmp, _id)
+			//			b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
+			//			// log.Debug("省份---", p, "城市---", c, "区---", d)
+			//			tmp["district"] = d
+			//			if b {
+			//				tmp["city"] = c
+			//				tmp["area"] = p
+			//			}
 		}
 		//品牌抽取
 		if ju.IsBrandGoods {
@@ -1194,7 +1200,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 
 //去重冗余字段
 func delFiled(k string) bool {
-	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo"
+	return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
 func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string, map[string][]*ju.SortObject) {

+ 272 - 112
src/jy/extract/extractInit.go

@@ -83,17 +83,25 @@ type ExtractTask struct {
 	CidRuleMap    map[string][]map[string]interface{} //规则
 	AuditFields   []string                            //需要审核的字段名称
 
-	ProvinceMap       map[string]string
-	CityBrief         map[string]*City     //市简称(只加载一次即可)
-	ProvinceBrief     map[string]*Province //省简称(只加载一次)
-	AreaToCity        map[string][]*City   //市,省全称简称(两个文件共用)
-	DistrictCityMap   map[string]*City
-	StreetDistrictMap map[string]*District
-	AreaGet           *ju.DFA //市全称
-	AreaDistrict      *ju.DFA //区或县
-	AreaProvinceGet   *ju.DFA //省
-	AreaSimGet        *ju.DFA //市简称
-	AreaStreet        *ju.DFA //街道
+	ProvinceMap       map[string]string    //省全称简称(key:浙江省 val:浙江)
+	ProvinceBriefMap  map[string]*Province //省简称对应的省信息(key:浙江 val:&Province{})
+	CityMap           map[string]string    //市全称简称(key:杭州市 val:杭州)
+	CityBriefMap      map[string]*City     //市简称对应的市信息(key:杭州 val:&City{})
+	CityFullMap       map[string]*City     //市全称对应的市信息(key:杭州市 val:&City{})
+	DistrictCityMap   map[string]*City     //区或县对应的city
+	DistrictSimAndAll map[string]string    //区或县(key:简称 val:全称)
+	StreetDistrictMap map[string]*District //街道对应的区或县
+
+	ProvinceAllGet *ju.DFA //省全称
+	ProvinceSimGet *ju.DFA //省简称
+	CityAllGet     *ju.DFA //市全称
+	CitySimGet     *ju.DFA //市简称
+	DistrictAllGet *ju.DFA //区或县全称
+	DistrictSimGet *ju.DFA //区或县简称
+	StreetGet      *ju.DFA //街道
+
+	PostCodeMap map[string]*PostCode //邮编
+	AreaCodeMap map[string]*AreaCode //区号
 
 	InfoType []map[string]interface{}
 }
@@ -339,8 +347,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
-							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -377,8 +385,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
-							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -422,8 +430,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
-							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
-							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
+						tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+						tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -631,24 +639,33 @@ func InitCityAll(version string) map[string]map[string]interface{} {
 }
 
 //初始化城市省份敏感词
-func (e *ExtractTask) InitDFA() {
+func (e *ExtractTask) InitCityDFA() {
 	defer qu.Catch()
-	e.AreaGet = &ju.DFA{}
-	e.AreaDistrict = &ju.DFA{}
-	e.AreaProvinceGet = &ju.DFA{}
-	e.AreaStreet = &ju.DFA{}
+	e.CityAllGet = &ju.DFA{}
+	e.CitySimGet = &ju.DFA{}
+	e.DistrictAllGet = &ju.DFA{}
+	e.DistrictSimGet = &ju.DFA{}
+	e.ProvinceAllGet = &ju.DFA{}
+	e.ProvinceSimGet = &ju.DFA{}
+	e.StreetGet = &ju.DFA{}
 	//初始化map
 	if e.ProvinceMap == nil {
 		e.ProvinceMap = make(map[string]string)
 	}
-	if e.CityBrief == nil {
-		e.CityBrief = make(map[string]*City)
+	if e.CityMap == nil {
+		e.CityMap = make(map[string]string)
+	}
+	if e.DistrictSimAndAll == nil {
+		e.DistrictSimAndAll = make(map[string]string)
 	}
-	if e.ProvinceBrief == nil {
-		e.ProvinceBrief = make(map[string]*Province)
+	if e.CityBriefMap == nil {
+		e.CityBriefMap = make(map[string]*City)
 	}
-	if e.AreaToCity == nil {
-		e.AreaToCity = make(map[string][]*City)
+	if e.CityFullMap == nil {
+		e.CityFullMap = make(map[string]*City)
+	}
+	if e.ProvinceBriefMap == nil {
+		e.ProvinceBriefMap = make(map[string]*Province)
 	}
 	if e.DistrictCityMap == nil {
 		e.DistrictCityMap = make(map[string]*City)
@@ -661,60 +678,57 @@ func (e *ExtractTask) InitDFA() {
 	for k, v := range fn1 {
 		for _, p := range v.([]interface{}) {
 			p1, _ := p.(string)
-			e.AreaProvinceGet.AddWord(p1)
-			e.ProvinceMap[p1] = k
+			e.ProvinceAllGet.AddWord(p1) //华中科技大学
+			e.ProvinceMap[p1] = k        //华中科技大学:湖北
 		}
 	}
 
 	//初始化城市全称
 	fn2 := InitCityAll(e.TaskInfo.Version)
 	for k, v := range fn2 {
-		e.AreaProvinceGet.AddWord(k) //省全称
+		//加载省信息
+		e.ProvinceAllGet.AddWord(k) //加入省全称dfa(k:浙江省)
 		p := &Province{}
-		p.Name = k
-		p.Brief = v["brief"].(string)
-		e.ProvinceMap[k] = p.Brief
-		//
-		e.ProvinceBrief[p.Brief] = p
-		p.Cap = v["captial"].(string)
+		p.Name = k                        //省全称:浙江省
+		p.Brief = v["brief"].(string)     //省简称:浙江
+		e.ProvinceSimGet.AddWord(p.Brief) //加入省简称dfa(k:浙江)
+		e.ProvinceMap[k] = p.Brief        //浙江省:浙江
+		e.ProvinceBriefMap[p.Brief] = p   //浙江:省信息{}
+		p.Cap = v["captial"].(string)     //省会(杭州)
+
+		//加载市信息
 		city, _ := v["city"].(map[string]interface{})
 		for k1, v1 := range city {
+			e.CityAllGet.AddWord(k1) //加入市全称dfa(k:杭州市)
 			v1m, _ := v1.(map[string]interface{})
 			c := &City{}
-			c.Name = k1
-			//			if v1m["brief"] == nil {
-			//			}
-			c.Brief = v1m["brief"].(string)
-			//
-			e.CityBrief[c.Brief] = c
+			c.Name = k1                     //市全称:杭州市
+			c.Brief = v1m["brief"].(string) //市简称:杭州
+			e.CitySimGet.AddWord(c.Brief)   //加入市简称dfa(k:杭州)
+			e.CityMap[k1] = c.Brief         //杭州市:杭州
+			e.CityBriefMap[c.Brief] = c     //杭州:市信息{}
+			e.CityFullMap[k1] = c           //杭州市:市信息{}
 			c.P = p
-			if c.Brief == p.Cap {
-				p.Captial = c
+			if c.Name == p.Cap {
+				p.Captial = c //加载province中的省会市信息{}
 			}
-			//加入到城市map中
-			//
-			cs := e.AreaToCity[k1]
-			e.AreaGet.AddWord(k1) //市全称
-			if cs != nil {
-				cs = append(cs, c)
-			} else {
-				cs = []*City{c}
-			}
-			e.AreaToCity[k1] = cs
+
 			//区县
 			districtmap := v1m["area"].(map[string]interface{}) //区或县
 			for district, streetarr := range districtmap {
 				d := &District{}
 				d.Name = district
 				d.C = c
-				e.AreaDistrict.AddWord(district) //加入区或县敏感词
+				//省直辖市,河南济源市没有区一级,目前区一级写的还是济源市
+				//匹配时,如果匹配到区,拿区和市比对,相同则代表是省直辖市,不要区一级?
+				e.DistrictAllGet.AddWord(district) //加入区或县全称dfa
 				ctmp := e.DistrictCityMap[district]
 				if ctmp == nil {
 					e.DistrictCityMap[district] = c
 				}
 				//街道
 				for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
-					e.AreaStreet.AddWord(s) //加入街道敏感词
+					e.StreetGet.AddWord(s) //加入街道敏感词
 					dtmp := e.StreetDistrictMap[s]
 					if dtmp == nil {
 						e.StreetDistrictMap[s] = d
@@ -725,70 +739,216 @@ func (e *ExtractTask) InitDFA() {
 	}
 	//初始化城市简称
 	fn3 := InitCitySim(e.TaskInfo.Version)
-	e.AreaSimGet = &ju.DFA{}
-	for k, v := range fn3 {
-		pb := v["brief"].(string)
-		p := e.ProvinceBrief[pb]
-		//加载
-		for _, ss := range []string{k, pb} { //省全称和省简称
-			cs := e.AreaToCity[ss]
-			if cs != nil {
-				cs = append(cs, p.Captial)
-			} else {
-				cs = []*City{p.Captial}
-			}
-			e.AreaToCity[ss] = cs
-			e.AreaSimGet.AddWord(ss)
-		}
+	for _, v := range fn3 {
 		city, _ := v["city"].(map[string]interface{})
-		for k1, v1 := range city {
+		for _, v1 := range city {
 			v1m, _ := v1.(map[string]interface{})
-			if v1m["brief"] == nil {
-			}
-			cb := v1m["brief"].(string)
-			c := e.AreaToCity[k1][0]
-			//加入到城市map中
-			for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州  浙江杭州
-				e.AreaSimGet.AddWord(ss)
-				cs := e.AreaToCity[ss]
-				if cs != nil {
-					cs = append(cs, c)
-				} else {
-					cs = []*City{c}
+			cb := v1m["brief"].(string)                 //市简称
+			arr := v1m["area"].(map[string]interface{}) //区或县简称
+			for districtsim, districtall := range arr {
+				e.DistrictSimAndAll[districtsim] = districtall.(string)
+				d := &District{}
+				d.Name = districtsim
+				d.C = e.CityBriefMap[cb]
+				e.DistrictSimGet.AddWord(districtsim) //加入区或县简称敏感词
+				ctmp := e.DistrictCityMap[districtsim]
+				if ctmp == nil {
+					e.DistrictCityMap[districtsim] = e.CityBriefMap[cb]
 				}
-				e.AreaToCity[ss] = cs
 			}
-			arr := v1m["area"].([]interface{})
-			for _, k2 := range arr {
-				s := k2.(string)
-				for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
-					cs := e.AreaToCity[ss]
-					e.AreaSimGet.AddWord(ss)
-					if cs != nil {
-						cs = append(cs, c)
-					} else {
-						cs = []*City{c}
-					}
-					e.AreaToCity[ss] = cs
+		}
+	}
+}
 
-					//只加入简称
-					if n == 0 {
-						d := &District{}
-						d.Name = ss
-						d.C = c
-						e.AreaDistrict.AddWord(ss) //加入区或县简称敏感词
-						ctmp := e.DistrictCityMap[ss]
-						if ctmp == nil {
-							e.DistrictCityMap[ss] = c
-						}
-					}
+//初始化邮编库
+func (e *ExtractTask) InitPostCode() {
+	defer qu.Catch()
+	if e.PostCodeMap == nil {
+		e.PostCodeMap = make(map[string]*PostCode)
+	}
+	list, _ := db.Mgo.Find("postcode", nil, nil, nil, false, -1, -1)
+	for _, l := range *list {
+		pc := &PostCode{}
+		pc.Code = qu.ObjToString(l["code"])
+		pc.P = qu.ObjToString(l["province"])
+		pc.C = qu.ObjToString(l["city"])
+		pc.D = qu.ObjArrToStringArr(l["district"].([]interface{}))
+		e.PostCodeMap[pc.Code] = pc
+	}
+}
 
-				}
-			}
-		}
+//初始化区号库
+func (e *ExtractTask) InitAreaCode() {
+	defer qu.Catch()
+	if e.AreaCodeMap == nil {
+		e.AreaCodeMap = make(map[string]*AreaCode)
+	}
+	list, _ := db.Mgo.Find("areacode", nil, nil, nil, false, -1, -1)
+	for _, l := range *list {
+		ac := &AreaCode{}
+		ac.Code = qu.ObjToString(l["code"])
+		ac.P = qu.ObjToString(l["province"])
+		ac.C = qu.ObjArrToStringArr(l["city"].([]interface{}))
+		e.AreaCodeMap[ac.Code] = ac
 	}
 }
 
+//初始化城市省份敏感词
+//func (e *ExtractTask) InitCityDFA() {
+//	defer qu.Catch()
+//	e.CityAllGet = &ju.DFA{}
+//	e.DistrictGet = &ju.DFA{}
+//	e.AreaProvinceGet = &ju.DFA{}
+//	e.StreetGet = &ju.DFA{}
+//	//初始化map
+//	if e.ProvinceMap == nil {
+//		e.ProvinceMap = make(map[string]string)
+//	}
+//	if e.CityBriefMap == nil {
+//		e.CityBriefMap = make(map[string]*City)
+//	}
+//	if e.ProvinceBriefMap == nil {
+//		e.ProvinceBriefMap = make(map[string]*Province)
+//	}
+//	if e.AreaToCityMap == nil {
+//		e.AreaToCityMap = make(map[string][]*City)
+//	}
+//	if e.DistrictCityMap == nil {
+//		e.DistrictCityMap = make(map[string]*City)
+//	}
+//	if e.StreetDistrictMap == nil {
+//		e.StreetDistrictMap = make(map[string]*District)
+//	}
+//	//初始化省
+//	fn1 := InitProvince(e.TaskInfo.Version)
+//	for k, v := range fn1 {
+//		for _, p := range v.([]interface{}) {
+//			p1, _ := p.(string)
+//			e.AreaProvinceGet.AddWord(p1) //华中科技大学
+//			e.ProvinceMap[p1] = k         //华中科技大学:湖北
+//		}
+//	}
+
+//	//初始化城市全称
+//	fn2 := InitCityAll(e.TaskInfo.Version)
+//	for k, v := range fn2 {
+//		e.AreaProvinceGet.AddWord(k) //加入省全称dfa(k:浙江省)
+//		p := &Province{}
+//		p.Name = k                      //省全称
+//		p.Brief = v["brief"].(string)   //省简称
+//		e.ProvinceMap[k] = p.Brief      //浙江省:浙江
+//		e.ProvinceBriefMap[p.Brief] = p //浙江:省信息
+//		p.Cap = v["captial"].(string)   //省会(杭州)
+//		city, _ := v["city"].(map[string]interface{})
+//		//
+//		for k1, v1 := range city {
+//			v1m, _ := v1.(map[string]interface{})
+//			c := &City{}
+//			c.Name = k1
+//			c.Brief = v1m["brief"].(string)
+//			e.CityBriefMap[c.Brief] = c
+//			c.P = p
+//			if c.Brief == p.Cap {
+//				p.Captial = c
+//			}
+//			//加入到城市map中
+//			//
+//			cs := e.AreaToCityMap[k1]
+//			e.CityAllGet.AddWord(k1) //市全称
+//			if cs != nil {
+//				cs = append(cs, c)
+//			} else {
+//				cs = []*City{c}
+//			}
+//			e.AreaToCityMap[k1] = cs
+//			//区县
+//			districtmap := v1m["area"].(map[string]interface{}) //区或县
+//			for district, streetarr := range districtmap {
+//				d := &District{}
+//				d.Name = district
+//				d.C = c
+//				e.DistrictGet.AddWord(district) //加入区或县敏感词
+//				ctmp := e.DistrictCityMap[district]
+//				if ctmp == nil {
+//					e.DistrictCityMap[district] = c
+//				}
+//				//街道
+//				for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
+//					e.StreetGet.AddWord(s) //加入街道敏感词
+//					dtmp := e.StreetDistrictMap[s]
+//					if dtmp == nil {
+//						e.StreetDistrictMap[s] = d
+//					}
+//				}
+//			}
+//		}
+//	}
+//	//初始化城市简称
+//	fn3 := InitCitySim(e.TaskInfo.Version)
+//	e.CitySimGet = &ju.DFA{}
+//	for k, v := range fn3 {
+//		pb := v["brief"].(string)
+//		p := e.ProvinceBriefMap[pb]
+//		//加载
+//		for _, ss := range []string{k, pb} { //省全称和省简称
+//			cs := e.AreaToCityMap[ss]
+//			if cs != nil {
+//				cs = append(cs, p.Captial)
+//			} else {
+//				cs = []*City{p.Captial}
+//			}
+//			e.AreaToCityMap[ss] = cs
+//			e.CitySimGet.AddWord(ss)
+//		}
+//		city, _ := v["city"].(map[string]interface{})
+//		for k1, v1 := range city {
+//			v1m, _ := v1.(map[string]interface{})
+//			if v1m["brief"] == nil {
+//			}
+//			cb := v1m["brief"].(string)
+//			c := e.AreaToCityMap[k1][0]
+//			//加入到城市map中
+//			for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州  浙江杭州
+//				e.CitySimGet.AddWord(ss)
+//				cs := e.AreaToCityMap[ss]
+//				if cs != nil {
+//					cs = append(cs, c)
+//				} else {
+//					cs = []*City{c}
+//				}
+//				e.AreaToCityMap[ss] = cs
+//			}
+//			arr := v1m["area"].([]interface{})
+//			for _, k2 := range arr {
+//				s := k2.(string)
+//				for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
+//					cs := e.AreaToCityMap[ss]
+//					e.CitySimGet.AddWord(ss)
+//					if cs != nil {
+//						cs = append(cs, c)
+//					} else {
+//						cs = []*City{c}
+//					}
+//					e.AreaToCityMap[ss] = cs
+
+//					//只加入简称
+//					if n == 0 {
+//						d := &District{}
+//						d.Name = ss
+//						d.C = c
+//						e.DistrictGet.AddWord(ss) //加入区或县简称敏感词
+//						ctmp := e.DistrictCityMap[ss]
+//						if ctmp == nil {
+//							e.DistrictCityMap[ss] = c
+//						}
+//					}
+
+//				}
+//			}
+//		}
+//	}
+//}
+
 //保存抽取详情数据
 func (e *ExtractTask) ResultSave(init bool) {
 	defer qu.Catch()

+ 767 - 199
src/jy/extract/extractcity.go

@@ -1,9 +1,8 @@
 package extract
 
 import (
-	"fmt"
+	. "jy/pretreated"
 	ju "jy/util"
-	"log"
 	qu "qfw/util"
 	"strings"
 )
@@ -35,245 +34,814 @@ type Street struct {
 	D    *District
 }
 
+//邮编
+type PostCode struct {
+	Code string
+	P    string
+	C    string
+	D    []string
+}
+
+//区号
+type AreaCode struct {
+	Code string
+	P    string
+	C    []string
+}
+
 var SortField []string
 
 func init() {
 	qu.ReadConfig("./extractcity.json", &SortField)
 }
-func (e *ExtractTask) TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
+
+//抽取city
+func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
+	/*
+		高准确率:
+			1.爬虫数据jsondata
+			2.采购单位库
+			3.邮编
+			4.固话
+		低准确率:(全称库匹配到不走简称库)
+			1.city全称库(buyeraddr;title,projectname)
+			2.city简称库(buyeraddr;title,projectname)
+	*/
+	defer qu.Catch()
+	//初始化
+	if j.AreaScore == nil {
+		j.AreaScore = make(map[string]int)
+	}
+	if j.CityScore == nil {
+		j.CityScore = make(map[string]int)
+	}
+	if j.DistrictScore == nil {
+		j.DistrictScore = make(map[string]int)
+	}
+	sm := NewSortMap()
+	//高精度抽取city
+	//存储每个流程的抽取结果
+	area1 := make([]map[string]string, 4)
+	city1 := make([]map[string]string, 4)
+	district1 := make([]map[string]string, 4)
+
+	//jsondata
+	p0, c0, d0, p, c, d := e.GetCityByJsonData(j)
+	area1 = append(area1, map[string]string{"a_c_d": p})
+	city1 = append(city1, map[string]string{"a_c_d": c})
+	district1 = append(district1, map[string]string{"a_c_d": d})
+	area1[0] = map[string]string{"jsondata": p0}
+	city1[0] = map[string]string{"jsondata": c0}
+	district1[0] = map[string]string{"jsondata": d0}
+	//qu.Debug("=====jsondata打分---", j.AreaScore, j.CityScore, j.DistrictScore)
+	//采购单位库
+	buyer, _ := resulttmp["buyer"].(string)
+	p1, c1, d1 := e.GetCityByBuyer(j, buyer)
+	//qu.Debug("buyer	p--", p1, "c--", c1, "d--", d1)
+	area1[1] = map[string]string{"buyer": p1}
+	city1[1] = map[string]string{"buyer": c1}
+	district1[1] = map[string]string{"buyer": d1}
+	//qu.Debug("=====采购单位库打分---", j.AreaScore, j.CityScore, j.DistrictScore)
+	//postcode邮编
+	buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
+	p2, c2, d2 := e.GetCityByPostCode(j, buyerzipcode)
+	//qu.Debug("postcode	p--", p2, "c--", c2, "d--", d2)
+	area1[2] = map[string]string{"postcode": p2}
+	city1[2] = map[string]string{"postcode": c2}
+	district1[2] = map[string]string{"postcode": d2}
+	//qu.Debug("=====postcode邮编打分---", j.AreaScore, j.CityScore, j.DistrictScore)
+	//areacode固话区号
+	buyertel, _ := resulttmp["buyertel"].(string)
+	p3, c3, d3 := e.GetCityByAreaCode(j, buyertel)
+	//qu.Debug("areacode	p--", p3, "c--", c3, "d--", d3, buyertel)
+	area1[3] = map[string]string{"areacode": p3}
+	city1[3] = map[string]string{"areacode": c3}
+	district1[3] = map[string]string{"areacode": d3}
+	//qu.Debug("=====areacode固话区号打分---", j.AreaScore, j.CityScore, j.DistrictScore)
+	HighPreCity := make(map[string]interface{})
+	HighPreCity["area"] = area1
+	HighPreCity["city"] = city1
+	HighPreCity["district"] = district1
+	//低精度抽取city
+	//buyeraddr,title,projectname
+	buyeraddr, _ := resulttmp["buyeraddr"].(string)
+	title, _ := resulttmp["title"].(string)
+	projectname, _ := resulttmp["projectname"].(string)
+	//qu.Debug(buyeraddr, "--", buyer, "--", title, "--", projectname)
+	sm.AddKey("buyeraddr", buyeraddr)
+	sm.AddKey("buyer", buyer)
+	sm.AddKey("title", title)
+	sm.AddKey("projectname", projectname)
+	area2, city2, district2 := e.GetCityByOthers(j, sm)
+	LowPreCity := make(map[string]interface{})
+	LowPreCity["area"] = area2
+	LowPreCity["city"] = city2
+	LowPreCity["district"] = district2
+	resulttmp["highprecity"] = HighPreCity
+	resulttmp["lowprecity"] = LowPreCity
+	//qu.Debug("最终打分---", j.AreaScore, j.CityScore, j.DistrictScore)
+	//最终抽取结果
+	finishP := HighestScoreArr(j.AreaScore)
+	finishC := HighestScoreArr(j.CityScore)
+	finishD := HighestScoreArr(j.DistrictScore)
+
+	//	area, _ := resulttmp["area"].(string)
+	//	city, _ := resulttmp["city"].(string)
+	//	district, _ := resulttmp["district"].(string)
+	//  qu.Debug("之前结果结果===", area, city, district)
+	arearesult := ""
+	cityresult := ""
+	districtresult := ""
+
+	if len(finishP) == 1 { //最高分一个
+		arearesult = finishP[0] //抽取结果直接赋值
+		cityresult = GetCity(arearesult, cityresult, e, finishC)
+		districtresult = GetDistrict(arearesult, districtresult, e, finishD)
+	} else if len(finishP) > 1 { //province最高分多个
+		if len(finishC) == 1 {
+			cityresult = finishC[0]
+			if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
+				arearesult = cfMap.P.Brief
+				districtresult = GetDistrict(arearesult, districtresult, e, finishD)
+			}
+		} else { //对应的city有多个(多个province和city)
+			arearesult = finishP[0] //抽取结果直接赋值
+			cityresult = GetCity(arearesult, cityresult, e, finishC)
+			districtresult = GetDistrict(arearesult, districtresult, e, finishD)
+		}
+	}
+	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
+	if arearesult == "" {
+		arearesult = "全国"
+	} else if cityresult == "" {
+		if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
+			cityresult = pbMap.Cap
+		}
+	}
+
+	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
+	resulttmp["area1"] = arearesult
+	resulttmp["city1"] = cityresult
+	resulttmp["district1"] = districtresult
+}
+func (e *ExtractTask) GetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
 	defer qu.Catch()
-	province := fmt.Sprint(resulttmp["area"])
-	city := fmt.Sprint(resulttmp["city"])
-	fieldval := make([]string, 0)
-	for _, f := range SortField { //
-		val := resulttmp[f]
-		if val == nil {
-			fieldval = append(fieldval, "")
+	jsondata := *j.Jsondata
+	if jsondata != nil { //jsondata中获取province和city
+		if acd, ok := jsondata["area_city_district"].(string); ok && acd != "" {
+			flag := false
+			p, flag = GetPCDByAreaDFA(p, acd, e, j, flag)
+			if !flag {
+				p, c, flag = GetPCDByCityDFA(p, c, acd, e, j, flag)
+			}
+			if !flag {
+				p, city, c = GetPCDByDistrictDFA(p, c, d, acd, e, j)
+			}
+		}
+
+		city, _ = jsondata["city"].(string)         //city全称或者简称
+		province, _ = jsondata["area"].(string)     //province简称
+		district, _ = jsondata["district"].(string) //district全称
+	}
+	PCDScore(j, "district", district, 5) //district打分
+	bp := false
+	if province != "" {
+		if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
+			bp = true //省份正确
+		}
+	}
+	pbrief := ""
+	if city != "" {
+		cityfullmap := e.CityFullMap[city] //判断city全称是否正确
+		if cityfullmap != nil {
+			pbrief = cityfullmap.P.Brief //province简称
+		} else {
+			citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
+			if citybriefmap != nil {
+				city = citybriefmap.Name //city简称替换为全称
+				pbrief = citybriefmap.P.Brief
+			}
+		}
+	}
+	if bp {
+		if pbrief == province { //爬虫的province和city匹配
+			PCDScore(j, "city", city, 5)
+		} else { //pbrief不匹配province(此时city为空或者错误)
+			city = ""
+		}
+		PCDScore(j, "province", province, 5)
+	} else { //省份错误或为空,取city的对应的pbrief为province
+		if pbrief != "" {
+			province = pbrief
+			PCDScore(j, "province", province, 5)
+			PCDScore(j, "city", city, 5)
 		} else {
-			fieldval = append(fieldval, fmt.Sprint(val))
+			province = ""
+			city = ""
 		}
 	}
-	//qu.Debug("fieldval========", fieldval)
-	bres, c, p = e.ExtractProvinceCity(province, city, id, fieldval) //抽取省和市
-	//qu.Debug("b--------", bres, "p---------", p, "c-------------", c)
-	bres, p, c, d = e.ExtractDistrict(fieldval, bres, c, p, id) //抽取区或县
-	//qu.Debug("bres========", bres, "p===========", p, "c=========", c, "d=============", d)
 	return
-}
 
-//抽取区或县(从配置的字段信息中抽取区或县)
-func (e *ExtractTask) ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
-	d := ""
-	for _, str := range field {
-		for pos, GET := range []*ju.DFA{e.AreaDistrict, e.AreaStreet} { //先匹配区或县再匹配街道
+}
+func (e *ExtractTask) GetCityByBuyer(j *ju.Job, buyer string) (province, city, district string) {
+	defer qu.Catch()
+	return
+}
+func (e *ExtractTask) GetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
+	defer qu.Catch()
+	pc := e.PostCodeMap[postcode]
+	if pc != nil {
+		province = pc.P
+		city = pc.C
+		districtTmp := pc.D
+		if len(districtTmp) == 1 { //对应多个district舍去
+			district = districtTmp[0]
+			PCDScore(j, "district", district, 5)
+		}
+		PCDScore(j, "province", province, 5)
+		PCDScore(j, "city", city, 5)
+	}
+	return
+}
+func (e *ExtractTask) GetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
+	defer qu.Catch()
+	if len(buyertel) >= 11 {
+		if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
+			n := 4
+		L:
+			areacode := buyertel[:n]
+			ac := e.AreaCodeMap[areacode]
+			if ac != nil {
+				province = ac.P
+				citytmp := ac.C
+				if len(citytmp) == 1 { //对应多个city舍去
+					city = citytmp[0]
+					PCDScore(j, "city", city, 5)
+				}
+				PCDScore(j, "province", province, 5)
+			} else {
+				n = n - 1
+				if n >= 3 {
+					goto L
+				}
+			}
+		} else if buyertel[:3] == "853" { //澳门
+			province = "澳门"
+			city = "澳门"
+			PCDScore(j, "province", province, 5)
+			PCDScore(j, "city", city, 5)
+		}
+	}
+	return
+}
+func (e *ExtractTask) GetCityByOthers(j *ju.Job, sm *SortMap) ([]map[string]string, []map[string]string, []map[string]string) {
+	//存储每个流程的抽取结果
+	area2 := []map[string]string{}
+	city2 := []map[string]string{}
+	district2 := []map[string]string{}
+	isExtP := false
+	isExtC := false
+	for _, from := range sm.Keys { //buyeraddr;title;projectname
+		str, _ := sm.Map[from].(string)
+		//分别记录buyeraddr;title;projectname全称匹配的打分情况
+		pscore1 := make(map[string]int)
+		cscore1 := make(map[string]int)
+		dscore1 := make(map[string]int)
+		//优先province,city,district,street全称匹配
+		for pos, GET := range []*ju.DFA{e.ProvinceAllGet, e.CityAllGet, e.DistrictAllGet, e.StreetGet} {
 			word := GET.CheckSensitiveWord(str)
 			if word != "" {
-				if pos == 0 { //区或县匹配
-					//log.Println("县直接匹配到====", word)
-					lock.Lock()
-					city := e.DistrictCityMap[word]
-					lock.Unlock()
-					//log.Println("city================", city)
-					if city != nil {
-						d = word
-						ctmp := city.Brief
-						ptmp := city.P.Brief
-						//log.Println("ctmpptmp================", ptmp, ctmp, bres)
-						if !bres { //城市省份没有抽到,通过区或县定位市和省
-							c = ctmp
-							p = ptmp
-							bres = true
-						} else { //对比抽到的城市省份是否一致
-							if c != ctmp || p != ptmp {
-								//log.Println("str---", str, "====", word)
-								c = ctmp
-								p = ptmp
-							}
-						}
+				if pos == 0 { //province
+					pbrief := e.ProvinceMap[word] //取province简称
+					OtherScore("p", []string{pbrief}, &pscore1, &cscore1, &dscore1)
+				} else if pos == 1 { //city
+					p := ""
+					cityfullmap := e.CityFullMap[word]
+					if cityfullmap != nil {
+						p = cityfullmap.P.Brief //取province简称
+					}
+					OtherScore("c", []string{p, word}, &pscore1, &cscore1, &dscore1)
+				} else if pos == 2 { //district
+					p, c := "", ""
+					dcitymap := e.DistrictCityMap[word] //区对应的city
+					if dcitymap != nil {
+						c = dcitymap.Name    //city全称
+						p = dcitymap.P.Brief //province简称
+					}
+					tmpArr := []string{p, c, word}
+					if word == c { //河南济源市
+						tmpArr = []string{p, c}
 					}
-				} else { //街道匹配
-					//log.Println("匹配到街道====", word)
-					lock.Lock()
-					district := e.StreetDistrictMap[word]
-					lock.Unlock()
-					//log.Println("district================", district)
-					if district != nil {
-						d = district.Name
-						ctmp := district.C.Brief
-						ptmp := district.C.P.Brief
-						//log.Println("districtptmp================", ctmp, ptmp)
-						if !bres { //城市省份没有抽到,通过区或县定位市和省
-							c = ctmp
-							p = ptmp
-							bres = true
-						} else { //对比抽到的城市省份是否一致
-							if c != ctmp || p != ptmp {
-								c = ctmp
-								p = ptmp
-							}
+					OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
+				} else if pos == 3 { //street
+					p, c, d := "", "", ""
+					sdmap := e.StreetDistrictMap[word] //对应的区
+					if sdmap != nil {
+						d = sdmap.Name
+						c = sdmap.C.Name
+						p = sdmap.C.P.Brief
+					}
+					tmpArr := []string{p, c, d}
+					if c == d { //河南济源市
+						tmpArr = []string{p, c}
+					}
+					OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
+				}
+			}
+		}
+
+		//取最高分的province,city,district
+		ph1 := HighestScore(pscore1)
+		if ph1 != "" {
+			isExtP = true
+		}
+		ch1 := HighestScore(cscore1)
+		if ch1 != "" {
+			isExtC = true
+		}
+		dh1 := HighestScore(dscore1)
+		if dh1 != "" {
+			isExtP = true
+			isExtC = true
+		}
+		area2 = append(area2, map[string]string{from + "_all": ph1})
+		city2 = append(city2, map[string]string{from + "_all": ch1})
+		district2 = append(district2, map[string]string{from + "_all": dh1})
+		//buyeraddr,title,projectname匹配对应的结果加入最终得分
+		if from == "buyeraddr" || from == "buyer" { //全称匹配,buyeraddr和buyer3分,title和projectname2分
+			PCDScore(j, "province", ph1, 3)
+			PCDScore(j, "city", ch1, 3)
+			PCDScore(j, "district", dh1, 3)
+		} else {
+			PCDScore(j, "province", ph1, 2)
+			PCDScore(j, "city", ch1, 2)
+			PCDScore(j, "district", dh1, 2)
+		}
+	}
+	//判断全称是否抽出了province和city,一个未抽出走简称抽取
+	if !isExtP || !isExtC {
+		for _, from := range sm.Keys { //buyeraddr;title;projectname
+			str, _ := sm.Map[from].(string)
+			pscore2 := make(map[string]int)
+			cscore2 := make(map[string]int)
+			dscore2 := make(map[string]int)
+			for pos, GET := range []*ju.DFA{e.ProvinceSimGet, e.CitySimGet, e.DistrictSimGet} {
+				word := GET.CheckSensitiveWord(str)
+				if word != "" {
+					if pos == 0 { //province
+						OtherScore("p", []string{word}, &pscore2, &cscore2, &dscore2)
+					} else if pos == 1 { //city
+						p, c := "", ""
+						citybriefmap := e.CityBriefMap[word]
+						if citybriefmap != nil {
+							p = citybriefmap.P.Brief
+							c = citybriefmap.Name
 						}
+						OtherScore("c", []string{p, c}, &pscore2, &cscore2, &dscore2)
+					} else if pos == 2 { //district
+						p, c := "", ""
+						d := e.DistrictSimAndAll[word]
+						dcitymap := e.DistrictCityMap[word]
+						if dcitymap != nil {
+							c = dcitymap.Name
+							p = dcitymap.P.Brief
+						}
+						OtherScore("d", []string{p, c, d}, &pscore2, &cscore2, &dscore2)
 					}
 				}
-				return bres, p, c, d
 			}
+			//取最高分的province,city,district
+			ph2 := HighestScore(pscore2)
+			ch2 := HighestScore(cscore2)
+			dh2 := HighestScore(dscore2)
+			area2 = append(area2, map[string]string{from + "_sim": ph2})
+			city2 = append(city2, map[string]string{from + "_sim": ch2})
+			district2 = append(district2, map[string]string{from + "_sim": dh2})
+			//buyeraddr,title,projectname匹配对应的结果加入最终得分
+			if from == "buyeraddr" {
+				PCDScore(j, "province", ph2, 2)
+				PCDScore(j, "city", ch2, 2)
+				PCDScore(j, "district", dh2, 2)
+			} else {
+				PCDScore(j, "province", ph2, 1)
+				PCDScore(j, "city", ch2, 1)
+				PCDScore(j, "district", dh2, 1)
+			}
+		}
+	}
+
+	return area2, city2, district2
+}
+
+//计算province,city,district得分
+func PCDScore(j *ju.Job, stype, text string, score int) {
+	defer qu.Catch()
+	if text != "" {
+		if stype == "district" {
+			scoretmp := j.DistrictScore[text]
+			j.DistrictScore[text] = scoretmp + score
+		} else if stype == "city" {
+			scoretmp := j.CityScore[text]
+			j.CityScore[text] = scoretmp + score
+		} else if stype == "province" {
+			scoretmp := j.AreaScore[text]
+			j.AreaScore[text] = scoretmp + score
 		}
 	}
-	return bres, p, c, d
 }
 
-//抽取城市、省份
-func (e *ExtractTask) ExtractProvinceCity(province, city, id string, text []string) (bres bool, c, p string) {
+func OtherScore(stype string, text []string, ps, cs, ds *map[string]int) {
 	defer qu.Catch()
-	bc := true //是否继续抽取
-	if city != "" {
-		lock.Lock()
-		citybrief := e.CityBrief[city]
-		//log.Println("citybrief========", citybrief)
-		lock.Unlock()
-		if citybrief == nil { //简称不存在
-			log.Println("city err:", city, id)
-		} else { //简称存在
-			lock.Lock()
-			pbrief := e.CityBrief[city].P.Brief
-			//log.Println("pbrief========", pbrief)
-			lock.Unlock()
-			if province != pbrief { //省份不配对
-				log.Println("province err:", city, province, id)
+	for i, t := range text {
+		if t != "" {
+			if i == 0 { //p
+				tmpscore := (*ps)[t]
+				(*ps)[t] = tmpscore + 1
+			} else if i == 1 { //c
+				tmpscore := (*cs)[t]
+				(*cs)[t] = tmpscore + 1
+			} else if i == 2 { //d
+				tmpscore := (*ds)[t]
+				(*ds)[t] = tmpscore + 1
+			}
+		}
+
+	}
+}
+
+func HighestScore(m map[string]int) string {
+	result := ""
+	tmpscore := 0
+	for str, score := range m {
+		if str != "" && tmpscore < score {
+			result = str
+			tmpscore = score
+		}
+	}
+	return result
+}
+
+func HighestScoreArr(m map[string]int) []string {
+	result := make(map[int][]string)
+	tmpscore := 0
+	for str, score := range m {
+		if str != "" && tmpscore <= score {
+			if result[tmpscore] != nil && tmpscore != score {
+				delete(result, tmpscore)
+			}
+			if r := result[score]; r != nil {
+				r = append(r, str)
+				result[score] = r
 			} else {
-				bc = false
-				//城市省份都正确
+				result[score] = []string{str}
 			}
+			tmpscore = score
 		}
 	}
-	//有省份
-	bp := false
-	lock.Lock()
-	provincebrief := e.ProvinceBrief[province]
-	//log.Println("provincebrief========", provincebrief)
-	lock.Unlock()
-	if provincebrief != nil { //省份简称正确
-		bp = true
-	} else { //没有省份,先识别省份
-		for _, str := range text { //没有省的简称,从配置的字段信息中抽取省
-			word := e.AreaProvinceGet.CheckSensitiveWord(str) //省全称DFA中匹配
-			if word != "" {
-				lock.Lock()
-				province = e.ProvinceMap[word]
-				lock.Unlock()
-				bp = true
+	return result[tmpscore]
+}
+
+func GetCity(area, city string, e *ExtractTask, finishC []string) string {
+	for _, c := range finishC { //取最高分与province匹配的city
+		if cfMap := e.CityFullMap[c]; cfMap != nil {
+			if cfMap.P.Brief == area {
+				city = c
 				break
 			}
 		}
 	}
-	//匹配城市
-	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
-		for pos, GET := range []*ju.DFA{e.AreaGet, e.AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
-			ws := make([]string, 5)
-			for n, str := range text {
-				if str != "" {
-					word := GET.CheckSensitiveWord(str)
-					if pos == 1 { //用简称 后辍为路、集团替换
-						str1 := strings.Replace(str, word+"路", "", 1)
-						if str1 != str {
-							word = GET.CheckSensitiveWord(str1)
-						}
-					}
-					ws[n] = word
-					if word != "" {
-						lock.Lock()
-						res := e.AreaToCity[word]
-						lock.Unlock()
-						if len(res) == 1 {
-							//判断省份
-							if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
-								bres = true
-								c = res[0].Brief
-								p = res[0].P.Brief
-								break
-							} else { //不一致时。。暂时不处理
-							}
-						} else { //多个时(出现这种情况是多个省中的市,市名相同。现在的配置文件中已经将市名,县名重复的全部去掉)
-						}
-					}
-				}
+	return city
+}
+
+func GetDistrict(area, district string, e *ExtractTask, finishD []string) string {
+	for _, d := range finishD { //取最高分与province匹配的district
+		if dcMap := e.DistrictCityMap[d]; dcMap != nil {
+			if dcMap.P.Brief == area {
+				district = d
+				break
+			}
+		}
+	}
+	return district
+}
+
+func GetPCDByAreaDFA(province, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, bool) {
+	if word := e.ProvinceSimGet.CheckSensitiveWord(acd); word != "" { //取省
+		if pbMap := e.ProvinceBriefMap[word]; pbMap != nil {
+			province = pbMap.Brief
+			if province == acd || pbMap.Name == acd {
+				flag = true
 			}
-			if !bres { //没有匹配到
-				mc := map[string]int{}
-				for _, w := range ws {
-					lock.Lock()
-					res := e.AreaToCity[w]
-					lock.Unlock()
-					for _, ct := range res {
-						if ct == nil {
-							continue
+			PCDScore(j, "province", province, 5)
+		}
+	}
+	return province, flag
+}
+
+func GetPCDByCityDFA(province, city, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, string, bool) {
+	for pos, GET := range []*ju.DFA{e.CityAllGet, e.CitySimGet} { //取市
+		if word := GET.CheckSensitiveWord(acd); word != "" {
+			if pos == 0 { //全称
+				if cfMap := e.CityFullMap[word]; cfMap != nil {
+					if province != "" && cfMap.P.Brief == province { //acd有province信息
+						city = cfMap.Name
+						if acd == province+city || acd == cfMap.P.Name+city {
+							flag = true
 						}
-						if bp { //有省份
-							if ct.P != nil && ct.P.Brief == province {
-								mc[ct.Brief]++
-							}
-						} else { //没有省份
-							mc[ct.Brief]++
+					} else if province == "" { //acd有city;city和district信息
+						city = cfMap.Name
+						province = cfMap.P.Brief
+						PCDScore(j, "province", province, 5)
+						if acd == city {
+							flag = true
 						}
 					}
+					PCDScore(j, "city", city, 5)
+					break
 				}
-				//计算mc中最大值且大于1
-				max := 1
-				v := ""
-				for mk, mv := range mc {
-					if mv > max {
-						v = mk
-					}
-				}
-				if v != "" {
-					bres = true
-					lock.Lock()
-					ctb := e.CityBrief[v]
-					lock.Unlock()
-					c = ctb.Brief
-					p = ctb.P.Brief
-				} else if len(mc) > 0 {
-					//取级别更大的
-					v := ""
-					for mk, _ := range mc {
-						lock.Lock()
-						cb := e.CityBrief[mk]
-						lock.Unlock()
-						if cb.P.Cap == mk {
-							bres = true
-							c = cb.Brief
-							p = cb.P.Brief
-							break
-						} else {
-							v = mk
+			} else { //简称
+				if cbMap := e.CityBriefMap[word]; cbMap != nil {
+					if province != "" && cbMap.P.Brief == province {
+						city = cbMap.Name
+						if acd == province+city || acd == cbMap.P.Name+city {
+							flag = true
+						}
+					} else if province == "" {
+						city = cbMap.Name
+						province = cbMap.P.Brief
+						PCDScore(j, "province", province, 5)
+						if acd == city {
+							flag = true
 						}
 					}
-					if !bres {
-						bres = true
-						lock.Lock()
-						cbb := e.CityBrief[v]
-						c = cbb.Brief
-						p = cbb.P.Brief
-						lock.Unlock()
-					}
+					PCDScore(j, "city", city, 5)
+					break
 				}
 			}
-			if bres {
-				break
-			}
 		}
-	} else {
-		return
 	}
-	if !bres {
-		//取默认省会
-		lock.Lock()
-		pbp := e.ProvinceBrief[province]
-		lock.Unlock()
-		if pbp != nil {
-			bres = true
-			c = pbp.Cap
-			p = province
+	return province, city, flag
+}
+func GetPCDByDistrictDFA(province, city, district, acd string, e *ExtractTask, j *ju.Job) (string, string, string) {
+	//area_city_district字段不会单独存区信息(省市,省,市,省区,省市区)
+	for _, GET := range []*ju.DFA{e.DistrictAllGet, e.DistrictSimGet} { //取区
+		if word := GET.CheckSensitiveWord(acd); word != "" {
+			if dcMap := e.DistrictCityMap[word]; dcMap != nil {
+				if city != "" && dcMap.Name == city { //有province和city
+					district = word
+				} else if city == "" && dcMap.P.Brief == province { //只有province
+					district = word
+					city = dcMap.Name
+					PCDScore(j, "city", city, 5)
+				} else if province == "" { //province和city都没有
+					district = word
+					city = dcMap.Name
+					province = dcMap.P.Brief
+					PCDScore(j, "city", city, 5)
+					PCDScore(j, "province", province, 5)
+				}
+				PCDScore(j, "district", district, 5)
+				break
+			}
 		}
 	}
-	return
+
+	return province, city, district
 }
+
+//func (e *ExtractTask) TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
+//	defer qu.Catch()
+//	province := fmt.Sprint(resulttmp["area"])
+//	city := fmt.Sprint(resulttmp["city"])
+//	fieldval := make([]string, 0)
+//	for _, f := range SortField { //
+//		val := resulttmp[f]
+//		if val == nil {
+//			fieldval = append(fieldval, "")
+//		} else {
+//			fieldval = append(fieldval, fmt.Sprint(val))
+//		}
+//	}
+//	//qu.Debug("fieldval========", fieldval)
+//	bres, c, p = e.ExtractProvinceCity(province, city, id, fieldval) //抽取省和市
+//	//qu.Debug("b--------", bres, "p---------", p, "c-------------", c)
+//	bres, p, c, d = e.ExtractDistrict(fieldval, bres, c, p, id) //抽取区或县
+//	//qu.Debug("bres========", bres, "p===========", p, "c=========", c, "d=============", d)
+//	return
+//}
+
+//抽取城市、省份
+//func (e *ExtractTask) ExtractProvinceCity(province, city, id string, text []string) (bres bool, c, p string) {
+//	defer qu.Catch()
+//	bc := true //是否继续抽取
+//	if city != "" {
+//		lock.Lock()
+//		citybriefmap := e.CityBriefMap[city]
+//		//log.Println("citybriefmap========", citybriefmap)
+//		lock.Unlock()
+//		if citybriefmap == nil { //简称不存在
+//			log.Println("city err:", city, id)
+//		} else { //简称存在
+//			lock.Lock()
+//			pbrief := e.CityBriefMap[city].P.Brief
+//			//log.Println("pbrief========", pbrief)
+//			lock.Unlock()
+//			if province != pbrief { //省份不配对
+//				log.Println("province err:", city, province, id)
+//			} else {
+//				bc = false
+//				//城市省份都正确
+//			}
+//		}
+//	}
+//	//有省份
+//	bp := false
+//	lock.Lock()
+//	provincebriefmap := e.ProvinceBriefMap[province]
+//	//log.Println("provincebriefmap========", provincebriefmap)
+//	lock.Unlock()
+//	if provincebriefmap != nil { //省份简称正确
+//		bp = true
+//	} else { //没有省份,先识别省份
+//		for _, str := range text { //没有省的简称,从配置的字段信息中抽取省
+//			word := e.ProvinceAllGet.CheckSensitiveWord(str) //省全称DFA中匹配
+//			if word != "" {
+//				lock.Lock()
+//				province = e.ProvinceMap[word]
+//				lock.Unlock()
+//				bp = true
+//				break
+//			}
+//		}
+//	}
+//	//匹配城市
+//	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
+//		for pos, GET := range []*ju.DFA{e.CityAllGet, e.CitySimGet} { //AreaGet市全称,AreaSimGet省全称和简称
+//			ws := make([]string, 5)
+//			for n, str := range text {
+//				if str != "" {
+//					word := GET.CheckSensitiveWord(str)
+//					if pos == 1 { //用简称 后辍为路、集团替换
+//						str1 := strings.Replace(str, word+"路", "", 1)
+//						if str1 != str {
+//							word = GET.CheckSensitiveWord(str1)
+//						}
+//					}
+//					ws[n] = word
+//					if word != "" {
+//						lock.Lock()
+//						res := e.AreaToCityMap[word]
+//						lock.Unlock()
+//						if len(res) == 1 {
+//							//判断省份
+//							if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
+//								bres = true
+//								c = res[0].Brief
+//								p = res[0].P.Brief
+//								break
+//							} else { //不一致时。。暂时不处理
+//							}
+//						} else { //多个时(出现这种情况是多个省中的市,市名相同。现在的配置文件中已经将市名,县名重复的全部去掉)
+//						}
+//					}
+//				}
+//			}
+//			if !bres { //没有匹配到
+//				mc := map[string]int{}
+//				for _, w := range ws {
+//					lock.Lock()
+//					res := e.AreaToCityMap[w]
+//					lock.Unlock()
+//					for _, ct := range res {
+//						if ct == nil {
+//							continue
+//						}
+//						if bp { //有省份
+//							if ct.P != nil && ct.P.Brief == province {
+//								mc[ct.Brief]++
+//							}
+//						} else { //没有省份
+//							mc[ct.Brief]++
+//						}
+//					}
+//				}
+//				//计算mc中最大值且大于1
+//				max := 1
+//				v := ""
+//				for mk, mv := range mc {
+//					if mv > max {
+//						v = mk
+//					}
+//				}
+//				if v != "" {
+//					bres = true
+//					lock.Lock()
+//					ctb := e.CityBriefMap[v]
+//					lock.Unlock()
+//					c = ctb.Brief
+//					p = ctb.P.Brief
+//				} else if len(mc) > 0 {
+//					//取级别更大的
+//					v := ""
+//					for mk, _ := range mc {
+//						lock.Lock()
+//						cb := e.CityBriefMap[mk]
+//						lock.Unlock()
+//						if cb.P.Cap == mk {
+//							bres = true
+//							c = cb.Brief
+//							p = cb.P.Brief
+//							break
+//						} else {
+//							v = mk
+//						}
+//					}
+//					if !bres {
+//						bres = true
+//						lock.Lock()
+//						cbb := e.CityBriefMap[v]
+//						c = cbb.Brief
+//						p = cbb.P.Brief
+//						lock.Unlock()
+//					}
+//				}
+//			}
+//			if bres {
+//				break
+//			}
+//		}
+//	} else {
+//		return
+//	}
+//	if !bres {
+//		//取默认省会
+//		lock.Lock()
+//		pbp := e.ProvinceBriefMap[province]
+//		lock.Unlock()
+//		if pbp != nil {
+//			bres = true
+//			c = pbp.Cap
+//			p = province
+//		}
+//	}
+//	return
+//}
+//抽取区或县(从配置的字段信息中抽取区或县)
+//func (e *ExtractTask) ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
+//	d := ""
+//	for _, str := range field {
+//		for pos, GET := range []*ju.DFA{e.DistrictGet, e.StreetGet} { //先匹配区或县再匹配街道
+//			word := GET.CheckSensitiveWord(str)
+//			if word != "" {
+//				if pos == 0 { //区或县匹配
+//					//log.Println("县直接匹配到====", word)
+//					lock.Lock()
+//					city := e.DistrictCityMap[word]
+//					lock.Unlock()
+//					//log.Println("city================", city)
+//					if city != nil {
+//						d = word
+//						ctmp := city.Brief
+//						ptmp := city.P.Brief
+//						//log.Println("ctmpptmp================", ptmp, ctmp, bres)
+//						if !bres { //城市省份没有抽到,通过区或县定位市和省
+//							c = ctmp
+//							p = ptmp
+//							bres = true
+//						} else { //对比抽到的城市省份是否一致
+//							if c != ctmp || p != ptmp {
+//								//log.Println("str---", str, "====", word)
+//								c = ctmp
+//								p = ptmp
+//							}
+//						}
+//					}
+//				} else { //街道匹配
+//					//log.Println("匹配到街道====", word)
+//					lock.Lock()
+//					district := e.StreetDistrictMap[word]
+//					lock.Unlock()
+//					//log.Println("district================", district)
+//					if district != nil {
+//						d = district.Name
+//						ctmp := district.C.Brief
+//						ptmp := district.C.P.Brief
+//						//log.Println("districtptmp================", ctmp, ptmp)
+//						if !bres { //城市省份没有抽到,通过区或县定位市和省
+//							c = ctmp
+//							p = ptmp
+//							bres = true
+//						} else { //对比抽到的城市省份是否一致
+//							if c != ctmp || p != ptmp {
+//								c = ctmp
+//								p = ptmp
+//							}
+//						}
+//					}
+//				}
+//				return bres, p, c, d
+//			}
+//		}
+//	}
+//	return bres, p, c, ""
+//}

+ 3 - 1
src/jy/extract/extractudp.go

@@ -108,7 +108,9 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 		ext.InitClearFn()
 		if ext.IsExtractCity { //版本上控制是否开始城市抽取
 			//初始化城市DFA信息
-			ext.InitDFA()
+			ext.InitCityDFA()
+			ext.InitAreaCode()
+			ext.InitPostCode()
 		}
 		//质量审核
 		ext.InitAuditFields()

+ 4 - 1
src/jy/pretreated/colonkv.go

@@ -44,6 +44,7 @@ var (
 	BracketsTextReg       = regexp.MustCompile("[((]([^((]+)[))]")
 	ContactBuyerTitleReg  = regexp.MustCompile("采购联系事项")
 	ContactAgencyTitleReg = regexp.MustCompile("招标联系事项")
+	ZipCode               = regexp.MustCompile("邮(政)?编(码)?")
 )
 
 //一行多个冒号kv处理
@@ -87,7 +88,7 @@ func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv {
 
 //处理正文
 func (ce *ColonkvEntity) processText(con string) string {
-	con = ce.divisionMoreKV(con)//一行多个冒号kv处理
+	con = ce.divisionMoreKV(con) //一行多个冒号kv处理
 	for {
 		tmp := con
 		con = ce.divisionMoreKV(con)
@@ -350,6 +351,8 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 						k = "地址"
 					} else if PersonReg.MatchString(v) || PhoneReg.MatchString(v) {
 						k = "联系方式"
+					} else if ZipCode.MatchString(v) {
+						k = "邮政编码"
 					}
 					k_length = len([]rune(k))
 				} else if strings.HasPrefix(strings.TrimSpace(v), buyer) || (prevNotEqual && buyerLenght >= prevLineLength-5 && buyerLenght <= prevLineLength && strings.Contains(prevLine, buyer)) {

+ 4 - 0
src/jy/util/article.go

@@ -16,6 +16,7 @@ type Job struct {
 	Href           string                            //原文链接
 	City           string                            //城市
 	Province       string                            //省份
+	Jsondata       *map[string]interface{}           //
 	Data           *map[string]interface{}           //数据库源数据
 	Block          []*Block                          //分块
 	Result         map[string][]*ExtField            //结果
@@ -30,6 +31,9 @@ type Job struct {
 	HasBrand       int                               //有品牌
 	HasGoods       int                               //有商品
 	IsFile         bool                              //有附件
+	AreaScore      map[string]int                    //province得分
+	CityScore      map[string]int                    //city得分
+	DistrictScore  map[string]int                    //istrict得分
 }
 
 type ExtField struct {

+ 1 - 1
src/jy/util/config.go

@@ -13,7 +13,7 @@ var FormatTextMap map[string][]map[string]interface{}
 
 func init() {
 	loadFormatText()
-	LoadTagDb("./res/tagdb")
+	//LoadTagDb("./res/tagdb")
 	LoadTagDb("./res/blocktagdb")
 }
 

+ 35 - 1
src/main_test.go

@@ -29,10 +29,44 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5c2b55f1a5cb26b9b7fac3c3", "1", "mxs_v2", "mxs_v2")
+	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5d25a292a5cb26b9b741292b", "1", "mxs_v1", "mxs_v2")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }
+
+func Test_Score(t *testing.T) {
+	//	m := map[string]int{
+	//		"河南": 10,
+	//		"浙江": 10,
+	//		"湖南": 5,
+	//		"XX": 11,
+	//		"YY": 5,
+	//	}
+	m := map[string]int{}
+	log.Println(len(HighestScoreArr(m)))
+}
+func HighestScoreArr(m map[string]int) []string {
+	result := make(map[int][]string)
+	tmpscore := 0
+	for str, score := range m {
+		log.Println(str, tmpscore, score)
+		if str != "" && tmpscore <= score {
+			log.Println(tmpscore, result[tmpscore] == nil)
+			if result[tmpscore] != nil && tmpscore != score {
+				delete(result, tmpscore)
+			}
+			if r := result[score]; r != nil {
+				r = append(r, str)
+				result[score] = r
+			} else {
+				result[score] = []string{str}
+			}
+			tmpscore = score
+		}
+	}
+	return result[tmpscore]
+}
+
 func Test_extractcity(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	extract.InitDFA2()

+ 2 - 1
src/specialsymbols.json

@@ -54,7 +54,8 @@
             "agency": true,
             "agency": true,
             "buyertel": true,
-            "buyerperson": true
+            "buyerperson": true,
+			"buyerzipcode":true
         },
         "symbol": [
             ":",