fengweiqiang 5 年之前
父节点
当前提交
0dd0b5030c
共有 4 个文件被更改,包括 996 次插入12 次删除
  1. 38 9
      src/jy/extract/extract.go
  2. 956 0
      src/jy/extract/newextractcity.go
  3. 2 2
      src/jy/util/ossclient.go
  4. 0 1
      src/jy/util/util2.go

+ 38 - 9
src/jy/extract/extract.go

@@ -39,14 +39,14 @@ var (
 )
 
 func closeDb(ext *ExtractTask) {
-	if ext.TaskInfo.FDB != nil {
-		s := ext.TaskInfo.FDB.Get()
-		db.Mgo.Close(s)
-	}
-	if ext.TaskInfo.TDB != nil {
-		s := ext.TaskInfo.TDB.Get()
-		db.Mgo.Close(s)
-	}
+	//if ext.TaskInfo.FDB != nil {
+	//	s := ext.TaskInfo.FDB.Get()
+	//	db.Mgo.Close(s)
+	//}
+	//if ext.TaskInfo.TDB != nil {
+	//	s := ext.TaskInfo.TDB.Get()
+	//	db.Mgo.Close(s)
+	//}
 }
 
 //启动测试抽取
@@ -1853,7 +1853,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 		if e.IsExtractCity { //城市抽取
 			//e.ExtractCity(j, tmp, _id)
-			e.NewExtractCity(j, tmp, _id)
+			e.NewExtractCity(j, &tmp, _id)
+
 			//			b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
 			//			// log.Debug("省份---", p, "城市---", c, "区---", d)
 			//			tmp["district"] = d
@@ -1971,6 +1972,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if bg, ok := tmp["bidamount"].(float64); ok && bg >= 500000000000 {
 			delete(tmp, "bidamount")
 		}
+		//检查字段
+		tmp = checkFields(tmp)
+		if tmp["projectname"] == nil || tmp["projectname"] == "" {
+			tmp["projectname"] = j.Title
+		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -2036,6 +2042,29 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	})
 }
 
+func checkFields(tmp map[string]interface{}) map[string]interface{} {
+	delete(tmp, "contenthtml")
+	delete(tmp, "detail")
+	if _, ok := tmp["bidamount"].(string); ok {
+		delete(tmp, "bidamount")
+	}
+	if _, ok := tmp["budget"].(string); ok {
+		delete(tmp, "budget")
+	}
+	if _, ok := tmp["bidopentime"].(string); ok {
+		delete(tmp, "bidopentime")
+	}
+	if _, ok := tmp["signaturedate"].(string); ok {
+		delete(tmp, "signaturedate")
+	}
+	for k, v := range tmp {
+		if v == "" {
+			delete(tmp, k)
+		}
+	}
+	return tmp
+}
+
 //保存其他
 //kv、表格、块上的标签凡是新的标签都入库
 //val  type   times   firstid  createtime 判定field

+ 956 - 0
src/jy/extract/newextractcity.go

@@ -0,0 +1,956 @@
+package extract
+
+import (
+	. "jy/pretreated"
+	ju "jy/util"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+
+	log "github.com/donnie4w/go-logger/logger"
+)
+
+var AgencyReg = []*regexp.Regexp{
+	regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
+	regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
+}
+
+//抽取city
+func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}, id string) {
+	/*
+		高准确率:
+			1.爬虫数据jsondata
+			2.采购单位库
+			3.邮编
+			4.固话
+			5.site(todo)
+		低准确率:(全称库匹配到不走简称库)
+			1.city全称库(buyeraddr;title,projectname)
+			2.city简称库(buyeraddr;title,projectname)
+	*/
+	defer qu.Catch()
+
+	//初始化
+	if j.FullAreaScore == nil {
+		j.FullAreaScore = make(map[string]float64)
+	}
+	if j.FullCityScore == nil {
+		j.FullCityScore = make(map[string]float64)
+	}
+	if j.FullDistrictScore == nil {
+		j.FullDistrictScore = make(map[string]float64)
+	}
+	if j.SimAreaScore == nil {
+		j.SimAreaScore = make(map[string]float64)
+	}
+	if j.SimCityScore == nil {
+		j.SimCityScore = make(map[string]float64)
+	}
+	if j.SimDistrictScore == nil {
+		j.SimDistrictScore = make(map[string]float64)
+	}
+	//记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
+	pscore := make(map[string]float64)
+	cscore := make(map[string]float64)
+	dscore := make(map[string]float64)
+
+	sm := NewSortMap()
+	//1.jsondata抽取
+	e.NewGetCityByJsonData(j)
+	//qu.Debug("jsondata打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//2.site库抽取
+	e.NewGetCityBySite(j)
+	//qu.Debug("site打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//3.采购单位库抽取(暂时没有采购单位库)
+	//buyer, _ := resulttmp["buyer"].(string)
+	//4.postcode邮编抽取
+	buyerzipcode := qu.ObjToString((*resulttmp)["buyerzipcode"])
+	e.NewGetCityByPostCode(j, buyerzipcode)
+	//qu.Debug("邮编打分后结果---", buyerzipcode, j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//5.areacode固话区号抽取
+	buyertel := qu.ObjToString((*resulttmp)["buyertel"])
+	e.NewGetCityByAreaCode(j, buyertel)
+	//qu.Debug("固话打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//6.buyeraddr,title,projectname抽取
+	buyeraddr := qu.ObjToString((*resulttmp)["buyeraddr"])
+	title := qu.ObjToString((*resulttmp)["title"])
+	projectname := qu.ObjToString((*resulttmp)["projectname"])
+	buyer := qu.ObjToString((*resulttmp)["buyer"])
+	//qu.Debug("buyeraddr--", buyeraddr, "--buyer--", buyer, "--title--", title, "--projectname--", projectname)
+	sm.AddKey("buyeraddr", buyeraddr)
+	sm.AddKey("buyer", buyer)
+	sm.AddKey("title", title)
+	sm.AddKey("projectname", projectname)
+	if projectaddr, isok := (*resulttmp)["projectaddr"].(string); isok {
+		sm.AddKey("projectaddr", projectaddr)
+	}
+	if bidopenaddress, isok := (*resulttmp)["bidopenaddress"].(string); isok {
+		sm.AddKey("bidopenaddress", bidopenaddress)
+	}
+	//7.buyeraddr buyer title projectname抽取
+	e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
+	//qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
+	//全称简称得分合并
+	MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
+	//qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//合并区简称得分
+	//qu.Debug("pcd=====", pscore, cscore, dscore)
+	MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
+	//qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+
+	j.SimAreaScore = map[string]float64{}
+	j.SimCityScore = map[string]float64{}
+	j.SimDistrictScore = map[string]float64{}
+
+	//8.detail抽取
+	if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
+		e.NewGetCityByDetail(j)
+	}
+	//qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+	//qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
+	MergeFullSimScore(j) //合并detail的全简称
+	//qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+
+	finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
+	e.RemoveCD(finishP, j)                      //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
+	//qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
+	//获取结果
+	finishC := HighestScoreArr(j.FullCityScore)
+	finishD := HighestScoreArr(j.FullDistrictScore)
+	arearesult := ""
+	cityresult := ""
+	districtresult := ""
+	tmpcity := []string{}
+	if len(finishP) == 1 { //最高分一个
+		arearesult = finishP[0] //抽取结果直接赋值
+		cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
+		cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
+	} else if len(finishP) > 1 { //province最高分多个
+		if len(finishC) == 1 {
+			cityresult = finishC[0]
+			if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
+				arearesult = cfMap.P.Brief
+				tmpcity = append(tmpcity, cityresult)
+				cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
+			}
+		} else { //对应的city有多个(多个province和city)
+			//arearesult = finishP[0] //抽取结果直接赋值
+			//cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
+			//cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
+			arearesult = "全国"
+		}
+	}
+	if cityresult != "" && cityresult == districtresult {
+		districtresult = ""
+	}
+	//qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
+	//直辖市
+	if arearesult == "北京" {
+		cityresult = "北京市"
+		if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
+			districtresult = "朝阳区"
+		}
+	} else if arearesult == "天津" {
+		cityresult = "天津市"
+	} else if arearesult == "上海" {
+		cityresult = "上海市"
+	} else if arearesult == "重庆" {
+		cityresult = "重庆市"
+	}
+	if arearesult == "" {
+		arearesult = "全国"
+	} /* else if cityresult == "" {
+		if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
+			cityresult = pbMap.Cap
+			resulttmp["defaultpcap"] = true
+		}
+	}*/
+	//qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
+	(*resulttmp)["area"] = arearesult
+	(*resulttmp)["city"] = cityresult
+	(*resulttmp)["district"] = districtresult
+}
+
+//jsondata中抽取城市
+func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
+	defer qu.Catch()
+	if j.Jsondata != nil {
+		jsondata := *j.Jsondata
+		//jsondata中获取province和city
+		if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
+			p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
+			GetByACDSimJb(p, c, d, a_c_d, e, j)            //简称匹配
+		}
+		city, _ = jsondata["city"].(string)         //city全称或者简称
+		province, _ = jsondata["area"].(string)     //province简称
+		district, _ = jsondata["district"].(string) //district全称
+	}
+	PCDScore(j, "district", district, 5, true) //district打分
+	bp := false
+	if province != "" {
+		if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
+			bp = true //省份正确
+		}
+	}
+	pbrief := ""
+	if city != "" {
+		cityfullmap := e.CityFullMap[city] //判断city全称是否正确
+		if cityfullmap != nil {
+			pbrief = cityfullmap.P.Brief //province简称
+		} else {
+			citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
+			if citybriefmap != nil {
+				city = citybriefmap.Name //city简称替换为全称
+				pbrief = citybriefmap.P.Brief
+			}
+		}
+	}
+	if bp {
+		if pbrief == province { //爬虫的province和city匹配
+			PCDScore(j, "city", city, 5, true)
+		} else { //pbrief不匹配province(此时city为空或者错误)
+			city = ""
+		}
+		PCDScore(j, "province", province, 5, true)
+	} else { //省份错误或为空,取city的对应的pbrief为province
+		if pbrief != "" {
+			province = pbrief
+			PCDScore(j, "province", province, 5, true)
+			PCDScore(j, "city", city, 5, true)
+		} else {
+			province = ""
+			city = ""
+		}
+	}
+	return
+
+}
+
+//全称从area_city_district中抽城市
+func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
+	text := e.Seg_PCD.Cut(a_c_d, true)
+	repeatPb := map[string]bool{}
+	for _, full := range text {
+		if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
+			if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
+				pbrief = tmpPbrief //省简称
+				PCDScore(j, "province", pbrief, 5, true)
+			}
+		} else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
+			if cfMap := e.CityFullMap[full]; cfMap != nil {
+				tmpcity := cfMap.Name                    //城市全称
+				tmpPbrief := cfMap.P.Brief               //省简称
+				if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
+					city = tmpcity
+					PCDScore(j, "city", city, 5, true)
+				} else if pbrief == "" {
+					city = tmpcity
+					pbrief = tmpPbrief
+					PCDScore(j, "city", city, 5, true)
+					PCDScore(j, "province", pbrief, 5, true)
+				}
+			}
+		} else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
+			carr := e.NewDistrictCityMap[full]
+			if len(carr) > 0 {
+				district = full
+				PCDScore(j, "district", district, 5, true)
+				for _, c := range carr {
+					tmpcity := c.Name      //城市全称
+					tmpPbrief := c.P.Brief //省简称
+					if pbrief == "" {      //之前没有匹配到省份
+						PCDScore(j, "city", tmpcity, 5, true)
+						if !repeatPb[tmpPbrief] {
+							PCDScore(j, "province", tmpPbrief, 5, true)
+							repeatPb[tmpPbrief] = true
+						}
+					} else {                     //已有省份
+						if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
+							PCDScore(j, "city", tmpcity, -5, true)
+							PCDScore(j, "province", tmpPbrief, -5, true)
+						} else {            //与之前匹配结果一致
+							if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
+								PCDScore(j, "city", tmpcity, 5, true)
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	return pbrief, city, district
+}
+
+//简称从area_city_district中抽城市
+func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
+	text := e.Seg_PCD.Cut(a_c_d, true)
+	repeatPb := map[string]bool{}
+	for _, sim := range text {
+		if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
+			if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
+				pbrief = pbMap.Brief
+				PCDScore(j, "province", pbrief, 5, true) //打分
+				//PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
+			}
+		} else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
+			if cbMap := e.CityBriefMap[sim]; cbMap != nil {
+				tmpcity := cbMap.Name
+				tmpPbrief := cbMap.P.Brief
+				if pbrief != "" && pbrief == tmpPbrief {
+					city = tmpcity
+					PCDScore(j, "city", city, 5, true)
+				} else if pbrief == "" {
+					city = tmpcity
+					pbrief = tmpPbrief
+					PCDScore(j, "city", city, 5, true)
+					PCDScore(j, "province", pbrief, 5, true)
+					//PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
+				}
+			}
+		} else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
+			dfullarr := e.NewDistrictSimAndAll[sim]
+			if len(dfullarr) > 0 {
+				PCDScore(j, "district", sim, 5, true)
+				for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
+					for _, c := range dfullAndCity {
+						if c == nil{
+							continue
+						}
+						tmpcity := c.Name      //城市全称
+						tmpPbrief := c.P.Brief //省简称
+						if pbrief == "" {      //之前没有匹配到省份
+							PCDScore(j, "city", tmpcity, 5, true)
+							if !repeatPb[tmpPbrief] {
+								PCDScore(j, "province", tmpPbrief, 5, true)
+								repeatPb[tmpPbrief] = true
+							}
+						} else {                     //已有省份
+							if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
+								PCDScore(j, "city", tmpcity, -5, true)
+								PCDScore(j, "province", tmpPbrief, -5, true)
+							} else {            //与之前匹配结果一致
+								if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
+									PCDScore(j, "city", tmpcity, 5, true)
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+//通过site提取城市
+func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
+	site, _ := (*j.Data)["site"].(string)
+	//qu.Debug("site--------", site)
+	if scMap := e.SiteCityMap[site]; scMap != nil {
+		if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
+			PCDScore(j, "province", scMap.P, 5, true)
+		}
+		if scMap.C != "" && scMap.C != "null" {
+			PCDScore(j, "city", scMap.C, 5, true)
+		}
+		if scMap.D != "" && scMap.D != "null" {
+			PCDScore(j, "district", scMap.D, 5, true)
+		}
+	}
+}
+
+//通过邮编提取城市
+func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
+	defer qu.Catch()
+	pc := e.PostCodeMap[postcode]
+	if pc != nil {
+		province = pc.P
+		city = pc.C
+		districtTmp := pc.D //邮编可能对应多个区
+		score := 3.0
+		if len(districtTmp) == 1 && districtTmp[0] != "" {
+			score = 5.0
+		}
+		for _, district := range districtTmp {
+			PCDScore(j, "district", district, score, true)
+		}
+		PCDScore(j, "province", province, 5, true)
+		PCDScore(j, "city", city, 5, true)
+	}
+	return
+}
+
+//固话区号提取城市
+func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
+	defer qu.Catch()
+	if len(buyertel) >= 11 {
+		if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
+			n := 4
+		L:
+			areacode := buyertel[:n]
+			ac := e.AreaCodeMap[areacode]
+			if ac != nil {
+				province = ac.P
+				citytmp := ac.C
+				if len(citytmp) == 1 { //对应多个city舍去
+					city = citytmp[0]
+					PCDScore(j, "city", city, 5, true)
+				}
+				PCDScore(j, "province", province, 5, true)
+			} else {
+				n = n - 1
+				if n >= 3 {
+					goto L
+				}
+			}
+		} /* else if buyertel[:3] == "853" { //澳门
+			province = "澳门"
+			city = "澳门"
+			PCDScore(j, "province", province, 5, true)
+			PCDScore(j, "city", city, 5, true)
+		}*/
+	}
+	return
+}
+
+func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
+	/*
+		1.对字段进行分词
+		2.省、市、区、街道、居委会全称进行匹配打分
+		3.省、市、区简称进行匹配打分
+	*/
+	ts := 0.5
+	for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
+		if i > 1 {
+			ts = 0.2
+		}
+		p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
+		str, _ := sm.Map[from].(string)
+		jbText := e.Seg_SV.Cut(str, true)
+		for _, text := range jbText {
+			if len([]rune(text)) == 1 {
+				continue
+			}
+			//全称匹配
+			//qu.Debug("text------", text)
+			for pos_full, trie_full := range e.Trie_Fulls {
+				if trie_full.Get(text) {
+					if pos_full == 0 && p_full == "" { //省全称
+						if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
+							p_full = tmpPbrief
+							PCDScore(j, "province", p_full, 4+ts, true)
+							break
+						}
+					} else if pos_full == 1 && c_full == "" { //市全称
+						if cfMap := e.CityFullMap[text]; cfMap != nil {
+							tmpPbrief := cfMap.P.Brief
+							if p_full == "" {
+								p_full = tmpPbrief
+								c_full = cfMap.Name
+								PCDScore(j, "province", p_full, 4+ts, true)
+								PCDScore(j, "city", c_full, 4+ts, true)
+								break
+							} else if p_full == tmpPbrief {
+								c_full = cfMap.Name
+								PCDScore(j, "province", tmpPbrief, 4+ts, true) //
+								PCDScore(j, "city", c_full, 4+ts, true)
+								break
+							} else if p_full != "" && p_full != tmpPbrief {
+								//city不做处理
+							}
+						}
+					} else if pos_full == 2 && d_full == "" { //区全称
+						repeatPb := map[string]bool{}
+						isOk := false
+						districtOk := false
+						citys := e.NewDistrictCityMap[text]
+						for _, c := range citys {
+							tmpPbrief := c.P.Brief
+							if p_full == tmpPbrief { //省份一致
+								d_full = text
+								if c_full == "" {
+									c_full = c.Name
+									PCDScore(j, "city", c_full, 4+ts, true)
+									PCDScore(j, "province", tmpPbrief, 4+ts, true) //
+								}
+								isOk = true
+								districtOk = true
+							} else if p_full == "" { //省份不存在
+								districtOk = true
+								if len(citys) == 1 { //对应一个city
+									p_full = tmpPbrief
+									c_full = c.Name
+									d_full = text
+									PCDScore(j, "province", p_full, 4+ts, true)
+									PCDScore(j, "city", c_full, 4+ts, true)
+									isOk = true
+								} else { //多个city,只打分,不赋值
+									if !repeatPb[tmpPbrief] {
+										PCDScore(j, "province", tmpPbrief, 2+ts, true)
+										repeatPb[tmpPbrief] = true
+									}
+									//PCDScore(j, "province", tmpPbrief, 2, true)
+									PCDScore(j, "city", c.Name, 2+ts, true)
+								}
+							} else if p_full != "" && p_full != tmpPbrief { //干扰项减分
+								if !repeatPb[tmpPbrief] {
+									PCDScore(j, "province", tmpPbrief, -5, true)
+									repeatPb[tmpPbrief] = true
+								}
+								//PCDScore(j, "province", tmpPbrief, -5, true)
+								PCDScore(j, "city", c.Name, -5, true)
+							}
+						}
+						if districtOk {
+							PCDScore(j, "district", text, 4+ts, true)
+						} else {
+							PCDScore(j, "district", text, -5, true)
+						}
+						if isOk {
+							break
+						}
+					} else if pos_full == 3 { //街道全称
+						districts := e.NewStreetDistrictMap[text]
+						if len(districts) == 1 { //街道唯一
+							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
+						}
+					} else if pos_full == 4 { //居委会全称
+						districts := e.CommunityDistrictMap[text]
+						if len(districts) == 1 { //居委会唯一
+							DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
+						}
+					}
+				}
+			}
+			//qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
+			//简称匹配
+			for pos_sim, trie_sim := range e.Trie_Sims {
+				if trie_sim.Get(text) {
+					if pos_sim == 0 && p_sim == "" { //省简称
+						p_sim = text
+						PCDScore(j, "province", p_sim, 3+ts, false)
+						break
+					} else if pos_sim == 1 { //市简称
+						if cbMap := e.CityBriefMap[text]; cbMap != nil {
+							tmpPbrief := cbMap.P.Brief
+							if p_sim == "" {
+								score := 2.0 + ts
+								if tmpPbrief == p_full {
+									score += 1.0
+								}
+								p_sim = tmpPbrief
+								c_sim = cbMap.Brief
+								PCDScore(j, "province", p_sim, score, false)
+								PCDScore(j, "city", cbMap.Name, score, false)
+								break
+							} else if p_sim == tmpPbrief {
+								c_sim = cbMap.Brief
+								PCDScore(j, "city", cbMap.Name, 3+ts, false)
+								PCDScore(j, "province", tmpPbrief, 3+ts, false)
+								break
+							} else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
+								delete(j.SimAreaScore, p_sim)
+								c_sim = text      //
+								p_sim = tmpPbrief //
+								PCDScore(j, "province", tmpPbrief, 3+ts, false)
+								PCDScore(j, "city", cbMap.Name, 3+ts, false)
+							}
+						}
+					} else if pos_sim == 2 && d_sim == "" { //区简称
+						repeatPb := map[string]bool{}
+						repeatDb := map[string]bool{}
+						dfull_citys := e.NewDistrictSimAndAll[text]
+						for _, dfull_city := range dfull_citys {
+							for dfull, c := range dfull_city { //dfull:简称对应的全称
+								if c == nil || c.P == nil {
+									continue
+								}
+								tmpPbrief := c.P.Brief
+								if p_sim == tmpPbrief { //省份一致
+									d_sim = text
+									PCDScore(j, "district", dfull, 2+ts, false)
+									if c_sim == "" {
+										c_sim = c.Brief
+										PCDScore(j, "city", c.Name, 2+ts, false)
+									}
+									PCDScore(j, "province", tmpPbrief, 2+ts, false) //
+								} else if p_sim == "" {
+									if !repeatDb[dfull] {
+										PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
+										repeatDb[dfull] = true
+									}
+									if len(dfull_citys) == 1 {
+										PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
+										PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
+									} else {
+										if !repeatPb[tmpPbrief] {
+											PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
+											repeatPb[tmpPbrief] = true
+										}
+										PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
+									}
+								} else if p_sim != "" && p_sim != tmpPbrief {
+									if !repeatPb[tmpPbrief] {
+										PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
+										repeatPb[tmpPbrief] = true
+									}
+									PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
+									PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
+								}
+							}
+						}
+					}
+				}
+			}
+			//qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
+		}
+	}
+}
+
+func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
+	repeatP_full := map[string]bool{}
+	repeatC_full := map[string]bool{}
+	repeatD_full := map[string]bool{}
+	repeatP_sim := map[string]bool{}
+	repeatC_sim := map[string]bool{}
+	repeatD_sim := map[string]bool{}
+	detailRune := []rune(j.Content)
+	detail := j.Content
+	if len(detailRune) > 600 {
+		start := detailRune[:300]
+		end := detailRune[len(detailRune)-300:]
+		detail = string(start) + string(end)
+	}
+	for _, reg := range AgencyReg {
+		detail = reg.ReplaceAllString(detail, "")
+	}
+	for _, text := range e.Seg_SV.Cut(detail, true) {
+		if len([]rune(text)) > 1 {
+			//全称匹配
+			for pos_full, trie_full := range e.Trie_Fulls {
+				if trie_full.Get(text) {
+					if pos_full == 0 { //省全称
+						if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
+							PCDScore(j, "province", tmpPbrief, 1, true)
+							repeatP_full[tmpPbrief] = true
+							break
+						}
+					} else if pos_full == 1 { //市全称
+						if cfMap := e.CityFullMap[text]; cfMap != nil {
+							if !repeatP_full[cfMap.P.Brief] {
+								PCDScore(j, "province", cfMap.P.Brief, 1, true)
+								repeatP_full[cfMap.P.Brief] = true
+							}
+							if !repeatC_full[cfMap.Name] {
+								PCDScore(j, "city", cfMap.Name, 1, true)
+								repeatC_full[cfMap.Name] = true
+							}
+							break
+						}
+					} else if pos_full == 2 { //区全称
+						citys := e.NewDistrictCityMap[text]
+						if len(citys) > 0 {
+							if !repeatD_full[text] {
+								PCDScore(j, "district", text, 1, true)
+								repeatD_full[text] = true
+							}
+							for _, c := range citys {
+								if !repeatC_full[c.Name] {
+									PCDScore(j, "city", c.Name, 1, true)
+									repeatC_full[c.Name] = true
+								}
+								if !repeatP_full[c.P.Brief] {
+									PCDScore(j, "province", c.P.Brief, 1, true)
+									repeatP_full[c.P.Brief] = true
+								}
+							}
+							break
+						}
+					} else if pos_full == 3 { //街道全称
+						districts := e.NewStreetDistrictMap[text]
+						if len(districts) == 1 {
+							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
+						}
+					} else if pos_full == 4 { //居委会全称
+						districts := e.CommunityDistrictMap[text]
+						if len(districts) == 1 {
+							DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
+						}
+					}
+				}
+			}
+			//qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
+			//简称匹配
+			for pos_sim, trie_sim := range e.Trie_Sims {
+				if trie_sim.Get(text) {
+					if pos_sim == 0 && !repeatP_sim[text] { //省简称
+						PCDScore(j, "province", text, 1, false)
+						repeatP_sim[text] = true
+						break
+					} else if pos_sim == 1 { //市简称
+						if cbMap := e.CityBriefMap[text]; cbMap != nil {
+							if !repeatP_sim[cbMap.P.Brief] {
+								PCDScore(j, "province", cbMap.P.Brief, 1, false)
+								repeatP_sim[cbMap.P.Brief] = true
+							}
+							if !repeatC_sim[cbMap.Name] {
+								PCDScore(j, "city", cbMap.Name, 1, false)
+								repeatC_sim[cbMap.Name] = true
+							}
+							break
+						}
+					} else if pos_sim == 2 { //区简称
+						dfull_citys := e.NewDistrictSimAndAll[text]
+						if len(dfull_citys) == 1 {
+							for _, dfull_city := range dfull_citys {
+								for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
+									if !repeatD_sim[dfull] {
+										PCDScore(j, "district", dfull, 1, false)
+										repeatD_sim[dfull] = true
+									}
+									if ctmp == nil {
+										continue
+									}
+									if !repeatC_sim[ctmp.Name] {
+										PCDScore(j, "city", ctmp.Name, 1, false)
+										repeatC_sim[ctmp.Name] = true
+									}
+									if !repeatP_sim[ctmp.P.Brief] {
+										PCDScore(j, "province", ctmp.P.Brief, 1, false)
+										repeatP_sim[ctmp.P.Brief] = true
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+			//qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
+		}
+	}
+}
+
+//街道、居委会对应多地市处理
+func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
+	if len(districts) == 1 {
+		district := districts[0]
+		city := district.C.Name
+		tmpPbrief := district.C.P.Brief
+		if pbrief != "" && tmpPbrief == pbrief {
+			PCDScore(j, "province", tmpPbrief, score, true)
+			PCDScore(j, "city", city, score, true)
+			PCDScore(j, "district", district.Name, score, true)
+		} else if pbrief == "" {
+			if repeatP != nil && !(*repeatP)[tmpPbrief] {
+				PCDScore(j, "province", tmpPbrief, score, true)
+				(*repeatP)[tmpPbrief] = true
+			} else if repeatP == nil {
+				PCDScore(j, "province", tmpPbrief, score, true)
+			}
+			if repeatC != nil && !(*repeatC)[city] {
+				PCDScore(j, "city", city, score, true)
+				(*repeatC)[city] = true
+			} else if repeatC == nil {
+				PCDScore(j, "city", city, score, true)
+			}
+			if repeatD != nil && !(*repeatD)[tmpPbrief] {
+				PCDScore(j, "district", district.Name, score, true)
+				(*repeatD)[district.Name] = true
+			} else if repeatD == nil {
+				PCDScore(j, "district", district.Name, score, true)
+			}
+		}
+	}
+}
+
+func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
+	for _, c := range finishC { //取最高分与province匹配的city
+		if cfMap := e.CityFullMap[c]; cfMap != nil {
+			if cfMap.P.Brief == area {
+				//				city = c
+				//				break
+				tmpcity = append(tmpcity, c)
+			}
+		}
+	}
+	if len(tmpcity) == 1 {
+		city = tmpcity[0]
+	}
+	return city, tmpcity
+}
+
+func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
+	for _, d := range finishD { //取最高分与province匹配的district
+		citys := e.NewDistrictCityMap[d]
+		for _, c := range citys {
+			if len(tmpcity) == 0 { //没有city
+				if c.P.Brief == area {
+					city = c.Name
+					district = d
+					return city, district
+				}
+			} else if len(tmpcity) == 1 { //一个city
+				if c.Name == city && c.P.Brief == area {
+					district = d
+					return city, district
+				}
+			} else {                         //多个city
+				for _, tc := range tmpcity { //多个city根据district最高分取
+					if tc == c.Name && len(finishD) == 1 {
+						city = c.Name
+						district = d
+						return city, district
+					}
+				}
+			}
+		}
+	}
+	return city, district
+}
+
+//计算province,city,district区或县匹配的得分
+func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
+	defer qu.Catch()
+	if t != "" {
+		if stype == "d" {
+			tmpscore := (*ds)[t]
+			(*ds)[t] = tmpscore + score
+		} else if stype == "c" {
+			tmpscore := (*cs)[t]
+			(*cs)[t] = tmpscore + score
+		} else if stype == "p" {
+			tmpscore := (*ps)[t]
+			(*ps)[t] = tmpscore + score
+		}
+	}
+}
+
+func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
+	if len(j.FullAreaScore) > 0 {
+		for pt, ps := range *pscore {
+			j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
+		}
+		for ct, cs := range *cscore {
+			j.FullCityScore[ct] = j.FullCityScore[ct] + cs
+		}
+		for dt, ds := range *dscore {
+			j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
+		}
+	}
+}
+func MergeFullSimScore(j *ju.Job) {
+	if len(j.FullAreaScore) == 0 {
+		j.FullAreaScore = j.SimAreaScore
+	} else {
+		for p_text, p_score := range j.FullAreaScore {
+			j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
+		}
+	}
+	for c_text, c_score := range j.SimCityScore {
+		j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
+	}
+
+	for d_text, d_score := range j.SimDistrictScore {
+		j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
+	}
+	//	if len(j.FullCityScore) == 0 {
+	//		j.FullCityScore = j.SimCityScore
+	//	} else {
+	//		for c_text, c_score := range j.FullCityScore {
+	//			j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
+	//		}
+	//	}
+	//	if len(j.FullDistrictScore) == 0 {
+	//		j.FullDistrictScore = j.SimDistrictScore
+	//	} else {
+	//		for d_text, d_score := range j.FullDistrictScore {
+	//			j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
+	//		}
+	//	}
+}
+
+func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
+	if len(j.FullDistrictScore) > 0 {
+		for d, _ := range j.FullDistrictScore {
+			tmpCitys := e.NewDistrictCityMap[d]
+			for _, c := range tmpCitys {
+				if j.FullCityScore[c.Name] != 0 {
+					tmpPb := c.P.Brief
+					//if j.FullAreaScore[tmpPb] != 0 {
+					flag := false
+					for _, p := range finishP {
+						if tmpPb == p {
+							flag = true
+							break
+						}
+					}
+					if !flag {
+						delete(j.FullCityScore, c.Name)
+						delete(j.FullDistrictScore, d)
+					}
+					//}
+				}
+			}
+		}
+	}
+	if len(j.FullCityScore) > 0 {
+		for tmpcity, _ := range j.FullCityScore {
+			c := e.CityFullMap[tmpcity]
+			if c == nil {
+				log.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
+				continue
+			}
+			tmpPb := c.P.Brief
+			//if j.FullAreaScore[tmpPb] != 0 {
+			flag := false
+			for _, p := range finishP {
+				if tmpPb == p {
+					flag = true
+					break
+				}
+			}
+			if !flag {
+				delete(j.FullCityScore, tmpcity)
+			}
+			//}
+		}
+	}
+
+}
+
+//province,city,district干扰项减分
+//func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
+//	defer qu.Catch()
+//	if text != "" {
+//		if stype == "city" {
+//			for cn, cscore := range j.CityScore {
+//				if cn != text {
+//					j.CityScore[cn] = cscore + score
+//					//错误的city减分后对应的province也减分
+//					for pb, pscore := range j.AreaScore {
+//						if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
+//							j.AreaScore[pb] = pscore + score
+//						}
+//					}
+//				}
+//			}
+//		} else if stype == "province" {
+//			for pb, pscore := range j.AreaScore {
+//				if pb != text {
+//					j.AreaScore[pb] = pscore + score
+//					//错误的province减分后对应的city也要减分
+//					for cn, cscore := range j.CityScore {
+//						if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
+//							j.CityScore[cn] = cscore + score
+//						}
+//					}
+//				}
+//			}
+//		}
+//		//		for name, tmpscore := range *whichMap {
+//		//			if name != text {
+//		//				(*whichMap)[name] = tmpscore + score
+//		//			}
+//		//		}
+//	}
+//}

+ 2 - 2
src/jy/util/ossclient.go

@@ -14,8 +14,8 @@ var (
 	ossEndpoint        = "oss-cn-beijing-internal.aliyuncs.com" //http://oss-cn-beijing.aliyuncs.com"
 	//ossEndpointTest    = "topjy.oss-cn-beijing.aliyuncs.com"
 	ossEndpointTest    = "oss-cn-beijing.aliyuncs.com"
-	ossAccessKeyId     = "LTAI4FvLSWN3Wz9F6dUxQGMR"
-	ossAccessKeySecret = "WnQpnNVEiRfZsz5hIqFSr0phayMo3U"
+	ossAccessKeyId     = "LTAI4G5x9aoZx8dDamQ7vfZi"
+	ossAccessKeySecret = "Bk98FsbPYXcJe72n1bG3Ssf73acuNh"
 	ossBucketName      = "topjy"
 	ossclient          *oss.Client
 )

+ 0 - 1
src/jy/util/util2.go

@@ -9,7 +9,6 @@ import (
 	"strconv"
 	"strings"
 	"sync"
-	"time"
 )
 
 const (