123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657 |
- package extract
- import (
- . "jy/pretreated"
- ju "jy/util"
- qu "qfw/util"
- "strings"
- )
- //省
- type Province struct {
- Name string
- Brief string
- Cap string
- Captial *City
- }
- //市
- type City struct {
- Name string
- Brief string
- P *Province
- }
- //区或县
- type District struct {
- Name string
- C *City
- }
- //街道
- type Street struct {
- Name string
- D *District
- }
- //村、社区、居委会
- type Community struct {
- Name string
- S *Street
- }
- //区或县简称对应的全称和市信息
- type DistrictSimFull struct {
- SimName string
- FullName string
- C *City
- }
- //邮编
- type PostCode struct {
- Code string
- P string
- C string
- D []string
- }
- //区号
- type AreaCode struct {
- Code string
- P string
- C []string
- }
- //抽取city
- func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
- /*
- 高准确率:
- 1.爬虫数据jsondata
- 2.采购单位库
- 3.邮编
- 4.固话
- 5.site(todo)
- 低准确率:(全称库匹配到不走简称库)
- 1.city全称库(buyeraddr;title,projectname)
- 2.city简称库(buyeraddr;title,projectname)
- */
- defer qu.Catch()
- //初始化
- if j.FullAreaScore == nil {
- j.FullAreaScore = make(map[string]float64)
- }
- if j.FullCityScore == nil {
- j.FullCityScore = make(map[string]float64)
- }
- if j.FullDistrictScore == nil {
- j.FullDistrictScore = make(map[string]float64)
- }
- sm := NewSortMap()
- //高精度抽取city
- //存储每个流程的抽取结果
- area1 := make([]map[string]string, 4)
- city1 := make([]map[string]string, 4)
- district1 := make([]map[string]string, 4)
- //jsondata
- p0, c0, d0, p, c, d := e.GetCityByJsonData(j)
- area1 = append(area1, map[string]string{"a_c_d": p})
- city1 = append(city1, map[string]string{"a_c_d": c})
- district1 = append(district1, map[string]string{"a_c_d": d})
- area1[0] = map[string]string{"jsondata": p0}
- city1[0] = map[string]string{"jsondata": c0}
- district1[0] = map[string]string{"jsondata": d0}
- //qu.Debug("=====jsondata打分---", j.AreaScore, j.CityScore, j.DistrictScore)
- //采购单位库
- buyer, _ := resulttmp["buyer"].(string)
- p1, c1, d1 := e.GetCityByBuyer(j, buyer)
- //qu.Debug("buyer p--", p1, "c--", c1, "d--", d1)
- area1[1] = map[string]string{"buyer": p1}
- city1[1] = map[string]string{"buyer": c1}
- district1[1] = map[string]string{"buyer": d1}
- //qu.Debug("=====采购单位库打分---", j.AreaScore, j.CityScore, j.DistrictScore)
- //postcode邮编
- buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
- p2, c2, d2 := e.GetCityByPostCode(j, buyerzipcode)
- //qu.Debug("postcode p--", p2, "c--", c2, "d--", d2)
- area1[2] = map[string]string{"postcode": p2}
- city1[2] = map[string]string{"postcode": c2}
- district1[2] = map[string]string{"postcode": d2}
- //qu.Debug("=====postcode邮编打分---", j.AreaScore, j.CityScore, j.DistrictScore)
- //areacode固话区号
- buyertel, _ := resulttmp["buyertel"].(string)
- p3, c3, d3 := e.GetCityByAreaCode(j, buyertel)
- //qu.Debug("areacode p--", p3, "c--", c3, "d--", d3, buyertel)
- area1[3] = map[string]string{"areacode": p3}
- city1[3] = map[string]string{"areacode": c3}
- district1[3] = map[string]string{"areacode": d3}
- //qu.Debug("=====areacode固话区号打分---", j.AreaScore, j.CityScore, j.DistrictScore)
- HighPreCity := make(map[string]interface{})
- HighPreCity["area"] = area1
- HighPreCity["city"] = city1
- HighPreCity["district"] = district1
- //低精度抽取city
- //buyeraddr,title,projectname
- buyeraddr, _ := resulttmp["buyeraddr"].(string)
- title, _ := resulttmp["title"].(string)
- projectname, _ := resulttmp["projectname"].(string)
- //qu.Debug(buyeraddr, "--", buyer, "--", title, "--", projectname)
- sm.AddKey("buyeraddr", buyeraddr)
- sm.AddKey("buyer", buyer)
- sm.AddKey("title", title)
- sm.AddKey("projectname", projectname)
- area2, city2, district2 := e.GetCityByOthers(j, sm)
- LowPreCity := make(map[string]interface{})
- LowPreCity["area"] = area2
- LowPreCity["city"] = city2
- LowPreCity["district"] = district2
- // resulttmp["highprecity"] = HighPreCity
- // resulttmp["lowprecity"] = LowPreCity
- //qu.Debug("最终打分---", j.AreaScore, j.CityScore, j.DistrictScore)
- //最终抽取结果
- finishP := HighestScoreArr(j.FullAreaScore)
- finishC := HighestScoreArr(j.FullCityScore)
- finishD := HighestScoreArr(j.FullDistrictScore)
- // area, _ := resulttmp["area"].(string)
- // city, _ := resulttmp["city"].(string)
- // district, _ := resulttmp["district"].(string)
- // qu.Debug("之前结果结果===", area, city, district)
- arearesult := ""
- cityresult := ""
- districtresult := ""
- if len(finishP) == 1 { //最高分一个
- arearesult = finishP[0] //抽取结果直接赋值
- cityresult = GetCity(arearesult, cityresult, e, finishC)
- cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
- } else if len(finishP) > 1 { //province最高分多个
- if len(finishC) == 1 {
- cityresult = finishC[0]
- if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
- arearesult = cfMap.P.Brief
- cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
- }
- } else { //对应的city有多个(多个province和city)
- arearesult = finishP[0] //抽取结果直接赋值
- cityresult = GetCity(arearesult, cityresult, e, finishC)
- cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
- }
- }
- //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
- if arearesult == "" {
- arearesult = "全国"
- } else if cityresult == "" {
- if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
- cityresult = pbMap.Cap
- resulttmp["defaultpcap"] = true
- }
- }
- //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
- resulttmp["area"] = arearesult
- resulttmp["city"] = cityresult
- resulttmp["district"] = districtresult
- }
- func (e *ExtractTask) GetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
- defer qu.Catch()
- jsondata := *j.Jsondata
- if jsondata != nil { //jsondata中获取province和city
- if acd, ok := jsondata["area_city_district"].(string); ok && acd != "" {
- flag := false
- p, flag = GetPCDByAreaDFA(p, acd, e, j, flag)
- if !flag {
- p, c, flag = GetPCDByCityDFA(p, c, acd, e, j, flag)
- }
- if !flag {
- p, city, c = GetPCDByDistrictDFA(p, c, d, acd, e, j)
- }
- }
- city, _ = jsondata["city"].(string) //city全称或者简称
- province, _ = jsondata["area"].(string) //province简称
- district, _ = jsondata["district"].(string) //district全称
- }
- PCDScore(j, "district", district, 5, true) //district打分
- bp := false
- if province != "" {
- if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
- bp = true //省份正确
- }
- }
- pbrief := ""
- if city != "" {
- cityfullmap := e.CityFullMap[city] //判断city全称是否正确
- if cityfullmap != nil {
- pbrief = cityfullmap.P.Brief //province简称
- } else {
- citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
- if citybriefmap != nil {
- city = citybriefmap.Name //city简称替换为全称
- pbrief = citybriefmap.P.Brief
- }
- }
- }
- if bp {
- if pbrief == province { //爬虫的province和city匹配
- PCDScore(j, "city", city, 5, true)
- } else { //pbrief不匹配province(此时city为空或者错误)
- city = ""
- }
- PCDScore(j, "province", province, 5, true)
- } else { //省份错误或为空,取city的对应的pbrief为province
- if pbrief != "" {
- province = pbrief
- PCDScore(j, "province", province, 5, true)
- PCDScore(j, "city", city, 5, true)
- } else {
- province = ""
- city = ""
- }
- }
- return
- }
- func (e *ExtractTask) GetCityByBuyer(j *ju.Job, buyer string) (province, city, district string) {
- defer qu.Catch()
- return
- }
- func (e *ExtractTask) GetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
- defer qu.Catch()
- pc := e.PostCodeMap[postcode]
- if pc != nil {
- province = pc.P
- city = pc.C
- districtTmp := pc.D
- if len(districtTmp) == 1 { //对应多个district舍去
- district = districtTmp[0]
- PCDScore(j, "district", district, 5, true)
- }
- PCDScore(j, "province", province, 5, true)
- PCDScore(j, "city", city, 5, true)
- }
- return
- }
- func (e *ExtractTask) GetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
- defer qu.Catch()
- if len(buyertel) >= 11 {
- if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
- n := 4
- L:
- areacode := buyertel[:n]
- ac := e.AreaCodeMap[areacode]
- if ac != nil {
- province = ac.P
- citytmp := ac.C
- if len(citytmp) == 1 { //对应多个city舍去
- city = citytmp[0]
- PCDScore(j, "city", city, 5, true)
- }
- PCDScore(j, "province", province, 5, true)
- } else {
- n = n - 1
- if n >= 3 {
- goto L
- }
- }
- } else if buyertel[:3] == "853" { //澳门
- province = "澳门"
- city = "澳门"
- PCDScore(j, "province", province, 5, true)
- PCDScore(j, "city", city, 5, true)
- }
- }
- return
- }
- func (e *ExtractTask) GetCityByOthers(j *ju.Job, sm *SortMap) ([]map[string]string, []map[string]string, []map[string]string) {
- //存储每个流程的抽取结果
- area2 := []map[string]string{}
- city2 := []map[string]string{}
- district2 := []map[string]string{}
- isExtPC := false
- for _, from := range sm.Keys { //buyeraddr;title;projectname
- str, _ := sm.Map[from].(string)
- //分别记录buyeraddr;title;projectname全称匹配的打分情况
- pscore1 := make(map[string]int)
- cscore1 := make(map[string]int)
- dscore1 := make(map[string]int)
- //优先province,city,district,street全称匹配
- for pos, GET := range []*ju.DFA{e.ProvinceAllGet, e.CityAllGet, e.DistrictAllGet, e.StreetGet} {
- word := GET.CheckSensitiveWord(str)
- if word != "" {
- if pos == 0 { //province
- pbrief := e.ProvinceMap[word] //取province简称
- OtherScore("p", []string{pbrief}, &pscore1, &cscore1, &dscore1)
- } else if pos == 1 { //city
- p := ""
- cityfullmap := e.CityFullMap[word]
- if cityfullmap != nil {
- p = cityfullmap.P.Brief //取province简称
- }
- OtherScore("c", []string{p, word}, &pscore1, &cscore1, &dscore1)
- } else if pos == 2 { //district
- p, c := "", ""
- dcitymap := e.DistrictCityMap[word] //区对应的city
- if dcitymap != nil {
- c = dcitymap.Name //city全称
- p = dcitymap.P.Brief //province简称
- }
- tmpArr := []string{p, c, word}
- if word == c { //河南济源市
- tmpArr = []string{p, c}
- }
- OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
- } else if pos == 3 { //street
- p, c, d := "", "", ""
- sdmap := e.StreetDistrictMap[word] //对应的区
- if sdmap != nil {
- d = sdmap.Name
- c = sdmap.C.Name
- p = sdmap.C.P.Brief
- }
- tmpArr := []string{p, c, d}
- if c == d { //河南济源市
- tmpArr = []string{p, c}
- }
- OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
- }
- }
- }
- //取最高分的province,city,district
- ph1 := HighestScore(pscore1)
- ch1 := HighestScore(cscore1)
- dh1 := HighestScore(dscore1)
- isMatch := IsMatch(ph1, ch1, e) //最高分p和最高分c可能不对应
- if ch1 != "" && ph1 != "" && isMatch {
- isExtPC = true
- }
- //是否相互匹配
- area2 = append(area2, map[string]string{from + "_all": ph1})
- city2 = append(city2, map[string]string{from + "_all": ch1})
- district2 = append(district2, map[string]string{from + "_all": dh1})
- //buyeraddr,title,projectname匹配对应的结果加入最终得分
- if isMatch {
- if from == "buyeraddr" || from == "buyer" { //全称匹配,buyeraddr和buyer3分,title和projectname2分
- PCDScore(j, "province", ph1, 3, true)
- PCDScore(j, "city", ch1, 3, true)
- PCDScore(j, "district", dh1, 3, true)
- } else {
- PCDScore(j, "province", ph1, 2, true)
- PCDScore(j, "city", ch1, 2, true)
- PCDScore(j, "district", dh1, 2, true)
- }
- }
- }
- //判断全称是否抽出了province和city,一个未抽出走简称抽取
- if !isExtPC {
- for _, from := range sm.Keys { //buyeraddr;title;projectname
- str, _ := sm.Map[from].(string)
- pscore2 := make(map[string]int)
- cscore2 := make(map[string]int)
- dscore2 := make(map[string]int)
- for pos, GET := range []*ju.DFA{e.ProvinceSimGet, e.CitySimGet, e.DistrictSimGet} {
- word := GET.CheckSensitiveWord(str)
- if word != "" {
- if pos == 0 { //province
- OtherScore("p", []string{word}, &pscore2, &cscore2, &dscore2)
- } else if pos == 1 { //city
- p, c := "", ""
- citybriefmap := e.CityBriefMap[word]
- if citybriefmap != nil {
- p = citybriefmap.P.Brief
- c = citybriefmap.Name
- }
- OtherScore("c", []string{p, c}, &pscore2, &cscore2, &dscore2)
- } else if pos == 2 { //district
- p, c := "", ""
- d := e.DistrictSimAndAll[word]
- dcitymap := e.DistrictCityMap[word]
- if dcitymap != nil {
- c = dcitymap.Name
- p = dcitymap.P.Brief
- }
- OtherScore("d", []string{p, c, d}, &pscore2, &cscore2, &dscore2)
- }
- }
- }
- //取最高分的province,city,district
- ph2 := HighestScore(pscore2)
- ch2 := HighestScore(cscore2)
- dh2 := HighestScore(dscore2)
- area2 = append(area2, map[string]string{from + "_sim": ph2})
- city2 = append(city2, map[string]string{from + "_sim": ch2})
- district2 = append(district2, map[string]string{from + "_sim": dh2})
- //buyeraddr,title,projectname匹配对应的结果加入最终得分
- if from == "buyeraddr" {
- PCDScore(j, "province", ph2, 2, true)
- PCDScore(j, "city", ch2, 2, true)
- PCDScore(j, "district", dh2, 2, true)
- } else {
- PCDScore(j, "province", ph2, 1, true)
- PCDScore(j, "city", ch2, 1, true)
- PCDScore(j, "district", dh2, 1, true)
- }
- }
- }
- return area2, city2, district2
- }
- func IsMatch(p, c string, e *ExtractTask) bool {
- ism := false
- if p != "" && c == "" {
- return true
- }
- if cfMap := e.CityFullMap[c]; cfMap != nil {
- if cfMap.P.Brief == p {
- ism = true
- }
- }
- return ism
- }
- //计算province,city,district得分
- func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
- defer qu.Catch()
- if text != "" {
- if stype == "district" {
- tmpdistrict := make(map[string]float64)
- if isfull {
- tmpdistrict = j.FullDistrictScore
- } else {
- tmpdistrict = j.SimDistrictScore
- }
- scoretmp := tmpdistrict[text]
- tmpdistrict[text] = scoretmp + score
- } else if stype == "city" {
- tmpcity := make(map[string]float64)
- if isfull {
- tmpcity = j.FullCityScore
- } else {
- tmpcity = j.SimCityScore
- }
- scoretmp := tmpcity[text]
- tmpcity[text] = scoretmp + score
- } else if stype == "province" {
- tmpprovince := make(map[string]float64)
- if isfull {
- tmpprovince = j.FullAreaScore
- } else {
- tmpprovince = j.SimAreaScore
- }
- scoretmp := tmpprovince[text]
- tmpprovince[text] = scoretmp + score
- }
- }
- }
- func OtherScore(stype string, text []string, ps, cs, ds *map[string]int) {
- defer qu.Catch()
- for i, t := range text {
- if t != "" {
- if i == 0 { //p
- tmpscore := (*ps)[t]
- (*ps)[t] = tmpscore + 1
- } else if i == 1 { //c
- tmpscore := (*cs)[t]
- (*cs)[t] = tmpscore + 1
- } else if i == 2 { //d
- tmpscore := (*ds)[t]
- (*ds)[t] = tmpscore + 1
- }
- }
- }
- }
- func HighestScore(m map[string]int) string {
- result := ""
- tmpscore := 0
- for str, score := range m {
- if str != "" && tmpscore < score {
- result = str
- tmpscore = score
- }
- }
- return result
- }
- func HighestScoreArr(m map[string]float64) []string {
- result := make(map[float64][]string)
- tmpscore := 0.0
- for str, score := range m {
- if str != "" && tmpscore <= score {
- if result[tmpscore] != nil && tmpscore != score {
- delete(result, tmpscore)
- }
- if r := result[score]; r != nil {
- r = append(r, str)
- result[score] = r
- } else {
- result[score] = []string{str}
- }
- tmpscore = score
- }
- }
- return result[tmpscore]
- }
- func GetCity(area, city string, e *ExtractTask, finishC []string) string {
- for _, c := range finishC { //取最高分与province匹配的city
- if cfMap := e.CityFullMap[c]; cfMap != nil {
- if cfMap.P.Brief == area {
- city = c
- break
- }
- }
- }
- return city
- }
- func GetDistrict(area, city, district string, e *ExtractTask, finishD []string) (string, string) {
- for _, d := range finishD { //取最高分与province匹配的district
- if dcMap := e.DistrictCityMap[d]; dcMap != nil {
- if dcMap.P.Brief == area {
- district = d
- tmpcity := dcMap.Name
- if city != tmpcity {
- if cfMap := e.CityFullMap[tmpcity]; cfMap != nil {
- if cfMap.P.Brief == area {
- city = tmpcity
- break
- }
- }
- }
- }
- }
- }
- return city, district
- }
- func GetPCDByAreaDFA(province, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, bool) {
- if word := e.ProvinceSimGet.CheckSensitiveWord(acd); word != "" { //取省
- if pbMap := e.ProvinceBriefMap[word]; pbMap != nil {
- province = pbMap.Brief
- if province == acd || pbMap.Name == acd { //用于判断area_city_district是否只有省份信息,flag为true就不在匹配area_city_district中的city和district
- flag = true
- }
- PCDScore(j, "province", province, 5, true)
- }
- }
- return province, flag
- }
- func GetPCDByCityDFA(province, city, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, string, bool) {
- for pos, GET := range []*ju.DFA{e.CityAllGet, e.CitySimGet} { //取市
- if word := GET.CheckSensitiveWord(acd); word != "" {
- if pos == 0 { //全称
- if cfMap := e.CityFullMap[word]; cfMap != nil {
- if province != "" && cfMap.P.Brief == province { //acd有province信息
- city = cfMap.Name
- if acd == province+city || acd == cfMap.P.Name+city {
- flag = true
- }
- } else if province == "" { //acd有city;city和district信息
- city = cfMap.Name
- province = cfMap.P.Brief
- PCDScore(j, "province", province, 5, true)
- if acd == city {
- flag = true
- }
- }
- PCDScore(j, "city", city, 5, true)
- break
- }
- } else { //简称
- if cbMap := e.CityBriefMap[word]; cbMap != nil {
- if province != "" && cbMap.P.Brief == province {
- city = cbMap.Name
- if acd == province+city || acd == cbMap.P.Name+city {
- flag = true
- }
- } else if province == "" {
- city = cbMap.Name
- province = cbMap.P.Brief
- PCDScore(j, "province", province, 5, true)
- if acd == city {
- flag = true
- }
- }
- PCDScore(j, "city", city, 5, true)
- break
- }
- }
- }
- }
- return province, city, flag
- }
- func GetPCDByDistrictDFA(province, city, district, acd string, e *ExtractTask, j *ju.Job) (string, string, string) {
- //area_city_district字段不会单独存区信息(省市,省,市,省区,省市区)
- for pos, GET := range []*ju.DFA{e.DistrictAllGet, e.DistrictSimGet} { //取区
- if word := GET.CheckSensitiveWord(acd); word != "" {
- if dcMap := e.DistrictCityMap[word]; dcMap != nil {
- district = word
- if pos == 1 { //简称换为全称
- district = e.DistrictSimAndAll[district]
- }
- if city == "" && dcMap.P.Brief == province { //只有province和district(are_city_district:河南省二七区)
- city = dcMap.Name
- PCDScore(j, "city", city, 5, true)
- } else if province == "" { //province和city都没有(are_city_district:二七区)
- city = dcMap.Name
- province = dcMap.P.Brief
- PCDScore(j, "city", city, 5, true)
- PCDScore(j, "province", province, 5, true)
- }
- PCDScore(j, "district", district, 5, true)
- break
- }
- }
- }
- return province, city, district
- }
|