123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609 |
- package extract
- import (
- . "jy/pretreated"
- ju "jy/util"
- qu "qfw/util"
- "strings"
- )
- // 标准化校验后存值
- func (e *ExtractTask) ExtractRegionClean(tmp *map[string]interface{}) {
- //标准化校验
- update_check := make(map[string]interface{}, 0)
- e.GetCheckFinallyRegionInfo(*tmp, &update_check)
- for k, v := range update_check {
- if k == "area" || k == "city" || k == "district" {
- (*tmp)[k] = v
- }
- if k == "modifycheck" && v != nil {
- (*tmp)[k] = v
- }
- }
- }
- // 抽取地域信息
- func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) {
- defer qu.Catch()
- //日志记录
- logRecordInfo := []map[string]interface{}{}
- f_area, f_city, f_district := "", "", ""
- all_regions := map[string]map[string]map[string]string{}
- //jsondata ~ 前置条件
- e.GetRegionByTentativeJsonData(j, &all_regions)
- if isLog && len(all_regions) > 0 {
- valueArr := []string{}
- valueArr = append(valueArr, qu.ObjToString((*j.Jsondata)["area_city_district"]))
- LogProcessRecordingForTentative("jsondata", valueArr, all_regions, &logRecordInfo)
- }
- b := ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
- if b {
- CompleteRegionInfo(&f_area, &f_city, &f_district)
- //最终赋值
- (*tmp)["area"] = f_area
- (*tmp)["city"] = f_city
- (*tmp)["district"] = f_district
- (*tmp)["regions_log"] = logRecordInfo
- return
- }
- //是否三大运营商-前置条件2
- e.GetRegionByTentativeOperator(qu.ObjToString((*tmp)["winner"]), &all_regions)
- if isLog && len(all_regions) > 0 {
- valueArr := []string{}
- valueArr = append(valueArr, qu.ObjToString((*tmp)["winner"]))
- LogProcessRecordingForTentative("运营商", valueArr, all_regions, &logRecordInfo)
- }
- b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
- if b {
- CompleteRegionInfo(&f_area, &f_city, &f_district)
- //最终赋值
- (*tmp)["area"] = f_area
- (*tmp)["city"] = f_city
- (*tmp)["district"] = f_district
- (*tmp)["regions_log"] = logRecordInfo
- return
- }
- //字段可控
- RegionFieldsArr := ju.DefaultRegions
- //采购单位比较特殊~需要根据站点类型进行重新组合
- if e.IsConsecutionRegion(qu.ObjToString((*tmp)["site"])) {
- RegionFieldsArr = ju.AdjustmentRegions
- }
- for _, v := range RegionFieldsArr {
- keyArr := strings.Split(v, ",")
- isExists, textValues, field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp)
- if isExists { //是否存在抽取有效值
- AnalysisIsUniqueInfo(new_regions, &all_regions)
- if isLog { //日志记录
- LogProcessRecordingForGroupInfo(strings.Join(keyArr, "_"), textValues, field_regions, old_regions, all_regions, &logRecordInfo)
- }
- b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
- if b {
- CompleteRegionInfo(&f_area, &f_city, &f_district)
- //最终赋值
- (*tmp)["area"] = f_area
- (*tmp)["city"] = f_city
- (*tmp)["district"] = f_district
- (*tmp)["regions_log"] = logRecordInfo
- return
- }
- }
- }
- //未提前结束~筛选出~最终的
- ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
- //給地域做建议的清洗完善
- CompleteRegionInfo(&f_area, &f_city, &f_district)
- //用到的字段
- projectname := qu.ObjToString((*tmp)["projectname"])
- buyer := qu.ObjToString((*tmp)["buyer"])
- site := qu.ObjToString((*tmp)["site"])
- //新疆兵团补充地域~
- if XjbtReg.MatchString(buyer) && f_city == "" {
- if a, c, d, ok := e.NewVerifyXjCorpsInfo(buyer); ok {
- f_area, f_city, f_district = a, c, d
- if isLog {
- LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
- "sup_xjbt": f_area + "~" + f_city + "~" + f_district,
- })
- }
- }
- }
- //此时进行特殊链路新增、补充原则
- if f_city == "" {
- e.LinkSpecialRuleFullStep(projectname, &f_area, &f_city, &f_district)
- }
- //企业补充城市校验逻辑
- if buyer != "" && f_city == "" && (f_area == "全国" || f_area == "") {
- LinkSpecialQyxyStep(buyer, &f_area, &f_city, &f_district)
- if f_area != "" && f_area != "全国" && isLog {
- LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
- "sup_qy_link": f_area + "~" + f_city + "~" + f_district,
- })
- }
- }
- if f_city == "" {
- e.LinkSpecialRuleBriefStep(projectname, &f_area, &f_city, &f_district)
- }
- if f_city == "" {
- e.LinkSpecialRuleBriefStep(buyer, &f_area, &f_city, &f_district)
- }
- if isLog {
- LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
- "sup_link": f_area + "~" + f_city + "~" + f_district,
- })
- }
- //正文补充地域~
- if f_area == "全国" || f_area == "" || f_city == "" {
- if b := e.NewVerifySensitiveInfo(qu.ObjToString((*j.Data)["detail"]), &f_area, &f_city, &f_district); b {
- if isLog {
- LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
- "sup_detail": f_area + "~" + f_city + "~" + f_district,
- })
- }
- }
- }
- //最终站点补充
- if f_area == "全国" || f_area == "" {
- if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" {
- f_area = sc.Q
- if isLog {
- LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
- "sup_site": f_area + "~" + f_city + "~" + f_district,
- })
- }
- }
- }
- //最终在清洗一遍数据
- CompleteRegionInfo(&f_area, &f_city, &f_district)
- //最终赋值
- (*tmp)["area"] = f_area
- (*tmp)["city"] = f_city
- (*tmp)["district"] = f_district
- (*tmp)["regions_log"] = logRecordInfo
- }
- // 对组进行分析处理
- func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]interface{}) (bool, []string, map[string]interface{}, map[string]map[string]map[string]string, map[string]map[string]map[string]string) {
- old_regions := map[string]map[string]map[string]string{}
- isExists := false
- textArr := []string{}
- field_regions := map[string]interface{}{}
- for _, key := range keyArr {
- text := ""
- if key == "site_area" || key == "site_city" {
- text = qu.ObjToString(tmp["site"])
- } else if key == "buyer_filiale" {
- text = GetFilialeByBuyerInfo(qu.ObjToString(tmp["buyer"]))
- } else if key == "projectname" {
- text = CleanRegionProjectNameInfo(qu.ObjToString(tmp[key]), qu.ObjToString(tmp["buyer"]))
- } else {
- text = qu.ObjToString(tmp[key])
- }
- textArr = append(textArr, text)
- if text != "" {
- isExists = true
- } else {
- continue //无值不用提取
- }
- valuesArr := []map[string]interface{}{}
- if key == "buyerzipcode" {
- valuesArr = e.GetRegionByPostCode(text, &old_regions)
- } else if key == "buyertel" {
- valuesArr = e.GetRegionByTelNumber(text, &old_regions)
- } else if key == "site_area" {
- valuesArr = e.GetRegionBySite(text, &old_regions, 1)
- } else if key == "site_city" {
- valuesArr = e.GetRegionBySite(text, &old_regions, 2)
- } else if key == "buyer_filiale" {
- valuesArr = e.GetRegionFromText(text, &old_regions, false, false, 2)
- } else {
- isAddress, isBrief := false, false
- if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" {
- isAddress = true
- }
- valuesArr = e.GetRegionFromText(text, &old_regions, isAddress, isBrief, 2)
- }
- field_regions[key] = valuesArr
- }
- //校验当前组的合理性
- new_regions := ReasonableGroupRegionInfo(old_regions)
- return isExists, textArr, field_regions, old_regions, new_regions
- }
- // 邮政编号
- func (e *ExtractTask) GetRegionByPostCode(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
- regionsArr := []map[string]interface{}{}
- pc := e.PostCodeMap[text]
- if pc != nil {
- if len(pc.D) == 1 {
- UpdateRegionsInfo(pc.P, pc.C, pc.D[0], regions)
- regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": pc.D[0]})
- } else {
- UpdateRegionsInfo(pc.P, pc.C, "", regions)
- regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": ""})
- }
- }
- return regionsArr
- }
- // 固话号码
- func (e *ExtractTask) GetRegionByTelNumber(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
- regionsArr := []map[string]interface{}{}
- if len(text) >= 11 {
- if strings.HasPrefix(text, "0") { //区号除了澳门853其他都是以0开头
- n := 4
- L:
- areacode := text[:n]
- ac := e.AreaCodeMap[areacode]
- if ac != nil {
- if len(ac.C) == 1 {
- UpdateRegionsInfo(ac.P, ac.C[0], "", regions)
- regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": ac.C[0], "district": ""})
- } else {
- UpdateRegionsInfo(ac.P, "", "", regions)
- regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": "", "district": ""})
- }
- } else {
- n = n - 1
- if n >= 3 {
- goto L
- }
- }
- }
- }
- return regionsArr
- }
- // 初步确认~采集
- func (e *ExtractTask) GetRegionByTentativeJsonData(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
- area, city, district := "", "", ""
- regions := map[string]map[string]map[string]string{}
- if j.Jsondata != nil {
- jsondata := *j.Jsondata
- if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
- e.GetRegionFromText(a_c_d, ®ions, false, false, 1)
- }
- }
- if len(regions) == 1 {
- for k, v := range regions {
- area = k
- if len(v) == 1 {
- for k1, v1 := range v {
- city = k1
- if len(v1) == 1 {
- for k2, _ := range v1 {
- district = k2
- }
- } else {
- break
- }
- }
- } else {
- break
- }
- }
- }
- if area != "" { //组装结构
- city_info := map[string]map[string]string{}
- district_info := map[string]string{}
- if city != "" {
- if district != "" {
- district_info[district] = district
- }
- city_info[city] = district_info
- }
- (*all_regions)[area] = city_info
- }
- }
- // 简称全程标准化的校验~
- func (e *ExtractTask) StandardizedegionInfo(area *string, city *string, district *string) {
- //特殊市补充
- if *area == "北京" {
- *city = "北京市"
- } else if *area == "天津" {
- *city = "天津市"
- } else if *area == "上海" {
- *city = "上海市"
- } else if *area == "重庆" {
- *city = "重庆市"
- }
- //非空与空~是否标准校验
- if *area == "" {
- *city = ""
- *district = ""
- } else {
- if province := e.ProvinceMap[*area]; province != "" {
- *area = province
- }
- if *city == "" {
- *district = ""
- } else {
- if csMap := e.CityBriefMap[*city]; csMap != nil {
- if csMap.P.Brief == *area && csMap.Name != "" {
- *city = csMap.Name
- } else {
- *city = ""
- *district = ""
- }
- } else {
- if e.CityMap[*city] == "" {
- *city = ""
- *district = ""
- }
- }
- if *district != "" {
- citysArr := e.DistrictSimAndAll[*district]
- if len(citysArr) == 1 {
- full_city := citysArr[0]
- for d, _ := range full_city {
- *district = d
- }
- } else if len(citysArr) > 1 {
- *district = ""
- } else if len(citysArr) == 0 {
- fullArr := e.DistrictCityMap[*district]
- if len(fullArr) == 0 {
- *district = ""
- }
- } else {
- }
- }
- }
- }
- }
- // 站点取值 from 1-省 2-省市
- func (e *ExtractTask) GetRegionBySite(site string, regions *map[string]map[string]map[string]string, from int) []map[string]interface{} {
- regionArr := []map[string]interface{}{}
- area, city, district := "", "", ""
- if scMap := e.SiteCityMap[site]; scMap != nil {
- if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
- area = scMap.P
- }
- if scMap.C != "" && scMap.C != "null" && area != "" {
- city = scMap.C
- }
- }
- e.StandardizedegionInfo(&area, &city, &district)
- if from == 1 && area != "" && area != "全国" {
- UpdateRegionsInfo(area, "", "", regions)
- regionArr = append(regionArr, map[string]interface{}{"area": area, "city": "", "district": ""})
- }
- if from == 2 && area != "" && area != "全国" && city != "" {
- UpdateRegionsInfo(area, city, "", regions)
- regionArr = append(regionArr, map[string]interface{}{"area": area, "city": city, "district": ""})
- }
- return regionArr
- }
- // 新疆兵团
- func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d string, ok bool) {
- buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
- ok = false
- for _, info := range e.XjbtCityArr {
- name := qu.ObjToString(info["name"])
- alias := qu.ObjToString(info["alias"])
- if strings.Contains(buyer, name) || strings.Contains(buyer, alias) {
- new_a = qu.ObjToString(info["area"])
- new_c = qu.ObjToString(info["city"])
- new_d = qu.ObjToString(info["district"])
- ok = true
- list := ju.IsMarkInterfaceMap(info["list"])
- for _, c := range list {
- c_name := qu.ObjToString(c["name"])
- if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
- new_a = qu.ObjToString(c["area"])
- new_c = qu.ObjToString(c["city"])
- new_d = qu.ObjToString(c["district"])
- break
- }
- }
- break
- }
- }
- return new_a, new_c, new_d, ok
- }
- // 敏感词识别
- func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool {
- detail = SensitiveReg.ReplaceAllString(detail, "")
- detail = TextAfterRemoveTable(detail)
- detail = CleanDetailReg1.ReplaceAllString(detail, "")
- //全称城市
- fullCityArr := e.SensitiveFullCity.FindAll(detail)
- if len(fullCityArr) == 1 {
- for _, v := range fullCityArr {
- if cityMap := e.CityFullMap[v]; cityMap != nil {
- if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
- *area = cityMap.P.Brief
- *city = cityMap.Name
- return true
- }
- }
- }
- }
- //全称区县
- fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
- if len(fullDistrictArr) == 1 {
- for _, v := range fullDistrictArr {
- if citys := e.DistrictCityMap[v]; len(citys) == 1 {
- if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
- *area = citys[0].P.Brief
- *city = citys[0].Name
- *district = v
- return true
- }
- }
- }
- }
- //简称城市
- simCityArr := e.SensitiveSimCity.FindAll(detail)
- if len(simCityArr) == 1 {
- for _, v := range simCityArr {
- if cityMap := e.CityBriefMap[v]; cityMap != nil {
- if *area == "" || *area == "全国" {
- *area = cityMap.P.Brief
- if !strings.Contains(*area, v) {
- *city = cityMap.Name
- }
- return true
- }
- if cityMap.P.Brief == *area && !strings.Contains(*area, v) {
- *area = cityMap.P.Brief
- *city = cityMap.Name
- return true
- }
- }
- }
- }
- //疑似固话提取~
- if *area == "" || *area == "全国" {
- fixedTelArr := FixedTelReg.FindAllString(detail, -1)
- if len(fixedTelArr) > 0 {
- codeArr := resetFixedTelInfo(fixedTelArr)
- if len(codeArr) == 1 {
- for _, v := range codeArr {
- if ac := e.AreaCodeMap[v]; ac != nil {
- *area = ac.P
- return true
- }
- }
- }
- }
- }
- return false
- }
- func resetFixedTelInfo(telArr []string) []string {
- codeArr := []string{}
- telsMap := map[string]string{}
- for _, v := range telArr {
- if v != "" {
- arr := strings.Split(v, "-")
- code := qu.ObjToString(arr[0])
- if telsMap[code] == "" {
- telsMap[code] = code
- codeArr = append(codeArr, code)
- }
- }
- }
- return codeArr
- }
- // 初步确认~运营商
- func (e *ExtractTask) GetRegionByTentativeOperator(winner string, all_regions *map[string]map[string]map[string]string) {
- area, city, district := "", "", ""
- regions := map[string]map[string]map[string]string{}
- if OperatorReg.MatchString(winner) {
- e.GetRegionFromText(winner, ®ions, false, false, 2)
- }
- if len(regions) == 1 {
- for k, v := range regions {
- area = k
- if len(v) == 1 {
- for k1, v1 := range v {
- city = k1
- if len(v1) == 1 {
- for k2, _ := range v1 {
- district = k2
- }
- } else {
- break
- }
- }
- } else {
- break
- }
- }
- }
- if area != "" { //组装结构
- //舍弃运营商的数据-area不一致
- if (*all_regions)[area] == nil && len((*all_regions)) > 0 {
- return
- }
- city_info := map[string]map[string]string{}
- if (*all_regions)[area] != nil {
- city_info = (*all_regions)[area]
- }
- district_info := map[string]string{}
- if city != "" {
- //舍弃运营商的数据-city不一致
- if city_info[city] == nil && len(city_info) > 0 {
- return
- }
- if district != "" {
- district_info[district] = district
- }
- city_info[city] = district_info
- }
- (*all_regions)[area] = city_info
- }
- }
- //初步确认~站点
- //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
- // area, city, district := "", "", ""
- // site, _ := (*j.Data)["site"].(string)
- // if scMap := e.SiteCityMap[site]; scMap != nil {
- // if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
- // area = scMap.P
- // }
- // if scMap.C != "" && scMap.C != "null" && area != "" {
- // city = scMap.C
- // }
- // //if scMap.D != "" && scMap.D != "null" && city != "" {
- // // district = scMap.D
- // //}
- // }
- //
- // //对省市区进行标准化校验~简称全程的问题
- // e.StandardizedegionInfo(&area, &city, &district)
- //
- // //取出唯一数据
- // j_area, j_city, j_district := "", "", ""
- // is_adjust := false
- // if len(*all_regions) == 1 { //有值~只进行补充操作
- // for k, v := range *all_regions {
- // j_area = k
- // for k1, v1 := range v {
- // j_city = k1
- // for k2, _ := range v1 {
- // j_district = k2
- // }
- // }
- // }
- // if j_area == area && area != "" {
- // if city != "" {
- // if j_city == "" {
- // is_adjust = true
- // } else if j_city == city {
- // if district != "" && j_district == "" {
- // is_adjust = true
- // }
- // }
- // }
- // }
- // } else {
- // is_adjust = true
- // }
- // if is_adjust && area != "" { //进行调整
- // city_info := map[string]map[string]string{}
- // district_info := map[string]string{}
- // if city != "" {
- // if district != "" {
- // district_info[district] = district
- // }
- // city_info[city] = district_info
- // }
- // (*all_regions)[area] = city_info
- // }
- //}
|