|
@@ -1,7 +1,7 @@
|
|
|
package extract
|
|
|
|
|
|
import (
|
|
|
- . "jy/pretreated"
|
|
|
+ "jy/pretreated"
|
|
|
ju "jy/util"
|
|
|
qu "qfw/util"
|
|
|
"strings"
|
|
@@ -23,7 +23,7 @@ func (e *ExtractTask) ExtractRegionClean(tmp *map[string]interface{}) {
|
|
|
}
|
|
|
|
|
|
// 抽取地域信息
|
|
|
-func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) {
|
|
|
+func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, jf *ju.Job, tmp *map[string]interface{}, isLog bool) {
|
|
|
defer qu.Catch()
|
|
|
//日志记录
|
|
|
logRecordInfo := []map[string]interface{}{}
|
|
@@ -89,6 +89,9 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
|
|
|
CompleteRegionInfo(&f_area, &f_city, &f_district)
|
|
|
//用到的字段
|
|
|
projectname := qu.ObjToString((*tmp)["projectname"])
|
|
|
+ if projectname == "" {
|
|
|
+ projectname = qu.ObjToString((*tmp)["title"])
|
|
|
+ }
|
|
|
buyer := qu.ObjToString((*tmp)["buyer"])
|
|
|
site := qu.ObjToString((*tmp)["site"])
|
|
|
//新疆兵团补充地域~
|
|
@@ -125,19 +128,26 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
|
|
|
})
|
|
|
}
|
|
|
|
|
|
- //正文补充地域~
|
|
|
+ //文本正文-提取补充
|
|
|
if f_area == "全国" || f_area == "" || f_city == "" {
|
|
|
- if b := e.NewVerifySensitiveInfo(qu.ObjToString((*j.Data)["detail"]), &f_area, &f_city, &f_district); b {
|
|
|
- if isLog {
|
|
|
- LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
- "sup_detail": f_area + "~" + f_city + "~" + f_district,
|
|
|
- })
|
|
|
- }
|
|
|
+ if b := e.NewVerifySensitiveInfo(j.Title+"\n"+j.Content, &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_detail": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
}
|
|
|
}
|
|
|
- //代理机构抽省市
|
|
|
- if f_city == "" {
|
|
|
- keyArr := []string{"agencyaddr"}
|
|
|
+ //文本附件-提取补充
|
|
|
+ if (f_area == "全国" || f_area == "" || f_city == "") && jf != nil {
|
|
|
+ if b := e.NewVerifySensitiveInfo(jf.Title+"\n"+jf.ContentClean, &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_jfdetail": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //疑似地址-提取补充-采用简称
|
|
|
+ if f_area == "全国" || f_area == "" || f_city == "" {
|
|
|
+ keyArr := []string{"brief_buyeraddr", "brief_agencyaddr"}
|
|
|
isExists, textValues, field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp)
|
|
|
if isExists { //是否存在抽取有效值
|
|
|
AnalysisIsUniqueInfo(new_regions, &all_regions)
|
|
@@ -155,7 +165,7 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- //pcd切词提取--区县
|
|
|
+ //PCD切词提取--区县
|
|
|
e.LinkSpecialRuleBriefStep2(projectname, &f_area, &f_city, &f_district)
|
|
|
e.LinkSpecialRuleBriefStep2(buyer, &f_area, &f_city, &f_district)
|
|
|
if isLog {
|
|
@@ -163,7 +173,48 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
|
|
|
"sup_link2": f_area + "~" + f_city + "~" + f_district,
|
|
|
})
|
|
|
}
|
|
|
-
|
|
|
+ //采用源码方式-提取补充
|
|
|
+ if f_area == "全国" || f_area == "" || f_city == "" {
|
|
|
+ if b := e.NewVerifySensitiveInfo(j.Title+"\n"+qu.ObjToString((*j.Data)["contenthtml"]), &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_contenthtml": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //清洗的标题-提取补充
|
|
|
+ if f_area == "全国" || f_area == "" || f_city == "" {
|
|
|
+ if new_title := CleanTitleReg1.ReplaceAllString(j.Title, ""); new_title != j.Title && new_title != "" {
|
|
|
+ if b := e.NewVerifySensitiveInfo(new_title, &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_title": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //采购单位地址-提取补充
|
|
|
+ if f_area == "全国" || f_area == "" {
|
|
|
+ if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["buyeraddr"]), &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_buyeraddr": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //中标单位地址-提取补充
|
|
|
+ if f_area == "全国" || f_area == "" {
|
|
|
+ if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["winneraddr"]), &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_winneraddr": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //中标单位-提取补充
|
|
|
+ if f_area == "全国" || f_area == "" {
|
|
|
+ if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["winner"]), &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_winner": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
//最终站点补充
|
|
|
if f_area == "全国" || f_area == "" {
|
|
|
if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" {
|
|
@@ -175,6 +226,14 @@ func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{},
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ //站点名称-提取补充
|
|
|
+ if f_area == "全国" || f_area == "" {
|
|
|
+ if b := e.NewVerifySensitiveInfo(qu.ObjToString((*tmp)["site"]), &f_area, &f_city, &f_district); b && isLog {
|
|
|
+ LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{
|
|
|
+ "sup_sitename": f_area + "~" + f_city + "~" + f_district,
|
|
|
+ })
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
//最终在清洗一遍数据
|
|
|
CompleteRegionInfo(&f_area, &f_city, &f_district)
|
|
@@ -199,6 +258,9 @@ func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]inter
|
|
|
text = GetFilialeByBuyerInfo(qu.ObjToString(tmp["buyer"]))
|
|
|
} else if key == "projectname" {
|
|
|
text = CleanRegionProjectNameInfo(qu.ObjToString(tmp[key]), qu.ObjToString(tmp["buyer"]))
|
|
|
+ } else if key == "brief_buyeraddr" || key == "brief_agencyaddr" {
|
|
|
+ new_key := strings.ReplaceAll(key, "brief_", "")
|
|
|
+ text = qu.ObjToString(tmp[new_key])
|
|
|
} else {
|
|
|
text = qu.ObjToString(tmp[key])
|
|
|
}
|
|
@@ -221,9 +283,13 @@ func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]inter
|
|
|
valuesArr = e.GetRegionFromText(text, &old_regions, false, false, 2)
|
|
|
} else {
|
|
|
isAddress, isBrief := false, false
|
|
|
- if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" || key == "agencyaddr" {
|
|
|
+ if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" {
|
|
|
isAddress = true
|
|
|
}
|
|
|
+ if key == "brief_buyeraddr" || key == "brief_agencyaddr" {
|
|
|
+ isAddress = true
|
|
|
+ isBrief = true
|
|
|
+ }
|
|
|
valuesArr = e.GetRegionFromText(text, &old_regions, isAddress, isBrief, 2)
|
|
|
}
|
|
|
field_regions[key] = valuesArr
|
|
@@ -434,9 +500,25 @@ func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d st
|
|
|
|
|
|
// 敏感词识别
|
|
|
func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool {
|
|
|
+ if detail == "" {
|
|
|
+ return false
|
|
|
+ }
|
|
|
detail = SensitiveReg.ReplaceAllString(detail, "")
|
|
|
- detail = TextAfterRemoveTable(detail)
|
|
|
detail = CleanDetailReg1.ReplaceAllString(detail, "")
|
|
|
+ detail = pretreated.HtmlToText(detail)
|
|
|
+ isChange := false
|
|
|
+ //全程省份
|
|
|
+ if *area == "" || *area == "全国" {
|
|
|
+ fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
|
|
|
+ if len(fullProvinceArr) == 1 {
|
|
|
+ for _, v := range fullProvinceArr {
|
|
|
+ if sim_province := e.ProvinceMap[v]; sim_province != "" {
|
|
|
+ *area = sim_province
|
|
|
+ isChange = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
//全称城市
|
|
|
fullCityArr := e.SensitiveFullCity.FindAll(detail)
|
|
|
if len(fullCityArr) == 1 {
|
|
@@ -484,7 +566,18 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+ //简称省份
|
|
|
+ if *area == "" || *area == "全国" {
|
|
|
+ simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
|
|
|
+ if len(simProvinceArr) == 1 {
|
|
|
+ for _, v := range simProvinceArr {
|
|
|
+ if v != "" {
|
|
|
+ *area = v
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
//疑似固话提取~
|
|
|
if *area == "" || *area == "全国" {
|
|
|
fixedTelArr := FixedTelReg.FindAllString(detail, -1)
|
|
@@ -500,7 +593,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- return false
|
|
|
+ return isChange
|
|
|
}
|
|
|
|
|
|
func resetFixedTelInfo(telArr []string) []string {
|