Browse Source

新版本抽取城市~及备份

zhengkun 2 years ago
parent
commit
208a30af9f

+ 1 - 1
src/config.json

@@ -16,7 +16,7 @@
     "iscltlog": false,
     "iscltlog": false,
     "brandgoods": false,
     "brandgoods": false,
     "pricenumber":true,
     "pricenumber":true,
-    "inscribe": false,
+    "inscribe": true,
     "udpport": "6601",
     "udpport": "6601",
     "udptaskid": "60b493c2e138234cb4adb640",
     "udptaskid": "60b493c2e138234cb4adb640",
     "nextNode": [],
     "nextNode": [],

+ 57 - 30
src/jy/extract/extract.go

@@ -27,15 +27,15 @@ import (
 var (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 	JYUrl                                        = "https://www.jianyu360.com/article/content/%s.html"
 	JYUrl                                        = "https://www.jianyu360.com/article/content/%s.html"
-	cut                                          = ju.NewCut()                          //获取正文并清理
-	ExtLogs                                      map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList                                     map[string]*ExtractTask                //任务列表
-	ClearTaskList                                map[string]*ClearTask                  //清理任务列表
-	saveLimit                                    = 100                                  //抽取日志批量保存
-	PageSize                                     = 5000                                 //查询分页
-	Fields                                       = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
-	Fields2                                      = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
-	NiJianField                                  = []string{
+	cut                                          = ju.NewCut()                                 //获取正文并清理
+	ExtLogs                                      map[*TaskInfo][]map[string]interface{}        //抽取日志
+	TaskList                                     map[string]*ExtractTask                       //任务列表
+	ClearTaskList                                map[string]*ClearTask                         //清理任务列表
+	saveLimit                                                                           = 100  //抽取日志批量保存
+	PageSize                                                                            = 5000 //查询分页
+	Fields                                                                              = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
+	Fields2                                                                             = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
+	NiJianField                                                                         = []string{
 		"string#approvecode",
 		"string#approvecode",
 		"string#total_investment",
 		"string#total_investment",
 		"string#funds",
 		"string#funds",
@@ -80,6 +80,12 @@ var (
 		"ah_whsggzyjyfww_kbxx_cgxm":       true,
 		"ah_whsggzyjyfww_kbxx_cgxm":       true,
 		"ah_whsggzyjyfww_kbxx_gcxm":       true,
 		"ah_whsggzyjyfww_kbxx_gcxm":       true,
 	}
 	}
+
+	clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
+	sortStrReg    *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
+	clearStrReg   *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
+	clearbondReg  *regexp.Regexp = regexp.MustCompile("(无|不|否)") //保证金
+
 )
 )
 
 
 //启动测试抽取-、、、、结果追踪
 //启动测试抽取-、、、、结果追踪
@@ -309,7 +315,13 @@ func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
 	return (&ExtractTask{}).PreInfo(doc)
 	return (&ExtractTask{}).PreInfo(doc)
 }
 }
 
 
-var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
+func CleanDetailText(detail string, summary string) string {
+	detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
+	detail = pretreated.RepairCon(detail)
+	detail = ju.CutLableStr(summary + "\n" + detail)
+	detail = cut.ClearHtml(summary + "\n" + detail)
+	return detail
+}
 
 
 //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
 //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
 func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
 func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
@@ -320,25 +332,14 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		isextFile = doc["isextFile"].(bool)
 		isextFile = doc["isextFile"].(bool)
 	}
 	}
 	detail := ""
 	detail := ""
-	d1, _ := doc["detail"].(string)
-	d2, _ := doc["contenthtml"].(string)
+	summary := qu.ObjToString(doc["summary"])
+	d1 := CleanDetailText(qu.ObjToString(doc["detail"]), summary)
+	d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
 	if len(d1) >= len(d2) || d2 == "" {
 	if len(d1) >= len(d2) || d2 == "" {
 		detail = d1
 		detail = d1
 	} else {
 	} else {
 		detail = d2
 		detail = d2
 	}
 	}
-	detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
-
-	d3, _ := doc["summary"].(string)
-	//全文的需要修复表格
-	detail = pretreated.RepairCon(detail)
-	detail = ju.CutLableStr(d3 + "\n" + detail)
-	detail = cut.ClearHtml(d3 + "\n" + detail)
-
-	if len(detail) < 30 && len(d1) > len(detail) {
-		detail = d1
-	}
-
 	doc["detail"] = detail
 	doc["detail"] = detail
 	isClearnMoney := !clearMoneyReg.MatchString(detail)
 	isClearnMoney := !clearMoneyReg.MatchString(detail)
 	if isClearnMoney {
 	if isClearnMoney {
@@ -478,9 +479,6 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	return j, jf, isSite
 	return j, jf, isSite
 }
 }
 
 
-var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
-var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
-
 //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 func file2text(doc *map[string]interface{}) {
 func file2text(doc *map[string]interface{}) {
 	mnameone := map[string]bool{}
 	mnameone := map[string]bool{}
@@ -2372,7 +2370,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		}
 		}
 		//城市抽取
 		//城市抽取
 		if e.IsExtractCity {
 		if e.IsExtractCity {
-			e.NewExtractCity(j, &tmp, _id)
+			//e.NewExtractCity(j, &tmp)
+
+			e.ExtractRegionInfo(j, &tmp, true)
+
 		}
 		}
 		//品牌抽取
 		//品牌抽取
 		if ju.IsBrandGoods {
 		if ju.IsBrandGoods {
@@ -2602,22 +2603,49 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 	if _, ok := tmp["supervisorrate"].(string); ok {
 	if _, ok := tmp["supervisorrate"].(string); ok {
 		delete(tmp, "supervisorrate")
 		delete(tmp, "supervisorrate")
 	}
 	}
+
+	//快速过滤一遍特殊字段
 	for k, v := range tmp {
 	for k, v := range tmp {
 		if k == "qualifies" {
 		if k == "qualifies" {
 			continue
 			continue
 		}
 		}
-		if k == "contract_guarantee" || k == "bid_guarantee" {
+		if k == "contract_guarantee" || k == "bid_guarantee" ||
+			k == "is_acquire_tender" {
 			if len(fmt.Sprint(v)) > 0 {
 			if len(fmt.Sprint(v)) > 0 {
 				tmp[k] = true
 				tmp[k] = true
 			} else {
 			} else {
 				delete(tmp, k)
 				delete(tmp, k)
 			}
 			}
 		}
 		}
+		if k == "is_joint_bidding" || k == "is_payment_deposit" {
+			if fmt.Sprint(v) == "true" {
+				tmp[k] = true
+			} else {
+				delete(tmp, k)
+			}
+		}
 		if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
 		if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
 			delete(tmp, k)
 			delete(tmp, k)
 		}
 		}
 	}
 	}
 
 
+	//特殊字段~根绝其他字段处理
+	bid_bond := qu.ObjToString(tmp["bid_bond"])
+	if bid_bond != "" && tmp["is_payment_deposit"] == nil {
+		if strings.Contains(bid_bond, "保证金") &&
+			!clearbondReg.MatchString(bid_bond) {
+			tmp["is_payment_deposit"] = true
+		}
+	}
+
+	//特殊字段~根绝其他字段处理
+	bidopenaddress := qu.ObjToString(tmp["bidopenaddress"])
+	if bidopenaddress != "" && tmp["bidopen_shape"] == nil {
+		if utf8.RuneCountInString(bidopenaddress) > 5 {
+			tmp["bidopen_shape"] = "线下开标"
+		}
+	}
+
 	//项目周期-有效值
 	//项目周期-有效值
 	projectperiod := qu.ObjToString(tmp["projectperiod"])
 	projectperiod := qu.ObjToString(tmp["projectperiod"])
 	if projectperiod != "" {
 	if projectperiod != "" {
@@ -2681,7 +2709,6 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 	}
 	}
 	delete(tmp, "biddiscount_up")
 	delete(tmp, "biddiscount_up")
 	delete(tmp, "biddiscount_down")
 	delete(tmp, "biddiscount_down")
-	delete(tmp, "addressing")
 
 
 	//临时
 	//临时
 	//bidstarttime := qu.Int64All(tmp["bidstarttime"])
 	//bidstarttime := qu.Int64All(tmp["bidstarttime"])

+ 327 - 2
src/jy/extract/extractcity_new.go

@@ -1,14 +1,339 @@
 package extract
 package extract
 
 
 import (
 import (
+	. "jy/pretreated"
 	ju "jy/util"
 	ju "jy/util"
 	qu "qfw/util"
 	qu "qfw/util"
+	"strings"
 )
 )
 
 
 //抽取地域信息
 //抽取地域信息
-func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}) {
+func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) {
 	defer qu.Catch()
 	defer qu.Catch()
-	//
+	//日志记录
+	logRecordInfo := []map[string]interface{}{}
+	f_area, f_city, f_district := "", "", ""
+	all_regions := map[string]map[string]map[string]string{}
+	//jsondata ~ 初步确认
+	e.GetRegionByTentativeJsonData(j, &all_regions)
+	//site ~ 初步确认
+	e.GetRegionByTentativeSite(j, &all_regions)
+	//记录
+	if isLog {
+		valueArr := []string{}
+		valueArr = append(valueArr, qu.ObjToString((*j.Jsondata)["area_city_district"]))
+		valueArr = append(valueArr, qu.ObjToString((*j.Data)["site"]))
+		LogProcessRecordingForTentative("jsondata_site", valueArr, all_regions, &logRecordInfo)
+	}
+	b := ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
+	if b {
+		CompleteRegionInfo(&f_area, &f_city, &f_district)
+		//最终赋值
+		(*tmp)["area"] = f_area
+		(*tmp)["city"] = f_city
+		(*tmp)["district"] = f_district
+		(*tmp)["regions_log"] = logRecordInfo
+		return
+	}
+	//638988a7911e1eb34509c209  功能存在缺陷
+	//字段可控
+	CityFieldsArr := []string{
+		"projectaddr,addressing",
+		"buyer,approvedepartment",
+		"buyerzipcode,buyertel",
+		"bidopenaddress,buyeraddr",
+		"title,projectname",
+	}
 
 
+	for _, v := range CityFieldsArr {
+		keyArr := strings.Split(v, ",")
+		//临时调试看到具体的值
+		isContinue, textValues := TextGroupInfo(keyArr, *tmp)
+		if !isContinue {
+			continue
+		}
+		field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp)
+		AnalysisIsUniqueInfo(new_regions, &all_regions)
+		if isLog { //日志记录
+			LogProcessRecordingForGroupInfo(strings.Join(keyArr, "_"), textValues, field_regions, old_regions, all_regions, &logRecordInfo)
+		}
+		b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
+		if b {
+			CompleteRegionInfo(&f_area, &f_city, &f_district)
+			//最终赋值
+			(*tmp)["area"] = f_area
+			(*tmp)["city"] = f_city
+			(*tmp)["district"] = f_district
+			(*tmp)["regions_log"] = logRecordInfo
+			return
+		}
+	}
+
+	//未提前结束~筛选出~最终的
+	ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
+	//給地域做建议的清洗完善
+	CompleteRegionInfo(&f_area, &f_city, &f_district)
+
+	//新疆兵团核对校验
+	buyer := qu.ObjToString((*tmp)["buyer"])
+	if xjbtReg.MatchString(buyer) && f_city == "" {
+		if a, c, d, ok := e.CheckingXjbtCity(buyer); ok {
+			f_area = a
+			f_city = c
+			f_district = d
+		}
+	}
+
+	//敏感词校验核对方法
+	if f_area != "全国" && f_city == "" {
+		if sensitive_city := e.SensitiveCityData(qu.ObjToString((*j.Data)["detail"]), f_area); sensitive_city != "" {
+			f_city = sensitive_city
+			(*tmp)["is_sensitive"] = 1
+		}
+	}
+
+	//最终赋值
+	(*tmp)["area"] = f_area
+	(*tmp)["city"] = f_city
+	(*tmp)["district"] = f_district
+	(*tmp)["regions_log"] = logRecordInfo
+}
+
+//对组进行分析处理
+func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]interface{}) (map[string]interface{}, map[string]map[string]map[string]string, map[string]map[string]map[string]string) {
+	old_regions := map[string]map[string]map[string]string{}
+	textArr := []string{}
+	field_regions := map[string]interface{}{}
+	for _, key := range keyArr {
+		text := qu.ObjToString(tmp[key])
+		textArr = append(textArr, text)
+		valuesArr := []map[string]interface{}{}
+		if key == "buyerzipcode" {
+			valuesArr = e.GetRegionByPostCode(text, &old_regions)
+		} else if key == "buyertel" {
+			valuesArr = e.GetRegionByTelNumber(text, &old_regions)
+		} else {
+			valuesArr = e.GetRegionFromText(text, &old_regions, 2)
+		}
+		field_regions[key] = valuesArr
+	}
+	//校验当前组的合理性
+	new_regions := ReasonableGroupRegionInfo(old_regions)
+
+	return field_regions, old_regions, new_regions
+}
+
+//邮政编号
+func (e *ExtractTask) GetRegionByPostCode(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
+	regionsArr := []map[string]interface{}{}
+	pc := e.PostCodeMap[text]
+	if pc != nil {
+		if len(pc.D) == 1 {
+			UpdateRegionsInfo(pc.P, pc.C, pc.D[0], regions)
+			regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": pc.D[0]})
+		} else {
+			UpdateRegionsInfo(pc.P, pc.C, "", regions)
+			regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": ""})
+		}
+	}
+	return regionsArr
+}
+
+//固话号码
+func (e *ExtractTask) GetRegionByTelNumber(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
+	regionsArr := []map[string]interface{}{}
+	if len(text) >= 11 {
+		if strings.HasPrefix(text, "0") { //区号除了澳门853其他都是以0开头
+			n := 4
+		L:
+			areacode := text[:n]
+			ac := e.AreaCodeMap[areacode]
+			if ac != nil {
+				if len(ac.C) == 1 {
+					UpdateRegionsInfo(ac.P, ac.C[0], "", regions)
+					regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": ac.C[0], "district": ""})
+				} else {
+					UpdateRegionsInfo(ac.P, "", "", regions)
+					regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": "", "district": ""})
+				}
+			} else {
+				n = n - 1
+				if n >= 3 {
+					goto L
+				}
+			}
+		}
+	}
+	return regionsArr
 }
 }
 
 
+//初步确认~采集
+func (e *ExtractTask) GetRegionByTentativeJsonData(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
+	area, city, district := "", "", ""
+	regions := map[string]map[string]map[string]string{}
+	if j.Jsondata != nil {
+		jsondata := *j.Jsondata
+		if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
+			e.GetRegionFromText(a_c_d, &regions, 1)
+		}
+	}
+	if len(regions) == 1 {
+		for k, v := range regions {
+			area = k
+			if len(v) == 1 {
+				for k1, v1 := range v {
+					city = k1
+					if len(v1) == 1 {
+						for k2, _ := range v1 {
+							district = k2
+						}
+					} else {
+						break
+					}
+				}
+			} else {
+				break
+			}
+		}
+	}
+	if area != "" { //组装结构
+		city_info := map[string]map[string]string{}
+		district_info := map[string]string{}
+		if city != "" {
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+		}
+		(*all_regions)[area] = city_info
+	}
+}
+
+//初步确认~站点
+func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
+	area, city, district := "", "", ""
+	site, _ := (*j.Data)["site"].(string)
+	if scMap := e.SiteCityMap[site]; scMap != nil {
+		if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
+			area = scMap.P
+		}
+		if scMap.C != "" && scMap.C != "null" && area != "" {
+			city = scMap.C
+		}
+		if scMap.D != "" && scMap.D != "null" && city != "" {
+			district = scMap.D
+		}
+	}
+	//取出唯一数据
+	j_area, j_city, j_district := "", "", ""
+	is_adjust := false
+	if len(*all_regions) == 1 { //有值~只进行补充操作
+		for k, v := range *all_regions {
+			j_area = k
+			for k1, v1 := range v {
+				j_city = k1
+				for k2, _ := range v1 {
+					j_district = k2
+				}
+			}
+		}
+		if j_area == area && area != "" {
+			if city != "" {
+				if j_city == "" {
+					is_adjust = true
+				} else if j_city == city {
+					if district != "" && j_district == "" {
+						is_adjust = true
+					}
+				}
+			}
+		}
+	} else {
+		is_adjust = true
+	}
+	if is_adjust && area != "" { //进行调整
+		city_info := map[string]map[string]string{}
+		district_info := map[string]string{}
+		if city != "" {
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+		}
+		(*all_regions)[area] = city_info
+	}
+}
+
+//新疆兵团映射
+func (e *ExtractTask) CheckingXjbtCity(buyer string) (new_a, new_c, new_d string, ok bool) {
+	buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
+	ok = false
+	for _, info := range e.XjbtCityArr {
+		name := qu.ObjToString(info["name"])
+		alias := qu.ObjToString(info["alias"])
+		if strings.Contains(buyer, name) || strings.Contains(buyer, alias) {
+			new_a = qu.ObjToString(info["area"])
+			new_c = qu.ObjToString(info["city"])
+			new_d = qu.ObjToString(info["district"])
+			ok = true
+			if res, ok := info["list"].([]interface{}); ok {
+				list := qu.ObjArrToMapArr(res)
+				for _, c := range list {
+					c_name := qu.ObjToString(c["name"])
+					if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
+						new_a = qu.ObjToString(c["area"])
+						new_c = qu.ObjToString(c["city"])
+						new_d = qu.ObjToString(c["district"])
+						break
+					}
+				}
+			}
+			break
+		}
+	}
+	return new_a, new_c, new_d, ok
+}
+
+//敏感词识别~~~
+func (e *ExtractTask) SensitiveCityData(detail string, area string) string {
+	//采用正文
+	detail = sensitiveReg.ReplaceAllString(detail, "")
+	//删除表格相关-文本
+	detail = TextAfterRemoveTable(detail)
+
+	sim_arr := e.SensitiveSimCity.FindAll(detail)
+	full_arr := e.SensitiveFullCity.FindAll(detail)
+	if len(full_arr) < 3 {
+		for _, v := range full_arr {
+			if cityMap := e.CityFullMap[v]; cityMap != nil {
+				if cityMap.P.Brief == area {
+					return cityMap.Name
+				}
+			}
+		}
+	}
+	if len(sim_arr) < 3 {
+		for _, v := range sim_arr {
+			if cityMap := e.CityBriefMap[v]; cityMap != nil {
+				if cityMap.P.Brief == area && !strings.Contains(area, v) {
+					return cityMap.Name
+				}
+			}
+		}
+	}
+	return ""
+}
+
+//临时调试属性
+func TextGroupInfo(keyArr []string, tmp map[string]interface{}) (bool, []string) {
+	isvalid := false
+	dataArr := []string{}
+	for _, v := range keyArr {
+		text := qu.ObjToString(tmp[v])
+		if text != "" {
+			isvalid = true
+		}
+		dataArr = append(dataArr, qu.ObjToString(tmp[v]))
+	}
+	return isvalid, dataArr
+}

+ 1 - 66
src/jy/extract/extractcity_old.go

@@ -10,7 +10,7 @@ import (
 )
 )
 
 
 //抽取city
 //抽取city
-func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}, id string) {
+func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}) {
 	/*
 	/*
 		高准确率:
 		高准确率:
 			1.爬虫数据jsondata
 			1.爬虫数据jsondata
@@ -992,68 +992,3 @@ func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
 		}
 		}
 	}
 	}
 }
 }
-
-//-新疆兵团映射-
-func (e *ExtractTask) CheckingXjbtCity(buyer string) (new_a, new_c, new_d string, ok bool) {
-	buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
-	ok = false
-	for _, info := range e.XjbtCityArr {
-		name := qu.ObjToString(info["name"])
-		alias := qu.ObjToString(info["alias"])
-		if strings.Contains(buyer, name) || strings.Contains(buyer, alias) {
-			new_a = qu.ObjToString(info["area"])
-			new_c = qu.ObjToString(info["city"])
-			new_d = qu.ObjToString(info["district"])
-			ok = true
-			if res, ok := info["list"].([]interface{}); ok {
-				list := qu.ObjArrToMapArr(res)
-				for _, c := range list {
-					c_name := qu.ObjToString(c["name"])
-					if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
-						new_a = qu.ObjToString(c["area"])
-						new_c = qu.ObjToString(c["city"])
-						new_d = qu.ObjToString(c["district"])
-						break
-					}
-				}
-			}
-			break
-		}
-	}
-	return new_a, new_c, new_d, ok
-}
-
-//敏感词识别~~~
-func (e *ExtractTask) SensitiveCityData(detail string, area string) string {
-	//采用正文
-	detail = sensitiveReg.ReplaceAllString(detail, "")
-	//删除表格相关-文本
-	detail = TextAfterRemoveTable(detail)
-
-	sim_arr := e.SensitiveSimCity.FindAll(detail)
-	full_arr := e.SensitiveFullCity.FindAll(detail)
-	if len(full_arr) < 3 {
-		for _, v := range full_arr {
-			if cityMap := e.CityFullMap[v]; cityMap != nil {
-				if cityMap.P.Brief == area {
-					return cityMap.Name
-				}
-			}
-		}
-	}
-	if len(sim_arr) < 3 {
-		for _, v := range sim_arr {
-			if cityMap := e.CityBriefMap[v]; cityMap != nil {
-				if cityMap.P.Brief == area && !strings.Contains(area, v) {
-					return cityMap.Name
-				}
-			}
-		}
-	}
-	//if len(new_city)==1 { //仅有一个有效城市
-	//	for _,v := range new_city{
-	//		return v
-	//	}
-	//}
-	return ""
-}

+ 404 - 0
src/jy/extract/extractcity_way.go

@@ -0,0 +1,404 @@
+package extract
+
+import (
+	qu "qfw/util"
+)
+
+//最终确认确认指定地域
+func ConfirmUniqueRegionInfo(regions map[string]map[string]map[string]string, area *string, city *string, district *string) bool {
+	if len(regions) > 1 || len(regions) == 0 {
+		return false
+	}
+	for k, v := range regions {
+		*area = k
+		if len(v) == 1 {
+			for k1, v1 := range v {
+				*city = k1
+				if len(v1) == 1 {
+					for k2, _ := range v1 {
+						*district = k2
+					}
+				}
+			}
+		}
+	}
+	if *area != "" && *city != "" && *district != "" {
+		return true
+	}
+	return false
+}
+
+//完整信息
+func CompleteRegionInfo(area *string, city *string, district *string) {
+	if *area == "北京" {
+		*city = "北京市"
+		if *district == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
+			*district = "朝阳区"
+		}
+	} else if *area == "天津" {
+		*city = "天津市"
+	} else if *area == "上海" {
+		*city = "上海市"
+	} else if *area == "重庆" {
+		*city = "重庆市"
+	}
+	if *area == "" {
+		*area = "全国"
+		*city = ""
+		*district = ""
+	}
+}
+
+//根据词获取所有的地域
+func (e *ExtractTask) takeRegionsFromWords(text string) []map[string]string {
+	regions := []map[string]string{}
+	//全称匹配
+	for pos_full, trie_full := range e.Trie_Fulls {
+		if trie_full.Get(text) {
+			if pos_full == 0 {
+				if province := e.ProvinceMap[text]; province != "" {
+					regions = append(regions, map[string]string{"area": province, "city": "", "district": ""})
+				}
+			} else if pos_full == 1 {
+				if data := e.CityFullMap[text]; data != nil {
+					if data.P.Brief != "" && data.Name != "" {
+						regions = append(regions, map[string]string{"area": data.P.Brief, "city": data.Name, "district": ""})
+					}
+				}
+			} else if pos_full == 2 {
+				citys := e.DistrictCityMap[text]
+				for _, c := range citys {
+					if c.P.Brief != "" && c.Name != "" && text != "" {
+						regions = append(regions, map[string]string{"area": c.P.Brief, "city": c.Name, "district": text})
+					}
+				}
+			}
+		}
+	}
+	//简称匹配
+	for pos_sim, trie_sim := range e.Trie_Sims {
+		if trie_sim.Get(text) {
+			if pos_sim == 0 {
+				if text != "" {
+					regions = append(regions, map[string]string{"area": text, "city": "", "district": ""})
+				}
+			} else if pos_sim == 1 {
+				if csMap := e.CityBriefMap[text]; csMap != nil {
+					if csMap.P.Brief != "" && csMap.Name != "" {
+						regions = append(regions, map[string]string{"area": csMap.P.Brief, "city": csMap.Name, "district": ""})
+					}
+				}
+			} else if pos_sim == 2 {
+				citysArr := e.DistrictSimAndAll[text]
+				for _, full_citys := range citysArr {
+					for district, c := range full_citys {
+						if c == nil || c.P == nil || c.Name == "" {
+							continue
+						}
+						if c.P.Brief != "" && c.Name != "" && district != "" {
+							regions = append(regions, map[string]string{"area": c.P.Brief, "city": c.Name, "district": district})
+						}
+					}
+				}
+			}
+		}
+	}
+	return regions
+}
+
+//文本取地域   from  1~jsondata文本   2~其他文本
+func (e *ExtractTask) GetRegionFromText(text string, regions *map[string]map[string]map[string]string, from int) []map[string]interface{} {
+	regionValues := []map[string]interface{}{}
+	if text == "" {
+		return regionValues
+	}
+	wordsArr := []string{}
+	if from == 1 {
+		wordsArr = e.Seg_PCD.Cut(text, true)
+	} else if from == 2 {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
+	for _, word := range wordsArr {
+		regionArr := e.takeRegionsFromWords(word)
+		for _, v := range regionArr {
+			area := qu.ObjToString(v["area"])
+			city := qu.ObjToString(v["city"])
+			district := qu.ObjToString(v["district"])
+			UpdateRegionsInfo(area, city, district, regions)
+			regionValues = append(regionValues, map[string]interface{}{"area": area, "city": city, "district": district})
+		}
+	}
+	return regionValues
+}
+
+//更新方法
+func UpdateRegionsInfo(area, city, district string, regions *map[string]map[string]map[string]string) {
+	if (*regions)[area] == nil {
+		city_info := map[string]map[string]string{}
+		district_info := map[string]string{}
+		if city != "" {
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+		}
+		(*regions)[area] = city_info //新增
+	} else {
+		city_info := (*regions)[area]
+		if city != "" {
+			district_info := map[string]string{}
+			if city_info[city] != nil {
+				district_info = city_info[city]
+			}
+			if district != "" {
+				district_info[district] = district
+			}
+			city_info[city] = district_info
+			(*regions)[area] = city_info
+		}
+	}
+}
+
+//同组合并后合理性校验
+func ReasonableGroupRegionInfo(datas map[string]map[string]map[string]string) map[string]map[string]map[string]string {
+	if len(datas) > 2 || len(datas) == 0 { //省份超限,无效
+		return map[string]map[string]map[string]string{}
+	}
+	uncity, undistrict := 0, 0
+	for _, v := range datas {
+		uncity += len(v)
+		for _, v1 := range v {
+			undistrict += len(v1)
+		}
+	}
+
+	if uncity > 3 {
+		regions_1 := map[string]map[string]map[string]string{}
+		for k, v := range datas {
+			city_info := map[string]map[string]string{}
+			if len(v) == 1 {
+				city_info = v
+			}
+			regions_1[k] = city_info
+		}
+		//计算当前
+		uncity_district := 0
+		for _, v := range regions_1 {
+			for _, v1 := range v {
+				uncity_district += len(v1)
+			}
+		}
+		if uncity_district > 3 {
+			regions_2 := map[string]map[string]map[string]string{}
+			for k, v := range regions_1 {
+				city_info := map[string]map[string]string{}
+				for k1, v1 := range v {
+					district_info := map[string]string{}
+					if len(v1) == 1 {
+						district_info = v1
+					}
+					city_info[k1] = district_info
+				}
+				regions_2[k] = city_info
+			}
+			return regions_2
+		}
+		return regions_1
+	}
+
+	if undistrict > 3 {
+		new_regions := map[string]map[string]map[string]string{}
+		for k, v := range datas {
+			city_info := map[string]map[string]string{}
+			for k1, v1 := range v {
+				district_info := map[string]string{}
+				if len(v1) == 1 {
+					district_info = v1
+				}
+				city_info[k1] = district_info
+			}
+			new_regions[k] = city_info
+		}
+		return new_regions
+	}
+	return datas
+}
+
+//两组比对~找寻补充,排除数据
+func AnalysisIsUniqueInfo(regions map[string]map[string]map[string]string, all_regions *map[string]map[string]map[string]string) {
+	if len(regions) == 0 {
+		return
+	}
+	if len(*all_regions) == 0 {
+		*all_regions = regions
+		return
+	}
+	regionsArr := splitRegionsInfos(regions)          //目标数据
+	all_regionsArr := splitRegionsInfos(*all_regions) //源数据
+	new_all_regionsArr := []map[string]string{}       //新数据
+	for _, info := range regionsArr {
+		area := qu.ObjToString(info["area"])
+		if (*all_regions)[area] == nil {
+			continue
+		}
+		unmatchInfo1 := ScreenOutReasonableRegionInfo(info, &all_regionsArr, &new_all_regionsArr)
+		if unmatchInfo1 != nil { //降级匹配~最多二级
+			unmatchInfo2 := ScreenOutReasonableRegionInfo(unmatchInfo1, &all_regionsArr, &new_all_regionsArr)
+			if unmatchInfo2 != nil { //降级匹配~最多一级
+				ScreenOutReasonableRegionInfo(unmatchInfo2, &all_regionsArr, &new_all_regionsArr)
+			}
+		}
+	}
+	//根据最新有效地域组~重新构建所有信息
+	reset_regions_infos := map[string]map[string]map[string]string{}
+	if len(new_all_regionsArr) > 0 {
+		for _, v := range new_all_regionsArr {
+			area := qu.ObjToString(v["area"])
+			city := qu.ObjToString(v["city"])
+			district := qu.ObjToString(v["district"])
+			UpdateRegionsInfo(area, city, district, &reset_regions_infos)
+		}
+		*all_regions = reset_regions_infos
+	}
+}
+
+//选取规则方法
+func ScreenOutReasonableRegionInfo(info map[string]string, regions_infosArr *[]map[string]string, new_regions_infosArr *[]map[string]string) map[string]string {
+	area := qu.ObjToString(info["area"])
+	city := qu.ObjToString(info["city"])
+	district := qu.ObjToString(info["district"])
+	is_Exist := false
+	for _, s := range *regions_infosArr {
+		s_area := qu.ObjToString(s["area"])
+		s_city := qu.ObjToString(s["city"])
+		s_district := qu.ObjToString(s["district"])
+		if s_area == area && s_city == city && s_district == district {
+			is_Exist = true
+			*new_regions_infosArr = append(*new_regions_infosArr, info)
+		} else {
+			//判断是否为补充原则
+			if area != "" && city != "" && district != "" { //3级补2级 3级补2级
+				if s_area == area && s_city == city && s_district == "" {
+					is_Exist = true
+					*new_regions_infosArr = append(*new_regions_infosArr, info)
+				} else if s_area == area && s_city == "" && s_district == "" {
+					is_Exist = true
+					*new_regions_infosArr = append(*new_regions_infosArr, info)
+				}
+			} else if area != "" && city != "" && district == "" { //2级补1级
+				if s_area == area && s_city == "" {
+					is_Exist = true
+					*new_regions_infosArr = append(*new_regions_infosArr, info)
+				}
+			}
+		}
+	}
+
+	//若没有补充~针对二三级 再次进行去掉末位一级,再次进行一轮比对
+	if !is_Exist {
+		if area != "" && city != "" && district != "" {
+			return map[string]string{"area": area, "city": city, "district": ""}
+		}
+		if area != "" && city != "" && district == "" {
+			return map[string]string{"area": area, "city": "", "district": ""}
+		}
+	}
+	return nil
+}
+
+//拆分地域数据~目的更好的合并选取
+func splitRegionsInfos(infos map[string]map[string]map[string]string) []map[string]string {
+	infosArr := []map[string]string{}
+	for k, v := range infos {
+		if len(v) > 0 {
+			for k1, v1 := range v {
+				if len(v1) > 0 {
+					for k2, _ := range v1 {
+						infosArr = append(infosArr, map[string]string{"area": k, "city": k1, "district": k2})
+					}
+				} else {
+					infosArr = append(infosArr, map[string]string{"area": k, "city": k1, "district": ""})
+				}
+			}
+		} else {
+			infosArr = append(infosArr, map[string]string{"area": k, "city": "", "district": ""})
+		}
+	}
+	return infosArr
+}
+
+//日志流程记录~组级别
+func LogProcessRecordingForGroupInfo(key string, valueArr []string, fieldInfos map[string]interface{}, groupInfos map[string]map[string]map[string]string, finallyInfos map[string]map[string]map[string]string, logRecordInfo *[]map[string]interface{}) {
+	groupArr := splitRegionsInfos(groupInfos)
+	finalluArr := splitRegionsInfos(finallyInfos)
+	data := map[string]interface{}{
+		key + "_value":   valueArr,
+		key + "_group":   groupArr,
+		"finally_region": finalluArr,
+	}
+	for k, v := range fieldInfos {
+		data[k] = v
+	}
+	*logRecordInfo = append(*logRecordInfo, data)
+}
+
+//日志流程记录~初步
+func LogProcessRecordingForTentative(key string, valueArr interface{}, finallyInfos map[string]map[string]map[string]string, logRecordInfo *[]map[string]interface{}) {
+	finallyArr := splitRegionsInfos(finallyInfos)
+	data := map[string]interface{}{
+		key + "_value":   valueArr,
+		"finally_region": finallyArr,
+	}
+	*logRecordInfo = append(*logRecordInfo, data)
+}
+
+//同组合并的地域数据
+//func MergeGroupRegionInfo(datas_1, datas_2 map[string]map[string]map[string]string) map[string]map[string]map[string]string {
+//	regions := map[string]map[string]map[string]string{}
+//	if len(datas_1) > 0 && len(datas_2) == 0 {
+//		return datas_1
+//	}
+//	if len(datas_2) > 0 && len(datas_1) == 0 {
+//		return datas_2
+//	}
+//	for k, v := range datas_1 {
+//		area, city, district := "", "", ""
+//		area = k
+//		if len(v) > 0 {
+//			for k1, v1 := range v {
+//				city = k1
+//				if len(v1) > 0 {
+//					for k2, _ := range v1 {
+//						district = k2
+//						UpdateRegionsInfo(area, city, district, &regions)
+//					}
+//				} else {
+//					UpdateRegionsInfo(area, city, district, &regions)
+//				}
+//			}
+//		} else {
+//			UpdateRegionsInfo(area, city, district, &regions)
+//		}
+//	}
+//
+//	for k, v := range datas_2 {
+//		area, city, district := "", "", ""
+//		area = k
+//		if len(v) > 0 {
+//			for k1, v1 := range v {
+//				city = k1
+//				if len(v1) > 0 {
+//					for k2, _ := range v1 {
+//						district = k2
+//						UpdateRegionsInfo(area, city, district, &regions)
+//					}
+//				} else {
+//					UpdateRegionsInfo(area, city, district, &regions)
+//				}
+//			}
+//		} else {
+//			UpdateRegionsInfo(area, city, district, &regions)
+//		}
+//	}
+//	return regions
+//}

+ 1 - 0
src/main.go

@@ -25,6 +25,7 @@ func init() {
 	qu.ReadConfig("./res/pricenumber.json", &u.PriceNumberConfig)
 	qu.ReadConfig("./res/pricenumber.json", &u.PriceNumberConfig)
 	//初始化util
 	//初始化util
 	u.UtilInit()
 	u.UtilInit()
+
 }
 }
 
 
 func main() {
 func main() {

+ 1 - 1
udps/main.go

@@ -19,7 +19,7 @@ func main() {
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")
-	flag.IntVar(&p, "p", 11109, "端口")
+	flag.IntVar(&p, "p", 6601, "端口")
 	flag.IntVar(&tmptime, "tmptime", 0, "时间查询")
 	flag.IntVar(&tmptime, "tmptime", 0, "时间查询")
 	flag.StringVar(&tmpkey, "tmpkey", "", "时间字段")
 	flag.StringVar(&tmpkey, "tmpkey", "", "时间字段")