zhengkun 2 лет назад
Родитель
Сommit
50dd0522f0

+ 4 - 2
src/config.json

@@ -3,10 +3,12 @@
     "mgodb": "127.0.0.1:27017",
     "dbsize": 3,
     "dbname": "extract_local",
-    "dbname_addrs": "extract_service",
-    "dbname_addrs_c": "address_new_2020",
     "site_addr": "127.0.0.1:27017",
     "site_dbname": "extract_local",
+    "qyxy_addr": "127.0.0.1:27017",
+    "qyxy_dbname": "extract_service",
+    "qyxy_username": "",
+    "qyxy_password": "",
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
     "ffield": true,

+ 7 - 7
src/jy/clear/clear.go

@@ -7,7 +7,7 @@ import (
 )
 
 //方法清单
-var clearfns = make(map[string]func(data []interface{},spidercode ... string) []interface{})
+var clearfns = make(map[string]func(data []interface{}, spidercode ...string) []interface{})
 var lock sync.RWMutex
 
 func init() {
@@ -29,24 +29,24 @@ func init() {
 	BindFn("clearBuyerPerson", ClearBuyerPerson) //处理较长采购联系人
 	BindFn("clearNumber", ClearNumber)           //一般用于处理抽取联系人后带有电话的情况
 	BindFn("clearEndSymblo", ClearEndSymblo)     //去除尾部特殊符号
-	BindFn("chiToInt", ChiToFloat)			      //中文转数字(费率、折扣率)
+	BindFn("chiToInt", ChiToFloat)               //中文转数字(费率、折扣率)
 }
 
 //绑定清理方法
-func BindFn(fnname string, fn func(data []interface{},spidercode ...string) []interface{}) {
+func BindFn(fnname string, fn func(data []interface{}, spidercode ...string) []interface{}) {
 	lock.Lock()
 	clearfns[fnname] = fn
 	lock.Unlock()
 }
 
 //执行清理动作,如果调用的清理方法不存在,则不做处理
-func DoClearFn(clear []string, data []interface{},spidercode ...string) []interface{} {
+func DoClearFn(clear []string, data []interface{}, spidercode ...string) []interface{} {
 	if len(clear) == 0 {
 		return data
 	}
 	for _, fnname := range clear {
 		if v, ok := clearfns[fnname]; ok {
-			data = v(data,spidercode...)
+			data = v(data, spidercode...)
 		}
 	}
 	return data
@@ -55,13 +55,13 @@ func DoClearFn(clear []string, data []interface{},spidercode ...string) []interf
 //取手机号
 var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,5})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
 
-func GetPhone(data []interface{},spidercode ...string) []interface{} {
+func GetPhone(data []interface{}, spidercode ...string) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))
 	return data
 }
 
 //去除数字
-func ClearNumber(data []interface{},spidercode ...string) []interface{} {
+func ClearNumber(data []interface{}, spidercode ...string) []interface{} {
 	data[0] = clearNum.ReplaceAllString(fmt.Sprint(data[0]), "")
 	return data
 }

+ 3 - 3
src/jy/clear/getratecurrency.go

@@ -8,17 +8,17 @@ import (
 
 var currency *regexp.Regexp
 var encyitem = map[string]string{
-	"$": "美元", "$": "美元", "美元": "美元",
+	"$": "美元", "$": "美元", "美元": "美元", "美金": "美元",
 	//待续
 }
 
 func init() {
 	//提取币种
-	currency, _ = regexp.Compile(`[$|$|美元]+`)
+	currency, _ = regexp.Compile(`[$|$|美元|美金]+`)
 }
 
 //获取币种
-func GetCurrency(data []interface{},spidercode ...string) []interface{} {
+func GetCurrency(data []interface{}, spidercode ...string) []interface{} {
 	val := "人民币"
 	currency.ReplaceAllStringFunc(fmt.Sprint(data[0]), func(key string) string {
 		v := encyitem[key]

+ 31 - 43
src/jy/extract/extract.go

@@ -1923,8 +1923,8 @@ var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
 var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体])$")
 
 //落款单位抽取
-var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))\n([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
-var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
+var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会|体]))\n([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
+var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会|体]))([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
 
 var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
 
@@ -2124,10 +2124,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					}
 					//中标单位~含字母判断~对比企业库
 					if (v.Field == "winner" || v.Field == "buyer") && letter_entity.MatchString(qu.ObjToString(v.SourceValue)) {
-						qyxy_data := make([]map[string]interface{}, 0)
-						ju.QyxySess.Find(map[string]interface{}{
+						qyxy_data := ju.Qyxy_Mgo.FindOne("qyxy_std", map[string]interface{}{
 							"company_name": qu.ObjToString(v.SourceValue),
-						}).All(&qyxy_data)
+						})
 						if qyxy_data != nil && len(qyxy_data) > 0 {
 							tmp[v.Field] = v.SourceValue
 						}
@@ -2319,10 +2318,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 								}
 								//中标单位~含字母判断~对比企业库
 								if (v.Field == "winner" || v.Field == "buyer") && letter_entity.MatchString(qu.ObjToString(v.SourceValue)) {
-									qyxy_data := make([]map[string]interface{}, 0)
-									ju.QyxySess.Find(map[string]interface{}{
+									qyxy_data := ju.Qyxy_Mgo.FindOne("qyxy_std", map[string]interface{}{
 										"company_name": qu.ObjToString(v.SourceValue),
-									}).All(&qyxy_data)
+									})
 									if qyxy_data != nil && len(qyxy_data) > 0 {
 										tmp[v.Field] = v.SourceValue
 									}
@@ -2372,6 +2370,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if e.IsExtractCity {
 			//e.NewExtractCity(j, &tmp) //旧版
 			e.ExtractRegionInfo(j, &tmp, false)
+			e.ExtractRegionClean(&tmp)
 		}
 		//品牌抽取
 		if ju.IsBrandGoods {
@@ -2576,11 +2575,6 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 		}
 	}
 
-	//金额比例异常-
-	if _, ok := tmp["bidamount"].(string); ok {
-		delete(tmp, "bidamount")
-	}
-
 	/*
 		else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/10 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
 			//比例限制打开
@@ -2592,21 +2586,32 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 		}
 	*/
 
+	//异常金额类型清洗-
+	if _, ok := tmp["bidamount"].(string); ok {
+		delete(tmp, "bidamount")
+	}
 	if _, ok := tmp["budget"].(string); ok {
 		delete(tmp, "budget")
 	}
-	if _, ok := tmp["unitprice"].(string); ok {
-		delete(tmp, "unitprice")
-	}
-	if _, ok := tmp["bidopentime"].(string); ok {
-		delete(tmp, "bidopentime")
-	}
-	if _, ok := tmp["signaturedate"].(string); ok {
-		delete(tmp, "signaturedate")
+
+	//budget bidamount 阈值限定
+	if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
+		tmp["budget_threshold"] = bg
+		delete(tmp, "budget")
 	}
-	if _, ok := tmp["supervisorrate"].(string); ok {
-		delete(tmp, "supervisorrate")
+	if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
+		tmp["bidamount_threshold"] = bg
+		delete(tmp, "bidamount")
 	}
+	//暂时弃用
+	//if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 {
+	//	code := qu.ObjToString(tmp["spidercode"])
+	//	if bg >= 50000000000 && code != "xz_xzzzqjzscjgycxxxpt_zbtzs" &&
+	//		code != "js_jsszbtbw_zbhxrgs" {
+	//		tmp["bidamount_max_err"] = bg
+	//		delete(tmp, "bidamount")
+	//	}
+	//}
 
 	//快速过滤一遍特殊字段
 	for k, v := range tmp {
@@ -2633,7 +2638,7 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 		}
 	}
 
-	//特殊字段~根其他字段处理
+	//特殊字段~根其他字段处理
 	bid_bond := qu.ObjToString(tmp["bid_bond"])
 	if bid_bond != "" && tmp["is_payment_deposit"] == nil {
 		if strings.Contains(bid_bond, "保证金") &&
@@ -2641,8 +2646,7 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 			tmp["is_payment_deposit"] = true
 		}
 	}
-
-	//特殊字段~根绝其他字段处理
+	//特殊字段~根据其他字段处理
 	bidopenaddress := qu.ObjToString(tmp["bidopenaddress"])
 	if bidopenaddress != "" && tmp["bidopen_shape"] == nil {
 		if utf8.RuneCountInString(bidopenaddress) > 5 {
@@ -2679,21 +2683,7 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 			tmp["s_winner"] = strwin
 		}
 	}
-	//budget bidamount
-	if bg, ok := tmp["budget"].(float64); ok {
-		if bg >= 50000000000 {
-			tmp["budget_max_err"] = bg
-			delete(tmp, "budget")
-		}
-	}
-	if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 {
-		code := qu.ObjToString(tmp["spidercode"])
-		if bg >= 50000000000 && code != "xz_xzzzqjzscjgycxxxpt_zbtzs" &&
-			code != "js_jsszbtbw_zbhxrgs" {
-			tmp["bidamount_max_err"] = bg
-			delete(tmp, "bidamount")
-		}
-	}
+
 	//投标方式-
 	bidway := qu.IntAll(tmp["bidway"])
 	if bidway == 1 {
@@ -2742,8 +2732,6 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 		}
 	}
 
-	//return tmp
-
 	//针对拟建单位~需要验证~各种字段优先级
 	if qu.ObjToString(tmp["toptype"]) == "拟建" &&
 		qu.ObjToString(tmp["subtype"]) == "拟建" {

+ 98 - 32
src/jy/extract/extractInit.go

@@ -2,6 +2,7 @@
 package extract
 
 import (
+	"fmt"
 	"github.com/sensitive"
 	"gopkg.in/mgo.v2/bson"
 	db "jy/mongodbutil"
@@ -112,7 +113,12 @@ type ExtractTask struct {
 	DistrictSimGet *ju.DFA //区或县简称
 	StreetGet      *ju.DFA //街道
 
-	XjbtCityArr           []map[string]interface{} //新疆兵团相关数据
+	XjbtCityArr []map[string]interface{} //新疆兵团相关数据
+	//标准化地域信息
+	S_ProvinceDict map[string][]S_Province //标准省份-map
+	S_CityDict     map[string][]S_City     //标准城市-map
+	S_DistrictDict map[string][]S_District //标准区县-map
+
 	SensitiveFullCity     *sensitive.Filter
 	SensitiveSimCity      *sensitive.Filter
 	SensitiveFullDistrict *sensitive.Filter
@@ -995,10 +1001,9 @@ func InitProvince(version string) map[string]interface{} {
 //加载所有
 func InitProvincesx() []map[string]interface{} {
 	defer qu.Catch()
-	provinces := make([]map[string]interface{}, 0)
-	ju.AddrsSess.Find(map[string]interface{}{
+	provinces, _ := ju.Qyxy_Mgo.Find("address_new_2020", map[string]interface{}{
 		"Remarks": nil,
-	}).All(&provinces)
+	}, nil, nil)
 	return provinces
 }
 
@@ -1033,6 +1038,61 @@ func (e *ExtractTask) InitXjbtCityInfo() {
 	e.XjbtCityArr = arr
 }
 
+//初始化标准地域信息
+func (e *ExtractTask) InitRegionInfo() {
+	defer qu.Catch()
+	e.S_ProvinceDict = make(map[string][]S_Province, 0)
+	e.S_CityDict = make(map[string][]S_City, 0)
+	e.S_DistrictDict = make(map[string][]S_District, 0)
+	q := map[string]interface{}{
+		"town_code": map[string]interface{}{
+			"$exists": 0,
+		},
+	}
+	dataArr, _ := ju.Qyxy_Mgo.Find("address_jy_2022", q, nil, nil)
+	for _, tmp := range dataArr {
+		district_code := qu.IntAll(tmp["district_code"])
+		city_code := qu.IntAll(tmp["city_code"])
+		if district_code > 0 {
+			province := qu.ObjToString(tmp["province"])
+			city := qu.ObjToString(tmp["city"])
+			district := qu.ObjToString(tmp["district"])
+			data := S_District{province, city, district}
+			if e.S_DistrictDict[district] == nil {
+				e.S_DistrictDict[district] = []S_District{data}
+			} else {
+				arr := e.S_DistrictDict[district]
+				arr = append(arr, data)
+				e.S_DistrictDict[district] = arr
+			}
+		} else {
+			if city_code > 0 {
+				province := qu.ObjToString(tmp["province"])
+				city := qu.ObjToString(tmp["city"])
+				data := S_City{province, city}
+				if e.S_CityDict[city] == nil {
+					e.S_CityDict[city] = []S_City{data}
+				} else {
+					arr := e.S_CityDict[city]
+					arr = append(arr, data)
+					e.S_CityDict[city] = arr
+				}
+			} else {
+				province := qu.ObjToString(tmp["province"])
+				data := S_Province{province}
+				if e.S_ProvinceDict[province] == nil {
+					e.S_ProvinceDict[province] = []S_Province{data}
+				} else {
+					arr := e.S_ProvinceDict[province]
+					arr = append(arr, data)
+					e.S_ProvinceDict[province] = arr
+				}
+			}
+		}
+	}
+	log.Debug(fmt.Sprintf("城市配置加载完毕...省~%d 市~%d 区~%d", len(e.S_ProvinceDict), len(e.S_CityDict), len(e.S_DistrictDict)))
+}
+
 //站点加载...
 func (e *ExtractTask) InitUpdateSite() {
 	defer qu.Catch()
@@ -1056,15 +1116,16 @@ func (e *ExtractTask) InitCityInfo() {
 	e.InitVar() //初始化变量
 	//新疆兵团数据
 	e.InitXjbtCityInfo()
+	//标准地域信息
+	e.InitRegionInfo()
 	//site站点信息
 	e.InitUpdateSite()
 	//初始化省信息
 	fn1 := InitProvince(e.TaskInfo.Version)
 	for k, v := range fn1 {
-		for _, p := range v.([]interface{}) {
-			p1, _ := p.(string)
-			e.Trie_Full_Province.AddWords(p1) //华中科技大学
-			e.ProvinceMap[p1] = k             //华中科技大学:湖北
+		for _, p := range ju.ConvertInterface(v) {
+			e.Trie_Full_Province.AddWords(p) //华中科技大学
+			e.ProvinceMap[p] = k             //华中科技大学:湖北
 		}
 	}
 	alldata := InitProvincesx()
@@ -1123,10 +1184,9 @@ func (e *ExtractTask) InitCityInfo() {
 		e.Trie_Sim_Province.AddWords(jc_province) //加入省简称Trie(k:浙江)
 		e.ProvinceMap[all_province] = jc_province //浙江省:浙江
 		e.ProvinceBriefMap[jc_province] = p       //浙江:省信息{}
-		if province_alias, ok := provinces["province_alias"].([]interface{}); ok {
-			for _, vprovince_alias := range province_alias {
-				e.ProvinceBriefMap[qu.ObjToString(vprovince_alias)] = p
-			}
+		province_alias := ju.ConvertInterface(provinces["province_alias"])
+		for _, vprovince_alias := range province_alias {
+			e.ProvinceBriefMap[qu.ObjToString(vprovince_alias)] = p
 		}
 		//加载市信息
 		citys := citys_maps[jc_province]
@@ -1147,17 +1207,17 @@ func (e *ExtractTask) InitCityInfo() {
 				e.CityBriefMap[c.Brief] = c  //杭州:市信息{}
 			}
 			c.P = p
-			if city_alias, ok := vcity["city_alias"].([]interface{}); ok {
-				for _, vcity_alias := range city_alias {
-					strvcity_alias := qu.ObjToString(vcity_alias)
-					if isok[jc_province+"_"+strvcity_alias] {
-						continue
-					}
-					e.CityBriefMap[strvcity_alias] = c
-					e.initDistricts(jc_province, strvcity_alias, c, jc_city, districts_maps, towns_maps, jwhs_maps)
-					isok[jc_province+"_"+strvcity_alias] = true
+			city_alias := ju.ConvertInterface(vcity["city_alias"])
+			for _, vcity_alias := range city_alias {
+				strvcity_alias := qu.ObjToString(vcity_alias)
+				if isok[jc_province+"_"+strvcity_alias] {
+					continue
 				}
+				e.CityBriefMap[strvcity_alias] = c
+				e.initDistricts(jc_province, strvcity_alias, c, jc_city, districts_maps, towns_maps, jwhs_maps)
+				isok[jc_province+"_"+strvcity_alias] = true
 			}
+
 			if isok[jc_province+"_"+qc_city] {
 				continue
 			}
@@ -1203,17 +1263,17 @@ func (e *ExtractTask) initDistricts(jc_province string, qc_city string, c *City,
 		} else {
 			e.DistrictCityMap[qc_district] = append(e.DistrictCityMap[qc_district], c)
 		}
-		if district_alias, ok := vdistricts["district_alias"].([]interface{}); ok {
-			for _, vdistrict_alias := range district_alias {
-				strvdistrict_alias := qu.ObjToString(vdistrict_alias)
-				e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
-				c_tmp := e.DistrictCityMap[strvdistrict_alias]
-				if len(c_tmp) == 0 {
-					tmpcarr := []*City{c}
-					e.DistrictCityMap[strvdistrict_alias] = tmpcarr
-				} else {
-					e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c)
-				}
+
+		district_alias := ju.ConvertInterface(vdistricts["district_alias"])
+		for _, vdistrict_alias := range district_alias {
+			strvdistrict_alias := qu.ObjToString(vdistrict_alias)
+			e.Trie_Full_District.AddWords(strvdistrict_alias) //加入区或县全称Trie
+			c_tmp := e.DistrictCityMap[strvdistrict_alias]
+			if len(c_tmp) == 0 {
+				tmpcarr := []*City{c}
+				e.DistrictCityMap[strvdistrict_alias] = tmpcarr
+			} else {
+				e.DistrictCityMap[strvdistrict_alias] = append(e.DistrictCityMap[strvdistrict_alias], c)
 			}
 		}
 		//街道乡镇
@@ -1281,6 +1341,12 @@ func (e *ExtractTask) InitVar() {
 	e.StreetDistrictMap = make(map[string][]*District)
 	//新疆兵团-数组
 	e.XjbtCityArr = make([]map[string]interface{}, 0)
+
+	//标准化地域信息
+	e.S_ProvinceDict = make(map[string][]S_Province, 0)
+	e.S_CityDict = make(map[string][]S_City, 0)
+	e.S_DistrictDict = make(map[string][]S_District, 0)
+
 	//敏感词-筛选
 	e.SensitiveFullCity = sensitive.New()
 	e.SensitiveSimCity = sensitive.New()

+ 14 - 0
src/jy/extract/extractcity.go

@@ -60,3 +60,17 @@ var AgencyReg = []*regexp.Regexp{
 	regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
 	regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
 }
+
+//标准化地域结构体
+type S_Province struct {
+	P_Name string
+}
+type S_City struct {
+	P_Name string
+	C_Name string
+}
+type S_District struct {
+	P_Name string
+	C_Name string
+	D_Name string
+}

+ 287 - 0
src/jy/extract/extractcity_clean.go

@@ -0,0 +1,287 @@
+package extract
+
+import (
+	"fmt"
+	ju "jy/util"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+)
+
+var cityEndReg *regexp.Regexp = regexp.MustCompile("(区|县|市)$")
+var ErrBuyerReg *regexp.Regexp = regexp.MustCompile("^(成都东部新区)")
+
+func (e *ExtractTask) GetCheckFinallyRegionInfo(tmp map[string]interface{}, update_check *map[string]interface{}) {
+
+	area := qu.ObjToString(tmp["area"])
+	city := qu.ObjToString(tmp["city"])
+	district := qu.ObjToString(tmp["district"])
+	buyer := qu.ObjToString(tmp["buyer"])
+
+	if buyer != "" && ErrBuyerReg.MatchString(buyer) && area == "浙江" {
+		(*update_check)["area"] = "四川"
+		(*update_check)["city"] = "成都市"
+		(*update_check)["district"] = ""
+		return
+	}
+
+	if (district != "" && city != "" && area != "" && area != "全国") || buyer == "" {
+		//标准城市-校验
+		rdata := e.StandardCheckCity(area, city, district)
+		if len(rdata) > 0 {
+			umap := updateLogging(tmp, rdata, "标准信息")
+			copyUpdateData(umap, update_check)
+		}
+		return
+	}
+	rdata := cityMarshal(tmp) //企业表-补城市
+	if len(rdata) > 0 {
+		new_area, new_city, new_district := area, city, district
+		if rdata["area"] != "" {
+			new_area = qu.ObjToString(rdata["area"])
+		}
+		if rdata["city"] != "" {
+			new_city = qu.ObjToString(rdata["city"])
+		}
+		if rdata["district"] != "" {
+			new_district = qu.ObjToString(rdata["district"])
+		}
+		umap := updateLogging(tmp, rdata, "企业信息")
+		n_rdata := e.StandardCheckCity(new_area, new_city, new_district)
+		if len(n_rdata) > 0 {
+			for rk, rv := range n_rdata {
+				umap[rk] = rv
+				umap["modifycheck"].(map[string]interface{})[rk] = fmt.Sprintf("企业标准信息~%s~%s", qu.ObjToString(tmp[rk]), rv)
+			}
+		}
+		copyUpdateData(umap, update_check)
+	} else {
+		n_rdata := e.StandardCheckCity(area, city, district)
+		if len(n_rdata) > 0 {
+			umap := updateLogging(tmp, n_rdata, "标准信息")
+			copyUpdateData(umap, update_check)
+		}
+	}
+}
+
+//企业表校验
+func cityMarshal(data map[string]interface{}) map[string]string {
+	buyer := qu.ObjToString(data["buyer"])
+	bidarea := qu.ObjToString(data["area"])
+	bidcity := qu.ObjToString(data["city"])
+	biddistrict := qu.ObjToString(data["district"])
+	rdata := make(map[string]string)
+	query_name := map[string]interface{}{
+		"company_name": buyer,
+	}
+	tmp := ju.Qyxy_Mgo.FindOne("qyxy_std", query_name)
+	if tmp == nil || len(tmp) < 2 {
+		return rdata
+	}
+	company_code := fmt.Sprint(tmp["company_code"])
+	if len(company_code) > 5 {
+		province_city_district := ju.Qyxy_Mgo.FindOne("address", map[string]interface{}{
+			"code": company_code[:6],
+		})
+		remarks := fmt.Sprint((province_city_district)["Remarks"])
+		if remarks == "" || remarks == "废除" || remarks == "已作废" {
+
+		} else if province_city_district != nil {
+			codeprovince := qu.ObjToString((province_city_district)["province"])
+			codecity := qu.ObjToString((province_city_district)["city"])
+			codedistrict := qu.ObjToString((province_city_district)["district"])
+			if bidarea == "" || bidarea == "全国" {
+				if codeprovince != "" {
+					rdata["area"] = codeprovince
+					if codecity != "" && codecity != codeprovince {
+						rdata["city"] = codecity
+						if codedistrict != "" && codedistrict != codecity {
+							rdata["district"] = codedistrict
+						}
+					}
+				}
+			} else if bidcity == "" && codecity != "" && bidarea == codeprovince {
+				if codecity != bidarea {
+					rdata["city"] = codecity
+					if codedistrict != "" && codecity != codedistrict {
+						rdata["district"] = codedistrict
+					}
+				}
+			} else if biddistrict == "" && codedistrict != "" && bidarea == codeprovince && codecity == bidcity {
+				rdata["district"] = codedistrict
+			}
+
+			if rdata["city"] != "" || rdata["district"] != "" {
+				return rdata
+			}
+		}
+	}
+
+	entprovince := qu.ObjToString(tmp["company_area"])
+	entcity := qu.ObjToString(tmp["company_city"])
+	entdistrict := qu.ObjToString(tmp["company_district"])
+
+	//新增特殊处理-港澳台数据
+	if bidarea == "" || bidarea == "香港" || bidarea == "澳门" || bidarea == "台湾" || bidarea == "全国" {
+		if entprovince != "" {
+			rdata["area"] = entprovince
+			if entcity != "" && entcity != entprovince {
+				rdata["city"] = entcity
+				if entdistrict != "" && entdistrict != entcity {
+					rdata["district"] = entdistrict
+				}
+			}
+		}
+	} else if bidcity == "" && entcity != "" && entprovince == bidarea {
+		rdata["city"] = entcity
+		if entdistrict != "" && entcity != entdistrict {
+			rdata["district"] = entdistrict
+		}
+	} else if biddistrict == "" && entdistrict != "" && entprovince == bidarea && bidcity == entcity {
+		rdata["district"] = entdistrict
+	}
+
+	return rdata
+}
+
+//标准校验
+func (e *ExtractTask) StandardCheckCity(area string, city string, district string) map[string]string {
+
+	rdata := make(map[string]string, 0)
+	if area == "香港" || area == "澳门" || area == "台湾" || (area == "全国" && (city == "" && district == "")) {
+		return rdata
+	}
+	//第一步:区校验
+	if district != "" {
+		districtArr := e.S_DistrictDict[district]
+		if districtArr == nil { //涉及了 个别别名相关的数据
+			trim_arr := aliasDataDistrict(district) //拆分后缀
+			if len(trim_arr) > 0 {
+				for _, alias_district := range trim_arr {
+					alias_districtArr := e.S_DistrictDict[alias_district]
+					for _, v := range alias_districtArr {
+						if city == v.C_Name && area == v.P_Name {
+							rdata["district"] = alias_district
+							return rdata
+						}
+					}
+				}
+			}
+			rdata["district"] = ""
+		} else {
+			isTrue := false
+			for _, v := range districtArr {
+				if city == v.C_Name && area == v.P_Name {
+					isTrue = true
+					break
+				}
+			}
+			if isTrue { //完全匹配
+				return rdata
+			} else { //未完全匹配
+				if len(districtArr) == 1 {
+					rdata["area"] = districtArr[0].P_Name
+					rdata["city"] = districtArr[0].C_Name
+					rdata["district"] = districtArr[0].D_Name
+					return rdata
+				} else {
+					rdata["district"] = ""
+				}
+			}
+		}
+	}
+
+	//第二步:区校验-失败   市-校验
+	if city != "" {
+		cityArr := e.S_CityDict[city]
+		if cityArr == nil {
+			//把市当成区,匹配三级   - 存在优化空间- city:郑州  别名
+			districtArr := e.S_DistrictDict[city]
+			for _, v := range districtArr {
+				if city == v.C_Name && area == v.P_Name {
+					rdata["area"] = districtArr[0].P_Name
+					rdata["city"] = districtArr[0].C_Name
+					rdata["district"] = districtArr[0].D_Name
+					return rdata
+				}
+			}
+			rdata["city"] = ""
+		} else {
+			isTrue := false
+			for _, v := range cityArr {
+				if area == v.P_Name {
+					isTrue = true
+					break
+				}
+			}
+			if isTrue { //完全匹配
+				return rdata
+			} else { //未完全匹配
+				if len(cityArr) == 1 {
+					rdata["area"] = cityArr[0].P_Name
+					rdata["city"] = cityArr[0].C_Name
+					rdata["district"] = ""
+					return rdata
+				} else {
+					rdata["city"] = ""
+				}
+			}
+		}
+	}
+
+	//第三步:省份校验
+	if e.S_ProvinceDict[area] == nil {
+		rdata["area"] = "全国"
+		rdata["city"] = ""
+		rdata["district"] = ""
+	}
+
+	return rdata
+}
+
+//更新日志
+func updateLogging(tmp map[string]interface{}, rdata map[string]string, desc string) map[string]interface{} {
+	umap := make(map[string]interface{})
+	if tmp["modifycheck"] == nil {
+		umap["modifycheck"] = make(map[string]interface{})
+	} else {
+		umap["modifycheck"] = tmp["modifycheck"]
+	}
+	for rk, rv := range rdata {
+		umap[rk] = rv
+		umap["modifycheck"].(map[string]interface{})[rk] = fmt.Sprintf("%s~%s~%s", desc, qu.ObjToString(tmp[rk]), rv)
+	}
+	return umap
+}
+
+func copyUpdateData(tmp map[string]interface{}, update_check *map[string]interface{}) {
+	for k, v := range tmp {
+		(*update_check)[k] = v
+	}
+}
+
+//拆分三级县
+func aliasDataDistrict(district string) []string {
+	arr := []string{}
+	if cityEndReg.MatchString(district) {
+		str := cityEndReg.FindString(district)
+		strings.TrimRight(district, str)
+		if str == "县" {
+			arr = append(arr, fmt.Sprintf("%s区", strings.TrimRight(district, str)))
+			arr = append(arr, fmt.Sprintf("%s市", strings.TrimRight(district, str)))
+		} else if str == "区" {
+			arr = append(arr, fmt.Sprintf("%s县", strings.TrimRight(district, str)))
+			arr = append(arr, fmt.Sprintf("%s市", strings.TrimRight(district, str)))
+		} else if str == "市" {
+			arr = append(arr, fmt.Sprintf("%s县", strings.TrimRight(district, str)))
+			arr = append(arr, fmt.Sprintf("%s区", strings.TrimRight(district, str)))
+		} else {
+
+		}
+	} else { //未找到 district- 区县市  例: district : 金水
+		arr = append(arr, fmt.Sprintf("%s区", district))
+		arr = append(arr, fmt.Sprintf("%s县", district))
+		arr = append(arr, fmt.Sprintf("%s市", district))
+	}
+	return arr
+}

+ 20 - 10
src/jy/extract/extractcity_new.go

@@ -7,6 +7,18 @@ import (
 	"strings"
 )
 
+//标准化校验后存值
+func (e *ExtractTask) ExtractRegionClean(tmp *map[string]interface{}) {
+	//标准化校验
+	update_check := make(map[string]interface{}, 0)
+	e.GetCheckFinallyRegionInfo(*tmp, &update_check)
+	for k, v := range update_check {
+		if k == "area" || k == "city" || k == "district" {
+			(*tmp)[k] = v
+		}
+	}
+}
+
 //抽取地域信息
 func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) {
 	defer qu.Catch()
@@ -362,16 +374,14 @@ func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d st
 			new_c = qu.ObjToString(info["city"])
 			new_d = qu.ObjToString(info["district"])
 			ok = true
-			if res, ok := info["list"].([]interface{}); ok {
-				list := qu.ObjArrToMapArr(res)
-				for _, c := range list {
-					c_name := qu.ObjToString(c["name"])
-					if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
-						new_a = qu.ObjToString(c["area"])
-						new_c = qu.ObjToString(c["city"])
-						new_d = qu.ObjToString(c["district"])
-						break
-					}
+			list := ju.IsMarkInterfaceMap(info["list"])
+			for _, c := range list {
+				c_name := qu.ObjToString(c["name"])
+				if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
+					new_a = qu.ObjToString(c["area"])
+					new_c = qu.ObjToString(c["city"])
+					new_d = qu.ObjToString(c["district"])
+					break
 				}
 			}
 			break

+ 2 - 4
src/jy/extract/extractcity_way.go

@@ -593,15 +593,13 @@ func LinkSuppleRules(regions map[string]map[string]map[string]string, area *stri
 			}
 		}
 	}
-
 }
 
 //链路补充~企业校验步骤
 func LinkSpecialQyxyStep(buyer string, area *string, city *string, district *string) {
-	qyxy_arr := make([]map[string]interface{}, 0)
-	ju.QyxySess.Find(map[string]interface{}{
+	qyxy_arr, _ := ju.Qyxy_Mgo.Find("qyxy_std", map[string]interface{}{
 		"company_name": buyer,
-	}).All(&qyxy_arr)
+	}, nil, nil)
 	if qyxy_arr != nil && len(qyxy_arr) > 0 {
 		for _, v := range qyxy_arr {
 			qy_area := qu.ObjToString(v["company_area"])

+ 7 - 8
src/jy/extract/extractudp.go

@@ -33,7 +33,7 @@ func ExtractUdpUpdateMachine() {
 
 //udp通知抽取
 func ExtractUdp() {
-	nextNodes = qu.ObjArrToMapArr(ju.Config["nextNode"].([]interface{}))
+	nextNodes = ju.IsMarkInterfaceMap(ju.Config["nextNode"])
 	Udpclient = mu.UdpClient{Local: ":" + qu.ObjToString(ju.Config["udpport"]), BufSize: 1024}
 	Udpclient.Listen(processUdpMsg)
 }
@@ -69,6 +69,8 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 			} else if stype == "heart_extract" {
 				skey, _ := rep["skey"].(string)
 				Udpclient.WriteUdp([]byte(skey), mu.OP_NOOP, ra)
+			} else if stype == "update_rule" {
+				ju.IsUpdateRule = true
 			} else {
 				sid, _ := rep["gtid"].(string)
 				eid, _ := rep["lteid"].(string)
@@ -123,7 +125,8 @@ var ext *ExtractTask
 //根据id区间抽取-udp模式
 func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 	defer qu.Catch()
-	if ext == nil {
+	if ext == nil || ju.IsUpdateRule {
+		ju.IsUpdateRule = false
 		ext = &ExtractTask{}
 		ext.Id = qu.ObjToString(ju.Config["udptaskid"])
 		ext.InitTaskInfo()
@@ -157,15 +160,11 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
 		ext.ResultSave(true)
 		ext.BidSave(true)
-		ext.IsRun = true
 		ext.InitFile()
+		ext.IsRun = true
+		ext.BidTotal = 0
 	} else {
 		ext.BidTotal = 0
-		if ju.IsUpdateSite && ext.IsExtractCity {
-			log.Debug()
-			ext.InitUpdateSite()
-			ju.IsUpdateSite = false
-		}
 	}
 	index := 0
 	if len(instanceId) > 0 { //分布式抽取进度

+ 26 - 0
src/jy/pretreated/analystep.go

@@ -462,6 +462,15 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, i
 			tablePackage[v] = blockPackage
 		}
 	}
+
+	//校验分包出来的候选人是否合理
+	if !isorderwiner && len(job.Winnerorder) > 0 {
+		if !verifyPackageWinnerOrder(job.Winnerorder) {
+			isorderwiner = true
+			job.Winnerorder = []map[string]interface{}{}
+		}
+	}
+
 	//处理中标人排序
 	if isorderwiner {
 		tmpWins := make(map[string]int)
@@ -601,6 +610,23 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job, i
 	}
 }
 
+//核查候选人字段是否合理
+func verifyPackageWinnerOrder(wins []map[string]interface{}) bool {
+	temp := map[string]string{}
+	for k, v := range wins {
+		if qu.IntAll(v["sort"]) != k+1 {
+			return false
+		}
+		entname := qu.ObjToString(v["entname"])
+		if temp[entname] == "" {
+			temp[entname] = entname
+		} else {
+			return false
+		}
+	}
+	return true
+}
+
 //一行多列 一列多行,按照分块逻辑处理
 //ration==1 遍历所有tabs,ration!=1 tabs只有一个
 func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {

+ 3 - 1
src/jy/pretreated/tablev2.go

@@ -252,7 +252,9 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite, tag str
 				}
 			}
 			if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
-				td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
+				if len(ts.WinnerOrder) != 3 { //覆盖候选人~方法调整
+					td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
+				}
 			}
 			if sonts.IsMultiPackage {
 				td.TR.Table.BPackage = true

+ 26 - 9
src/jy/util/util.go

@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"github.com/cron"
 	"go.mongodb.org/mongo-driver/bson/primitive"
-	"gopkg.in/mgo.v2"
 	. "jy/mongodbutil"
 	qu "qfw/util"
 	"regexp"
@@ -43,10 +42,9 @@ var BrandGet *DFA     //品牌
 var IsBrandGoods bool //是否开启品牌抽取
 
 var SaveResult, FieldsFind, IsSaveTag, SaveBlock, QualityAudit, Ffield, Inscribe bool
-var AddrsSess, QyxySess *mgo.Collection
-var Site_Mgo *MongodbSim
+var Site_Mgo, Qyxy_Mgo *MongodbSim
 
-var IsUpdateSite bool
+var IsUpdateRule bool
 var DefaultRegions, AdjustmentRegions = []string{}, []string{}
 
 func init() {
@@ -58,10 +56,8 @@ func UtilInit() {
 	addr := qu.ObjToString(Config["mgodb"])
 	dbname := qu.ObjToString(Config["dbname"])
 	Mgo = MgoFactory(initCap, initCap*3, 120, addr, dbname)
-	AddrsSess = Mgo.Get().DB(qu.ObjToString(Config["dbname_addrs"])).C(qu.ObjToString(Config["dbname_addrs_c"]))
-	QyxySess = Mgo.Get().DB(qu.ObjToString(Config["dbname_addrs"])).C("qyxy_std")
 
-	//站点爬虫库
+	//站点爬虫库~
 	Site_Mgo = &MongodbSim{
 		MongodbAddr: qu.ObjToString(Config["site_addr"]),
 		DbName:      qu.ObjToString(Config["site_dbname"]),
@@ -71,6 +67,15 @@ func UtilInit() {
 	}
 	Site_Mgo.InitPool()
 
+	Qyxy_Mgo = &MongodbSim{
+		MongodbAddr: qu.ObjToString(Config["qyxy_addr"]),
+		DbName:      qu.ObjToString(Config["qyxy_dbname"]),
+		Size:        5,
+		UserName:    qu.ObjToString(Config["qyxy_username"]),
+		Password:    qu.ObjToString(Config["qyxy_password"]),
+	}
+	Qyxy_Mgo.InitPool()
+
 	SaveResult, _ = Config["saveresult"].(bool)
 	FieldsFind, _ = Config["fieldsfind"].(bool)
 	IsSaveTag, _ = Config["iscltlog"].(bool)
@@ -87,10 +92,10 @@ func UtilInit() {
 	AdjustmentRegions = ConvertInterface(RegionsConfig["adjustment_regions"])
 
 	//定时更新站点信息
-	IsUpdateSite = false
+	IsUpdateRule = false
 	c := cron.New()
 	c.AddFunc("0 0 8 * * ?", func() {
-		IsUpdateSite = true
+		IsUpdateRule = true
 	})
 	c.Start()
 
@@ -249,3 +254,15 @@ func ConvertInterface(t interface{}) []string {
 	}
 	return p_list
 }
+
+func IsMarkInterfaceMap(t interface{}) []map[string]interface{} {
+	p_list := []map[string]interface{}{}
+	if yl_list_1, ok_1 := t.(primitive.A); ok_1 {
+		p_list = qu.ObjArrToMapArr(yl_list_1)
+	} else {
+		if yl_list_2, ok_2 := t.([]interface{}); ok_2 {
+			p_list = qu.ObjArrToMapArr(yl_list_2)
+		}
+	}
+	return p_list
+}

+ 9 - 21
src/main.go

@@ -33,35 +33,23 @@ func main() {
 	extract.ExtractUdp()              //udp通知抽取
 	go Router.Run(":" + qu.ObjToString(u.Config["port"]))
 	go log.Debug("启动..", qu.ObjToString(u.Config["port"]))
+
 	go func() {
 		http.ListenAndServe("localhost:10000", nil)
 	}()
+
 	lock := make(chan bool)
 	<-lock
 }
 
 //验证规则
 func testMain() {
-	var FilialeReg1 = regexp.MustCompile("(.{1,3})分(公司|院|校|行)$")
-	var FilialeReg2 = regexp.MustCompile(".*[((](.*)[))].*")
-	var FilialeReg3 = regexp.MustCompile(".*(集团|公司)(.*)公司")
+	new_detail := `4、成交单位于成交通知书发出后30日内与用户单位签订合同。
+上海大学
+2023-01-04
+点击下载《成交公告》`
+	var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会|体]))\n([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
+	new_str := inscribe_entity_1.FindString(new_detail)
+	log.Debug(new_str)
 
-	/*
-		1.**分公司、**分院、**分校、**分行前三个字符
-		2.括号里的字符
-		3.**集团**公司中间的字符、**公司**公司中间的字符
-	*/
-	buyer := "**公司河北公司"
-	if FilialeReg1.MatchString(buyer) {
-		new_str := FilialeReg1.FindString(buyer)
-		log.Debug(new_str)
-	}
-	if FilialeReg2.MatchString(buyer) {
-		new_str := FilialeReg2.ReplaceAllString(buyer, "${1}")
-		log.Debug(new_str)
-	}
-	if FilialeReg3.MatchString(buyer) {
-		new_str := FilialeReg3.ReplaceAllString(buyer, "${2}")
-		log.Debug(new_str)
-	}
 }

+ 4 - 18
src/mark

@@ -4,26 +4,12 @@
     "mgodb": "SJZY_RWExt_Other:SJZY%40E3X4t5O8th@172.17.145.163:27083",
     "dbsize": 3,
     "dbname": "extract_2021",
-    "dbname_addrs": "mixdata",
-    "dbname_addrs_c": "address_new_2020",
     "site_addr": "172.17.4.87:27080",
     "site_dbname": "editor",
-    "redis": "qyk_redis=172.17.4.87:1479",
-    "elasticsearch": "http://172.17.145.170:9800",
-    "elasticsearch_index": "winner_enterprise",
-    "elasticsearch_type": "winnerent",
-    "elasticsearch_db": "winner_enterprise",
-    "elasticsearch_buyer_index": "buyer_enterprise",
-    "elasticsearch_buyer_type": "buyerent",
-    "elasticsearch_buyer_db": "buyer_enterprise",
-    "elasticsearch_agency_index": "agency_enterprise",
-    "elasticsearch_agency_type": "agencyent",
-    "elasticsearch_agency_db": "agency_enterprise",
-    "redis_qyk": "qyk_redis",
-    "redis_winner_db": "1",
-    "redis_buyer_db": "2",
-    "redis_agency_db": "3",
-    "elasticPoolSize": 10,
+    "qyxy_addr": "172.17.145.163:27083,172.17.4.187:27082",
+    "qyxy_dbname": "mixdata",
+    "qyxy_username": "zhengkun",
+    "qyxy_password": "zk@123123",
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
     "saveresult": false,

+ 1 - 1
udps/main.go

@@ -27,7 +27,7 @@ func main() {
 	flag.StringVar(&id2, "lteid", "92446f91923488e1724735de", "lteid")
 
 	flag.StringVar(&ids, "ids", "", "id1,id2")
-	flag.StringVar(&stype, "stype", "biddingall", "stype,传递类型")
+	flag.StringVar(&stype, "stype", "", "stype,传递类型")
 	flag.StringVar(&bkey, "bkey", "", "bkey,加上此参数表示不生关键词和摘要")
 	flag.StringVar(&q, "q", "", "q查询语句\"{'':''}\",有q就不要gtid,lteid")
 	flag.StringVar(&param, "param", "", "param,生信息发布或其他索引时用双引号套单引号\"{'mgoaddr':'','d':'','c':'','index':'','type':''}\"")