Browse Source

字段规则-单位-备份

zhengkun 3 years ago
parent
commit
e8f83326b9

+ 9 - 4
data_quality/src/main.go

@@ -93,10 +93,15 @@ func startFieldScoreTask(mapInfo map[string]interface{}) {
 		}
 		//更新集合
 		update_dict := make(map[string]interface{},0)
-		b_s := buyerFieldScore(tmp)
-		w_s := winnerFieldScore(tmp)
-		update_dict["buyer"] = b_s
-		update_dict["winner"] = w_s
+
+		//subtype := qu.ObjToString(tmp["subtype"])
+		//b_score := buyerFieldScore(tmp)
+		//update_dict["buyer"] = b_score
+		//if subtype=="中标"||subtype=="成交"||subtype=="合同" {
+		//	w_score := winnerFieldScore(tmp)
+		//	update_dict["winner"] = w_score
+		//}
+
 
 		updateFieldScore = append(updateFieldScore, []map[string]interface{}{
 			map[string]interface{}{

+ 4 - 3
data_quality/src/method.go

@@ -9,8 +9,8 @@ import (
 
 var specHeadReg *regexp.Regexp = regexp.MustCompile("^([a-zA-Z]{1,2}[\u4e00-\u9fa5]{6,}|某部|州|自治区|自治州|街道|名称|省|市|县|区|业绩|资格|中标|项目|预算单位)")
 var unHanHeadReg *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5])")
-var unConReg *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处)")
-var unEndReg *regexp.Regexp = regexp.MustCompile("^.*(公司|学(校)?|博物馆|联合社|合作社|监狱|办公厅|电视台|集团|机构|企业|办公室|委员会|联社|厂|场|院|所|店|中心|局|站|城|处|行|科|部|队|联合(会|体)|工作室)$")
+var unConReg *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处|校)")
+var unEndReg *regexp.Regexp = regexp.MustCompile("^.*(公司|学(校)?|博物馆|联合社|合作社|监狱|办公厅|电视台|集团|机构|企业|办公室|委员会|实验室|联社|厂|场|院|所|店|小|台|中心|局|站|城|馆|厅|处|行|科|部|队|联合(会|体)|工作室)$")
 var unenableReg1 *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5]{1,2}(责任|有限|有限股份|有限责任|实业)公司|.*(某部|先生|女士|小姐)|工程技术处)$")
 var unenableReg2 *regexp.Regexp = regexp.MustCompile("(\\?|?|单位|#|xxxx|\\*\\*|%|万元|设计企业|免费|代表|代码标识|盖电子|测试测试|删除|错误|吊销|注销|发起人|待清理|&#|护照号|身份证号|\" +\n\t\"法人|&nbsp|国家拨入|借款|积累资金|认股人|--|、|&|`|美元)")
 var GSE *gse.Segmenter  = &gse.Segmenter{}       //分词
@@ -26,9 +26,10 @@ func qyNameIsExistsQYXY(name string) bool{
 		"company_name": name,
 	}
 	data :=qy_mgo.FindOne(qy_coll_name,q)
-	if data==nil {
+	if data==nil || len(data)<2{
 		return false
 	}
+
 	return true
 }
 

+ 20 - 6
data_quality/src/scorebuyer.go

@@ -6,39 +6,49 @@ import (
 )
 
 
-func buyerFieldScore(tmp map[string]interface{}) (int64) {
+func buyerFieldScore(tmp map[string]interface{}) (map[string]interface{}) {
 	buyer := qu.ObjToString(tmp["buyer"])
 	agency := qu.ObjToString(tmp["agency"])
 	winner := qu.ObjToString(tmp["winner"])
 	score := int64(100)
-
+	reason:=""
 	/*错误项*/
-	if buyer=="" || !isIncludingHan(buyer) || utf8.RuneCountInString(buyer) < 4{
-		return 0
+	if (utf8.RuneCountInString(buyer) > 0 && utf8.RuneCountInString(buyer) < 4 ) ||
+		buyer=="" || !isIncludingHan(buyer) {
+		return map[string]interface{}{
+			"score":int64(0),
+			"reason":"错误项",
+		}
 	}
 	/*减分项*/
 	//1、企业库匹配
 	if !isTest && !qyNameIsExistsQYXY(buyer)  {
-		score -= 2
+		reason+="~企业-1"
+		score -= 1
 	}
 	//2、前缀校验
 	if specHeadReg.MatchString(buyer) || !unHanHeadReg.MatchString(buyer) {
+		reason+="~前缀-2"
 		score -= 2
 	}
 	//3、后缀校验
 	if unConReg.MatchString(buyer) || unEndReg.MatchString(buyer) {
 		if unenableReg1.MatchString(buyer) || unenableReg2.MatchString(buyer) {
+			reason+="~略特殊-2"
 			score -= 2
 		}
 	}else {
+		reason+="~后缀-2"
 		score -= 2
 	}
 	//4、与其他单位比对
 	if buyer==agency || buyer==winner {
+		reason+="~其他单位-2"
 		score -= 2
 	}
 	//5、中英文结合
 	if isIncludingOtherHan(buyer){
+		reason+="~非纯中文-2"
 		score -= 2
 	}
 
@@ -47,10 +57,14 @@ func buyerFieldScore(tmp map[string]interface{}) (int64) {
 	if len(buyer_jb_arr)>0 && buyer_jb_arr!=nil {
 		head_char := qu.ObjToString(buyer_jb_arr[0])
 		if utf8.RuneCountInString(head_char) == 1{
+			reason+="~分词-2"
 			score -= 2
 		}
 	}
 
-	return score
+	return map[string]interface{}{
+		"score":score,
+		"reason":reason,
+	}
 }
 

+ 25 - 11
data_quality/src/scorewinner.go

@@ -6,48 +6,58 @@ import (
 	"unicode/utf8"
 )
 
-func winnerFieldScore(tmp map[string]interface{}) (int64) {
+func winnerFieldScore(tmp map[string]interface{}) (map[string]interface{}) {
 	s_winner := qu.ObjToString(tmp["s_winner"])
 	buyer := qu.ObjToString(tmp["buyer"])
 	agency := qu.ObjToString(tmp["agency"])
-	subtype := qu.ObjToString(tmp["subtype"])
 	package_map:=*qu.ObjToMap(tmp["package"])
 	score := int64(100)
+	reason := ""
 	s_winner_arr := strings.Split(s_winner,",")
-	if len(s_winner_arr)!=len(package_map) {
-		score -= 2
+	if len(s_winner_arr)!=len(package_map) && len(package_map)>0 {
+		reason+="~分包量-1"
+		score -= 1
 	}
 
 	for _,winner:=range s_winner_arr{
 		/*错误项*/
-		if ((subtype=="中标"||subtype=="成交") && winner=="") || !isIncludingHan(buyer) ||
-			(utf8.RuneCountInString(winner) > 0 && utf8.RuneCountInString(winner) < 4 ) {
-			return 0
+		if (utf8.RuneCountInString(winner) > 0 && utf8.RuneCountInString(winner) < 4 ) ||
+			winner=="" || !isIncludingHan(winner) {
+			return map[string]interface{}{
+				"score":int64(0),
+				"reason":"错误项",
+			}
 		}
 
 		/*减分项*/
 		//1、企业库匹配
-		if !isTest && !qyNameIsExistsQYXY(buyer){
-			score -= 2
+		if !isTest && !qyNameIsExistsQYXY(winner){
+			reason+="~企业-1"
+			score -= 1
 		}
 		//2、前缀校验
 		if specHeadReg.MatchString(winner) || !unHanHeadReg.MatchString(winner) {
+			reason+="~前缀-2"
 			score -= 2
 		}
 		//3、后缀校验
 		if unConReg.MatchString(winner) || unEndReg.MatchString(winner) {
 			if unenableReg1.MatchString(winner) || unenableReg2.MatchString(winner) {
+				reason+="~略特殊-2"
 				score -= 2
 			}
 		}else {
+			reason+="~后缀-2"
 			score -= 2
 		}
 		//4、与其他单位比对
 		if (winner==agency || winner==buyer) && winner !="" {
+			reason+="~其他单位-2"
 			score -= 2
 		}
 		//5、中英文结合
-		if isIncludingOtherHan(buyer){
+		if isIncludingOtherHan(winner){
+			reason+="~非纯中文-2"
 			score -= 2
 		}
 
@@ -56,9 +66,13 @@ func winnerFieldScore(tmp map[string]interface{}) (int64) {
 		if len(winner_jb_arr)>0 && winner_jb_arr!=nil {
 			head_char := qu.ObjToString(winner_jb_arr[0])
 			if utf8.RuneCountInString(head_char) == 1{
+				reason+="~分词-2"
 				score -= 2
 			}
 		}
 	}
-	return score
+	return map[string]interface{}{
+		"score":score,
+		"reason":reason,
+	}
 }

+ 7 - 7
src/jy/cluster/ssh.go

@@ -47,13 +47,13 @@ func ssHConnect(user, password, host string, port int) (*ssh.Session, error) {
 var sshstr = `
 #!/bin/bash
 cd /opt
-kill -9 $(pidof extract_v3)
-rm -rf extract_v3*
-wget http://172.17.145.179:9080/res/extract_v3.tgz
-tar -xzvf extract_v3.tgz
-cd /opt/extract_v3
-chmod 777 extract_v3
-nohup ./extract_v3 >/opt/extract_v3/nohup 2>&1 &
+kill -9 $(pidof extract_fbs)
+rm -rf extract_fbs*
+wget http://172.17.145.179:9080/res/extract_fbs.tgz
+tar -xzvf extract_fbs.tgz
+cd /opt/extract_fbs
+chmod 777 extract_fbs
+nohup ./extract_fbs >/opt/extract_fbs/nohup 2>&1 &
 exit
 `
 

+ 20 - 2
src/jy/extract/extractudp.go

@@ -146,9 +146,15 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				//	continue
 				//}
 				if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
-					//log.Debug(index, qu.BsonIdToSId(v["_id"]), "//开标记录")
+					log.Debug(index, qu.BsonIdToSId(v["_id"]), "//开标记录")
+					continue
+				}
+				if qu.ObjToString(v["subtype"])!="中标" &&
+					qu.ObjToString(v["subtype"])!="成交" &&
+					qu.ObjToString(v["subtype"])!="合同" {
 					continue
 				}
+
 				var j, jf *ju.Job
 				var isSite bool
 				if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
@@ -163,9 +169,21 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 			}
 			list2, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl+"_back", query, nil, Fields, false, -1, -1)
 			for _, v := range *list2 {
-				if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+				//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
+				//	continue
+				//}
+
+				if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
+					log.Debug(index, qu.BsonIdToSId(v["_id"]), "//开标记录")
 					continue
 				}
+				if qu.ObjToString(v["subtype"])!="中标" &&
+					qu.ObjToString(v["subtype"])!="成交" &&
+					qu.ObjToString(v["subtype"])!="合同" {
+					continue
+				}
+
+
 				var j, jf *ju.Job
 				var isSite bool
 				if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {