Parcourir la source

11月1日 , 阶段告一段落 , 留念

zhengkun il y a 1 an
Parent
commit
282d3c8233

+ 21 - 0
src/jy/extract/extractcheck.go

@@ -61,6 +61,17 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 		tmp["bidamount_threshold"] = bg
 		delete(tmp, "bidamount")
 	}
+
+	//对分包存储校验···package
+	if tmp["package"] != nil {
+		if isExistsPackage(tmp["package"].(map[string]map[string]interface{})) {
+			tmp["is_exist_package"] = true
+		} else {
+			tmp["package_c"] = tmp["package"]
+			delete(tmp, "package")
+		}
+	}
+
 	//对于单位,金额与候选信息进行相互校验与选取
 	if winner := qu.ObjToString(tmp["winner"]); winner != "" {
 		if winnerorder := ju.IsMarkInterfaceMap(tmp["winnerorder"]); len(winnerorder) > 0 {
@@ -187,6 +198,16 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 	delete(tmp, "biddiscount_up")
 	delete(tmp, "biddiscount_down")
 
+	//budget bidamount 阈值限定再次
+	if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
+		tmp["budget_threshold"] = bg
+		delete(tmp, "budget")
+	}
+	if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
+		tmp["bidamount_threshold"] = bg
+		delete(tmp, "bidamount")
+	}
+
 	//检查剑鱼发布-爬虫
 	jyfb_data := *qu.ObjToMap(j_data["jyfb_data"])
 	if jyfb_data != nil {

+ 3 - 0
src/jy/extract/extractinit.go

@@ -152,6 +152,7 @@ type SiteCity struct {
 	D string //区全称
 	T string //站点类型
 	Q string //企业地域
+	B string //所属企业
 }
 
 type ClearTaskInfo struct {
@@ -1021,6 +1022,7 @@ func InitSite() []map[string]interface{} {
 		"f_district": 1,
 		"site_type":  1,
 		"area":       1,
+		"site_buyer": 1,
 	})
 	return list
 	//list, _ := db.Mgo.Find("site", query, nil, `{"site":1,"area":1,"city":1,"district":1}`, false, -1, -1)
@@ -1108,6 +1110,7 @@ func (e *ExtractTask) InitUpdateSite() {
 			D: qu.ObjToString(v["f_district"]),
 			T: qu.ObjToString(v["site_type"]),
 			Q: qu.ObjToString(v["area"]),
+			B: qu.ObjToString(v["site_buyer"]),
 		}
 		e.SiteCityMap[site] = s
 	}

+ 6 - 6
src/jy/extract/extractsave.go

@@ -187,7 +187,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					}
 					tmp["budget"] = tmpBudget
 				}
-
 				if qu.Float64All(tmp["agencyfee"]) < tmpAgencyfee {
 					fieldSource["agencyfee"] = map[string]interface{}{
 						"ext_type": "",
@@ -196,7 +195,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					}
 					tmp["agencyfee"] = tmpAgencyfee
 				}
-
 				if qu.Float64All(tmp["bidamount"]) < tmpBidamount {
 					//特殊爬虫分包金额-不要
 					if j.SpiderCode == "sx_sxgzszcgxt_fzb_cjhxrgs_bu" && qu.Float64All(tmp["bidamount"]) > 0.0 {
@@ -210,7 +208,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 						tmp["bidamount"] = tmpBidamount
 					}
 				}
-
 				//if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) {
 				//	fieldSource["bidamount"] = map[string]interface{}{
 				//		"ext_type": "",
@@ -238,7 +235,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 						}
 					}
 				}
-
 				if tmp["agencyfee"] == nil || tmp["agencyfee"] == 0 {
 					for _, v := range j.PackageInfo {
 						if v["agencyfee"] != nil {
@@ -251,7 +247,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 						}
 					}
 				}
-
 				if tmp["bidamount"] == nil || tmp["bidamount"] == 0 {
 					for _, v := range j.PackageInfo {
 						if v["bidamount"] != nil {
@@ -396,6 +391,12 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if ju.QualityAudit {
 			e.QualityAudit(tmp)
 		}
+		//站点所有单位补充···
+		if site := e.SiteCityMap[j.Site]; site != nil && qu.ObjToString(tmp["buyer"]) == "" {
+			if site.B != "" {
+				tmp["buyer"] = site.B
+			}
+		}
 		//落款等文本识别
 		jf_text := ""
 		if jf != nil {
@@ -406,7 +407,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		e.getQualifications(&tmp, *j.Data)
 		//城市抽取
 		if e.IsExtractCity {
-			//e.NewExtractCity(j, &tmp) //旧版
 			e.ExtractRegionInfo(j, jf, &tmp, true)
 			e.ExtractRegionClean(&tmp)
 		}

+ 47 - 27
src/jy/extract/extraxtmethod.go

@@ -108,17 +108,17 @@ var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[:
 var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
 
 // 包含字母的实体单位
-var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体])$")
+var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
 
 // 落款单位抽取
-var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会|体]))\n([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
-var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会|体]))([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
+var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*[\n]+([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
+var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
 
 // 特殊实体
-var inscribe_entity_3 = regexp.MustCompile("(招标组织部门|招标机构)[::]([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会|体]))")
+var inscribe_entity_3 = regexp.MustCompile("(招标组织部门|招标机构)[::]([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府))")
 
 // 有效企业
-var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体])$")
+var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委员会|办公室|车务段|机构|企业|设计|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$")
 
 // 发布时间识别
 var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
@@ -253,6 +253,25 @@ func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool {
 	return false
 }
 
+// 判断-附件分包是否无效判定(不通用)
+func isExistsPackage(pkg map[string]map[string]interface{}) bool {
+	if pkg == nil || len(pkg) == 0 {
+		return false
+	}
+	if len(pkg) == 1 {
+		for _, v := range pkg {
+			winner := qu.ObjToString(v["winner"])
+			budget := qu.Float64All(v["budget"])
+			bidamout := qu.Float64All(v["bidamount"])
+			if winner != "" || budget > float64(0) || bidamout > float64(0) {
+				return true
+			}
+		}
+		return false
+	}
+	return true
+}
+
 // getQualifications 添加所有资质新字段
 func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) {
 	/**
@@ -274,7 +293,7 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
 		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
 		if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]), *tmp); new_buyer != "" {
 			(*tmp)["buyer"] = new_buyer
-			(*tmp)["inscribe_buyer"] = "落款实体"
+			(*tmp)["inscribe_buyer"] = "落款结构实体"
 		}
 	}
 	//落款特殊实体
@@ -286,13 +305,13 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
 		}
 	}
 	//实体服务识别
-	//if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
-	//	!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
-	//	if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
-	//		(*tmp)["buyer"] = new_buyer
-	//		(*tmp)["inscribe_buyer"] = "实体识别"
-	//	}
-	//}
+	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
+		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
+		if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
+			(*tmp)["buyer"] = new_buyer
+			(*tmp)["inscribe_buyer"] = "实体识别服务"
+		}
+	}
 	//拟建不能存buyer
 	if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
 		qu.ObjToString((*tmp)["subtype"]) == "拟建" {
@@ -349,6 +368,11 @@ func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface
 	title := qu.ObjToString(tmp["title"])
 	winner := qu.ObjToString(tmp["winner"])
 	agency := qu.ObjToString(tmp["agency"])
+	toptype := qu.ObjToString(tmp["toptype"])
+	//采用-标题项目名称
+	if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
+		return new_str
+	}
 	if !entdfa_filtration.MatchString(title) {
 		//采用-排除表格的文本识别
 		new_detail := pretreated.TextAfterRemoveTable(detail)
@@ -359,22 +383,18 @@ func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface
 		if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
 			return new_str
 		}
-
-		//采用-去除标签的纯文本(含表格)
-		new_detail = pretreated.HtmlToText(detail)
-		new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
-		if len(new_detail) > 500 {
-			new_detail = new_detail[len(new_detail)-500:]
-		}
-		if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
-			return new_str
+		if toptype != "结果" {
+			//采用-去除标签的纯文本(含表格)
+			new_detail = pretreated.HtmlToText(detail)
+			new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
+			if len(new_detail) > 500 {
+				new_detail = new_detail[len(new_detail)-500:]
+			}
+			if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
+				return new_str
+			}
 		}
 	}
-	//采用-标题项目名称
-	if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
-		return new_str
-	}
-
 	//采用-附件识别
 	if !entdfa_filtration.MatchString(title) {
 		if len(jf_detail) > 500 {

+ 1 - 1
src/jy/extract/score.go

@@ -27,7 +27,7 @@ var (
 	CommonScore     map[string]float64
 	FieldsScore     map[string]map[string]float64
 	lengthValidReg0 = regexp.MustCompile(`(金额|单价)`)
-	lengthValidReg1 = regexp.MustCompile(`^(.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$`)
+	lengthValidReg1 = regexp.MustCompile(`^(.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位|公司)|(联通|移动|电信))|某部|某单位|某公司)$`)
 	lengthValidReg2 = regexp.MustCompile(`([,,、])`)
 	lengthValidReg3 = regexp.MustCompile(`(.{4,20}公司)`)
 )

+ 67 - 2
src/jy/extract/score_jsondata.go

@@ -5,6 +5,7 @@ import (
 	"jy/clear"
 	"jy/util"
 	"log"
+	qu "qfw/common/src/qfw/util"
 	util2 "qfw/util"
 	"regexp"
 	"strings"
@@ -48,8 +49,12 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask, spiderCode string, iscl
 			vstring = endOfParenthesesClrear.ReplaceAllString(vstring, "")
 			vstring = endOfPunctuationClrear.ReplaceAllString(vstring, "")
 			if utf8.RuneCountInString(vstring) < 5 {
-				delete(*jd, k)
-				continue
+				if lengthValidReg1.MatchString(vstring) && (k == "buyer" || k == "winner") {
+					//非指定文本-不过滤
+				} else {
+					delete(*jd, k)
+					continue
+				}
 			}
 			if utf8.RuneCountInString(vstring) > 35 {
 				delete(*jd, k)
@@ -138,6 +143,21 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 							}
 							score := util2.Float64All(ranges[2])
 							if valueLen > gt && valueLen <= lte {
+								//特殊长度不进行减分操作
+								if score < 0.0 && (v == "winner" || v == "buyer") {
+									tmpValue := fmt.Sprint(tmpExtField.Value)
+									if IsPersonName(tmpValue) && !lengthValidReg0.MatchString(tmpValue) {
+										continue
+									}
+									if lengthValidReg1.MatchString(tmpValue) && gt == 0 && lte == 4 {
+										continue
+									}
+									if lengthValidReg2.MatchString(tmpValue) {
+										if isMultiSupplier(tmpValue) {
+											continue
+										}
+									}
+								}
 								tmpExtField.Score += score
 								break
 							}
@@ -153,6 +173,21 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 							if p["regexp"] != nil {
 								reg := p["regexp"].(*regexp.Regexp)
 								if reg.MatchString(util2.ObjToString(tmpExtField.Value)) {
+									//不进行减分操作
+									if qu.Float64All(p["score"]) < 0.0 && (v == "winner" || v == "buyer") {
+										tmpValue := fmt.Sprint(tmpExtField.Value)
+										if IsPersonName(tmpValue) && !lengthValidReg0.MatchString(tmpValue) {
+											return
+										}
+										if lengthValidReg1.MatchString(tmpValue) {
+											return
+										}
+										if lengthValidReg2.MatchString(tmpValue) {
+											if isMultiSupplier(tmpValue) {
+												return
+											}
+										}
+									}
 									tmpExtField.Score += util2.Float64All(p["score"])
 								}
 							}
@@ -278,6 +313,21 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 									}
 									score := util2.Float64All(ranges[2])
 									if valueLen > gt && valueLen <= lte {
+										//特殊长度不进行减分操作
+										if score < 0.0 && (k == "winner" || k == "buyer") {
+											tmpValue := fmt.Sprint(tmpsvalue.Value)
+											if IsPersonName(tmpValue) && !lengthValidReg0.MatchString(tmpValue) {
+												continue
+											}
+											if lengthValidReg1.MatchString(tmpValue) && gt == 0 && lte == 4 {
+												continue
+											}
+											if lengthValidReg2.MatchString(tmpValue) {
+												if isMultiSupplier(tmpValue) {
+													continue
+												}
+											}
+										}
 										v[i].Score += score
 										v[i].ScoreItem = append(v[i].ScoreItem, &util.ScoreItem{Des: "JsonData长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
 										break
@@ -294,6 +344,21 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 									if p["regexp"] != nil {
 										reg := p["regexp"].(*regexp.Regexp)
 										if reg.MatchString(util2.ObjToString(tmpsvalue.Value)) {
+											//不进行减分操作
+											if qu.Float64All(p["score"]) < 0.0 && (k == "winner" || k == "buyer") {
+												tmpValue := fmt.Sprint(tmpsvalue.Value)
+												if IsPersonName(tmpValue) && !lengthValidReg0.MatchString(tmpValue) {
+													return
+												}
+												if lengthValidReg1.MatchString(tmpValue) {
+													return
+												}
+												if lengthValidReg2.MatchString(tmpValue) {
+													if isMultiSupplier(tmpValue) {
+														return
+													}
+												}
+											}
 											v[i].Score += util2.Float64All(p["score"])
 											v[i].ScoreItem = append(v[i].ScoreItem, &util.ScoreItem{Des: "JsonData负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: util2.Float64All(p["score"])})
 										}

+ 4 - 0
src/main.go

@@ -44,3 +44,7 @@ func testMain() {
 	text := con[1:2]
 	log.Debug(text)
 }
+
+func testPostDfa() {
+	//http://extcity.spdata.jianyu360.com/service/entity/test?text=我是正文开滦(集团)有限责任公司
+}