fengweiqiang před 4 roky
rodič
revize
d9f9ff8c2e

+ 55 - 39
src/jy/extract/extract.go

@@ -216,9 +216,7 @@ func RunExtractTask(taskId string) {
 			//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 			//	continue
 			//}
-			if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时
-				continue
-			}
+			//根据标题判断是否抽取
 			b := IsExtract("title", qu.ObjToString(v["title"]), "")
 			if !b {
 				continue
@@ -328,6 +326,17 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		if (*toMap)["jsoncontent"] != nil {
 			delete(*toMap, "jsoncontent")
 		}
+		for k,v := range *toMap{
+			if _,ok := v.(float64);ok{
+				continue
+			}else if _,ok := v.(int64);ok{
+				continue
+			}else if _,ok2 := v.(string);ok2{
+				continue
+			}else {
+				delete(*toMap,k)
+			}
+		}
 	}
 	j = &ju.Job{
 		SourceMid:      qu.BsonIdToSId(doc["_id"]),
@@ -459,12 +468,21 @@ func file2text(doc *map[string]interface{}) {
 
 //抽取
 func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
+
 	e.ExtractDetail(j, isSite, j.SpiderCode)
 	if jf != nil && jf.IsFile {
-		e.ExtractFile(jf, isSite, j.SpiderCode)
-		for tmpk, _ := range jf.Result {
+		e.ExtractDetail(jf, isSite, j.SpiderCode)
+		for tmpk, xs := range jf.Result {
 			if len(j.Result[tmpk]) == 0 {
+				if tmpk == "budget" || tmpk == "bidamount" {
+					for _, v := range xs {
+						if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
+							j.Result[tmpk] = append(j.Result[tmpk], v)
+						}
+					}
+				} else {
 					j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
+				}
 			}
 		}
 		if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
@@ -1455,6 +1473,12 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
 						return
 					}
 					text := qu.ObjToString(v.Value)
+					if v.Field == "bidamount" || v.Field == "budget" {
+						if strings.Contains(qu.ObjToString(v.SourceValue), "费率") {
+							j.Result[in.Field][k].IsTrue = false
+							continue
+						}
+					}
 					if text != "" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
 					}
@@ -1825,7 +1849,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(j.Winnerorder) > 0 { //候选人信息
 			for i, v := range j.Winnerorder {
 				if v["price"] != nil {
-					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode)[0]				}
+					tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney)
+					if tmpPrice[len(tmpPrice)-1].(bool) {
+						j.Winnerorder[i]["price"] = tmpPrice[0]
+					} else {
+						delete(j.Winnerorder[i], "price")
+					}
+				}
 			}
 			tmp["winnerorder"] = j.Winnerorder
 		}
@@ -1839,12 +1869,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					if v.Score > -1 {
 						ffield[v.Field] = v.Value
 						if tmp[v.Field] == nil {
-							if v.Field == "budget" || v.Field == "bidamount" {
-								if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
-									tmp[v.Field] = v.Value
-								}
-							} else {
+							if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue {
 								tmp[v.Field] = v.Value
+								break
 							}
 						}
 						break
@@ -1946,7 +1973,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 						kvtext.WriteString(jv_k)
 						kvtext.WriteString(":")
 						kvtext.WriteString(jv_vv.Value)
-						kvtext.WriteString(" ")
+						kvtext.WriteString("\n")
 					}
 				}
 			}
@@ -1975,21 +2002,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["projectname"] = j.Title
 		}
 		tmp["repeat"] = 0
+		if ju.Ffield {
+			if len(ffield) > 0 {
+				tmp["ffield"] = ffield
+			}
+		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
-				/*	if len(e.SiteFields) <= 0 {
-						//for field, _ := range e.Fields {
-						//	if tmp[field] == nil &&  {
-						//		tmp[field] = "" //覆盖之前版本数据
-						//	}
-						//}
-					} else {
-						//for field, _ := range e.SiteFields {
-						//	if tmp[field] == nil &&{
-						//		tmp[field] = "" //覆盖之前版本数据
-						//	}
-						//}
-					}*/
 				tmparr := []map[string]interface{}{
 					map[string]interface{}{
 						"_id": qu.StringTOBsonId(_id),
@@ -2018,19 +2037,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 		} else { //测试结果
 			delete(tmp, "_id")
-			//auxinfo := auxInfo(j)
-			//auxinfof := auxInfo(jf)
-			//if len(auxinfo) > 0 {
-			//	tmp["fieldall"] = auxinfo
-			//}
-			//if len(auxinfof) > 0 {
-			//	tmp["fieldallf"] = auxinfof
-			//}
-			if ju.Ffield {
-				if len(ffield) > 0 {
-					tmp["ffield"] = ffield
-				}
-			}
 			delete(tmp, "fieldall")
 			if len(j.BlockPackage) > 0 { //分包详情
 				if len(j.BlockPackage) > 10 {
@@ -2410,7 +2416,17 @@ func resetWinnerorder(j *ju.Job) {
 	} else if len(bidamounts) > 0 {
 		j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...)
 	}
-	
+	if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 {
+		winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
+		j.Result["winner"] = winners
+		if j.Winnerorder[0]["price"] != nil {
+			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney)
+			if tmpPrice[len(tmpPrice)-1].(bool) {
+				bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true})
+			}
+			j.Result["bidamount"] = bidamounts
+		}
+	}
 }
 func RemoveReplicaSliceString(slc []string) []string {
 	result := make([]string, 0)

+ 1 - 1
src/jy/extract/extractudp.go

@@ -161,7 +161,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				//	log.Debug(index, qu.BsonIdToSId(v["_id"]), "//去除含敏感词数据")
 				//	continue
 				//}
-				if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时开标记录
+				if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" || "a_hbszbtbggfwpt_kbjl" == qu.ObjToString(v["spidercode"]) { //临时开标记录
 					log.Debug(index, qu.BsonIdToSId(v["_id"]), "//开标记录")
 					continue
 				}

+ 19 - 18
src/jy/extract/score_jsondata.go

@@ -24,7 +24,7 @@ var endOfPunctuationClrear = regexp.MustCompile("[,,.。??;;]+$")
 var keysClrear = regexp.MustCompile("(详见|公告|X|内文|某单位|某部|文件|\\*|暂无|?|\\?)")
 
 //jsondata清理
-func clearJd(jd *map[string]interface{}, e *ExtractTask, spiderCode,IsClearnMoney string) {
+func clearJd(jd *map[string]interface{}, e *ExtractTask, spiderCode string, isclearnMoney string) {
 	for k, v := range *jd {
 		if k == "buyer" || k == "winner" || k == "agency" || k == "projectcode" || k == "projectname" {
 			vstring := util2.ObjToString(v)
@@ -37,7 +37,7 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask, spiderCode,IsClearnMone
 			cfn := e.ClearFn[k]
 			lockclear.Unlock()
 			if len(cfn) > 0 {
-				data := clear.DoClearFn(cfn, []interface{}{vstring, ""}, spiderCode,IsClearnMoney)
+				data := clear.DoClearFn(cfn, []interface{}{vstring, ""}, spiderCode, isclearnMoney)
 				lockclear.Lock()
 				if clear.AsyField[k] != nil || clear.SymField[k] != nil || clear.MesField[k] != nil {
 					vstring = clear.OtherClean(k, util2.ObjToString(data[0]))
@@ -47,7 +47,6 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask, spiderCode,IsClearnMone
 			vstring = htmlclrear.ReplaceAllString(vstring, "")
 			vstring = endOfParenthesesClrear.ReplaceAllString(vstring, "")
 			vstring = endOfPunctuationClrear.ReplaceAllString(vstring, "")
-			vstring = keysClrear.ReplaceAllString(vstring, "")
 			if utf8.RuneCountInString(vstring) < 5 {
 				delete(*jd, k)
 				continue
@@ -85,7 +84,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode,j.IsClearnMoney)
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode, j.IsClearnMoney)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1, IsTrue: newNum[len(newNum)-1].(bool)})
@@ -105,7 +104,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if bt, ok := (*j.Jsondata)[v].(float64); ok && bt > 0 {
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: bt, Score: 0.1})
 				} else {
-					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode)
+					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode, j.IsClearnMoney)
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1})
 				}
 				j.Result[v] = extFields
@@ -203,21 +202,23 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode,j.IsClearnMoney)
-				//if util2.IntAll(newNum[0]) != 0 {
-				extFields := make([]*util.ExtField, 0)
-				if jdextweight > 1 {
-					if oneScore < 0 {
-						oneScore = 0.1
-					}
-					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: oneScore + 1, IsTrue: newNum[len(newNum)-1].(bool)})
-				} else {
-					if oneScore < 0 {
-						oneScore = 0.1
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""}, j.SpiderCode, j.IsClearnMoney)
+				if newNum[len(newNum)-1].(bool) {
+					//if util2.IntAll(newNum[0]) != 0 {
+					extFields := make([]*util.ExtField, 0)
+					if jdextweight > 1 {
+						if oneScore < 0 {
+							oneScore = 0.1
+						}
+						extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: oneScore + 1, IsTrue: newNum[len(newNum)-1].(bool)})
+					} else {
+						if oneScore < 0 {
+							oneScore = 0.1
+						}
+						extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: oneScore, IsTrue: newNum[len(newNum)-1].(bool)})
 					}
-					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: oneScore, IsTrue: newNum[len(newNum)-1].(bool)})
+					j.Result[v] = append(j.Result[v], extFields...)
 				}
-				j.Result[v] = append(j.Result[v], extFields...)
 				continue
 			}