Browse Source

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

maxiaoshan 5 years ago
parent
commit
35f77cb941
3 changed files with 59 additions and 1 deletions
  1. 2 0
      src/jy/extract/extract.go
  2. 52 1
      src/jy/extract/score_jsondata.go
  3. 5 0
      src/res/fieldscore.json

+ 2 - 0
src/jy/extract/extract.go

@@ -2091,6 +2091,8 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 		marshalbt, _ := json.Marshal(j.Jsondata)
 		tmpjddata := make(map[string]interface{})
 		json.Unmarshal(marshalbt,&tmpjddata)
+		//jsondata清理
+		clearJd(j.Jsondata)
 		for _, jdkey := range ju.JsonData {
 			if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
 				for tmpk, tmpv := range j.Result[jdkey][:5] {

+ 52 - 1
src/jy/extract/score_jsondata.go

@@ -11,9 +11,60 @@ import (
 	"unicode/utf8"
 )
 
+//清理html
+var htmlclrear = regexp.MustCompile("</?[^>]+>")
+
+//清理括号结尾
+var endOfParenthesesClrear = regexp.MustCompile(`[\(\[(【\{]+[\\u4e00-\\u9fa5\s]*[\))\]】\}]+$`)
+
+//清理标点符合结尾
+var endOfPunctuationClrear = regexp.MustCompile("[,,.。??;;]+$")
+
+//清理关键字
+var keysClrear = regexp.MustCompile("(详见|公告|X|内文|某单位|某部|文件|\\*|暂无|?|\\?)")
+
+//jsondata清理
+func clearJd(jd *map[string]interface{}, e *ExtractTask) {
+	for k, v := range *jd {
+		if k == "buyer" || k == "winner" || k == "agency" || k == "projectcode" || k == "projectname" {
+			vstring := util2.ObjToString(v)
+			if vstring == "" {
+				delete(*jd, k)
+				continue
+			}
+			//函数清理
+			lockclear.Lock()
+			cfn := e.ClearFn[k]
+			lockclear.Unlock()
+			if len(cfn) > 0 {
+				data := clear.DoClearFn(cfn, []interface{}{vstring, ""})
+				lockclear.Lock()
+				if clear.AsyField[k] != nil || clear.SymField[k] != nil || clear.MesField[k] != nil {
+					vstring = clear.OtherClean(k, util2.ObjToString(data[0]))
+				}
+				lockclear.Unlock()
+			}
+			vstring = htmlclrear.ReplaceAllString(vstring, "")
+			vstring = endOfParenthesesClrear.ReplaceAllString(vstring, "")
+			vstring = endOfPunctuationClrear.ReplaceAllString(vstring, "")
+			vstring = keysClrear.ReplaceAllString(vstring, "")
+			if utf8.RuneCountInString(vstring) < 5 {
+				delete(*jd, k)
+				continue
+			}
+			if utf8.RuneCountInString(vstring) > 35 {
+				delete(*jd, k)
+				continue
+			} else {
+				(*jd)[k] = vstring
+			}
+		}
+	}
+}
+
 func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.ExtField {
 	jdextweight := util2.IntAll((*j.Jsondata)["extweight"])
-	if jdextweight==0{
+	if jdextweight == 0 {
 		return j.Result
 	}
 	tmps := make(map[string][]*util.ExtField)

+ 5 - 0
src/res/fieldscore.json

@@ -175,6 +175,11 @@
                 "regstr": "(代表|招标|交易中心|顾问|单位)",
                 "score": -5
             },
+            {
+                "describe": "包含特殊符号",
+                "regstr": "(-|—)",
+                "score": -50
+            },
             {
                 "describe": "包含负分不再展示",
                 "regstr": "(详见|提出|面向|施工)",