Bläddra i källkod

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

maxiaoshan 5 år sedan
förälder
incheckning
e32974c104

+ 2 - 2
src/config.json

@@ -2,8 +2,8 @@
     "port": "9090",
     "mgodb": "192.168.3.207:27092",
     "dbsize": 3,
-    "dbname": "extract_v3",
-    "redis": "qyk_redis=127.0.0.1:6379",
+    "dbname": "extract_v3xs",
+    "redis": "qyk_redis=192.168.3.207:6379",
     "elasticsearch": "http://127.0.0.1:9800",
     "elasticsearch_index": "winner_enterprise_tmp",
     "elasticsearch_type": "winnerent",

+ 1 - 1
src/jy/clear/clear.go

@@ -53,7 +53,7 @@ func DoClearFn(clear []string, data []interface{}) []interface{} {
 }
 
 //取手机号
-var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
+var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,5})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
 
 func GetPhone(data []interface{}) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))

+ 16 - 0
src/jy/extract/score_jsondata.go

@@ -94,6 +94,22 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				//AddExtLog("clear", j.SourceMid, (*j.Jsondata)[v], newNum[0], &RegLuaInfo{ "JsonData_"+v, "", v, "", false, nil, nil}, e.TaskInfo) //抽取日志
 				//}
 				continue
+			}else if v == "bidopentime"{
+				lockclear.Lock()
+				cfn := e.ClearFn[v]
+				lockclear.Unlock()
+				if len(cfn) == 0 {
+					continue
+				}
+				extFields := make([]*util.ExtField, 0)
+				if bt,ok :=(*j.Jsondata)[v].(float64);ok && bt>0{
+					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: bt, Score: 0.1})
+				}else {
+					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1})
+				}
+				j.Result[v] = extFields
+				continue
 			}
 			vv := strings.TrimSpace(util2.ObjToString((*j.Jsondata)[v]))
 			if vv == "" || strings.Contains(vv, "详见公告") {

+ 2 - 2
src/jy/pretreated/analykv.go

@@ -10,7 +10,7 @@ import (
 var Han = regexp.MustCompile("[\\p{Han}]")
 var Han1 = regexp.MustCompile("[^:;;,:,。. \u3000\u2003\u00a0\\s]")
 var Han2 = regexp.MustCompile("[^:;;,:,。.]")
-var Key = regexp.MustCompile("[::]")
+var Key = regexp.MustCompile("[::]")
 var Time = regexp.MustCompile("[\\d]")
 var dh = regexp.MustCompile("[,,.]")
 var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
@@ -401,7 +401,7 @@ func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, ma
 				}
 			}
 		} else if len([]rune(k)) == 2 {
-			if filter_zbje_jd.MatchString(k) { //钱
+			if !filter_zbje_jd.MatchString(k) { //钱
 				if tag != "" && filter_tag_zb.MatchString(tag) {
 					k = "中标" + k
 				} else {

+ 35 - 0
src/jy/pretreated/division.go

@@ -253,6 +253,7 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite
 		title = filterTitle(title)
 		//分割标题 [和及]。。。 参与
 		splitTitles := ProcTitle(title)
+		blockText = mergetext(splitTitles, blockText)
 		block := &util.Block{
 			Index:  index,     //序号
 			Text:   blockText, //内容
@@ -319,6 +320,32 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite
 	return returnBlocks, returnValue
 }
 
+func mergetext(titles []string, text string) string {
+	if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
+		return text
+	}
+	splitLenstrs := strings.Split(text, "\n")
+	if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
+		return text
+	}
+	tt := ""
+	for i, v := range splitLenstrs[1:] {
+		lentexts := regDivision.Split(v, -1)
+		if len(lentexts) == 2 {
+			if strings.Contains(titles[i], lentexts[0]) {
+				tt += titles[i] + ":" + lentexts[1] + "\n"
+			}else if strings.Contains(titles[i], lentexts[0]) ||strings.Contains(titles[i], lentexts[0]){
+				tt += titles[i] + ":" + lentexts[1] + "\n"
+			}
+		}
+	}
+	if len(tt) == 0 {
+		return text
+	} else {
+		return tt
+	}
+}
+
 //块标题处理
 func ProcTitle(title string) []string {
 	if title == "" {
@@ -345,6 +372,14 @@ func ProcTitle(title string) []string {
 				}
 				ara[kk] = start + vv
 			}
+		} else if vv == "联系人" || vv == "联系方式" {
+			if strings.Contains(prev, "代理") {
+				ara[kk] = "代理机构" + vv
+			} else if strings.Contains(prev, "中标") {
+				ara[kk] = "中标单位" + vv
+			} else if strings.Contains(prev, "采购") {
+				ara[kk] = "采购单位" + vv
+			}
 		}
 		if len([]rune(vv)) > 3 {
 			if direct == -1 {

+ 19 - 8
src/res/fieldscore.json

@@ -100,8 +100,13 @@
             },
             {
                 "describe": "乱码",
-                "regstr": "[±??¨êí¤ì×üàóμˉ÷°úéè]",
-                "score": -20
+                "regstr": "[±??¨êí¤ìüàóμˉ÷°úéè]",
+                "score": -10
+            },
+            {
+                "describe": "符合",
+                "regstr": "[,,.。!!]",
+                "score": -10
             }
         ],
         "length": [
@@ -172,8 +177,8 @@
             },
             {
                 "describe": "包含负分",
-                "regstr": "(代表|招标|交易中心|顾问|单位|测试)",
-                "score": -5
+                "regstr": "(代表|招标|交易中心|顾问|单位|测试|采购)",
+                "score": -10
             },
             {
                 "describe": "包含特殊符号2",
@@ -484,7 +489,13 @@
     },
  	"buyertel": {
         "type": "string",
-        "positivewords": [],
+        "positivewords": [
+            {
+                "describe": "区号开头",
+                "regstr": "^\\([0-9]{3,4}\\)",
+                "score": 2
+            }
+        ],
         "negativewords": [
             {
                 "describe": "出现中文汉字",
@@ -589,8 +600,8 @@
                 "score": -10
             },  {
                 "describe": "包含负分",
-                "regstr": "(详(见|情)|公告|test)",
-                "score": -10
+                "regstr": "(详(见|情)|公告|test|招标人)",
+                "score": -20
             }
         ],
         "length": [
@@ -650,7 +661,7 @@
             {
                 "describe": "[gt,∞,score]",
                 "range": [
-                    14,
+                    24,
                     -1,
                     -10
                 ]

+ 5 - 10
src/res/formattext.json

@@ -93,7 +93,7 @@
 		},
 		{
 			"reg": "\\n(.{2,8})联系方式[::](.+?)\\s+\\+\\s+(.+)",
-            "separator": "\n${1}联系人:$2\n${1}联系电话:$3",
+            "separator": "\n${1}联系人:$2\n${1}联系方式:$3",
             "desc": ""
 		},
 		{
@@ -109,21 +109,16 @@
 		{
 			"reg": "(收货)(联系人)和(联系方式)[::](.+?)/(.+)",
             "separator": "${1}${2}:${4}\n${1}${3}:${5}",
-            "desc": ""
-		},
-		{
-			"reg": "(招标人[::][^::,、,]+?)(联系人[::].+)",
-            "separator": "${1}\n${2}",
             "desc": ""
 		},
 		{
 			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{2,8})联系人[::]([\u4e00-\u9fa5、]+)\\s+((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}联系人:${2}\n${1}联系电话:${3}",
+            "separator": "${1}联系人:${2}\n${1}联系方式:${3}",
             "desc": ""
 		},
 		{
 			"reg": "(采购[^方式]{1,8})[::]([^::]{3,15})[,,]([^::]{2,5})((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}:${2}\n${1}联系人:${3}\n${1}联系电话:${4}",
+            "separator": "${1}:${2}\n${1}联系人:${3}\n${1}联系方式:${4}",
             "desc": ""
 		},
 		{
@@ -143,8 +138,8 @@
 		},
 		{
 			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8}?)(联系(方式|电话|人)和?)+[::]([^\\d::]{2,8}?)[((]?[\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}联系人:${4}\n${1}联系电话:${5}",
-            "desc": "采购人联系方式:李静  0311-66629799 or 联系电话:张先生 0917―2660282"
+            "separator": "${1}联系人:${4}\n${1}联系方式:${5}",
+            "desc": "采购人联系方式:李静  0311-66629799 or 联系方式:张先生 0917―2660282"
 		},
 		{
 			"reg": "[((]([^))]{2,8}联系人)[::](.+?)[,,]((联系)?(电话|手机)(号码)?)[::](.+)[))]",

+ 0 - 1
src/res/specialsymbols.json

@@ -108,7 +108,6 @@
             "buyer": true,
             "winner": true,
             "agency": true,
-            "agency": true,
             "buyertel": true,
             "buyerperson": true
         },