5 年之前 · 307df54eaa
--- a/src/config.json
+++ b/src/config.json
@@ -2,8 +2,8 @@
 
				     "port": "9090",
			
 
				     "mgodb": "192.168.3.207:27092",
			
 
				     "dbsize": 3,
			
 
				-    "dbname": "extract_v3",
			
 
				-    "redis": "qyk_redis=127.0.0.1:6379",
			
 
				+    "dbname": "extract_v3xs",
			
 
				+    "redis": "qyk_redis=192.168.3.207:6379",
			
 
				     "elasticsearch": "http://127.0.0.1:9800",
			
 
				     "elasticsearch_index": "winner_enterprise_tmp",
			
 
				     "elasticsearch_type": "winnerent",
			
--- a/src/jy/clear/clear.go
+++ b/src/jy/clear/clear.go
@@ -53,7 +53,7 @@ func DoClearFn(clear []string, data []interface{}) []interface{} {
 
				 }
			
 
				 
			
 
				 //取手机号
			
 
				-var PhoneReg = regexp.MustCompile("((([（(]\\d{3,4}[)）])?(\\d{6,12}([×―—－\\-]+\\d{3,4})?|\\d{3,5}[×―—－\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—－\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—－\\-])+\\d{2,}[×―—－\\-]+(\\d{3,})+|(\\d{2}[×―—－\\-])+\\d{8}[×―—－\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,，;；\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
			
 
				+var PhoneReg = regexp.MustCompile("((([（(]\\d{3,4}[)）])?(\\d{6,12}([×―—－\\-]+\\d{3,5})?|\\d{3,5}[×―—－\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—－\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—－\\-])+\\d{2,}[×―—－\\-]+(\\d{3,})+|(\\d{2}[×―—－\\-])+\\d{8}[×―—－\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,，;；\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
			
 
				 
			
 
				 func GetPhone(data []interface{}) []interface{} {
			
 
				 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))
			
--- a/src/jy/pretreated/analykv.go
+++ b/src/jy/pretreated/analykv.go
@@ -10,7 +10,7 @@ import (
 
				 var Han = regexp.MustCompile("[\\p{Han}]")
			
 
				 var Han1 = regexp.MustCompile("[^:;；,：，。． \u3000\u2003\u00a0\\s]")
			
 
				 var Han2 = regexp.MustCompile("[^:;；,：，。.]")
			
 
				-var Key = regexp.MustCompile("[:：：]")
			
 
				+var Key = regexp.MustCompile("[:：]")
			
 
				 var Time = regexp.MustCompile("[\\d]")
			
 
				 var dh = regexp.MustCompile("[，,.]")
			
 
				 var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
			
--- a/src/jy/pretreated/division.go
+++ b/src/jy/pretreated/division.go
@@ -253,6 +253,7 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite
 
				 		title = filterTitle(title)
			
 
				 		//分割标题 [和及]。。。 参与
			
 
				 		splitTitles := ProcTitle(title)
			
 
				+		blockText = mergetext(splitTitles, blockText)
			
 
				 		block := &util.Block{
			
 
				 			Index:  index,     //序号
			
 
				 			Text:   blockText, //内容
			
@@ -319,6 +320,32 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite
 
				 	return returnBlocks, returnValue
			
 
				 }
			
 
				 
			
 
				+func mergetext(titles []string, text string) string {
			
 
				+	if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
			
 
				+		return text
			
 
				+	}
			
 
				+	splitLenstrs := strings.Split(text, "\n")
			
 
				+	if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
			
 
				+		return text
			
 
				+	}
			
 
				+	tt := ""
			
 
				+	for i, v := range splitLenstrs[1:] {
			
 
				+		lentexts := regDivision.Split(v, -1)
			
 
				+		if len(lentexts) == 2 {
			
 
				+			if strings.Contains(titles[i], lentexts[0]) {
			
 
				+				tt += titles[i] + ":" + lentexts[1] + "\n"
			
 
				+			}else if strings.Contains(titles[i], lentexts[0]) ||strings.Contains(titles[i], lentexts[0]){
			
 
				+				tt += titles[i] + ":" + lentexts[1] + "\n"
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if len(tt) == 0 {
			
 
				+		return text
			
 
				+	} else {
			
 
				+		return tt
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 //块标题处理
			
 
				 func ProcTitle(title string) []string {
			
 
				 	if title == "" {
			
@@ -345,6 +372,14 @@ func ProcTitle(title string) []string {
 
				 				}
			
 
				 				ara[kk] = start + vv
			
 
				 			}
			
 
				+		} else if vv == "联系人" || vv == "联系方式" {
			
 
				+			if strings.Contains(prev, "代理") {
			
 
				+				ara[kk] = "代理机构" + vv
			
 
				+			} else if strings.Contains(prev, "中标") {
			
 
				+				ara[kk] = "中标单位" + vv
			
 
				+			} else if strings.Contains(prev, "采购") {
			
 
				+				ara[kk] = "采购单位" + vv
			
 
				+			}
			
 
				 		}
			
 
				 		if len([]rune(vv)) > 3 {
			
 
				 			if direct == -1 {
			
--- a/src/res/fieldscore.json
+++ b/src/res/fieldscore.json
@@ -100,8 +100,13 @@
 
				             },
			
 
				             {
			
 
				                 "describe": "乱码",
			
 
				-                "regstr": "[±?？¨êí¤ì×üàóμˉ÷°úéè]",
			
 
				-                "score": -20
			
 
				+                "regstr": "[±?？¨êí¤ìüàóμˉ÷°úéè]",
			
 
				+                "score": -10
			
 
				+            },
			
 
				+            {
			
 
				+                "describe": "符合",
			
 
				+                "regstr": "[,，.。!！]",
			
 
				+                "score": -10
			
 
				             }
			
 
				         ],
			
 
				         "length": [
			
@@ -172,8 +177,8 @@
 
				             },
			
 
				             {
			
 
				                 "describe": "包含负分",
			
 
				-                "regstr": "(代表|招标|交易中心|顾问|单位|测试)",
			
 
				-                "score": -5
			
 
				+                "regstr": "(代表|招标|交易中心|顾问|单位|测试|采购)",
			
 
				+                "score": -10
			
 
				             },
			
 
				             {
			
 
				                 "describe": "包含特殊符号2",
			
@@ -484,7 +489,13 @@
 
				     },
			
 
				  	"buyertel": {
			
 
				         "type": "string",
			
 
				-        "positivewords": [],
			
 
				+        "positivewords": [
			
 
				+            {
			
 
				+                "describe": "区号开头",
			
 
				+                "regstr": "^\\([0-9]{3,4}\\)",
			
 
				+                "score": 2
			
 
				+            }
			
 
				+        ],
			
 
				         "negativewords": [
			
 
				             {
			
 
				                 "describe": "出现中文汉字",
			
--- a/src/res/specialsymbols.json
+++ b/src/res/specialsymbols.json
@@ -108,7 +108,6 @@
 
				             "buyer": true,
			
 
				             "winner": true,
			
 
				             "agency": true,
			
 
				-            "agency": true,
			
 
				             "buyertel": true,
			
 
				             "buyerperson": true
			
 
				         },