fengweiqiang 5 年之前
父节点
当前提交
307df54eaa
共有 6 个文件被更改,包括 55 次插入10 次删除
  1. 2 2
      src/config.json
  2. 1 1
      src/jy/clear/clear.go
  3. 1 1
      src/jy/pretreated/analykv.go
  4. 35 0
      src/jy/pretreated/division.go
  5. 16 5
      src/res/fieldscore.json
  6. 0 1
      src/res/specialsymbols.json

+ 2 - 2
src/config.json

@@ -2,8 +2,8 @@
     "port": "9090",
     "mgodb": "192.168.3.207:27092",
     "dbsize": 3,
-    "dbname": "extract_v3",
-    "redis": "qyk_redis=127.0.0.1:6379",
+    "dbname": "extract_v3xs",
+    "redis": "qyk_redis=192.168.3.207:6379",
     "elasticsearch": "http://127.0.0.1:9800",
     "elasticsearch_index": "winner_enterprise_tmp",
     "elasticsearch_type": "winnerent",

+ 1 - 1
src/jy/clear/clear.go

@@ -53,7 +53,7 @@ func DoClearFn(clear []string, data []interface{}) []interface{} {
 }
 
 //取手机号
-var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
+var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,5})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
 
 func GetPhone(data []interface{}) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))

+ 1 - 1
src/jy/pretreated/analykv.go

@@ -10,7 +10,7 @@ import (
 var Han = regexp.MustCompile("[\\p{Han}]")
 var Han1 = regexp.MustCompile("[^:;;,:,。. \u3000\u2003\u00a0\\s]")
 var Han2 = regexp.MustCompile("[^:;;,:,。.]")
-var Key = regexp.MustCompile("[::]")
+var Key = regexp.MustCompile("[::]")
 var Time = regexp.MustCompile("[\\d]")
 var dh = regexp.MustCompile("[,,.]")
 var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")

+ 35 - 0
src/jy/pretreated/division.go

@@ -253,6 +253,7 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite
 		title = filterTitle(title)
 		//分割标题 [和及]。。。 参与
 		splitTitles := ProcTitle(title)
+		blockText = mergetext(splitTitles, blockText)
 		block := &util.Block{
 			Index:  index,     //序号
 			Text:   blockText, //内容
@@ -319,6 +320,32 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock, isSite
 	return returnBlocks, returnValue
 }
 
+func mergetext(titles []string, text string) string {
+	if len(titles) == 0 || utf8.RuneCountInString(text) > 150 {
+		return text
+	}
+	splitLenstrs := strings.Split(text, "\n")
+	if len(splitLenstrs) == 1 || len(titles) != len(splitLenstrs)-1 {
+		return text
+	}
+	tt := ""
+	for i, v := range splitLenstrs[1:] {
+		lentexts := regDivision.Split(v, -1)
+		if len(lentexts) == 2 {
+			if strings.Contains(titles[i], lentexts[0]) {
+				tt += titles[i] + ":" + lentexts[1] + "\n"
+			}else if strings.Contains(titles[i], lentexts[0]) ||strings.Contains(titles[i], lentexts[0]){
+				tt += titles[i] + ":" + lentexts[1] + "\n"
+			}
+		}
+	}
+	if len(tt) == 0 {
+		return text
+	} else {
+		return tt
+	}
+}
+
 //块标题处理
 func ProcTitle(title string) []string {
 	if title == "" {
@@ -345,6 +372,14 @@ func ProcTitle(title string) []string {
 				}
 				ara[kk] = start + vv
 			}
+		} else if vv == "联系人" || vv == "联系方式" {
+			if strings.Contains(prev, "代理") {
+				ara[kk] = "代理机构" + vv
+			} else if strings.Contains(prev, "中标") {
+				ara[kk] = "中标单位" + vv
+			} else if strings.Contains(prev, "采购") {
+				ara[kk] = "采购单位" + vv
+			}
 		}
 		if len([]rune(vv)) > 3 {
 			if direct == -1 {

+ 16 - 5
src/res/fieldscore.json

@@ -100,8 +100,13 @@
             },
             {
                 "describe": "乱码",
-                "regstr": "[±??¨êí¤ì×üàóμˉ÷°úéè]",
-                "score": -20
+                "regstr": "[±??¨êí¤ìüàóμˉ÷°úéè]",
+                "score": -10
+            },
+            {
+                "describe": "符合",
+                "regstr": "[,,.。!!]",
+                "score": -10
             }
         ],
         "length": [
@@ -172,8 +177,8 @@
             },
             {
                 "describe": "包含负分",
-                "regstr": "(代表|招标|交易中心|顾问|单位|测试)",
-                "score": -5
+                "regstr": "(代表|招标|交易中心|顾问|单位|测试|采购)",
+                "score": -10
             },
             {
                 "describe": "包含特殊符号2",
@@ -484,7 +489,13 @@
     },
  	"buyertel": {
         "type": "string",
-        "positivewords": [],
+        "positivewords": [
+            {
+                "describe": "区号开头",
+                "regstr": "^\\([0-9]{3,4}\\)",
+                "score": 2
+            }
+        ],
         "negativewords": [
             {
                 "describe": "出现中文汉字",

+ 0 - 1
src/res/specialsymbols.json

@@ -108,7 +108,6 @@
             "buyer": true,
             "winner": true,
             "agency": true,
-            "agency": true,
             "buyertel": true,
             "buyerperson": true
         },