Browse Source

项目编号干扰项清理

fengweiqiang 6 năm trước cách đây
mục cha
commit
1914ba7c8c

+ 4 - 1
src/jy/pretreated/analykv.go

@@ -196,7 +196,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 					v = str1
 					if k0 < len(strs)-1 {
 						s2 := u.TrimLRSpace(strings.Join(strs[k0+1], ""), "")
-						if len([]rune(s2)) < 10 && !Key.MatchString(s2) {
+						if len([]rune(s2)) < 10 && !regexp.MustCompile("^[0-9]+[、]+$").MatchString(s2) && !Key.MatchString(s2) {
 							v += s2
 							k0++
 						}
@@ -334,6 +334,9 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 }
 
 func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int) {
+	if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v){
+		return
+	}
 	k = space.ReplaceAllString(k, "")
 	if len([]rune(k)) > 1 {
 		if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) {

+ 1 - 1
src/jy/pretreated/analytable.go

@@ -107,7 +107,7 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号){1}(:|:)(.){4,30}()|\)|\])`)
+	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
 	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9a-zA-Z]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}

+ 4 - 4
src/res/fieldscore.json

@@ -331,19 +331,19 @@
         "positivewords": [
             {
                 "describe": "有关键字加分",
-                "regstr": "(财采|招字|财购){1}",
+                "regstr": "(财采|招字|财购|赣购){1}",
                 "score": 2
             },
             {
                 "describe": "号结尾加分",
                 "regstr": ".{4,35}(号)$",
-                "score": 3
+                "score": 2
             }
         ],
         "negativewords": [
             {
-                "describe": "纯数字减分",
-                "regstr": "^\\d{8,}$",
+                "describe": "长度年月日纯数字减分",
+                "regstr": "^\\d{8}$",
                 "score": -1
             },
             {