fengweiqiang %!s(int64=6) %!d(string=hai) anos
pai
achega
253a765c05

+ 38 - 38
src/jy/pretreated/analystep.go

@@ -7,7 +7,6 @@ import (
 	"encoding/json"
 	"jy/util"
 	//"log"
-	"unicode/utf8"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -52,6 +51,7 @@ func AnalyStart(job *util.Job) {
 			job.Block = append(job.Block, bl)
 		}
 	} else { //未分块,创建分块
+		//log.Println(con)
 		bl := &util.Block{}
 		newCon := con
 		if len(tabs) > 0 { //解析表格逻辑
@@ -102,50 +102,50 @@ func processTableInBlock(bl *util.Block, job *util.Job) {
 
 //匹配项目编号
 func FindProjectCode(newCon string, job *util.Job) {
-	newCon = TextAfterRemoveTable(newCon)
+	newCon = HtmlToText(newCon)
 	if strings.TrimSpace(newCon) == "" {
 		return
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
-		//5d424bdfa5cb26b9b7ac7a85
-		//5d425a48a5cb26b9b7df5fec
-		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newConTMP, " ")
-		if len(splitStr) >= 2 {
-			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
-				newCon = "项目编号:" + splitStr[len(splitStr)-1]
-			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-				//5d4253f3a5cb26b9b7ca2662
-				newCon = "项目编号:" + tmpstr
-			}
-		} else if len(splitStr) == 1 {
-			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
-				newCon = "项目编号:" + tmpstr
-			} else if strings.Contains(newConTMP, "、") {
-				tmpstrs := strings.Split(newCon, "、")
-				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+	/*		if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
+			//5d424bdfa5cb26b9b7ac7a85
+			//5d425a48a5cb26b9b7df5fec
+			//5d425506a5cb26b9b7cd2c3c
+			splitStr := strings.Split(newConTMP, " ")
+			if len(splitStr) >= 2 {
+				if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+					newCon = "项目编号:" + splitStr[len(splitStr)-1]
+				} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+					//5d4253f3a5cb26b9b7ca2662
+					newCon = "项目编号:" + tmpstr
+				}
+			} else if len(splitStr) == 1 {
+				if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+					newCon = "项目编号:" + tmpstr
+				} else if strings.Contains(newConTMP, "、") {
+					tmpstrs := strings.Split(newCon, "、")
+					newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
+				}
 			}
 		}
-	}
-	proCode = projectcodeReg.FindString(newCon)
-	if proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.ColonKV = ckv
-		blCode.Text = proCode
-		job.Block = append(job.Block, blCode)
-	} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.ColonKV = ckv
-		blCode.Text = proCode
-		job.Block = append(job.Block, blCode)
-	} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
-		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.Text = proCode
-		blCode.ColonKV = ckv
-		job.Block = append(job.Block, blCode)
-	}
+		proCode = projectcodeReg.FindString(newCon)
+		if proCode != "" {
+			ckv := GetKVAll(proCode, job.Title, nil, 1)
+			blCode.ColonKV = ckv
+			blCode.Text = proCode
+			job.Block = append(job.Block, blCode)
+		} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
+			ckv := GetKVAll(proCode, job.Title, nil, 1)
+			blCode.ColonKV = ckv
+			blCode.Text = proCode
+			job.Block = append(job.Block, blCode)
+		} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
+			ckv := GetKVAll(proCode, job.Title, nil, 1)
+			blCode.Text = proCode
+			blCode.ColonKV = ckv
+			job.Block = append(job.Block, blCode)
+		}*/
 	if proCode = jsonReg.FindString(newCon); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)

+ 0 - 5
src/jy/pretreated/analytable.go

@@ -107,11 +107,6 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
-	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
-	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")

+ 7 - 2
src/res/fieldscore.json

@@ -351,7 +351,7 @@
         "negativewords": [
             {
                 "describe": "以什么开始的减分",
-                "regstr": "^【",
+                "regstr": "^[|-]",
                 "score": -1
             },
             {
@@ -366,9 +366,14 @@
             },
             {
                 "describe": "中文汉字大于6个",
-                "regstr": "[\\u4e00-\\u9fa5]{6,}",
+                "regstr": "[\\u4e00-\\u9fa5]{6,9}",
                 "score": -1.3
             },
+            {
+                "describe": "中文汉字大于10个",
+                "regstr": "[\\u4e00-\\u9fa5]{10,}",
+                "score": -3
+            },
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",

+ 0 - 5
src/res/formattext.json

@@ -50,11 +50,6 @@
             "separator": "([\u4e00-\u9fa5]+?)[\u3000\u2003\u00a0\\s]+__$1",
             "desc": "例如:把采 购 人替换成采购人"
         },
-        {
-            "reg": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])([\u4e00-\u9fa5][^,,。、.;;\r\n))包段]{1,30}?[::].+?)([))])",
-            "separator": "$1\n$2\n$3\n$4\n",
-            "desc": "例如:采购项目名称:脱贫攻坚大数据平台建设项目(项目编号:YLLBC20164002-HS)"
-        },
         {
             "reg_c": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])(.+?[::].+?)([))])",
 			"reg": "[((]([^::))\\r\\n]{2,10}[::][^::))\\r\\n]+)+[))]",