fengweiqiang 5 rokov pred
rodič
commit
0beb1ea6d9

+ 0 - 3
src/jy/pretreated/README.MD

@@ -1,3 +0,0 @@
-**预处理**
-转换文档-》想要的Article格式
-

+ 3 - 2
src/jy/pretreated/tablev2.go

@@ -11,6 +11,7 @@ import (
 	"regexp"
 	"strings"
 	"sync"
+	"unicode/utf8"
 
 	"github.com/PuerkitoBio/goquery"
 )
@@ -288,7 +289,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string)
 
 //对td单元格值判断是否是表头和根据td内容长度进行分块处理
 func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite string) {
-	lenval := len([]rune(td.Val)) //经过处理的td内容长度
+	lenval := utf8.RuneCountInString(td.Val)//经过处理的td内容长度
 	//if lentxt > 9 {
 	//td.KV = GetKVAll(txt, "")
 	ub := []*u.Block{}
@@ -385,7 +386,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite stri
 		if !bsontable {
 			txt := repSpace.ReplaceAllString(td.Val, "")
 			btw, must, _, _, repl := CheckHeader(txt)
-			if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
+			if lenval > 18 {
 				btw = false
 			}
 			if strings.Contains(td.Val, "个项目") || strings.Contains(td.Val, "奥图码") {

+ 4 - 0
src/jy/pretreated/winnerorder.go

@@ -45,6 +45,7 @@ var (
 	findCandidate  = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体))|工作室)")
 	findCandidate2 = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合(会|体)|工作室)$)")
 	clearSpace1    = regexp.MustCompile("([((][\\d一二三四五六七八九十][))][\\s\u3000\u2003\u00a0\\t]*|<[^>].+?>)")
+	clearSpace2    = regexp.MustCompile("</?[^>]+>")
 	offerReg       = regexp.MustCompile("(中标|磋商|投标|报|单|成交)总?(价|金额)")
 )
 
@@ -54,6 +55,9 @@ var (
  *from 来源
  */
 func (wo *WinnerOrderEntity) Find(text string, flag bool, from int, isSite bool, codeSite string) []map[string]interface{} {
+	if clearSpace2.MatchString(text) {
+		return  []map[string]interface{}{}
+	}
 	text = winnerReg5.ReplaceAllString(text, "\n$3:$1\n")
 	/*
 		"_id" : ObjectId("5c2c6f60a5cb26b9b7b62cd8")

+ 0 - 0
src/jy/statistics/statistics.go


+ 6 - 1
src/res/fieldscore.json

@@ -172,9 +172,14 @@
             },
             {
                 "describe": "包含负分",
-                "regstr": "(代表|招标|交易中心|顾问|单位)",
+                "regstr": "(代表|招标|交易中心|顾问|单位|测试)",
                 "score": -5
             },
+            {
+                "describe": "包含特殊符号2",
+                "regstr": "(·|/.|-|、|(|/+|//|~)",
+                "score": -3
+            },
             {
                 "describe": "包含特殊符号",
                 "regstr": "(-|—)",