Browse Source

切割段落

fengweiqiang 6 years ago
parent
commit
bcc3721270

+ 12 - 6
src/jy/pretreated/analystep.go

@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"jy/util"
 	//"log"
+	"unicode/utf8"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -30,7 +31,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -107,18 +108,23 @@ func FindProjectCode(newCon string, job *util.Job) {
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if newCon = projectcodeRegAll.FindString(newCon); newCon != "" { //项目名称项目编号一起的
+	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
 		//5d424bdfa5cb26b9b7ac7a85
 		//5d425a48a5cb26b9b7df5fec
 		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newCon, " ")
+		splitStr := strings.Split(newConTMP, " ")
 		if len(splitStr) >= 2 {
-			newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+				newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+				//5d4253f3a5cb26b9b7ca2662
+				newCon = "项目编号:" + tmpstr
+			}
 		} else if len(splitStr) == 1 {
 			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
 				newCon = "项目编号:" + tmpstr
-			}else if strings.Contains(newCon,"、"){
-				tmpstrs :=strings.Split(newCon,"、")
+			} else if strings.Contains(newConTMP, "、") {
+				tmpstrs := strings.Split(newCon, "、")
 				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
 			}
 		}

+ 2 - 2
src/jy/pretreated/analytable.go

@@ -107,8 +107,8 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`(采购项目|项目)名称及[项目]?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]")
+	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
+	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
 	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")

+ 20 - 3
src/jy/pretreated/colonkv.go

@@ -202,17 +202,34 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
 
 //根据配置文件中的规则,格式化正文
 func formatText(content, key string) string {
-	segment := DivideSegment(content)
+	segments := make([]*Segment, 0)
+	if key == "all" {
+		segments = DivideSegmentHtml(content)
+	} else if key == "kv" {
+		segments = DivideSegment(content)
+		//log.Println("清理前:\n",content)
+	}
 	newCon := ""
-	for _, v := range segment {
-		if v.Index > len(segment)-3 {
+	for _, v := range segments {
+		if v.Index > len(segments)-3 {
 			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
 				break
 			}
 		}
+		if key == "kv" && utf8.RuneCountInString(v.Text)>=1{
+			//log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
+			v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
+				return r == 19968 || r == 20108 || r == 19977 ||
+					r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
+			})
+			//log.Println("清理前后",v.Text)
+		}
 		newCon += v.Text + "\n"
 	}
 	content = regEndWrap.ReplaceAllString(newCon, "")
+	//if key == "kv"{
+	//	log.Println("清理前后\n",content)
+	//}
 	for _, v := range FormatTextMap[key] {
 		reg, _ := v["reg"].(*regexp.Regexp)
 		separator, isString := v["separator"].(string)

+ 66 - 6
src/jy/pretreated/division.go

@@ -1,6 +1,7 @@
 package pretreated
 
 import (
+	"fmt"
 	"jy/util"
 	qutil "qfw/util"
 	"regexp"
@@ -65,6 +66,24 @@ var (
 	*/
 	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
 	filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
+	xuhao             = map[string]bool{
+		"19968_12289": true,
+		"19968_46":    true,
+		"20108_12289": true,
+		"20108_46":    true,
+		"19977_12289": true,
+		"19977_46":    true,
+		"22235_12289": true,
+		"22235_46":    true,
+		"20116_12289": true,
+		"20116_46":    true,
+		"20845_12289": true,
+		"20845_46":    true,
+		"19971_12289": true,
+		"19971_46":    true,
+		"20843_12289": true,
+		"20061_46":    true,
+	}
 )
 
 //分块
@@ -409,11 +428,52 @@ func appendWarpStop(text string) string {
 	}
 	return text
 }
-
+//分段
+func DivideSegmentHtml(txt string) []*util.Segment {
+	//先分段
+	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		return r == 10 || r == 13
+	})
+	//再去除空行
+	segs := make([]*util.Segment, 0)
+	_index := 0
+	for _, seg := range _segs {
+		if seg != " " && len(seg) > 1 {
+			_seg := util.Segment{}
+			_index = _index + 1
+			_seg.Index = _index
+			_seg.Text = seg
+			segs = append(segs, &_seg)
+		}
+	}
+	return segs
+}
 //分段
 func DivideSegment(txt string) []*util.Segment {
 	//先分段
+	tmpstr := ""
 	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
+			r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
+			if tmpstr == "" {
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if strings.Contains(tmpstr, "_") {
+				tmpstr = ""
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if tmpstr == fmt.Sprint(r) {
+				if r == 46 || r == 12289{
+					tmpstr = ""
+				}
+				return false
+			}
+			tmpstr += "_" + fmt.Sprint(r)
+			if xuhao[tmpstr] {
+				return true
+			}
+		}
+		tmpstr= ""
 		return r == 10 || r == 13
 	})
 	//再去除空行
@@ -528,7 +588,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			return false, ""
 		}
 		//
-		is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
+		is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
 		for _, sv := range is {
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 		}
@@ -568,13 +628,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexPkgMap[sv[0]] = v[0]
 		}
 		//key在包前面,并且在一行的开头
-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		if len(keys) == 0 {
 			//key在包前面,并且key以冒号结尾
-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		if len(keys) == 0 {
-			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		for _, key := range keys {
 			startEndMap[key[5]] = key[4]
@@ -628,7 +688,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			}
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
-			text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
+			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {

+ 6 - 1
src/res/fieldscore.json

@@ -122,7 +122,7 @@
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|会|场)$",
+                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|场)$",
                 "score": 3
             }
         ],
@@ -136,6 +136,11 @@
                 "describe": "包含负分不再展示",
                 "regstr": "(详见|提出|面向|施工)",
                 "score": -50
+            },
+            {
+                "describe": "一个字或者两个字不再显示",
+                "regstr": "^[\\s]*[\\u4e00-\\u9fa5]{1,2}[\\s]*$",
+                "score": -50
             }
         ],
         "length": [