6 years ago · bcc3721270
--- a/src/jy/pretreated/analystep.go
+++ b/src/jy/pretreated/analystep.go
@@ -7,6 +7,7 @@ import (
 
				 	"encoding/json"
			
 
				 	"jy/util"
			
 
				 	//"log"
			
 
				+	"unicode/utf8"
			
 
				 	"strings"
			
 
				 
			
 
				 	"github.com/PuerkitoBio/goquery"
			
@@ -30,7 +31,7 @@ func AnalyStart(job *util.Job) {
 
				 		}
			
 
				 	}
			
 
				 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
			
 
				-	if len(blockArrays) > 0 {                                                //有分块
			
 
				+	if len(blockArrays) > 0 { //有分块
			
 
				 		//从块里面找分包
			
 
				 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
			
 
				 		for _, bl := range blockArrays {
			
@@ -107,18 +108,23 @@ func FindProjectCode(newCon string, job *util.Job) {
 
				 	}
			
 
				 	var proCode string
			
 
				 	blCode := &util.Block{}
			
 
				-	if newCon = projectcodeRegAll.FindString(newCon); newCon != "" { //项目名称项目编号一起的
			
 
				+	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
			
 
				 		//5d424bdfa5cb26b9b7ac7a85
			
 
				 		//5d425a48a5cb26b9b7df5fec
			
 
				 		//5d425506a5cb26b9b7cd2c3c
			
 
				-		splitStr := strings.Split(newCon, " ")
			
 
				+		splitStr := strings.Split(newConTMP, " ")
			
 
				 		if len(splitStr) >= 2 {
			
 
				-			newCon = "项目编号:" + splitStr[len(splitStr)-1]
			
 
				+			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
			
 
				+				newCon = "项目编号:" + splitStr[len(splitStr)-1]
			
 
				+			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
			
 
				+				//5d4253f3a5cb26b9b7ca2662
			
 
				+				newCon = "项目编号:" + tmpstr
			
 
				+			}
			
 
				 		} else if len(splitStr) == 1 {
			
 
				 			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
			
 
				 				newCon = "项目编号:" + tmpstr
			
 
				-			}else if strings.Contains(newCon,"、"){
			
 
				-				tmpstrs :=strings.Split(newCon,"、")
			
 
				+			} else if strings.Contains(newConTMP, "、") {
			
 
				+				tmpstrs := strings.Split(newCon, "、")
			
 
				 				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
			
 
				 			}
			
 
				 		}
			
--- a/src/jy/pretreated/analytable.go
+++ b/src/jy/pretreated/analytable.go
@@ -107,8 +107,8 @@ var (
 
				 	underline                   = regexp.MustCompile("_+$")
			
 
				 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
			
 
				 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
			
 
				-	projectcodeRegAll           = regexp.MustCompile(`(采购项目|项目)名称及[项目]?编号[:|：]?.*[\n]?`)
			
 
				-	projectcodeRegAll2          = regexp.MustCompile("[(（].{4,30}[)）]")
			
 
				+	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|：]?.*[\n]?`)
			
 
				+	projectcodeRegAll2          = regexp.MustCompile("[(（].{4,30}[)）]?")
			
 
				 	projectcodeReg              = regexp.MustCompile(`(（|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|：)(.){4,30}(）|\)|\])`)
			
 
				 	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|：)(.{4,39})[0-9a-zA-Z）号]`)
			
 
				 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
			
--- a/src/jy/pretreated/colonkv.go
+++ b/src/jy/pretreated/colonkv.go
@@ -202,17 +202,34 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
 
				 
			
 
				 //根据配置文件中的规则，格式化正文
			
 
				 func formatText(content, key string) string {
			
 
				-	segment := DivideSegment(content)
			
 
				+	segments := make([]*Segment, 0)
			
 
				+	if key == "all" {
			
 
				+		segments = DivideSegmentHtml(content)
			
 
				+	} else if key == "kv" {
			
 
				+		segments = DivideSegment(content)
			
 
				+		//log.Println("清理前:\n",content)
			
 
				+	}
			
 
				 	newCon := ""
			
 
				-	for _, v := range segment {
			
 
				-		if v.Index > len(segment)-3 {
			
 
				+	for _, v := range segments {
			
 
				+		if v.Index > len(segments)-3 {
			
 
				 			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
			
 
				 				break
			
 
				 			}
			
 
				 		}
			
 
				+		if key == "kv" && utf8.RuneCountInString(v.Text)>=1{
			
 
				+			//log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
			
 
				+			v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
			
 
				+				return r == 19968 || r == 20108 || r == 19977 ||
			
 
				+					r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
			
 
				+			})
			
 
				+			//log.Println("清理前后",v.Text)
			
 
				+		}
			
 
				 		newCon += v.Text + "\n"
			
 
				 	}
			
 
				 	content = regEndWrap.ReplaceAllString(newCon, "")
			
 
				+	//if key == "kv"{
			
 
				+	//	log.Println("清理前后\n",content)
			
 
				+	//}
			
 
				 	for _, v := range FormatTextMap[key] {
			
 
				 		reg, _ := v["reg"].(*regexp.Regexp)
			
 
				 		separator, isString := v["separator"].(string)
			
--- a/src/jy/pretreated/division.go
+++ b/src/jy/pretreated/division.go
@@ -1,6 +1,7 @@
 
				 package pretreated
			
 
				 
			
 
				 import (
			
 
				+	"fmt"
			
 
				 	"jy/util"
			
 
				 	qutil "qfw/util"
			
 
				 	"regexp"
			
@@ -65,6 +66,24 @@ var (
 
				 	*/
			
 
				 	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[ 　\u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
			
 
				 	filterPkgTitleKey = regexp.MustCompile("结果[:：]?$")
			
 
				+	xuhao             = map[string]bool{
			
 
				+		"19968_12289": true,
			
 
				+		"19968_46":    true,
			
 
				+		"20108_12289": true,
			
 
				+		"20108_46":    true,
			
 
				+		"19977_12289": true,
			
 
				+		"19977_46":    true,
			
 
				+		"22235_12289": true,
			
 
				+		"22235_46":    true,
			
 
				+		"20116_12289": true,
			
 
				+		"20116_46":    true,
			
 
				+		"20845_12289": true,
			
 
				+		"20845_46":    true,
			
 
				+		"19971_12289": true,
			
 
				+		"19971_46":    true,
			
 
				+		"20843_12289": true,
			
 
				+		"20061_46":    true,
			
 
				+	}
			
 
				 )
			
 
				 
			
 
				 //分块
			
@@ -409,11 +428,52 @@ func appendWarpStop(text string) string {
 
				 	}
			
 
				 	return text
			
 
				 }
			
 
				-
			
 
				+//分段
			
 
				+func DivideSegmentHtml(txt string) []*util.Segment {
			
 
				+	//先分段
			
 
				+	_segs := strings.FieldsFunc(txt, func(r rune) bool {
			
 
				+		return r == 10 || r == 13
			
 
				+	})
			
 
				+	//再去除空行
			
 
				+	segs := make([]*util.Segment, 0)
			
 
				+	_index := 0
			
 
				+	for _, seg := range _segs {
			
 
				+		if seg != " " && len(seg) > 1 {
			
 
				+			_seg := util.Segment{}
			
 
				+			_index = _index + 1
			
 
				+			_seg.Index = _index
			
 
				+			_seg.Text = seg
			
 
				+			segs = append(segs, &_seg)
			
 
				+		}
			
 
				+	}
			
 
				+	return segs
			
 
				+}
			
 
				 //分段
			
 
				 func DivideSegment(txt string) []*util.Segment {
			
 
				 	//先分段
			
 
				+	tmpstr := ""
			
 
				 	_segs := strings.FieldsFunc(txt, func(r rune) bool {
			
 
				+		if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
			
 
				+			r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
			
 
				+			if tmpstr == "" {
			
 
				+				tmpstr += fmt.Sprint(r)
			
 
				+				return false
			
 
				+			} else if strings.Contains(tmpstr, "_") {
			
 
				+				tmpstr = ""
			
 
				+				tmpstr += fmt.Sprint(r)
			
 
				+				return false
			
 
				+			} else if tmpstr == fmt.Sprint(r) {
			
 
				+				if r == 46 || r == 12289{
			
 
				+					tmpstr = ""
			
 
				+				}
			
 
				+				return false
			
 
				+			}
			
 
				+			tmpstr += "_" + fmt.Sprint(r)
			
 
				+			if xuhao[tmpstr] {
			
 
				+				return true
			
 
				+			}
			
 
				+		}
			
 
				+		tmpstr= ""
			
 
				 		return r == 10 || r == 13
			
 
				 	})
			
 
				 	//再去除空行
			
@@ -528,7 +588,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 
				 			return false, ""
			
 
				 		}
			
 
				 		//
			
 
				-		is := regexp.MustCompile(v[0]+"[:：]*").FindAllStringIndex(con, -1)
			
 
				+		is := regexp.MustCompile(v[0] + "[:：]*").FindAllStringIndex(con, -1)
			
 
				 		for _, sv := range is {
			
 
				 			appendWarpIndex = append(appendWarpIndex, sv[0])
			
 
				 		}
			
@@ -568,13 +628,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 
				 			indexPkgMap[sv[0]] = v[0]
			
 
				 		}
			
 
				 		//key在包前面，并且在一行的开头
			
 
				-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
			
 
				+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
			
 
				 		if len(keys) == 0 {
			
 
				 			//key在包前面，并且key以冒号结尾
			
 
				-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
			
 
				+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([（(].{1,8}?[)）])?[:：]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
			
 
				 		}
			
 
				 		if len(keys) == 0 {
			
 
				-			keys = regexp.MustCompile("()注[:：]([\u4e00-\u9fa5]{2,8}?([（(].{1,8}?[)）])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
			
 
				+			keys = regexp.MustCompile("()注[:：]([\u4e00-\u9fa5]{2,8}?([（(].{1,8}?[)）])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
			
 
				 		}
			
 
				 		for _, key := range keys {
			
 
				 			startEndMap[key[5]] = key[4]
			
@@ -628,7 +688,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 
				 			}
			
 
				 			index := util.PackageNumberConvert(bk)
			
 
				 			//去掉前缀,空格必须要加，分kv的时候要用
			
 
				-			text = regexp.MustCompile(bv[0]+"[:：]*").ReplaceAllString(text, "")
			
 
				+			text = regexp.MustCompile(bv[0] + "[:：]*").ReplaceAllString(text, "")
			
 
				 			headKey := ""
			
 
				 			if indexKeyStringMap[iv] != "" {
			
 
				 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
			
--- a/src/res/fieldscore.json
+++ b/src/res/fieldscore.json
@@ -122,7 +122,7 @@
 
				         "positivewords": [
			
 
				             {
			
 
				                 "describe": "以*结尾",
			
 
				-                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|会|场)$",
			
 
				+                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|场)$",
			
 
				                 "score": 3
			
 
				             }
			
 
				         ],
			
@@ -136,6 +136,11 @@
 
				                 "describe": "包含负分不再展示",
			
 
				                 "regstr": "(详见|提出|面向|施工)",
			
 
				                 "score": -50
			
 
				+            },
			
 
				+            {
			
 
				+                "describe": "一个字或者两个字不再显示",
			
 
				+                "regstr": "^[\\s]*[\\u4e00-\\u9fa5]{1,2}[\\s]*$",
			
 
				+                "score": -50
			
 
				             }
			
 
				         ],
			
 
				         "length": [