Browse Source

候选人过滤

wcj 6 years ago
parent
commit
023d59b624

+ 1 - 1
src/config.json

@@ -8,7 +8,7 @@
     "elasticPoolSize": 30,
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
     "mergetablealias": "projectset_v1",
-    "saveresult": false,
+    "saveresult": true,
     "qualityaudit": false,
     "qualityaudit": false,
     "saveblock": false,
     "saveblock": false,
     "filelength": 100000,
     "filelength": 100000,

+ 9 - 4
src/jy/clear/cutspace.go

@@ -20,7 +20,7 @@ func init() {
 	cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
 	cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
 	cutAllSpace, _ = regexp.Compile(`\s*`)
 	cutAllSpace, _ = regexp.Compile(`\s*`)
 	catSymbol, _ = regexp.Compile(`[]+`)
 	catSymbol, _ = regexp.Compile(`[]+`)
-	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/|]")
+	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/|]")
 	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
 	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
 }
 }
 
 
@@ -115,8 +115,8 @@ func childCutNotPrs(data []interface{}, count int) []interface{} {
 	if count >= 50 || value == "" {
 	if count >= 50 || value == "" {
 		return data
 		return data
 	}
 	}
-	startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "[>》]", "〔"}
-	endChars := []string{"[)]", "[\\]】]", "[}}]", "[<《]", "[>》]", "〕"}
+	startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "〔"}
+	endChars := []string{"[)]", "[\\]】]", "[}}]", "[>》]", "〕"}
 	for k, v := range startChars {
 	for k, v := range startChars {
 		sReg := regexp.MustCompile(v)
 		sReg := regexp.MustCompile(v)
 		eReg := regexp.MustCompile(endChars[k])
 		eReg := regexp.MustCompile(endChars[k])
@@ -185,7 +185,11 @@ func ClearBuyerPerson(data []interface{}) []interface{} {
 			}
 			}
 			if i == 0 && placeReg.MatchString(v) {
 			if i == 0 && placeReg.MatchString(v) {
 				if length == 1 {
 				if length == 1 {
-					tmp = tmp + v
+					if len([]rune(v)) >= 4 {
+						tmp = ""
+					} else {
+						tmp = tmp + v
+					}
 				} else {
 				} else {
 					tmp = tmp + v + "-"
 					tmp = tmp + v + "-"
 				}
 				}
@@ -198,6 +202,7 @@ func ClearBuyerPerson(data []interface{}) []interface{} {
 			}
 			}
 		}
 		}
 		data[0] = tmp
 		data[0] = tmp
+
 	} else {
 	} else {
 		value = separateSymbol.ReplaceAllString(value, "")
 		value = separateSymbol.ReplaceAllString(value, "")
 		data[0] = value
 		data[0] = value

+ 14 - 8
src/jy/pretreated/analystep.go

@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"encoding/json"
 	"jy/util"
 	"jy/util"
 	//"log"
 	//"log"
+	"unicode/utf8"
 	"strings"
 	"strings"
 
 
 	"github.com/PuerkitoBio/goquery"
 	"github.com/PuerkitoBio/goquery"
@@ -30,7 +31,7 @@ func AnalyStart(job *util.Job) {
 		}
 		}
 	}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
 		for _, bl := range blockArrays {
@@ -70,8 +71,9 @@ func AnalyStart(job *util.Job) {
 			//从正文里面找分包
 			//从正文里面找分包
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 		}
 		}
-		FindProjectCode(newCon, job) //匹配项目编号
 		bl.Text = HtmlToText(con)
 		bl.Text = HtmlToText(con)
+		//log.Println(bl.Text)
+		FindProjectCode(bl.Text, job) //匹配项目编号
 		//调用kv解析
 		//调用kv解析
 		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1)
 		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1)
 		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil)
 		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil)
@@ -80,7 +82,6 @@ func AnalyStart(job *util.Job) {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 			job.Winnerorder = bl.Winnerorder
 			job.Winnerorder = bl.Winnerorder
 		}
 		}
-		//log.Println(bl.Text)
 		job.Block = append(job.Block, bl)
 		job.Block = append(job.Block, bl)
 	}
 	}
 }
 }
@@ -107,18 +108,23 @@ func FindProjectCode(newCon string, job *util.Job) {
 	}
 	}
 	var proCode string
 	var proCode string
 	blCode := &util.Block{}
 	blCode := &util.Block{}
-	if newCon = projectcodeRegAll.FindString(newCon); newCon != "" { //项目名称项目编号一起的
+	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
 		//5d424bdfa5cb26b9b7ac7a85
 		//5d424bdfa5cb26b9b7ac7a85
 		//5d425a48a5cb26b9b7df5fec
 		//5d425a48a5cb26b9b7df5fec
 		//5d425506a5cb26b9b7cd2c3c
 		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newCon, " ")
+		splitStr := strings.Split(newConTMP, " ")
 		if len(splitStr) >= 2 {
 		if len(splitStr) >= 2 {
-			newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+				newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+				//5d4253f3a5cb26b9b7ca2662
+				newCon = "项目编号:" + tmpstr
+			}
 		} else if len(splitStr) == 1 {
 		} else if len(splitStr) == 1 {
 			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
 			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
 				newCon = "项目编号:" + tmpstr
 				newCon = "项目编号:" + tmpstr
-			}else if strings.Contains(newCon,"、"){
-				tmpstrs :=strings.Split(newCon,"、")
+			} else if strings.Contains(newConTMP, "、") {
+				tmpstrs := strings.Split(newCon, "、")
 				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
 				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
 			}
 			}
 		}
 		}

+ 11 - 10
src/jy/pretreated/analytable.go

@@ -93,10 +93,10 @@ var (
 	projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
 	projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
 	MhSpilt        = regexp.MustCompile("[::]")
 	MhSpilt        = regexp.MustCompile("[::]")
 	//识别采购单位联系人、联系电话、代理机构联系人、联系电话
 	//识别采购单位联系人、联系电话、代理机构联系人、联系电话
-	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公)?((电话([//]传真)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|(项目)?经办)人)|采购方代表")
+	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公)?((电话([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|(项目)?经办)人)|采购方代表")
 	ContactInfoMustReg  = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
 	ContactInfoMustReg  = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
 	ContactType         = map[string]*regexp.Regexp{
 	ContactType         = map[string]*regexp.Regexp{
-		"采购单位": regexp.MustCompile("(^采购(项目.{2}|服务)?|比选|询价|甲|招标(服务)?|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人|采购部"),
+		"采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|甲|招标(服务)?|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人|采购部"),
 		"代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
 		"代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
 	}
 	}
 	ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
 	ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
@@ -106,8 +106,8 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+|标的|班子成员")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+|标的|班子成员")
-	projectcodeRegAll           = regexp.MustCompile(`(采购项目|项目)名称及[项目]?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]")
+	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
+	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
 	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
 	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
@@ -2448,6 +2448,7 @@ func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat) {
 	//处理表格中的联系人信息
 	//处理表格中的联系人信息
 	indexMap := contactFormat.IndexMap
 	indexMap := contactFormat.IndexMap
 	matchMap := contactFormat.MatchMap
 	matchMap := contactFormat.MatchMap
+	//qutil.Debug("==============================td=======================", indexMap, matchMap)
 	weightMap := map[string]map[string]interface{}{} //权重
 	weightMap := map[string]map[string]interface{}{} //权重
 	mustMatchFirst := len(indexMap) > 0              //第一个必须匹配上
 	mustMatchFirst := len(indexMap) > 0              //第一个必须匹配上
 	reCreate := false
 	reCreate := false
@@ -2635,7 +2636,7 @@ L:
 	(*contactFormat).MatchMap = matchMap
 	(*contactFormat).MatchMap = matchMap
 	//	for _, tr := range tn.TRs {
 	//	for _, tr := range tn.TRs {
 	//		for _, td := range tr.TDs {
 	//		for _, td := range tr.TDs {
-	//			log.Println(td.SortKV.Map)
+	//			qutil.Debug("td.sort.map---", td.SortKV.Map)
 	//		}
 	//		}
 	//	}
 	//	}
 }
 }
@@ -2734,13 +2735,13 @@ func (tn *Table) asdFind(td_k string, matchCount int, weightMap map[string]map[s
 	}
 	}
 	if len(indexMap) == 0 && td_kv.Title != "" {
 	if len(indexMap) == 0 && td_kv.Title != "" {
 		//td_kv.Title
 		//td_kv.Title
-		if titleMatchType := ContactTypeTitleMatch(td_kv.Title); len(titleMatchType) != 0 {
+		if titleMatchType := ContactTypeTitleMatch(td_kv.Title); titleMatchType != "" {
 			thidTdIndex = 0
 			thidTdIndex = 0
 			matchMap = map[string]map[string]bool{}
 			matchMap = map[string]map[string]bool{}
-			//indexMap = map[int]string{1: titleMatchType}
-			for i, t := range titleMatchType {
-				indexMap[i+1] = t
-			}
+			indexMap = map[int]string{1: titleMatchType}
+			//			for i, t := range titleMatchType {
+			//				indexMap[i+1] = t
+			//			}
 		}
 		}
 	}
 	}
 	return matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex
 	return matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex

+ 88 - 71
src/jy/pretreated/colonkv.go

@@ -15,14 +15,15 @@ import (
 type ColonkvEntity struct{}
 type ColonkvEntity struct{}
 
 
 var (
 var (
-	colonkvEntity = &ColonkvEntity{}
-	regReplKV     = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
-	regReplKV2    = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
-	regKV         = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
-	filterK       = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
-	filterValue   = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
-	regReplKey    = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
-	BlockTagMap   = map[string]bool{
+	colonkvEntity  = &ColonkvEntity{}
+	regReplKV      = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
+	regReplKV2     = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
+	regKV          = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
+	filterK        = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
+	filterValue    = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
+	regReplKey     = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
+	buyerAndAgency = regexp.MustCompile("(代理(机构|人)|采购(人|单位))")
+	BlockTagMap    = map[string]bool{
 		"招标范围": true,
 		"招标范围": true,
 		"资格要求": true,
 		"资格要求": true,
 	}
 	}
@@ -202,17 +203,34 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
 
 
 //根据配置文件中的规则,格式化正文
 //根据配置文件中的规则,格式化正文
 func formatText(content, key string) string {
 func formatText(content, key string) string {
-	segment := DivideSegment(content)
+	segments := make([]*Segment, 0)
+	if key == "all" {
+		segments = DivideSegmentHtml(content)
+	} else if key == "kv" {
+		segments = DivideSegment(content)
+		//log.Println("清理前:\n",content)
+	}
 	newCon := ""
 	newCon := ""
-	for _, v := range segment {
-		if v.Index > len(segment)-3 {
+	for _, v := range segments {
+		if v.Index > len(segments)-3 {
 			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
 			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
 				break
 				break
 			}
 			}
 		}
 		}
+		if key == "kv" && utf8.RuneCountInString(v.Text) >= 1 {
+			//log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
+			v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
+				return r == 19968 || r == 20108 || r == 19977 ||
+					r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
+			})
+			//log.Println("清理前后",v.Text)
+		}
 		newCon += v.Text + "\n"
 		newCon += v.Text + "\n"
 	}
 	}
 	content = regEndWrap.ReplaceAllString(newCon, "")
 	content = regEndWrap.ReplaceAllString(newCon, "")
+	//if key == "kv"{
+	//	log.Println("清理前后\n",content)
+	//}
 	for _, v := range FormatTextMap[key] {
 	for _, v := range FormatTextMap[key] {
 		reg, _ := v["reg"].(*regexp.Regexp)
 		reg, _ := v["reg"].(*regexp.Regexp)
 		separator, isString := v["separator"].(string)
 		separator, isString := v["separator"].(string)
@@ -312,14 +330,18 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	}
 	}
 	mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
 	mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
 	titleMatch := false
 	titleMatch := false
-	if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
+	if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
 		titleMatch = true
 		titleMatch = true
 		mustMatchFirst = false
 		mustMatchFirst = false
-		for i, t := range titleMatchType {
-			indexMap[i+1] = t
-		}
-		//indexMap = map[int]string{1: titleMatchType}
+		indexMap = map[int]string{1: titleMatchType}
 	}
 	}
+	//	if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
+	//		titleMatch = true
+	//		mustMatchFirst = false
+	//		for i, t := range titleMatchType {
+	//			indexMap[i+1] = t
+	//		}
+	//	}
 	//	if buyers == nil {
 	//	if buyers == nil {
 	//		Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
 	//		Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
 	//	}
 	//	}
@@ -330,7 +352,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//		}
 	//		}
 	//	}
 	//	}
 	startIndex := 0
 	startIndex := 0
-	//prevKey := ""
+	prevKey := ""
 	index, notmatchCount, allMatchCount := 0, 0, 0
 	index, notmatchCount, allMatchCount := 0, 0, 0
 	weightMap := map[string]map[string]interface{}{}     //权重
 	weightMap := map[string]map[string]interface{}{}     //权重
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
@@ -349,6 +371,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				if buyer == "" {
 				if buyer == "" {
 					continue
 					continue
 				}
 				}
+
 				prevLine := kv.PrevLine
 				prevLine := kv.PrevLine
 				prevLine = strings.TrimSpace(prevLine)
 				prevLine = strings.TrimSpace(prevLine)
 				prevLine = strings.Split(prevLine, " ")[0]
 				prevLine = strings.Split(prevLine, " ")[0]
@@ -450,7 +473,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 							weightMap[ct_k][ct_k] = weightVal.(int)
 							weightMap[ct_k][ct_k] = weightVal.(int)
 							matchMap[ct_k] = map[string]bool{}
 							matchMap[ct_k] = map[string]bool{}
 							isBreak = false
 							isBreak = false
-							//prevKey = ""
+							prevKey = ""
 						}
 						}
 					}
 					}
 				}
 				}
@@ -470,7 +493,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				if startIndex == 0 {
 				if startIndex == 0 {
 					indexMap = map[int]string{}
 					indexMap = map[int]string{}
 				}
 				}
-				//prevKey = ""
+				prevKey = ""
 				startIndex++
 				startIndex++
 				indexMap[startIndex] = ct_k
 				indexMap[startIndex] = ct_k
 				isContinue = true
 				isContinue = true
@@ -493,7 +516,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			isBreak = false
 			isBreak = false
 			continue
 			continue
 		}
 		}
-		if !ContactInfoMustReg.MatchString(k) {
+		if !ContactInfoMustReg.MatchString(k) { //判断是否是电话、邮箱、地址等信息
 			if mustMatchFirst {
 			if mustMatchFirst {
 				mustMatchFirst = false
 				mustMatchFirst = false
 				continue
 				continue
@@ -520,17 +543,18 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		//		} else if index < 2 {
 		//		} else if index < 2 {
 		//			index++
 		//			index++
 		//		}
 		//		}
-		//		if prevKey != k {
-		//			prevKey = k
-		//			index = 1
-		//		} else if prevKey == k {
-		//			index++
-		//		}
-		if startIndex == 0 || startIndex%2 == 1 {
+		if prevKey != k {
+			prevKey = k
 			index = 1
 			index = 1
-		} else if startIndex%2 == 0 {
-			index = 2
+		} else if prevKey == k {
+			index++
 		}
 		}
+		//		if startIndex == 0 || startIndex%2 == 1 || index == 0 {
+		//			index = 1
+		//		} else if startIndex%2 == 0 {
+		//			index = 2
+		//		}
+
 		//hasMatch[k] = true
 		//hasMatch[k] = true
 		//过滤值
 		//过滤值
 		if filterValue.MatchString(v) {
 		if filterValue.MatchString(v) {
@@ -607,42 +631,42 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//	}
 	//	}
 	//Debug("totalIndexMap", len(totalIndexMap))
 	//Debug("totalIndexMap", len(totalIndexMap))
 }
 }
-func ContactTypeTitleMatch(title string) []string {
-	matchType := []string{}
-	matchTypeMap := map[string]bool{}
-	if title != "" && len([]rune(title)) < 25 {
-		if ContactBuyerTitleReg.MatchString(title) {
-			matchType = append(matchType, "采购单位")
-			matchTypeMap["采购单位"] = true
-		}
-		if ContactAgencyTitleReg.MatchString(title) {
-			matchType = append(matchType, "代理机构")
-			matchTypeMap["代理机构"] = true
-		}
-		if len(matchType) == 2 {
-			return matchType
-		}
-		for _, ct_k := range HasOrderContactType(title) {
-			if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
-				matchType = append(matchType, ct_k)
-			}
-		}
-	}
-	//	matchType := ""
-	//	if title != "" && len([]rune(title)) < 15 {
+func ContactTypeTitleMatch(title string) string {
+	//	matchType := []string{}
+	//	matchTypeMap := map[string]bool{}
+	//	if title != "" && len([]rune(title)) < 25 {
 	//		if ContactBuyerTitleReg.MatchString(title) {
 	//		if ContactBuyerTitleReg.MatchString(title) {
-	//			matchType = "采购单位"
-	//		} else if ContactAgencyTitleReg.MatchString(title) {
-	//			matchType = "代理机构"
-	//		} else {
-	//			for _, ct_k := range HasOrderContactType(title) {
-	//				if ContactType[ct_k].MatchString(title) {
-	//					matchType = ct_k
-	//					break
-	//				}
+	//			matchType = append(matchType, "采购单位")
+	//			matchTypeMap["采购单位"] = true
+	//		}
+	//		if ContactAgencyTitleReg.MatchString(title) {
+	//			matchType = append(matchType, "代理机构")
+	//			matchTypeMap["代理机构"] = true
+	//		}
+	//		if len(matchType) == 2 {
+	//			return matchType
+	//		}
+	//		for _, ct_k := range HasOrderContactType(title) {
+	//			if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
+	//				matchType = append(matchType, ct_k)
 	//			}
 	//			}
 	//		}
 	//		}
 	//	}
 	//	}
+	matchType := ""
+	if title != "" && len([]rune(title)) < 15 {
+		if ContactBuyerTitleReg.MatchString(title) {
+			matchType = "采购单位"
+		} else if ContactAgencyTitleReg.MatchString(title) {
+			matchType = "代理机构"
+		} else {
+			for _, ct_k := range HasOrderContactType(title) {
+				if ContactType[ct_k].MatchString(title) {
+					matchType = ct_k
+					break
+				}
+			}
+		}
+	}
 	return matchType
 	return matchType
 }
 }
 
 
@@ -748,16 +772,6 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 		kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
 		kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
 	}
 	}
 	for _, findkv := range findkvs {
 	for _, findkv := range findkvs {
-		//		if ContactInfoMustReg.MatchString(findkv.Value) { //名称、地址、联系人、邮编、电话
-		//			preval := findkv.PrevLine
-		//			ctkarr := HasOrderContactType(preval)
-		//			if len(ctkarr) > 0 {
-		//				for i, ct_k := range ctkarr {
-		//					indexMap[i+1] = ct_k
-		//				}
-		//			}
-		//			qutil.Debug("----", indexMap)
-		//		}
 		k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
 		k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
 		//val是空的话,不打标签
 		//val是空的话,不打标签
 		if filterValue.MatchString(val) {
 		if filterValue.MatchString(val) {
@@ -863,6 +877,9 @@ func RemoveWarpOfTdVal(text string) string {
 //打标签的时候,清理key
 //打标签的时候,清理key
 //from 1--冒号key 2--table key
 //from 1--冒号key 2--table key
 func ClearKey(k string, from int) string {
 func ClearKey(k string, from int) string {
+	if buyerAndAgency.MatchString(filterK.FindString(k)) { //采购项目联系人(代理机构)5d423d70a5cb26b9b76fa2e7
+		return k
+	}
 	for {
 	for {
 		old := k
 		old := k
 		if from == 1 {
 		if from == 1 {

+ 66 - 6
src/jy/pretreated/division.go

@@ -1,6 +1,7 @@
 package pretreated
 package pretreated
 
 
 import (
 import (
+	"fmt"
 	"jy/util"
 	"jy/util"
 	qutil "qfw/util"
 	qutil "qfw/util"
 	"regexp"
 	"regexp"
@@ -65,6 +66,24 @@ var (
 	*/
 	*/
 	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
 	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
 	filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
 	filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
+	xuhao             = map[string]bool{
+		"19968_12289": true,
+		"19968_46":    true,
+		"20108_12289": true,
+		"20108_46":    true,
+		"19977_12289": true,
+		"19977_46":    true,
+		"22235_12289": true,
+		"22235_46":    true,
+		"20116_12289": true,
+		"20116_46":    true,
+		"20845_12289": true,
+		"20845_46":    true,
+		"19971_12289": true,
+		"19971_46":    true,
+		"20843_12289": true,
+		"20061_46":    true,
+	}
 )
 )
 
 
 //分块
 //分块
@@ -409,11 +428,52 @@ func appendWarpStop(text string) string {
 	}
 	}
 	return text
 	return text
 }
 }
-
+//分段
+func DivideSegmentHtml(txt string) []*util.Segment {
+	//先分段
+	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		return r == 10 || r == 13
+	})
+	//再去除空行
+	segs := make([]*util.Segment, 0)
+	_index := 0
+	for _, seg := range _segs {
+		if seg != " " && len(seg) > 1 {
+			_seg := util.Segment{}
+			_index = _index + 1
+			_seg.Index = _index
+			_seg.Text = seg
+			segs = append(segs, &_seg)
+		}
+	}
+	return segs
+}
 //分段
 //分段
 func DivideSegment(txt string) []*util.Segment {
 func DivideSegment(txt string) []*util.Segment {
 	//先分段
 	//先分段
+	tmpstr := ""
 	_segs := strings.FieldsFunc(txt, func(r rune) bool {
 	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
+			r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
+			if tmpstr == "" {
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if strings.Contains(tmpstr, "_") {
+				tmpstr = ""
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if tmpstr == fmt.Sprint(r) {
+				if r == 46 || r == 12289{
+					tmpstr = ""
+				}
+				return false
+			}
+			tmpstr += "_" + fmt.Sprint(r)
+			if xuhao[tmpstr] {
+				return true
+			}
+		}
+		tmpstr= ""
 		return r == 10 || r == 13
 		return r == 10 || r == 13
 	})
 	})
 	//再去除空行
 	//再去除空行
@@ -528,7 +588,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			return false, ""
 			return false, ""
 		}
 		}
 		//
 		//
-		is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
+		is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
 		for _, sv := range is {
 		for _, sv := range is {
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 		}
 		}
@@ -568,13 +628,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexPkgMap[sv[0]] = v[0]
 			indexPkgMap[sv[0]] = v[0]
 		}
 		}
 		//key在包前面,并且在一行的开头
 		//key在包前面,并且在一行的开头
-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		if len(keys) == 0 {
 		if len(keys) == 0 {
 			//key在包前面,并且key以冒号结尾
 			//key在包前面,并且key以冒号结尾
-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		}
 		if len(keys) == 0 {
 		if len(keys) == 0 {
-			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		}
 		for _, key := range keys {
 		for _, key := range keys {
 			startEndMap[key[5]] = key[4]
 			startEndMap[key[5]] = key[4]
@@ -628,7 +688,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			}
 			}
 			index := util.PackageNumberConvert(bk)
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
 			//去掉前缀,空格必须要加,分kv的时候要用
-			text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
+			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
 			headKey := ""
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {

+ 5 - 0
src/jy/pretreated/tablev2.go

@@ -877,6 +877,11 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 //纯文本
 //纯文本
 func HtmlToText(con string) string {
 func HtmlToText(con string) string {
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	//log.Println(doc2.Html())
+	doc2.Find("tr").Each(func(i int, selection *goquery.Selection) {
+		selection.AfterHtml(string(rune(10)))
+	})
+	//log.Println(doc2.Html())
 	return doc2.Text()
 	return doc2.Text()
 }
 }
 
 

+ 1 - 1
src/main_blocktest.go

@@ -51,7 +51,7 @@ func all() {
 }
 }
 func one() {
 func one() {
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
-	d, _ := m.FindById("bidding", "5d423af9a5cb26b9b766ec44", extract.Fields)
+	d, _ := m.FindById("bidding", "5d424df7a5cb26b9b7b61fde", extract.Fields)
 	com(*d)
 	com(*d)
 }
 }
 func com(doc map[string]interface{}) {
 func com(doc map[string]interface{}) {

+ 33 - 115
src/main_test.go

@@ -3,7 +3,6 @@ package main
 import (
 import (
 	"fmt"
 	"fmt"
 	"jy/admin/track"
 	"jy/admin/track"
-	"jy/clear"
 	"jy/extract"
 	"jy/extract"
 	. "jy/mongodbutil"
 	. "jy/mongodbutil"
 	"log"
 	"log"
@@ -44,6 +43,8 @@ func Test_reg(t *testing.T) {
 	log.Println("---", reg1.FindAllString("05939-5365001(兰陵县芦柞镇人民政府)", -1))
 	log.Println("---", reg1.FindAllString("05939-5365001(兰陵县芦柞镇人民政府)", -1))
 	reg2, _ := regexp.Compile("^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$")
 	reg2, _ := regexp.Compile("^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$")
 	log.Println("---", reg2.MatchString("张女士/"))
 	log.Println("---", reg2.MatchString("张女士/"))
+	filterK := regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
+	log.Println(filterK.FindString("二)采购项目联系人(代理机构)"))
 }
 }
 
 
 func Test_reg1(t *testing.T) {
 func Test_reg1(t *testing.T) {
@@ -72,123 +73,40 @@ func Test_paths(t *testing.T) {
 }
 }
 
 
 func Test_clear(t *testing.T) {
 func Test_clear(t *testing.T) {
-	text := clear.OtherClean("winner", "宁夏泷泽医疗器械有限公司(地址:银川市兴庆区绿地214商城D区7号楼317房)")
-	log.Println(text)
-}
-
-func Test_reg3(t *testing.T) {
-	text := []rune("(法撒旦法士大夫发的发)生(的]发的法旦法士大夫三发的)")
-	for i := 1; i <= 2; i++ {
-		if len(text) > 0 {
-			text = gl(i, text)
-		}
-	}
-	log.Println("finish--", string(text))
-}
-func gl(i int, text []rune) []rune {
-	pairedIndex := make(map[int]int)
-	surplusMax := -1  //记录多余的反符号最大值
-	positiveMax := -1 //记录多余的正符号最大值
-	removeLength := 0
-	nb := 0
-	//na := 0
-	length := len(text)
-	allSymbol := "[((\\[【{{〔<《))\\]】}}〕>》]"
-	allReg := regexp.MustCompile(allSymbol)
-	symmetricMap := map[string]string{
-		"]": "[",
-		")": "(",
-		"】": "【",
-		"}": "{",
-	}
-	symbolIndex := map[string][]int{} //记录符号和当前索引位置
-	//log.Println(string(text))
-	for index, t := range text {
-		now := allReg.FindString(string(t))
-		if len(now) > 0 { //匹配到符号
-			if index == 0 {
-				if symmetricMap[now] != "" { //去除第一个反符号
-					text = text[1:len(text)]
-				} else if len(now) > 0 { //第一个是正符号,记录索引位置
-					tmpArr := []int{index}
-					symbolIndex[now] = tmpArr
-				}
-			} else {
-				if symmetricMap[now] != "" { //反向符号,找出对称的正向符号
-					fdSymbol := symmetricMap[now] //正向符号
-					tmp := symbolIndex[fdSymbol]
-					if len(tmp) == 0 { //多出来的反向符号,记录最大值
-						//log.Println("多余反向符号----", now)
-						if index > surplusMax {
-							surplusMax = index
-						}
-					} else {
-						nowIndex := tmp[len(tmp)-1]              //索引位置
-						symbolIndex[fdSymbol] = tmp[:len(tmp)-1] //匹配索引位置后,删除之前的记录
-						if len(symbolIndex[fdSymbol]) == 0 {
-							delete(symbolIndex, fdSymbol)
-						}
-						//将成对的符号的index记录,
-						if index == length-1 {
-							pairedIndex[index] = nowIndex
-						}
-						pairedIndex[nowIndex] = index
-					}
-				} else { //正向符号,加入symbolIndex记录索引
-					tmpArr := []int{}
-					if len(symbolIndex[now]) > 0 { //有该符号的索引位置
-						tmpArr = symbolIndex[now]
-						tmpArr = append(tmpArr, index)
-					} else { //没有该符号的索引位置
-						tmpArr = []int{index}
-					}
-					symbolIndex[now] = tmpArr
-				}
-			}
+	value := "法拉(盛(客{)户)端副科级沙发俩括号的"
+	log.Println("pre---", value)
+	startChars := []string{"[((]", "[\\[【]", "[{{]", "[<《]", "〔"}
+	endChars := []string{"[))]", "[\\]】]", "[}}]", "[>》]", "〕"}
+	for k, v := range startChars {
+		sReg := regexp.MustCompile(v)
+		eReg := regexp.MustCompile(endChars[k])
+		sIndex := sReg.FindAllStringIndex(value, -1)
+		eIndex := eReg.FindAllStringIndex(value, -1)
+		sCount := len(sIndex)
+		eCount := len(eIndex)
+		if sCount == eCount {
+			continue
 		}
 		}
-	}
-
-	if len(symbolIndex) != 0 { //多余的正符号索引位置
-		for _, arr := range symbolIndex {
-			for j, l := range arr {
-				if j == 0 && l == 0 {
-					text = text[1:] //删除text开头的正向符号
-					removeLength = 1
-					nb = nb + 1
-				}
-				if positiveMax < l { //记录最大正向索引
-					positiveMax = l
-				}
-			}
+		log.Println("value1---", value, sCount, eCount)
+		//清理前面
+		if sCount > eCount {
+			value = value[sIndex[eCount][1]:]
 		}
 		}
-	}
-	firstOpposite := pairedIndex[0]
-	if firstOpposite != 0 { //第一个正符号对应反符号的位置
-		text = text[firstOpposite+1:]
-		removeLength = firstOpposite + 1
-		nb = nb + removeLength
-	}
-	lastOpposite := pairedIndex[length-1] //最后一个符号
-	if lastOpposite > 0 {                 //有对称的正向符号,删除其中间内容
-		//na = length - lastOpposite
-		text = text[:lastOpposite-removeLength]
-	} else if surplusMax == length-1 { //没有对称,只删除最后一个反符号
-		text = text[:length-1-removeLength]
-		//na = na + 1
-	}
-	//有多余反向符号,删除之前部分 surplusMax所有多余反向符号的最大索引
-	if surplusMax != -1 && surplusMax > firstOpposite && surplusMax < length-1 {
-		if (lastOpposite > 0 && surplusMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发}发(发发发发发发)
-			text = text[surplusMax-nb+1:]
-			nb = surplusMax + 1
+		log.Println("value2---", value)
+		//清理后面
+		if sCount < eCount {
+			value = value[:eIndex[sCount][0]]
 		}
 		}
+		log.Println("value3---", value)
 	}
 	}
-	//多余正符号删除之后部分(优先删除反符号之前部分)//(发发{发发)发发发发发发发发发发发发发发发(发{发)
-	if positiveMax != -1 && positiveMax != 0 && positiveMax > surplusMax && positiveMax > firstOpposite { ////发发发发发发]发发{
-		if (lastOpposite > 0 && positiveMax < lastOpposite) || (lastOpposite == 0) { //发发发发发发发发{发发发发发(发发)
-			text = text[:positiveMax-nb]
-		}
+	log.Println("value4---", value)
+	//交叉出现情况处理
+	sReplReg := regexp.MustCompile("[((\\[【{{〔<《][^))\\]】}}〕>》]*$")
+	eReplReg := regexp.MustCompile("^[^((\\[【{{〔<《]*[))\\]】}}〕>》]")
+	if sReplReg.MatchString(value) || eReplReg.MatchString(value) {
+		value = sReplReg.ReplaceAllString(value, "")
+		value = eReplReg.ReplaceAllString(value, "")
+		//value = fmt.Sprint(childCutNotPrs([]interface{}{value, data[1]}, count+1)[0])
 	}
 	}
-	log.Println(string(text))
-	return text
+	log.Println("result---", value)
 }
 }

+ 7 - 2
src/res/fieldscore.json

@@ -122,7 +122,7 @@
         "positivewords": [
         "positivewords": [
             {
             {
                 "describe": "以*结尾",
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|会|场)$",
+                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|场)$",
                 "score": 3
                 "score": 3
             }
             }
         ],
         ],
@@ -136,6 +136,11 @@
                 "describe": "包含负分不再展示",
                 "describe": "包含负分不再展示",
                 "regstr": "(详见|提出|面向|施工)",
                 "regstr": "(详见|提出|面向|施工)",
                 "score": -50
                 "score": -50
+            },
+            {
+                "describe": "一个字或者两个字不再显示",
+                "regstr": "^[\\s]*[\\u4e00-\\u9fa5]{1,2}[\\s]*$",
+                "score": -50
             }
             }
         ],
         ],
         "length": [
         "length": [
@@ -250,7 +255,7 @@
                 "range": [
                 "range": [
                     30,
                     30,
                     -1,
                     -1,
-                    -1
+                    -10
                 ]
                 ]
             }
             }
         ]
         ]

+ 4 - 4
versioncomparison/config.json

@@ -1,9 +1,9 @@
 {
 {
-    "premgo": "192.168.3.207:27081",
-    "predb": "qfw",
+    "premgo": "192.168.3.207:27082",
+    "predb": "extract_kf",
     "prec": "result_v3",
     "prec": "result_v3",
-    "newmgo": "192.168.3.207:27081",
-    "newdb": "extract_v3",
+    "newmgo": "192.168.3.207:27082",
+    "newdb": "extract_kf",
     "newc": "result_data",
     "newc": "result_data",
     "fields": [
     "fields": [
         "projectname",
         "projectname",

+ 124 - 2
versioncomparison/main.go

@@ -23,6 +23,7 @@ var (
 	Sid, Eid    string
 	Sid, Eid    string
 	Fields      []string
 	Fields      []string
 	FieldsQuery string
 	FieldsQuery string
+	Url         = "https://www.jianyu360.com/article/content/%s.html"
 )
 )
 
 
 type Compare struct {
 type Compare struct {
@@ -59,7 +60,9 @@ func init() {
 func main() {
 func main() {
 	getVersionData()
 	getVersionData()
 	createXlsx()
 	createXlsx()
+	//biaozhucompare()
 }
 }
+
 func createXlsx() {
 func createXlsx() {
 	xf, err := xlsx.OpenFile("template.xlsx")
 	xf, err := xlsx.OpenFile("template.xlsx")
 	if err != nil {
 	if err != nil {
@@ -87,7 +90,7 @@ func createXlsx() {
 		}
 		}
 	}
 	}
 	//生成信息sheet
 	//生成信息sheet
-	url := "https://www.jianyu360.com/article/content/%s.html"
+
 	for _, field := range Fields {
 	for _, field := range Fields {
 		sh, _ := xf.AddSheet(field)
 		sh, _ := xf.AddSheet(field)
 		rowh := sh.AddRow()
 		rowh := sh.AddRow()
@@ -102,7 +105,7 @@ func createXlsx() {
 				row.AddCell().SetString(k)
 				row.AddCell().SetString(k)
 				row.AddCell().SetString(v.PreVal)
 				row.AddCell().SetString(v.PreVal)
 				row.AddCell().SetString(v.NewVal)
 				row.AddCell().SetString(v.NewVal)
-				row.AddCell().SetString(fmt.Sprintf(url, qu.CommonEncodeArticle("content", v.Id)))
+				row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
 			}
 			}
 		}
 		}
 	}
 	}
@@ -184,3 +187,122 @@ func getVersionData() {
 		Compares[k] = cp
 		Compares[k] = cp
 	}
 	}
 }
 }
+
+type BidData struct {
+	id  string
+	key map[string]interface{}
+}
+type BidCom struct {
+	Val []int
+	Ids []map[string]interface{}
+}
+
+//标注正确率统计
+func biaozhucompare() {
+	exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1)
+	extDatas := []BidData{}
+	for _, v := range *exts {
+		key := map[string]interface{}{
+			"projectname": v["projectname"],
+			"projectcode": v["projectcode"],
+			"buyer":       v["buyer"],
+			"budget":      qu.Float64All(v["budget"]),
+			"bidamount":   qu.Float64All(v["bidamount"]),
+			"agency":      v["agency"],
+			"buyerperson": v["buyerperson"],
+			"buyertel":    v["buyertel"],
+		}
+		ext := BidData{
+			id:  qu.BsonIdToSId(v["_id"]),
+			key: key,
+		}
+		extDatas = append(extDatas, ext)
+	}
+	log.Println("exts ok")
+	bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1)
+	bzDatas := []BidData{}
+	for _, v := range *bzs {
+		bidamount := float64(0)
+		if bigprices, ok := v["bigprice"].([]interface{}); ok {
+			bidamount = qu.Float64All(bigprices[0])
+		}
+		key := map[string]interface{}{
+			"projectname": qu.ObjToString(v["projectname"]),
+			"projectcode": qu.ObjToString(v["projectcode"]),           //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])),
+			"buyer":       qu.ObjToString(v["buyer"]),                 // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])),
+			"budget":      qu.Float64All(qu.ObjToString(v["budget"])), //  qu.Float64All(qu.ObjToString(v["t_budget"])),
+			"bidamount":   bidamount,
+			"agency":      qu.ObjToString(v["agency"]),
+			"buyerperson": qu.ObjToString(v["buyerperson"]),
+			"buyertel":    qu.ObjToString(v["buyertel"]),
+		}
+		bz := BidData{
+			id:  qu.BsonIdToSId(v["_id"]),
+			key: key,
+		}
+		bzDatas = append(bzDatas, bz)
+	}
+	log.Println("bzs ok")
+
+	bcoms := map[string]*BidCom{}
+	for _, ext := range extDatas {
+		for _, bz := range bzDatas {
+			if bz.id == ext.id {
+				for key, val := range ext.key {
+					//					if key == "budget" {
+					//						log.Println(key, ext.key[key], ";;;;;", bz.key[key])
+					//					}
+					if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 {
+						bcom := bcoms[key]
+						if bcom == nil {
+							bcom = &BidCom{
+								Val: []int{0, 0},
+								Ids: []map[string]interface{}{},
+							}
+						}
+						if val == bz.key[key] {
+							bcom.Val[0] += 1
+						} else {
+							bcom.Val[1] += 1
+							tmp := map[string]interface{}{
+								"id":  ext.id,
+								"ext": val,
+								"bz":  bz.key[key],
+							}
+							bcom.Ids = append(bcom.Ids, tmp)
+						}
+						bcoms[key] = bcom
+					}
+				}
+				break
+			}
+		}
+	}
+	xl := xlsx.NewFile()
+	sh, _ := xl.AddSheet("统计")
+	h := sh.AddRow()
+	h.AddCell().SetString("field")
+	h.AddCell().SetString("相同")
+	h.AddCell().SetString("不同")
+	for k, v := range bcoms {
+		row := sh.AddRow()
+		row.AddCell().SetString(k)
+		row.AddCell().SetInt(v.Val[0])
+		row.AddCell().SetInt(v.Val[1])
+		ksh, _ := xl.AddSheet(k)
+		rh := ksh.AddRow()
+		rh.AddCell().SetString("id")
+		rh.AddCell().SetString("标注")
+		rh.AddCell().SetString("抽取")
+		rh.AddCell().SetString("url")
+		for _, v := range v.Ids {
+			rw := ksh.AddRow()
+			rw.AddCell().SetString(qu.ObjToString(v["id"]))
+			rw.AddCell().SetString(fmt.Sprint(v["bz"]))
+			rw.AddCell().SetString(fmt.Sprint(v["ext"]))
+			rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"]))))
+		}
+		log.Println(k, v.Val)
+	}
+	xl.Save("ext_bz.xlsx")
+}