maxiaoshan пре 6 година
родитељ
комит
c854efabc0

+ 4 - 4
src/config.json

@@ -8,11 +8,11 @@
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": false,
+    "saveresult": true,
     "fieldscore": true,
-    "qualityaudit": true,
-    "iscltlog": false,
-    "brandgoods": true,
+    "qualityaudit": false,
+    "iscltlog": true,
+    "brandgoods": false,
     "udptaskid": "5be107e600746bf92debf080",
     "udpip": "127.0.0.1",
     "udpport": "1484",

+ 19 - 23
src/jy/extract/extract.go

@@ -233,11 +233,11 @@ func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 		toptype = "*"
 	}
 	j = &ju.Job{
-		SourceMid:  qu.BsonIdToSId(doc["_id"]),
-		Category:   toptype,
-		CategorySecond:subtype,
-		Content:    qu.ObjToString(doc["detail"]),
-		SpiderCode: qu.ObjToString(doc["spidercode"]),
+		SourceMid:      qu.BsonIdToSId(doc["_id"]),
+		Category:       toptype,
+		CategorySecond: subtype,
+		Content:        qu.ObjToString(doc["detail"]),
+		SpiderCode:     qu.ObjToString(doc["spidercode"]),
 		//Domain:     qu.ObjToString(doc["domain"]),
 		//Href:       qu.ObjToString(doc["href"]),
 		Title:     qu.ObjToString(doc["title"]),
@@ -323,12 +323,12 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 		for _, v := range e.RulePres {
 			doc = ExtRegPre(doc, j, v, e.TaskInfo)
 		}
-		if j.CategorySecond=="" {
+		if j.CategorySecond == "" {
 			//抽取规则
-			tmprules:= map[string][]*RuleCore{}
+			tmprules := map[string][]*RuleCore{}
 			lock.Lock()
 			for k, vc1 := range e.RuleCores[j.Category] {
-				tmprules[k]=vc1
+				tmprules[k] = vc1
 			}
 			lock.Unlock()
 			for _, vc1 := range tmprules {
@@ -364,10 +364,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					// log.Debug("抽取-后置规则", tmp)
 				}
 			}
-		}else{
-			fmt.Println(e.RuleCores)
-			fmt.Println("++++++++++++++++")
-			fmt.Println(e.RuleCores[j.Category+"_"+j.CategorySecond])
+		} else {
 			for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
 				for _, vc := range vc1 {
 					tmp := ju.DeepCopy(doc).(map[string]interface{})
@@ -462,12 +459,12 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 		doc := *j.Data
 		//全局前置规则,结果覆盖doc属性
 		for _, v := range e.RulePres {
-			if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+			if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 				doc = ExtRegPre(doc, j, v, e.TaskInfo)
 			}
 		}
 		//抽取规则
-		if j.CategorySecond==""{
+		if j.CategorySecond == "" {
 			for _, vc1 := range e.RuleCores[j.Category] {
 				for _, vc := range vc1 {
 					tmp := ju.DeepCopy(doc).(map[string]interface{})
@@ -477,7 +474,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 					}
 					//抽取-前置规则
 					for _, v := range vc.RulePres {
-						if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+						if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 							tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
 						}
 					}
@@ -485,7 +482,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 
 					//抽取-规则
 					for _, v := range vc.RuleCores {
-						if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+						if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 							ExtRegCore(vc.ExtFrom, tmp, j, v, e)
 						}
 					}
@@ -493,14 +490,14 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 
 					//抽取-后置规则
 					for _, v := range vc.RuleBacks {
-						if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+						if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 							ExtRegBack(j, v, e.TaskInfo)
 						}
 					}
 					// log.Debug("抽取-后置规则", tmp)
 				}
 			}
-		}else{
+		} else {
 			for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
 				for _, vc := range vc1 {
 					tmp := ju.DeepCopy(doc).(map[string]interface{})
@@ -510,7 +507,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 					}
 					//抽取-前置规则
 					for _, v := range vc.RulePres {
-						if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+						if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 							tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
 						}
 					}
@@ -518,7 +515,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 
 					//抽取-规则
 					for _, v := range vc.RuleCores {
-						if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+						if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 							ExtRegCore(vc.ExtFrom, tmp, j, v, e)
 						}
 					}
@@ -526,7 +523,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 
 					//抽取-后置规则
 					for _, v := range vc.RuleBacks {
-						if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+						if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 							ExtRegBack(j, v, e.TaskInfo)
 						}
 					}
@@ -535,10 +532,9 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
 			}
 		}
 
-
 		//全局后置规则
 		for _, v := range e.RuleBacks {
-			if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
+			if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
 		}

+ 11 - 0
src/jy/pretreated/analystep.go

@@ -59,6 +59,11 @@ func AnalyStart(job *util.Job) {
 				//				}
 			}
 			job.Block = append(job.Block, bl)
+
+			if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
+				//新加table未找到winnerorder, 从分块文本中找中标候选人
+				job.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
+			}
 		}
 	} else { //未分块,创建分块
 		bl := &util.Block{}
@@ -72,10 +77,16 @@ func AnalyStart(job *util.Job) {
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
 			//			}
+
 		} else {
 			//从正文里面找分包
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 		}
+		//新加 未分块table中未能解析到中标候选人,从正文中解析
+		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {
+			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
+		}
+
 		//调用kv解析
 		bl.ColonKV = GetKVAll(newCon, "", 1)
 		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "")

+ 97 - 17
src/jy/pretreated/analytable.go

@@ -88,7 +88,7 @@ var (
 
 	//Tg = map[string]interface{}{}
 	//一些表格没有表头,是空的,对值是排序的做处理对应 NullTxBid
-	NullTdReg      = regexp.MustCompile("(首选|第[一二三四五1-5])(中标|成交)?(名|(候选|排序)?(人|单位|供应商))")
+	NullTdReg      = regexp.MustCompile("(首选|第[一二三四五1-5])(中标|成交)?(名(称)?|(候选|排序)?(人|单位|供应商))")
 	NullTxtBid     = "成交供应商排名"
 	projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
 	MhSpilt        = regexp.MustCompile("[::]")
@@ -105,6 +105,8 @@ var (
 	FilterSerial                = regexp.MustCompile(".+[、..::,]")
 	filterTableWror             = regexp.MustCompile("班子成员")
 	underline                   = regexp.MustCompile("_+$")
+	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
+	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -201,6 +203,10 @@ func (table *Table) KVFilter() {
 	//4.对KV的处理
 	//判断表格是否有用,调用abandontable正则数组进行判断
 	//遍历每一行
+	winnertag := iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.Tag) //table标签
+	if !winnertag {
+		winnertag = iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签
+	}
 	for _, tr := range table.TRs {
 		for _, td := range tr.TDs {
 			//fmt.Println(td.BH, td.MustBH, td.Val, td.SortKV.Map)
@@ -260,11 +266,10 @@ func (table *Table) KVFilter() {
 			continue
 		}
 		v := table.SortKV.Map[k]
-		//u.Debug(k, v)
 		if _, ok := v.(string); ok {
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
 			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v)
-			//u.Debug(k, v, k1, w1, v1, tag, b)
+			//qutil.Debug(k, v, k1, w1, v1, tag, b)
 			if b {
 				//降低冒号值的权重
 				if MhSpilt.MatchString(v1) {
@@ -293,6 +298,7 @@ func (table *Table) KVFilter() {
 			as.AddKey(k, v)
 		}
 	}
+
 	//处理值是数组的kv放入标准化kv中
 	checkKey := map[int]bool{}
 	for kn, k := range as.Keys {
@@ -310,12 +316,17 @@ func (table *Table) KVFilter() {
 					for n1, _ := range vs1 {
 						smap[n1] = map[string]interface{}{}
 					}
+					//hadSort := false
+					tmpEntname := make([]string, len(vs1))
+					tmpPrice := make([]string, len(vs1))
 					for kn1, k := range as.Keys[kn:] {
 						v := as.Map[k]
 						if ContactType["采购单位"].MatchString(k) || ContactType["代理机构"].MatchString(k) {
 							continue
 						}
-						if vs, ok := v.([]string); ok && len(vs) == len(vs1) {
+						//目前对数组数据的key做判断,但是某些额可以是不满足情况的
+						//载明内容:[第一中标候选人 第二中标候选人] id:5d00587da5cb26b9b75e367b
+						if vs, ok := v.([]string); ok && len(vs) == len(vs1) { //数组值的个数相同
 							res, _, _, _, repl := CheckCommon(k, "bidorder")
 							kv := ""
 							if !res {
@@ -324,23 +335,72 @@ func (table *Table) KVFilter() {
 									kv = kt[0].Value
 								}
 							}
+							//qutil.Debug(k, res, repl, kv, "--", vs)
+							if !res && kv == "" { //key未验证出,验证数组的val值
+								checkKey[kn+kn1] = true
+								if winnertag { //如果是中标信息 在根据val数组信息解析候选人
+									for vsk, vsv := range vs {
+										if NullTdReg.MatchString(vsv) { //数据先验证val是否有排序
+											//hadSort = true
+											smap[vsk]["sortstr"] = vsv
+											smap[vsk]["sort"] = GetBidSort(vsv, vsk+1)
+										} else if findCandidate2.MatchString(vsv) && tmpEntname[vsk] == "" { //数据验证val是否是候选人
+											entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string)
+											if entname != "" {
+												tmpEntname[vsk] = entname
+											}
+										} else { //验证val时如果数组中的第一条数据既不满足sort或者entname 判定此数组数据错误
+											break
+										}
+									}
+								}
+							}
 							if res || kv != "" { //连续往下找几个key
 								checkKey[kn+kn1] = true
+							SORT:
 								if repl == "sort" {
+									//hadSort = true
 									for vsk, vsv := range vs {
 										smap[vsk]["sortstr"] = vsv
 										smap[vsk]["sort"] = GetBidSort(vsv, vsk+1)
 									}
 								} else if repl == "entname" || kv == "中标单位" {
 									for vsk, vsv := range vs {
-										smap[vsk]["entname"] = winnerOrderEntity.clear("中标单位", vsv)
+										if winnerReg6.MatchString(vsv) { //k:中标候选人 v:["第一名","第二名"]
+											repl = "sort"
+											goto SORT
+										}
+										//										if entname, _ := smap[vsk]["entname"].(string); entname != "" || len([]rune(vsv)) < 3 {
+										//											break
+										//										}
+										//										entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string)
+										//										if entname != "" {
+										//											smap[vsk]["entname"] = entname
+										//
+										if tmpEntname[vsk] != "" || len([]rune(vsv)) < 4 { //排除 单位:["台","个","套"]
+											break
+										}
+										entname, _ := winnerOrderEntity.clear("中标单位", vsv).(string)
+										if entname != "" {
+											tmpEntname[vsk] = entname
+										}
 									}
 								} else if kv == "中标金额" {
 									for vsk, vsv := range vs {
-										p1 := qutil.Float64All(smap[vsk]["price"])
-										p2 := qutil.Float64All(vsv)
+										//过滤price 2348273.432元(万元)-->2348273.432
+										//tmp1, _ := smap[vsk]["price"].(string)
+										tmp1 := tmpPrice[vsk]
+										p1num := numberReg2.FindString(tmp1)
+										p2num := numberReg2.FindString(vsv)
+										p1 := qutil.Float64All(p1num)
+										p2 := qutil.Float64All(p2num)
 										if p2 > p1 {
-											smap[vsk]["price"] = winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv))
+											//smap[vsk]["price"] = winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv))
+											price := winnerOrderEntity.clear("中标金额", vsv+GetMoneyUnit(k, vsv))
+											if pricestr, _ := price.(string); len(pricestr) < 30 && len(pricestr) > 0 {
+												tmpPrice[vsk] = pricestr
+											}
+
 										}
 									}
 								}
@@ -350,8 +410,22 @@ func (table *Table) KVFilter() {
 						}
 					}
 					newSmap := []map[string]interface{}{}
-					for _, smap_v := range smap {
-						if len(smap_v) > 0 {
+					//qutil.Debug("smap=======", smap)
+					//qutil.Debug("tmpEntname--", len(tmpEntname), tmpEntname)
+					//qutil.Debug("tmpPrice--", len(tmpPrice), tmpPrice)
+					for n, smap_v := range smap {
+						//if hadSort { //有排序,再添加entname和price
+						if len(tmpEntname) > 0 && n < len(tmpEntname) && tmpEntname[n] != "" {
+							smap_v["entname"] = tmpEntname[n]
+
+							if len(tmpPrice) > 0 && n < len(tmpPrice) && tmpPrice[n] != "" {
+								smap_v["price"] = tmpPrice[n]
+							}
+						}
+						//} else if len(tmpEntname) > 0 {
+						//fmt.Println("table winnerorder only has entname", tmpEntname)
+						//}
+						if len(smap_v) > 2 { //只有排序信息 sort和sortstr
 							newSmap = append(newSmap, smap_v)
 						}
 					}
@@ -430,7 +504,7 @@ func (table *Table) KVFilter() {
 					table.StandKVWeight["中标单位"] = -25
 				}
 			}
-		} else if !table.BPackage {
+		} else if !table.BPackage { //没有table.WinnerOrder也没有分包 将td中的WinnerOrder赋值给table.WinnerOrder
 			if len(winnerOrder) > 1 {
 				table.WinnerOrder = winnerOrder
 			}
@@ -507,6 +581,7 @@ func (table *Table) MergerToTableresult() {
 	if table.TableResult.BlockTag == "" && table.Tag != "" {
 		table.TableResult.BlockTag = table.Tag
 	}
+	//中标候选人(多个table,现在默认取第一个table的信息,考虑需不需要多个table分析合并数据?)
 	if table.TableResult.WinnerOrder == nil || len(table.TableResult.WinnerOrder) == 0 {
 		table.TableResult.WinnerOrder = table.WinnerOrder
 	}
@@ -634,7 +709,7 @@ func (ts *TableResult) Analy() {
 	for _, table := range tabs {
 		table.MergerToTableresult()
 		//		for k, v := range table.TableResult.SortKV.Map {
-		//			log.Println(k, v)
+		//			qutil.Debug(k, "=====", v)
 		//		}
 	}
 }
@@ -788,6 +863,7 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 			if u.IsBrandGoods {
 				table.analyBrand()
 			}
+
 			//判断是否是多包,并处理分包的
 			table.CheckMultiPackageByTable()
 			str := "\n"
@@ -1335,7 +1411,7 @@ func (table *Table) FindKV() {
 				}
 				**/
 				//				if td.Val == "电视" || td.Val == "电话机" || td.Val == "传真机" || td.Val == "音响" {
-				//	qutil.Debug("----", "td.BH:", td.BH, "KVDirect:", td.KVDirect, "Val:", td.Val, "direct:", direct, "vdirect:", vdirect)
+				//qutil.Debug("----td.Valtype", td.Valtype, "td.BH:", td.BH, "KVDirect:", td.KVDirect, "Val:", td.Val, "direct:", direct, "vdirect:", vdirect)
 				//				}
 				if !td.BH && td.KVDirect < 3 {
 					if !table.FindTdVal(td, direct, vdirect) {
@@ -1564,6 +1640,11 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 	//	if near != nil {
 	//		fmt.Println("near----", near.Val, td.Val)
 	//	}
+	//	qutil.Debug(near != nil)
+	//	qutil.Debug(near.BH)
+	//	qutil.Debug(near.KeyDirect == vdirect, near.KeyDirect == 0)
+	//	qutil.Debug(near.KVDirect == direct, near.KVDirect == 0)
+	//	qutil.Debug(near.KVDirect < 3)
 	if near != nil && near.BH && (near.KeyDirect == vdirect || near.KeyDirect == 0) && (near.KVDirect == direct || near.KVDirect == 0) && near.KVDirect < 3 {
 		near.KVDirect = direct
 		near.KeyDirect = vdirect
@@ -1691,7 +1772,6 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 		}
 		b = true
 	}
-	//qutil.Debug("map", b, table.SortKV.Map)
 	return
 }
 
@@ -2022,8 +2102,8 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 					//if !(len(k1tags) > 0 && k1tags[0].Value == "采购单位") {
 					//	tn.SortKV.RemoveKey(k1)
 					//}
-					for _,vcgdw:=range k1tags{
-						if vcgdw.Value =="采购单位"{
+					for _, vcgdw := range k1tags {
+						if vcgdw.Value == "采购单位" {
 							tn.SortKV.RemoveKey(k1)
 						}
 					}
@@ -2751,7 +2831,7 @@ func (table *Table) analyBrand() {
 		} else {
 			// "_id" : ObjectId("5c2c3802a5cb26b9b78646c4")5c2b0551a5cb26b9b7cb05db否5c2a42e6a5cb26b9b763ba5a采购人:一、采购人5c2b06f5a5cb26b9b7cc4409
 			//成交供应商排名 [map[entname:昆明合优科技有限公司 sortstr:第一中标候选人 sort:1] map[sort:2 entname:昆明厚起科技有限公司 sortstr:第二中标候选人] map[entname:云南远安科技发展有限公司 sortstr:第三中标候选人 sort:3]]
-			//fmt.Println("err data:", key, val)
+			//qutil.Debug("err data:", key, val)
 		}
 	}
 	//处理数组数据后,匹配必须title和替换要保存的title

+ 1 - 1
src/jy/pretreated/colonkv.go

@@ -18,7 +18,7 @@ var (
 	regReplKV2    = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
 	regKV         = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
 	filterK       = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
-	filterValue   = regexp.MustCompile("^(无)$")
+	filterValue   = regexp.MustCompile("(^(无)$|.+%.*|[\r\n\\s\u3000\u2003\u00a0]+|^<.*>)")
 	regReplKey    = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
 	BlockTagMap   = map[string]bool{
 		"招标范围": true,

+ 1 - 1
src/jy/pretreated/tablev2.go

@@ -115,6 +115,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	ht := td.Goquery.ChildrenFiltered("table")
 	bsontable := false
 	txt := ""
+	//子table处理合并
 	if ht.Size() > 0 {
 		//qutil.Debug("有子表格")
 		txt = TextAfterRemoveTable(td.Html)
@@ -219,7 +220,6 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 			for _, bl := range ub {
 				//冒号kv
 				for bl_ck, bl_cv := range bl.ColonKV.Kv {
-					//u.Debug(bl_ck, bl_cv)
 					if td.SortKV.Map[bl_ck] == nil || bl.ColonKV.KvTag[bl_ck].Weight >= colonKvWeight[bl_ck] {
 						colonKvWeight[bl_ck] = bl.ColonKV.KvTag[bl_ck].Weight
 						td.SortKV.AddKey(bl_ck, bl_cv)

+ 36 - 11
src/jy/pretreated/winnerorder.go

@@ -24,19 +24,25 @@ type WinnerFlag struct {
 var (
 	winnerOrderEntity = &WinnerOrderEntity{}
 	numberReg         = regexp.MustCompile("[一二三四五六七八九十0-9]+")
+	numberReg2        = regexp.MustCompile("[\\d一二三四五六七八九十.,,]+")
 	thisNumberReg     = regexp.MustCompile("第" + numberReg.String())
 	winnerReg1        = regexp.MustCompile("(^|[^为])(【?(推荐)?第[一二三四五六七八九十1-9]+(合格|名|包|标段)?】?((候|侯)选)?(备选|成交|中(标|选))人?([((]成交[))])?((候|侯)选|排序)?(人(单位)?|供应商|单位|机构)(名称)?为?)($|[^,;;。,])")
 	winnerReg2        = regexp.MustCompile("(排名第[一二三四五六七八九十1-9]+|第[一二三四五六七八九十1-9]+(候|侯)选人)")
-	winnerReg3        = regexp.MustCompile("(第[一二三四五六七八九十1-9]+名)")
-	winnerReg4        = regexp.MustCompile("((确认|推荐|评审|排(名|序))[为::]+|(由高到低排序前.名|公示下列内容|(确定|推荐)的?中(标|选)候选人|\n中(标|选)候选.{1,3}\\s*\n|\n(中(标|选)候选.{1,3}[::\u3000\u2003\u00a0\\s]|成交候选供应商)|(排(名|序)|公(示|告)|具体|推荐|结果(公示)?|中(标|选)候选人.{0,2})如下|[一二三四五六七八九十\\d]+、(中(标|选)候选[^\n::]{1,8}|.{0,8}(成交|结果)信息|成交[^\n::]{2,8}))[为::]?)")
-	winnerReg5        = regexp.MustCompile("([^,;;。,、\n]+)(为?第[一二三四五六七八九十1-9]+(成交|中标)?((候|侯)选(人|供应商|单位|机构)|名)|排名第[一二三四五六七八九十1-9]+)([,;;。,、]|\\s+\n)")
-	colonEndReg       = regexp.MustCompile("[::]$")
-	toWarpReg         = regexp.MustCompile("[,。,;;]+")
-	findamountReg     = regexp.MustCompile("[,。,;;\u3000\u2003\u00a0\\s]+")
-	amountReg         = regexp.MustCompile("^\\d+(\\.\\d+)?((百|千)?元|(百|千)?(万|亿)元?)$")
-	companyWarpReg    = regexp.MustCompile("(公司)(.+?[::])")
-	findCompanyReg    = regexp.MustCompile("[^::]+公司")
-	colonSpaceReg     = regexp.MustCompile("[::]\\s+")
+	//winnerReg2     = regexp.MustCompile("(第[一二三四五六七八九十1-9]+(候|侯)选人)")
+	winnerReg3     = regexp.MustCompile("(第[一二三四五六七八九十1-9]+名)")
+	winnerReg4     = regexp.MustCompile("((确认|推荐|评审|排(名|序))[为::]+|(由高到低排序前.名|公示下列内容|(确定|推荐)的?中(标|选)候选人|\n中(标|选)候选.{1,3}\\s*\n|\n(中(标|选)候选.{1,3}[::\u3000\u2003\u00a0\\s]|成交候选供应商)|(排(名|序)|公(示|告)|具体|推荐|结果(公示)?|中(标|选)候选人.{0,2})如下|[一二三四五六七八九十\\d]+、(中(标|选)候选[^\n::]{1,8}|.{0,8}(成交|结果)信息|成交[^\n::]{2,8}))[为::]?)")
+	winnerReg5     = regexp.MustCompile("([^,;;。,、\n]+?)(为?)(第[一二三四五六七八九十1-9]+(成交|中标)?((候|侯)选(人|供应商|单位|机构)|名)|排名第[一二三四五六七八九十1-9]+)([,;;。,、]|\\s+\n)")
+	winnerReg6     = regexp.MustCompile("(^(排名)?第[一二三四五六七八九十1-9]+[名中标成交备选候选人单位供应商]*)")
+	colonEndReg    = regexp.MustCompile("[::]$")
+	toWarpReg      = regexp.MustCompile("[,。,;;]+")
+	findamountReg  = regexp.MustCompile("[,。,;;\u3000\u2003\u00a0\\s]+")
+	amountReg      = regexp.MustCompile("^\\d+(\\.\\d+)?((百|千)?元|(百|千)?(万|亿)元?)$")
+	companyWarpReg = regexp.MustCompile("(公司)(.+?[::])")
+	findCompanyReg = regexp.MustCompile("[^::]+公司")
+	colonSpaceReg  = regexp.MustCompile("[::]\\s+")
+	findCandidate  = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体))|工作室)")
+	findCandidate2 = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$)")
+	clearSpace1    = regexp.MustCompile("([((][\\d一二三四五六七八九十][))][\\s\u3000\u2003\u00a0\\t]*|<[^>].+?>)")
 )
 
 /*
@@ -45,7 +51,17 @@ var (
  *from 来源
  */
 func (wo *WinnerOrderEntity) Find(text string, flag bool, from int) []map[string]interface{} {
-	text = winnerReg5.ReplaceAllString(text, "\n$2:$1\n")
+	text = winnerReg5.ReplaceAllString(text, "\n$3:$1\n")
+	/*
+		"_id" : ObjectId("5c2c6f60a5cb26b9b7b62cd8")
+
+		1 .第一中选候选人:
+		(1)	单位名称:成都维诺信科技有限公司
+		(2)	参选报价:522,00.00元
+		(3)	质量:符合比选文件规定的质量标准
+
+	*/
+	text = clearSpace1.ReplaceAllString(text, "") //清理(1)	单位名称:成都维诺信科技有限公司-->单位名称:成都维诺信科技有限公司
 	if strings.TrimSpace(text) == "" {
 		return []map[string]interface{}{}
 	}
@@ -192,6 +208,13 @@ func (wo *WinnerOrderEntity) findByReg(content string, blocks []string, reg_2 *r
 				winners = append(winners, object)
 				object = map[string]interface{}{}
 			}
+			//新加 从正文抽取时对v校验
+			if from == 1 || from == 3 {
+				v = findCandidate.FindString(v)
+				if v == "" {
+					continue
+				}
+			}
 			val := wo.clear("中标单位", v)
 			if val != nil {
 				count++
@@ -236,6 +259,8 @@ func (wo *WinnerOrderEntity) clear(typ, v string) interface{} {
 	if typ == "中标单位" && regDivision.MatchString(v) {
 		v = findCompanyReg.FindString(v)
 	}
+	v = filterValue.ReplaceAllString(v, "")
+	//过滤
 	return v //clear.ClearResult(typ, v)
 }
 

+ 30 - 29
src/jy/util/article.go

@@ -2,23 +2,23 @@ package util
 
 //
 type Job struct {
-	SourceMid    string                            //数据源的MongoId
-	Category     string                            //类别
-	CategorySecond string							//二级分类
-	Content      string                            //正文
-	Title        string                            //标题
-	SpiderCode   string                            //爬虫代码
-	Domain       string                            //网站域名
-	Href         string                            //原文链接
-	City         string                            //城市
-	Province     string                            //省份
-	Data         *map[string]interface{}           //数据库源数据
-	Block        []*Block                          //分块
-	Result       map[string][]*ExtField            //结果
-	BuyerAddr    string                            //采购单位地址
-	BlockPackage map[string]*BlockPackage          //块中的分包
-	Winnerorder  []map[string]interface{}          //中标候选人排序
-	PackageInfo  map[string]map[string]interface{} //分包信息
+	SourceMid      string                            //数据源的MongoId
+	Category       string                            //类别
+	CategorySecond string                            //二级分类
+	Content        string                            //正文
+	Title          string                            //标题
+	SpiderCode     string                            //爬虫代码
+	Domain         string                            //网站域名
+	Href           string                            //原文链接
+	City           string                            //城市
+	Province       string                            //省份
+	Data           *map[string]interface{}           //数据库源数据
+	Block          []*Block                          //分块
+	Result         map[string][]*ExtField            //结果
+	BuyerAddr      string                            //采购单位地址
+	BlockPackage   map[string]*BlockPackage          //块中的分包
+	Winnerorder    []map[string]interface{}          //中标候选人排序
+	PackageInfo    map[string]map[string]interface{} //分包信息
 
 	BrandData [][]map[string]string //
 	HasTable  int                   //有table
@@ -41,18 +41,19 @@ type ExtField struct {
 
 //块
 type Block struct {
-	Tags     []Tags          //对块做的标签,可以作为数据抽取的依据
-	Title    string          //块标题
-	Index    int             //块索引
-	Text     string          //块内容
-	Start    int             //开始索引
-	End      int             //结束索引
-	ColonKV  *JobKv          //冒号kv (分出的对应的KV值)
-	TableKV  *JobKv          //table kv (分出的对应的KV值)
-	SpaceKV  *JobKv          //空格 kv (分出的对应的KV值)
-	BPackage *BlockPackage   //分包信息
-	Tag      map[string]bool //块标签
-	Block    []*Block        //子块
+	Tags        []Tags                   //对块做的标签,可以作为数据抽取的依据
+	Title       string                   //块标题
+	Index       int                      //块索引
+	Text        string                   //块内容
+	Start       int                      //开始索引
+	End         int                      //结束索引
+	ColonKV     *JobKv                   //冒号kv (分出的对应的KV值)
+	TableKV     *JobKv                   //table kv (分出的对应的KV值)
+	SpaceKV     *JobKv                   //空格 kv (分出的对应的KV值)
+	BPackage    *BlockPackage            //分包信息
+	Tag         map[string]bool          //块标签
+	Block       []*Block                 //子块
+	Winnerorder []map[string]interface{} //块中,中标候选人排序
 }
 
 //段落

+ 5 - 5
src/main_test.go

@@ -29,7 +29,7 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5a53966e40d2d9bbe8f7d30a", "1", "mxs_v2", "mxs_v2")
+	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5c2b55f1a5cb26b9b7fac3c3", "1", "mxs_v2", "mxs_v2")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }
@@ -40,11 +40,11 @@ func Test_extractcity(t *testing.T) {
 	extract.FindBuyer()
 }
 func Test_reg(t *testing.T) {
-	context := `sfsa.`
-	reg := regexp.MustCompile(`(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,\.。、::“”‘’"])`)
-	//reg := regexp.MustCompile(`[\\p{Han}]`)
+	context := `sfsa发斯蒂芬.`
+	//reg := regexp.MustCompile(`(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,\.。、::“”‘’"])`)
+	reg := regexp.MustCompile(`[\\p{Han}]`)
 	tmp := reg.MatchString(context)
-	log.Println(tmp)
+	log.Println("--", tmp)
 }
 
 func Test_reg1(t *testing.T) {

+ 2 - 2
src/res/tablev1.json

@@ -62,7 +62,7 @@
 		"其他投标人"
 	],
 	"bidorder":[
-		".{0,8}排[序名]$__sort",
-		"(人|供应商|单位)(名称)?$__entname"
+		"(.{0,8}排[序名]$|名次|^序号$)__sort",
+		"([^负责联系需求]+人|供应商|^单位)(名称)?$__entname"
 	]
 }