Bläddra i källkod

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

zhangjinkun 6 år sedan
förälder
incheckning
34f6ec17d5

+ 38 - 39
src/jy/extract/extract.go

@@ -722,53 +722,52 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 //lua脚本根据属性设置提取kv值
 func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map[string]interface{} {
 	kvmap := map[string][]map[string]interface{}{}
-	blocks := []*ju.Block{}
-	for _, bl := range j.Block {
-		if len(bl.Block) > 0 {
-			blocks = append(blocks, bl.Block...)
-		} else {
-			blocks = append(blocks, bl)
-		}
-	}
 	for fieldname, field := range vc.LFields {
 		if field != vc.Field {
 			continue
 		}
-		for _, bl := range blocks {
-			tp := ""
-			for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
-				if k == 0 {
-					tp = "colon"
-				} else if k == 1 {
-					tp = "space"
-				} else if k == 2 {
-					tp = "table"
-				}
-				if v == nil || v.KvTags == nil {
-					continue
-				}
-				for _, vv := range v.KvTags[fieldname] {
-					text := ju.TrimLRSpace(vv.Value, "")
-					if text != "" {
-						kvmap[field] = append(kvmap[field], map[string]interface{}{
-							"code":        "CL_" + vv.Key,
-							"field":       field,
-							"ruletext":    vv.Key,
-							"extfrom":     vc.ExtFrom,
-							"sourcevalue": text,
-							"value":       text,
-							"type":        tp,
-							"matchtype":   "tag_string",
-							"blocktag":    bl.Classify,
-							"weight":      vv.Weight,
-						})
-					}
+		extractFromKv(field, fieldname, j.Block, vc, kvmap)
+	}
+	AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
+	return kvmap
+}
+
+func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
+	for _, bl := range blocks {
+		tp := ""
+		for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
+			if k == 0 {
+				tp = "colon"
+			} else if k == 1 {
+				tp = "space"
+			} else if k == 2 {
+				tp = "table"
+			}
+			if v == nil || v.KvTags == nil {
+				continue
+			}
+			for _, vv := range v.KvTags[fieldname] {
+				text := ju.TrimLRSpace(vv.Value, "")
+				if text != "" {
+					kvmap[field] = append(kvmap[field], map[string]interface{}{
+						"code":        "CL_" + vv.Key,
+						"field":       field,
+						"ruletext":    vv.Key,
+						"extfrom":     vc.ExtFrom,
+						"sourcevalue": text,
+						"value":       text,
+						"type":        tp,
+						"matchtype":   "tag_string",
+						"blocktag":    bl.Classify,
+						"weight":      vv.Weight,
+					})
 				}
 			}
 		}
+		if len(kvmap[field]) == 0 {
+			extractFromKv(field, fieldname, bl.Block, vc, kvmap)
+		}
 	}
-	AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
-	return kvmap
 }
 
 //正则提取结果

+ 0 - 2
src/jy/extract/score.go

@@ -41,7 +41,6 @@ func init() {
 			CommonScore[k] = qu.Float64All(v)
 		}
 	}
-	log.Println(CommonScore)
 	//指定抽取属性打分配置
 	if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
 		FieldsScore = map[string]map[string]float64{}
@@ -55,7 +54,6 @@ func init() {
 			FieldsScore[key] = fieldscore
 		}
 	}
-	log.Println(FieldsScore)
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)

+ 12 - 6
src/jy/pretreated/analystep.go

@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"jy/util"
 	//"log"
+	"unicode/utf8"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -30,7 +31,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -107,18 +108,23 @@ func FindProjectCode(newCon string, job *util.Job) {
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if newCon = projectcodeRegAll.FindString(newCon); newCon != "" { //项目名称项目编号一起的
+	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
 		//5d424bdfa5cb26b9b7ac7a85
 		//5d425a48a5cb26b9b7df5fec
 		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newCon, " ")
+		splitStr := strings.Split(newConTMP, " ")
 		if len(splitStr) >= 2 {
-			newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+				newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+				//5d4253f3a5cb26b9b7ca2662
+				newCon = "项目编号:" + tmpstr
+			}
 		} else if len(splitStr) == 1 {
 			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
 				newCon = "项目编号:" + tmpstr
-			}else if strings.Contains(newCon,"、"){
-				tmpstrs :=strings.Split(newCon,"、")
+			} else if strings.Contains(newConTMP, "、") {
+				tmpstrs := strings.Split(newCon, "、")
 				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
 			}
 		}

+ 3 - 3
src/jy/pretreated/analytable.go

@@ -107,8 +107,8 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`(采购项目|项目)名称及[项目]?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]")
+	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
+	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
 	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
@@ -3151,7 +3151,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 20 - 3
src/jy/pretreated/colonkv.go

@@ -202,17 +202,34 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
 
 //根据配置文件中的规则,格式化正文
 func formatText(content, key string) string {
-	segment := DivideSegment(content)
+	segments := make([]*Segment, 0)
+	if key == "all" {
+		segments = DivideSegmentHtml(content)
+	} else if key == "kv" {
+		segments = DivideSegment(content)
+		//log.Println("清理前:\n",content)
+	}
 	newCon := ""
-	for _, v := range segment {
-		if v.Index > len(segment)-3 {
+	for _, v := range segments {
+		if v.Index > len(segments)-3 {
 			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
 				break
 			}
 		}
+		if key == "kv" && utf8.RuneCountInString(v.Text)>=1{
+			//log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
+			v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
+				return r == 19968 || r == 20108 || r == 19977 ||
+					r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
+			})
+			//log.Println("清理前后",v.Text)
+		}
 		newCon += v.Text + "\n"
 	}
 	content = regEndWrap.ReplaceAllString(newCon, "")
+	//if key == "kv"{
+	//	log.Println("清理前后\n",content)
+	//}
 	for _, v := range FormatTextMap[key] {
 		reg, _ := v["reg"].(*regexp.Regexp)
 		separator, isString := v["separator"].(string)

+ 66 - 6
src/jy/pretreated/division.go

@@ -1,6 +1,7 @@
 package pretreated
 
 import (
+	"fmt"
 	"jy/util"
 	qutil "qfw/util"
 	"regexp"
@@ -65,6 +66,24 @@ var (
 	*/
 	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
 	filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
+	xuhao             = map[string]bool{
+		"19968_12289": true,
+		"19968_46":    true,
+		"20108_12289": true,
+		"20108_46":    true,
+		"19977_12289": true,
+		"19977_46":    true,
+		"22235_12289": true,
+		"22235_46":    true,
+		"20116_12289": true,
+		"20116_46":    true,
+		"20845_12289": true,
+		"20845_46":    true,
+		"19971_12289": true,
+		"19971_46":    true,
+		"20843_12289": true,
+		"20061_46":    true,
+	}
 )
 
 //分块
@@ -409,11 +428,52 @@ func appendWarpStop(text string) string {
 	}
 	return text
 }
-
+//分段
+func DivideSegmentHtml(txt string) []*util.Segment {
+	//先分段
+	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		return r == 10 || r == 13
+	})
+	//再去除空行
+	segs := make([]*util.Segment, 0)
+	_index := 0
+	for _, seg := range _segs {
+		if seg != " " && len(seg) > 1 {
+			_seg := util.Segment{}
+			_index = _index + 1
+			_seg.Index = _index
+			_seg.Text = seg
+			segs = append(segs, &_seg)
+		}
+	}
+	return segs
+}
 //分段
 func DivideSegment(txt string) []*util.Segment {
 	//先分段
+	tmpstr := ""
 	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
+			r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
+			if tmpstr == "" {
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if strings.Contains(tmpstr, "_") {
+				tmpstr = ""
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if tmpstr == fmt.Sprint(r) {
+				if r == 46 || r == 12289{
+					tmpstr = ""
+				}
+				return false
+			}
+			tmpstr += "_" + fmt.Sprint(r)
+			if xuhao[tmpstr] {
+				return true
+			}
+		}
+		tmpstr= ""
 		return r == 10 || r == 13
 	})
 	//再去除空行
@@ -528,7 +588,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			return false, ""
 		}
 		//
-		is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
+		is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
 		for _, sv := range is {
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 		}
@@ -568,13 +628,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexPkgMap[sv[0]] = v[0]
 		}
 		//key在包前面,并且在一行的开头
-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		if len(keys) == 0 {
 			//key在包前面,并且key以冒号结尾
-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		if len(keys) == 0 {
-			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		for _, key := range keys {
 			startEndMap[key[5]] = key[4]
@@ -628,7 +688,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			}
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
-			text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
+			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {

+ 1 - 1
src/jy/pretreated/tablev2.go

@@ -396,7 +396,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 		if len(td.SortKV.Keys) > 0 {
 			//td.KVDirect = 3 //不当头也不当值,忽略
 			if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
-				td.Val = td.SortKV.Keys[0]
+				td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
 				td.BH = true
 			}
 		} else if !bsontable {

+ 13 - 6
src/jy/pretreated/winnerorder.go

@@ -1,7 +1,6 @@
 package pretreated
 
 import (
-	"log"
 	//"jy/clear"
 	"jy/util"
 	qutil "qfw/util"
@@ -27,7 +26,7 @@ var (
 	numberReg         = regexp.MustCompile("[一二三四五六七八九十0-9]+")
 	numberReg2        = regexp.MustCompile("[\\d一二三四五六七八九十.,,]+")
 	thisNumberReg     = regexp.MustCompile("第" + numberReg.String())
-	winnerReg1        = regexp.MustCompile("(^|[^为])(【?(推荐)?第[一二三四五六七八九十1-9]+(合格|名|包|标段)?】?((候|侯)选)?(备选|成交|中(标|选))人?([((]成交[))])?((候|侯)选|排序)?(人(单位)?|供应商|单位|机构)(名称)?为?)($|[^,;;。,])")
+	winnerReg1        = regexp.MustCompile("(^|[^为])(【?(推荐)?第[一二三四五六七八九十1-9]+(合格|名|包|标段)?】?((候|侯)选)?(入围|备选|成交|中(标|选))人?([((]成交[))])?((候|侯)选|排序)?(人(单位)?|供[货]商|单位|机构)(名称)?为?)($|[^,;;。,])")
 	winnerReg2        = regexp.MustCompile("(排名第[一二三四五六七八九十1-9]+|第[一二三四五六七八九十1-9]+(候|侯)选人)")
 	//winnerReg2     = regexp.MustCompile("(第[一二三四五六七八九十1-9]+(候|侯)选人)")
 	winnerReg3     = regexp.MustCompile("(第[一二三四五六七八九十1-9]+名)")
@@ -44,6 +43,7 @@ var (
 	findCandidate  = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体))|工作室)")
 	findCandidate2 = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$)")
 	clearSpace1    = regexp.MustCompile("([((][\\d一二三四五六七八九十][))][\\s\u3000\u2003\u00a0\\t]*|<[^>].+?>)")
+	offerReg       = regexp.MustCompile("(中标|磋商|投标|报|单|成交)总?(价|金额)")
 )
 
 /*
@@ -102,7 +102,6 @@ func (wo *WinnerOrderEntity) Find(text string, flag bool, from int) []map[string
 			invalidCount++
 		}
 	}
-	log.Println(invalidCount)
 	if invalidCount > len(winners)/2 {
 		return []map[string]interface{}{}
 	}
@@ -229,10 +228,18 @@ func (wo *WinnerOrderEntity) findByReg(content string, blocks []string, reg_2 *r
 				object["type"] = 1
 			}
 		} else { //中标金额
-			kvTags := GetKvTags([]*util.Kv{&util.Kv{Key: k, Value: v}}, "", []string{"中标金额"})
+			findOfferFlag := false
+			if offerReg.MatchString(k) {
+				findOfferFlag = true
+			} else {
+				kvTags := GetKvTags([]*util.Kv{&util.Kv{Key: k, Value: v}}, "", []string{"中标金额"})
+				if len(kvTags["中标金额"]) > 0 {
+					findOfferFlag = true
+				}
+			}
 			//找到了中标金额
-			if len(kvTags["中标金额"]) > 0 && object["entname"] != nil {
-				val := wo.clear("中标金额", kvTags["中标金额"][0].Value)
+			if findOfferFlag && object["entname"] != nil {
+				val := wo.clear("中标金额", v)
 				if val != nil {
 					object["price"] = val
 				}

+ 15 - 3
src/main_blocktest.go

@@ -17,6 +17,8 @@ var f *os.File
 var m = map[string]bool{}
 
 func main12() {
+	//winnerorder()
+	//return
 	//log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动"))
 	//return
 	//f, _ = os.OpenFile("./title.txt", os.O_RDWR|os.O_CREATE, 777)
@@ -49,7 +51,7 @@ func all() {
 }
 func one() {
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
-	d, _ := m.FindById("bidding", "5d4260f7a5cb26b9b7ea8c63", extract.Fields)
+	d, _ := m.FindById("bidding", "5d423af9a5cb26b9b766ec44", extract.Fields)
 	com(*d)
 }
 func com(doc map[string]interface{}) {
@@ -152,8 +154,7 @@ func com(doc map[string]interface{}) {
 	}
 	log.Println("=============抽取结果 result================")
 	for k, v := range set["result"].(map[string][]*ju.ExtField) {
-		break
-		if k != "budget" {
+		if k != "winner" {
 			continue
 		}
 		for _, vv := range v {
@@ -241,3 +242,14 @@ func GetDetail(doc map[string]interface{}) (detail string) {
 	}
 	return detail
 }
+func winnerorder() {
+	text := `评审专家名单:
+吴殿波、韩屹、孙胜进、郑丹、李海波
+ 
+中标标的名称、规格型号、数量、单价、服务要求:
+2019年沈阳惠涌供热有限责任公司、沈阳圣达热力供暖有限责任公司、沈阳惠盛供热有限责任公司PE管保温
+第一入围供货商:沈阳曲暖鼎盛保温安装有限公司 、总单价:11.833300
+第二入围供货商:沈阳国盛防腐保温有限公司、总单价:11.102100
+第三入围供货商:沈阳泰豪管材有限公司、总单价:13.258100`
+	log.Println((&pretreated.WinnerOrderEntity{}).Find(text, true, 1))
+}

+ 6 - 1
src/res/fieldscore.json

@@ -122,7 +122,7 @@
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|会|场)$",
+                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|场)$",
                 "score": 3
             }
         ],
@@ -136,6 +136,11 @@
                 "describe": "包含负分不再展示",
                 "regstr": "(详见|提出|面向|施工)",
                 "score": -50
+            },
+            {
+                "describe": "一个字或者两个字不再显示",
+                "regstr": "^[\\s]*[\\u4e00-\\u9fa5]{1,2}[\\s]*$",
+                "score": -50
             }
         ],
         "length": [