Browse Source

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 years ago
parent
commit
d3d00520e9

+ 4 - 4
src/config.json

@@ -8,11 +8,11 @@
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": true,
+    "saveresult": false,
     "fieldscore": true,
-    "qualityaudit": false,
-    "iscltlog": true,
-    "brandgoods": false,
+    "qualityaudit": true,
+    "iscltlog": false,
+    "brandgoods": true,
     "udptaskid": "5be107e600746bf92debf080",
     "udpip": "127.0.0.1",
     "udpport": "1484",

+ 9 - 2
src/jy/admin/rulecheck.go

@@ -198,6 +198,8 @@ func checkPreReg(content, ruleText string) string {
 		tmp := strings.Split(ruleText, "__")
 		var pattern string
 		if strings.Contains(tmp[0], "\\u") {
+			tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+			tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 		} else {
 			pattern = tmp[0]
@@ -222,6 +224,8 @@ func checkBackReg(content, ruleText string) string {
 		tmp := strings.Split(ruleText, "__")
 		var pattern string
 		if strings.Contains(tmp[0], "\\u") {
+			tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+			tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 			pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 		} else {
 			pattern = tmp[0]
@@ -258,10 +262,13 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 			}
 			var pattern string
 			if strings.Contains(tmp[0], "\\u") {
+				tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+				tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 				pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 			} else {
 				pattern = tmp[0]
 			}
+			log.Println("pattern", pattern)
 			reg := regexp.MustCompile(pattern)
 			apos := reg.FindAllStringSubmatchIndex(content, -1)
 			if len(apos) > 0 {
@@ -313,7 +320,7 @@ func checkBackScript(table, code, name, version, infoid, script string, alone bo
 		doc = extract.ExtRegPre(doc, j, v, e.TaskInfo)
 	}
 	//抽取规则
-	if j.CategorySecond==""{
+	if j.CategorySecond == "" {
 		for _, vc1 := range e.RuleCores[j.Category] {
 			for _, vc := range vc1 {
 				tmp := ju.DeepCopy(doc).(map[string]interface{})
@@ -331,7 +338,7 @@ func checkBackScript(table, code, name, version, infoid, script string, alone bo
 				}
 			}
 		}
-	}else{
+	} else {
 		for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
 			for _, vc := range vc1 {
 				tmp := ju.DeepCopy(doc).(map[string]interface{})

+ 27 - 3
src/jy/clear/totimestamp.go

@@ -4,11 +4,12 @@ package clear
 import (
 	"fmt"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
 )
 
-var reg, regA, regB, regC, regAfter *regexp.Regexp
+var reg, regA, regB, regC, regD, regAfter *regexp.Regexp
 
 const (
 	T = 365 * 86400
@@ -22,10 +23,12 @@ var item = map[string]string{
 }
 
 func init() {
+	//二〇一五年十一月四日十五时
 	reg, _ = regexp.Compile(`\d+`)
 	regA, _ = regexp.Compile(`[一|二|三|四|五|六|七|八|九|十|零|〇|1|2|3|4|5|6|7|8|9|0]`)
 	regB, _ = regexp.Compile(`\d+年\d+月\d+日((上|下)午)?\s*\d+[::时]\d+分?[-—]\d+[::时]\d+时?分?`)
 	regC, _ = regexp.Compile(`\s*\d+[::时]\d+分?[-—]`)
+	regD, _ = regexp.Compile(`([一|二|三|四|五|六|七|八|九|十|零|〇]{4})年([一|二|三|四|五|六|七|八|九|十]{1,2})月([一|二|三|四|五|六|七|八|九|十]{1,3})日([一|二|三|四|五|六|七|八|九|十]{1,3})时`)
 	regAfter, _ = regexp.Compile(`(下午D?\d{1,2}[时|:|:|h|H])`)
 }
 
@@ -41,6 +44,27 @@ func init() {
 */
 func ObjToTimestamp(data []interface{}) []interface{} {
 	tmp := fmt.Sprint(data[0])
+	//处理类似:二〇一五年十一月四日十五时
+	cht := regD.FindStringSubmatch(tmp)
+	if len(cht) == 5 {
+		y := chineseToNumber(cht[1])
+		m := 0
+		for _, v := range []rune(cht[2]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			m += it
+		}
+		d := 0
+		for _, v := range []rune(cht[3]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			d += it
+		}
+		M := 0
+		for _, v := range []rune(cht[4]) {
+			it, _ := strconv.Atoi(item[string(v)])
+			M += it
+		}
+		tmp = fmt.Sprintf("%s年%d月%d日%d时", y, m, d, M)
+	}
 	//2016年12月7日上午9:00-11:30时 时间范围处理 取后面的时间
 	if regB.MatchString(tmp) {
 		tmp = regC.ReplaceAllString(tmp, "")
@@ -97,8 +121,8 @@ func ObjToTimestamp(data []interface{}) []interface{} {
 		t, _ := time.ParseInLocation("2006-01-02 15:04", timestr, time.Local)
 		timestamp = t.Unix()
 	}
-	if timestamp < 0 || timestamp > (time.Now().Unix()+T) {
-		data[0] = 0
+	if timestamp <= 0 || timestamp > (time.Now().Unix()+T) {
+		data[0] = ""
 	} else {
 		if addreptime > 0 {
 			timestamp += addreptime

+ 1 - 0
src/jy/extract/exportask.go

@@ -57,6 +57,7 @@ func extractAndExport(v string, t map[string]interface{}) {
 	e.InitTag()
 	e.InitClearFn()
 	e.InfoTypeList()
+	e.InitBlockRule()
 	//品牌抽取是否开启
 	ju.IsBrandGoods = ju.Config["brandgoods"].(bool)
 

+ 2 - 0
src/jy/extract/extract.go

@@ -46,6 +46,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.InitRuleCore()
 	ext.InitPkgCore()
 	ext.InitBlockRule()
+	ext.InfoTypeList()
 	ext.InitTag()
 	ext.InitClearFn()
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
@@ -118,6 +119,7 @@ func StartExtractTaskId(taskId string) bool {
 	ext.InitRuleCore()
 	ext.InitPkgCore()
 	ext.InitBlockRule()
+	ext.InfoTypeList()
 	ext.InitTag()
 	ext.InitClearFn()
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取

+ 12 - 0
src/jy/extract/extractInit.go

@@ -214,6 +214,8 @@ func (e *ExtractTask) InitRulePres() {
 				tmp := strings.Split(rinfo.RuleText, "__")
 				var pattern string
 				if strings.Contains(tmp[0], "\\u") {
+					tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+					tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 				} else {
 					pattern = tmp[0]
@@ -251,6 +253,8 @@ func (e *ExtractTask) InitRuleBacks() {
 				tmp := strings.Split(rinfo.RuleText, "__")
 				var pattern string
 				if strings.Contains(tmp[0], "\\u") {
+					tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+					tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 					pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 				} else {
 					pattern = tmp[0]
@@ -335,6 +339,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -371,6 +377,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -414,6 +422,8 @@ func (e *ExtractTask) InfoRole(vinfo map[string]interface{}) []*RuleCore {
 					tmp := strings.Split(rinfo.RuleText, "__")
 					var pattern string
 					if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 						pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 					} else {
 						pattern = tmp[0]
@@ -485,6 +495,8 @@ func (e *ExtractTask) InitPkgCore() {
 						tmp := strings.Split(rinfo.RuleText, "__")
 						var pattern string
 						if strings.Contains(tmp[0], "\\u") {
+							tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
+							tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
 							pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
 						} else {
 							pattern = tmp[0]

+ 9 - 5
src/jy/extract/score.go

@@ -17,7 +17,7 @@ func init() {
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
 	//实例化正则
 	for _, tmp := range SoreConfig {
-		log.Println(tmp)
+		//log.Println(tmp)
 		if tmp["type"] == "string" {
 			if positions, ok := tmp["position"].([]interface{}); ok {
 				for _, position := range positions {
@@ -25,8 +25,10 @@ func init() {
 						qu.Try(func() {
 							strReq, _ := p["regstr"].(string)
 							if strings.Contains(strReq, "\\u") {
-								pattern, _ := strconv.Unquote(`"` + strReq + `"`)
-								p["regexp"] = regexp.MustCompile(pattern)
+								strReq = strings.Replace(strReq, "\\", "\\\\", -1)
+								strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
+								strReq, _ = strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(strReq)
 							} else {
 								p["regexp"] = regexp.MustCompile(strReq)
 							}
@@ -42,8 +44,10 @@ func init() {
 						qu.Try(func() {
 							strReq, _ := p["regstr"].(string)
 							if strings.Contains(strReq, "\\u") {
-								pattern, _ := strconv.Unquote(`"` + strReq + `"`)
-								p["regexp"] = regexp.MustCompile(pattern)
+								strReq = strings.Replace(strReq, "\\", "\\\\", -1)
+								strReq = strings.Replace(strReq, "\\\\u", "\\u", -1)
+								strReq, _ = strconv.Unquote(`"` + strReq + `"`)
+								p["regexp"] = regexp.MustCompile(strReq)
 							} else {
 								p["regexp"] = regexp.MustCompile(strReq)
 							}

+ 16 - 24
src/jy/pretreated/analystep.go

@@ -18,39 +18,30 @@ func AnalyStart(job *util.Job) {
 	//格式化正文
 	con = formatText(con, "all")
 	job.Content = con
-	//
+	//计算表格占比,返回表格数组、占比
 	tabs, ration := ComputeConRatio(con, 1)
 	if len(tabs) > 0 {
 		newcon, newtabs, newration := FindBigText(con, ration, tabs)
-		if newcon != "" && newration == 0 {
+		if newcon != "" {
 			con = newcon
 			tabs = newtabs
 			ration = newration
 		}
 	}
-	blockArrays, _ := DivideBlock(con, 1, job.RuleBlock)
-	if len(blockArrays) > 0 { //有分块
+	blockArrays, _ := DivideBlock(con, 1, job.RuleBlock) //分块
+	if len(blockArrays) > 0 {                            //有分块
 		//从块里面找分包
-		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title)
+		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
 			if len([]rune(bl.Text)) > 80 {
-				ba1, _ := DivideBlock(bl.Text, 1, job.RuleBlock)
-				if len(ba1) > 0 {
-					t := ""
-					for _, t1 := range ba1 {
-						t += t1.Text
-					}
-					bl.Text = t
-					bl.ColonKV = GetKVAll(t, bl.Title, 1)
-					bl.SpaceKV = SspacekvEntity.Entrance(t, bl.Title)
-				}
+				bl.Block, _ = DivideBlock(bl.Text, 1, job.RuleBlock)
 			}
 			//块中再查找表格(块,处理完把值赋到块)
 			t1, _ := ComputeConRatio(bl.Text, 2)
 			if len(t1) > 0 {
-				job.HasTable = 1 //添加标识:文本中有table
-				tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock)
-				processTableResult(tabres, bl, job)
+				job.HasTable = 1                                                                             //添加标识:文本中有table
+				tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+				processTableResult(tabres, bl, job)                                                          //分析table解析结果
 				if bl.Title == "" && tabres.BlockTag != "" {
 					bl.Title = tabres.BlockTag
 				}
@@ -77,7 +68,6 @@ func AnalyStart(job *util.Job) {
 			//			for k, v := range bl.TableKV.Kv {
 			//				log.Println("bl.TableKV.Kv", k, v)
 			//			}
-
 		} else {
 			//从正文里面找分包
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
@@ -88,8 +78,8 @@ func AnalyStart(job *util.Job) {
 		}
 
 		//调用kv解析
-		bl.ColonKV = GetKVAll(newCon, "", 1)
-		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "")
+		bl.ColonKV = GetKVAll(newCon, "", nil, 1)
+		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "", nil)
 		job.Block = append(job.Block, bl)
 	}
 }
@@ -279,7 +269,7 @@ func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) str
 			content = regEndWrap.ReplaceAllString(content, "")
 			doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
 			doc.Find("table").Eq(0).ReplaceWithHtml(content)
-			con, _ = doc.Html()
+			con, _ = doc.Find("body").Html()
 		}
 	}
 	return con
@@ -295,9 +285,11 @@ func FindBigText(con string, r float32, t []*goquery.Selection) (content string,
 		if content != "" {
 			tabs, ration = ComputeConRatio(content, 1)
 			if len(tabs) > 0 {
-				content = tableDivideBlock(content, ration, tabs)
-				if content == "" {
+				con := tableDivideBlock(content, ration, tabs)
+				if con == "" {
 					return
+				} else {
+					content = con
 				}
 			} else {
 				doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))

File diff suppressed because it is too large
+ 409 - 367
src/jy/pretreated/analytable.go


+ 8 - 7
src/jy/pretreated/colonkv.go

@@ -63,8 +63,11 @@ func (ce *ColonkvEntity) divisionMoreKV(con string) string {
 }
 
 //获取冒号kv入口
-func (ce *ColonkvEntity) entrance(con, title string, from int) ([]*Kv, map[string]string) {
+func (ce *ColonkvEntity) entrance(con, title string, contactFormat *ContactFormat, from int) ([]*Kv, map[string]string) {
 	kvs := ce.GetKvs(con, title, from)
+	if from == 1 {
+		FormatContactKv(&kvs, title, nil, contactFormat)
+	}
 	kv := map[string]string{}
 	for _, v := range kvs {
 		if strings.TrimSpace(v.Value) == "" {
@@ -84,7 +87,7 @@ func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv {
 
 //处理正文
 func (ce *ColonkvEntity) processText(con string) string {
-	con = ce.divisionMoreKV(con)
+	con = ce.divisionMoreKV(con)//一行多个冒号kv处理
 	for {
 		tmp := con
 		con = ce.divisionMoreKV(con)
@@ -238,8 +241,7 @@ func IsContactKvHandle(value string, m map[string]bool) bool {
 
 //kv关于联系人信息的处理
 //采购人>集中采购机构
-/*
-func FormatContactKv(kvs *[]*Kv, title string, buyers []string) {
+func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *ContactFormat) {
 	////////////////////////////
 	//处理联系人信息
 	var indexMap map[int]string
@@ -565,7 +567,6 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string) {
 	//	}
 	//Debug("totalIndexMap", len(totalIndexMap))
 }
-*/
 func ContactTypeTitleMatch(title string) string {
 	matchType := ""
 	if title != "" && len([]rune(title)) < 15 {
@@ -614,9 +615,9 @@ func HasOrderContactType(text string) []string {
 
 //两种冒号kv结合到一起
 //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
-func GetKVAll(content, title string, from int) *JobKv {
+func GetKVAll(content, title string, contactFormat *ContactFormat, from int) *JobKv {
 	content = formatText(content, "kv")
-	m1Kvs, _ := colonkvEntity.entrance(content, title, from)
+	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from)
 	m1, m1Weight := KvTagsToKV(m1Kvs, title, nil, from)
 	if m1 == nil {
 		m1 = map[string]string{}

+ 105 - 57
src/jy/pretreated/division.go

@@ -19,7 +19,7 @@ var (
 		"(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)",
 		"1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)",
 	}*/
-	/*regSerialTitles_1 = []*regexp.Regexp{
+	regSerialTitles_1 = []*regexp.Regexp{
 		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)([一二三四五六七八九十]+)[\u3000\u2003\u00a0\\s]*[、..::,](.*)"),
 		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)[((]([一二三四五六七八九十]+)[))][\u3000\u2003\u00a0\\s]*[、..::]?(.*)"),
 		regexp.MustCompile("([\r\n][\u3000\u2003\u00a0\\s]*|^[\u3000\u2003\u00a0\\s]*)(\\d+)[\u3000\u2003\u00a0\\s]*、(.*)"),
@@ -36,7 +36,7 @@ var (
 		regexp.MustCompile("^(\\d+)[\u3000\u2003\u00a0\\s]+([^\\d][^\r\n]+)$"),
 		regexp.MustCompile("^1[..](\\d+)[\u3000\u2003\u00a0\\s]+([^\\d..][^\r\n]+)$"),
 		regexp.MustCompile("^[(](\\d+)[\u3000\u2003\u00a0\\s)]+([^\r\n]+)$"),
-	}*/
+	}
 	regReplAllTd       = regexp.MustCompile("(?smi)<td.*?>.+?</td>")
 	regIsNumber        = regexp.MustCompile("^\\d+$")
 	regIsChineseNumber = regexp.MustCompile("^[一二三四五六七八九十]+$")
@@ -48,12 +48,16 @@ var (
 	regDivision        = regexp.MustCompile("[::]")
 	regSpliteSegment   = regexp.MustCompile("[\r\n]")
 	regFilterNumber    = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
-	regSplit           = regexp.MustCompile("和|以?及|与|、")
+	regSplit           = regexp.MustCompile("或|和|以?及|与|、|或")
 	regStartWrap       = regexp.MustCompile("^[\r\n]")
 	regEndWrap         = regexp.MustCompile("[\r\n]$")
 	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
 	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
 	moreColonReg       = regexp.MustCompile("[::]+")
+	regFilter          = regexp.MustCompile("等$")
+	confusion          = map[string]string{
+		"参与": "canyu",
+	}
 	//查找分包之前,先对内容进行预处理
 	/*
 		第一包:采购设备清单
@@ -75,7 +79,13 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 	//contentTemp := regReplAllTd.ReplaceAllString(content, "")
 	contentTemp := TextAfterRemoveTable(content)
 	tdIndexs := regReplAllTd.FindAllStringSubmatchIndex(content, -1)
-	regContenSerialTitle, regSerialTitleIndex := getSerialType(contentTemp, ruleBlock.BlockRegs)
+	var regContenSerialTitle *regexp.Regexp
+	var regSerialTitleIndex int
+	if ruleBlock!=nil && len(ruleBlock.BlockRegs)>0{
+		regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp, ruleBlock.BlockRegs)
+	}else {
+		regContenSerialTitle, regSerialTitleIndex = getSerialType(contentTemp,  regSerialTitles_1)
+	}
 	//没有分块
 	if regSerialTitleIndex == -1 {
 		if len(contentTemp) == len(content) {
@@ -86,7 +96,12 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 		}
 	}
 	//匹配序号和标题
-	regSerialTitle := ruleBlock.TitleRegs[regSerialTitleIndex]
+	var regSerialTitle *regexp.Regexp
+	if ruleBlock != nil && len(ruleBlock.TitleRegs)>0{
+		regSerialTitle = ruleBlock.TitleRegs[regSerialTitleIndex]
+	}else {
+		regSerialTitle = regSerialTitles_2[regSerialTitleIndex]
+	}
 	indexs := regContenSerialTitle.FindAllStringIndex(content, -1)
 	indexs = filterSerial(content, indexs, tdIndexs)
 	//头块
@@ -154,7 +169,6 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 		}
 		//获取块中除了序号和标题的内容
 		blockText := regTrimSpace.ReplaceAllString(content[end:nextStart], "")
-		var titles = []string{}
 		if title != "" {
 			blockTextTemp := regReplAllSpace.ReplaceAllString(blockText, "")
 			//特殊情况处理
@@ -173,6 +187,7 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 						十二、开标时间:2017年3月20日9时30分
 					*/
 					blockText = title
+					title = ""
 				}
 			} else if blockTextTemp != "" && regDivision.MatchString(title) {
 				/*
@@ -185,34 +200,16 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 				divisionIndexs := regDivision.FindStringIndex(title)
 				titleBefore := regReplAllSpace.ReplaceAllString(title[:divisionIndexs[0]], "")
 				titleAfter := regReplAllSpace.ReplaceAllString(title[divisionIndexs[1]:], "")
+				blockText = title + "\n" + blockText
 				if titleAfter != "" {
-					titles = append(titles, titleBefore)
-					//分段 去每一个冒号前面的key
-					segments := regSpliteSegment.Split(blockText, -1)
-					for _, sv := range segments {
-						divisionIndexs = regDivision.FindStringIndex(sv)
-						if len(divisionIndexs) == 0 {
-							continue
-						}
-						titleTemp := regReplAllSpace.ReplaceAllString(sv[:divisionIndexs[0]], "")
-						if titleTemp == "" {
-							continue
-						}
-						titles = append(titles, titleTemp)
-					}
-					blockText = title + "\n" + blockText
 					title = ""
 				} else {
-					blockText = title + "\n" + blockText
 					title = titleBefore
 				}
 			} else {
 				blockText = title + "\n" + blockText
 			}
 		}
-		if len(titles) == 0 {
-			titles = append(titles, title)
-		}
 		//没有内容的块,不打标签,不分段
 		if blockText == "" {
 			continue
@@ -222,29 +219,29 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 			continue
 		}
 		blockText = hasMergeKV(title, blockText)
-		block := &util.Block{
-			Index: index,     //序号
-			Text:  blockText, //内容
-			Title: title,     //标题
-			Start: start,
-			End:   nextStart,
-		}
 		//
 		titleIsExists := map[string]bool{} //去重
-		for _, tv := range titles {
-			tv = filterTitle(tv)
-			//分割标题 [和及]。。。
-			splitTitles := regSplit.Split(tv, -1)
-			for _, sv := range splitTitles {
-				if sv == "" || titleIsExists[sv] {
-					continue
-				}
-				titleIsExists[sv] = true
-				//标题过短过长不打标签
-				if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
-					//打标签
-					block.Tags = append(block.Tags, util.GetBlockTags(sv))
-				}
+		title = filterTitle(title)
+		//分割标题 [和及]。。。 参与
+		splitTitles := ProcTitle(title)
+		block := &util.Block{
+			Index:  index,     //序号
+			Text:   blockText, //内容
+			Title:  title,     //标题
+			Titles: splitTitles,
+			Start:  start,
+			End:    nextStart,
+		}
+
+		for _, sv := range splitTitles {
+			if sv == "" || titleIsExists[sv] {
+				continue
+			}
+			titleIsExists[sv] = true
+			//标题过短过长不打标签
+			if len([]rune(sv)) >= 2 && len([]rune(sv)) <= 10 {
+				//打标签
+				block.Tags = append(block.Tags, util.GetBlockTags(sv))
 			}
 		}
 		tagsToBlocks(blocks, block)
@@ -268,18 +265,62 @@ func DivideBlock(content string, from int, ruleBlock *util.RuleBlock) ([]*util.B
 			returnValue = 1
 		}
 	}
-
+	contactFormat := &util.ContactFormat{
+		IndexMap: map[int]string{},
+		MatchMap: map[string]map[string]bool{},
+	}
 	for _, bl := range returnBlocks {
 		//解析kv
 		newText := TextAfterRemoveTable(bl.Text)
-		bl.ColonKV = GetKVAll(newText, bl.Title, from)
-		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title)
+		bl.ColonKV = GetKVAll(newText, bl.Title, contactFormat, from)
+		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title, contactFormat)
 		//正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
 		bl.Text = appendWarpStop(bl.Text)
 	}
 	return returnBlocks, returnValue
 }
 
+//块标题处理
+func ProcTitle(title string) []string {
+	if title == "" {
+		return []string{}
+	}
+	for k, v := range confusion {
+		title = strings.Replace(title, k, v, -1)
+	}
+	direct := 1
+	prev := ""
+	ara := regSplit.Split(title, -1)
+	for kk, vv := range ara {
+		for kkk, vvv := range confusion {
+			vv = strings.Replace(vv, vvv, kkk, -1)
+		}
+		ara[kk] = vv
+		if len([]rune(vv)) == 2 {
+			if kk == 0 {
+				direct = -1
+			} else {
+				start := ""
+				if len([]rune(prev)) > 3 {
+					start = string([]rune(prev)[:len([]rune(prev))-2])
+				}
+				ara[kk] = start + vv
+			}
+		}
+		if len([]rune(vv)) > 3 {
+			if direct == -1 {
+				end := string([]rune(vv)[len([]rune(vv))-2:])
+				for i := 0; i < kk; i++ {
+					ara[i] = ara[i] + end
+				}
+				break
+			}
+			prev = vv
+		}
+	}
+	return ara
+}
+
 //有合并kv的 例如项目名称及编号
 func hasMergeKV(title, text string) string {
 	title = regDivision.ReplaceAllString(title, "")
@@ -413,6 +454,12 @@ func tagsToBlocks(blocks []*util.Block, block *util.Block) {
 }
 
 func filterTitle(title string) string {
+	if strings.Contains(title, ",") && strings.Contains(title, "。") {
+		return ""
+	}
+	if len([]rune(title)) > 30 {
+		return ""
+	}
 	//清理空格
 	title = regReplAllSpace.ReplaceAllString(title, "")
 	//清理成对出现的符号中的内容
@@ -421,6 +468,7 @@ func filterTitle(title string) string {
 	title = regReplAllSymbol.ReplaceAllString(title, "")
 	//清理序号
 	title = regFilterNumber.ReplaceAllString(title, "")
+	title = regFilter.ReplaceAllString(title, "")
 	return title
 }
 
@@ -438,8 +486,8 @@ func FindPackageFromBlocks(blocks *[]*util.Block, title string) (blockPackage ma
 		//把分包内容摘除掉有问题 有的项目名称中包含二标段
 		if ok && false {
 			v.Text = surplusText
-			v.ColonKV = GetKVAll(surplusText, v.Title, 1)
-			v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title)
+			v.ColonKV = GetKVAll(surplusText, v.Title, nil, 1)
+			v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title, nil)
 		}
 	}
 	return
@@ -588,9 +636,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 				//合并文本
 				(*blockPackage)[index].Text += "\n" + text
 				//合并冒号kv
-				colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", 1)
+				colonJobKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 1)
 				if headKey != "" {
-					kvAgain := GetKVAll(text, "", 4)
+					kvAgain := GetKVAll(text, "", nil, 4)
 					for kv_k, kv_v := range kvAgain.Kv {
 						if colonJobKv.Kv[kv_k] == "" {
 							colonJobKv.Kv[kv_k] = kv_v
@@ -608,7 +656,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					(*blockPackage)[index].ColonKV.Kv[kv_k] = kv_v
 				}
 				//合并空格kv
-				spaceJobKv := SspacekvEntity.Entrance(text, "")
+				spaceJobKv := SspacekvEntity.Entrance(text, "", nil)
 				for kv_k, kv_v := range spaceJobKv.Kv {
 					if kv_v == "" {
 						continue
@@ -626,9 +674,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					Type:     bv[1],
 					Accuracy: accuracy,
 				}
-				finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", 4)
+				finalKv := GetKVAll(strings.TrimLeft(text, headKey), "", nil, 4)
 				if headKey != "" {
-					kvAgain := GetKVAll(text, "", 4)
+					kvAgain := GetKVAll(text, "", nil, 4)
 					for kv_k, kv_v := range kvAgain.Kv {
 						if finalKv.Kv[kv_k] == "" {
 							finalKv.Kv[kv_k] = kv_v
@@ -637,7 +685,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					}
 				}
 				newBpkg.ColonKV = finalKv
-				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "")
+				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "", nil)
 				(*blockPackage)[index] = newBpkg
 			}
 		}

+ 2 - 2
src/jy/pretreated/spacekv.go

@@ -16,7 +16,7 @@ var (
 	excludeSpaceKey = regexp.MustCompile("[.、�\\[【{{〔<《\\]】}}〕>》]")
 )
 
-func (se *SpacekvEntity) Entrance(text, title string) *util.JobKv {
+func (se *SpacekvEntity) Entrance(text, title string, contactFormat *util.ContactFormat) *util.JobKv {
 	lines := se.getLines(text)
 	kvMaps := []*util.Kv{}
 	for _, line := range lines {
@@ -26,7 +26,7 @@ func (se *SpacekvEntity) Entrance(text, title string) *util.JobKv {
 		}
 		kvMaps = append(kvMaps, kvMap...)
 	}
-	//FormatContactKv(&kvMaps, title, nil, contactFormat)
+	FormatContactKv(&kvMaps, title, nil, contactFormat)
 	kv, tagKv := KvTagsToKV(kvMaps, title, nil, 1)
 	return &util.JobKv{
 		Kvs:   kvMaps,

+ 111 - 81
src/jy/pretreated/tablev2.go

@@ -64,24 +64,24 @@ type TD struct {
 	Val            string             //值
 	Text           string             //原始串
 	SortKV         *SortMap           //存放kv值
-	Html           string
-	BH             bool         //是否是表头
-	MustBH         bool         //不能修改的表头
-	StandardKey    string       //标准表头
-	Colspan        int          //合并列
-	Rowspan        int          //合并行
-	StartCol       int          //起始列
-	EndCol         int          //终止列
-	StartRow       int          //起始行
-	EndRow         int          //终止行
-	ColPos         int          //当前在TR中的位置
-	HeadTd         *TD          //(是val元素)k节点
-	KVDirect       int          //键-值方向,0未知,1横 2纵//指值和k的方向
-	KeyDirect      int          //k方向,k纵值横,k横值纵 1横 2纵
-	SonTds         []*TD        //(是key元素)值节点数组
-	SonTableResult *TableResult //子值表格集
-	ArrVal         []string     //数组值,当是左临元素是合并行的元素时!
-	Valtype        string       //"BO=中标人顺序"
+	Html           string             //html值
+	BH             bool               //是否是表头
+	MustBH         bool               //不能修改的表头
+	StandardKey    string             //标准表头
+	Colspan        int                //合并列
+	Rowspan        int                //合并行
+	StartCol       int                //起始列
+	EndCol         int                //终止列
+	StartRow       int                //起始行
+	EndRow         int                //终止行
+	ColPos         int                //当前在TR中的位置
+	HeadTd         *TD                //(是val元素)k节点
+	KVDirect       int                //键-值方向,0未知,1横 2纵//指值和k的方向
+	KeyDirect      int                //k方向,k纵值横,k横值纵 1横 2纵
+	SonTds         []*TD              //(是key元素)值节点数组
+	SonTableResult *TableResult       //子值表格集
+	ArrVal         []string           //数组值,当是左临元素是合并行的元素时!
+	Valtype        string             //"BO=中标人顺序"
 }
 
 var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[、])([\\S]{4,12})|([\\S]{2,12}))[::]([\\S]{5,60})([一二三四五六七八九]+[、])?`)
@@ -112,44 +112,68 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	if rowspan == 0 {
 		rowspan = 1
 	}
-	td.Colspan, td.Rowspan = colspan, rowspan
-	td.Html, _ = td.Goquery.Html()
-	ht := td.Goquery.ChildrenFiltered("table")
-	bsontable := false
+	td.Colspan, td.Rowspan = colspan, rowspan  //合并列,合并行
+	td.Html, _ = td.Goquery.Html()             //html值
+	ht := td.Goquery.ChildrenFiltered("table") //获取td的table
+	bsontable := false                         //默认td中没有table
 	txt := ""
 	//子table处理合并
 	if ht.Size() > 0 {
 		//qutil.Debug("有子表格")
 		txt = TextAfterRemoveTable(td.Html)
-		ts := td.TR.Table.TableResult
-		tabs, _ := ComputeConRatio(td.Html, 2)
-		if len(tabs) > 0 {
-			bsontable = true
-			stag := ts.BlockTag
-			if stag == "" {
-				var tdleft *TD
-				if len(tr.TDs) > 0 {
-					tdleft = tr.TDs[len(tr.TDs)-1]
-					if tdleft.BH {
-						//u.Debug(tdleft.Val),如果不存在就是上一行的
-						stag = tdleft.Val
-					}
-				} else if len(tr.Table.TRs) > 0 {
-					lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
-					str := ""
-					for _, td3 := range lasttr.TDs {
-						str += td3.Val
-						if len([]rune(str)) > 14 {
-							str = ""
-							break
-						}
+		td.tdHasTable(&bsontable, tr, table) //处理td中的table,块标签处理,子表解析集处理
+	} else {
+		txt = strings.TrimSpace(td.Goquery.Text())
+	}
+	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
+	td.Val = text //值
+	td.Text = txt //原始串
+	//对td单元格值判断是否是表头和根据td内容长度进行分块处理
+	td.tdIsHb(tr, table, bsontable)
+	bhead := false
+	if td.TR.RowPos == 0 { //第一行
+		if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
+			bhead = true
+		}
+	}
+	if bhead && !bsontable {
+		td.BH = true
+		td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
+		td.KVDirect = 2  //键-值方向,0未知,1横 2纵//指值和k的方向
+	}
+	//u.Debug(td.BH, td.Val)
+	return td
+}
+
+//处理td中的table,块标签处理,子表解析集处理
+func (td *TD) tdHasTable(bsontable *bool, tr *TR, table *Table) {
+	ts := td.TR.Table.TableResult
+	tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
+	if len(tabs) > 0 {
+		(*bsontable) = true
+		stag := ts.BlockTag //块标签
+		if stag == "" {
+			var tdleft *TD
+			if len(tr.TDs) > 0 {
+				tdleft = tr.TDs[len(tr.TDs)-1]
+				if tdleft.BH {
+					//u.Debug(tdleft.Val),如果不存在就是上一行的
+					stag = tdleft.Val
+				}
+			} else if len(tr.Table.TRs) > 0 {
+				lasttr := tr.Table.TRs[len(tr.Table.TRs)-1]
+				str := ""
+				for _, td3 := range lasttr.TDs {
+					str += td3.Val
+					if len([]rune(str)) > 14 {
+						str = ""
+						break
 					}
-					stag = str
 				}
+				stag = str
 			}
-			sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock)
+			sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
 			td.BH = false
-
 			td.SonTableResult = sonts
 			//for _, k := range sonts.SortKV.Keys {
 			//u.Debug(k, sonts.SortKV.Map[k])
@@ -201,21 +225,20 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 				//u.Debug(fmt.Sprintf("%v", td.TR.Table.BlockPackage.Map["1"]))
 			}
 		}
-	} else {
-		txt = td.Goquery.Text()
 	}
-	text := dwReg.ReplaceAllString(u.TrimLRAll(txt, ""), "$1")
-	//u.Debug(txt, text)
-	td.Val = text
-	td.Text = txt
-	//对td单元格值判断是否是key
-	lentxt := len([]rune(text))
+}
+
+//对td单元格值判断是否是表头和根据td内容长度进行分块处理
+func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
+	lenval := len([]rune(td.Val)) //经过处理的td内容长度
 	//if lentxt > 9 {
 	//td.KV = GetKVAll(txt, "")
 	ub := []*u.Block{}
-	if lentxt > 50 { //看是否划块
+	//经过处理的td内容长度大于50,划块,分包
+	if lenval > 50 { //看是否划块
 		//u.Debug(txt)
-		ub, _ = DivideBlock(txt, 2, nil)
+		ub, _ = DivideBlock(td.Text, 2, table.TableResult.RuleBlock) //对td的原始值
+		//看是否划块
 		if len(ub) > 0 {
 			colonKvWeight := map[string]int{}
 			spaceKvWeight := map[string]int{}
@@ -249,15 +272,15 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		}*/
 		if len(tr.TDs) > 0 {
 			tdleft := tr.TDs[len(tr.TDs)-1]
-			if tdleft.BH && excludeKey.MatchString(tdleft.Text) {
+			if tdleft.BH && excludeKey.MatchString(tdleft.Text) { //(涉及包号|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)
 				isFindPkg = false
 			}
 		}
 		if isFindPkg {
 			if len(ub) > 0 {
-				blockPackage = FindPackageFromBlocks(&ub, "")
+				blockPackage = FindPackageFromBlocks(&ub, "") //从块里面找分包
 			} else {
-				blockPackage = FindPackageFromText("", text)
+				blockPackage = FindPackageFromText("", td.Val) //从正文里面找分包
 			}
 		}
 		if len(blockPackage) > 0 {
@@ -287,16 +310,28 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 			}
 		}
 	}
-	//
-	if lentxt < 50 {
+	//经过处理的td内容长度小于50,冒号kv,td表头
+	if lenval < 50 {
 		//		td.SortKV = FindKv(text, "")
 		kvTitle := ""
 		if len(td.TR.TDs) > 0 {
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 		}
-		_, resm := colonkvEntity.entrance(text, kvTitle, 2)
+		/*
+					预算总价
+			(人民币:元)
+		*/
+		if td.Text != "" && strings.Contains(td.Text, "预算总价") && (strings.Contains(td.Text, "(") || strings.Contains(td.Text, "(")) {
+			tagindex := 0
+			if tagindex = strings.Index(td.Text, "("); tagindex <= 0 {
+				tagindex = strings.Index(td.Text, "(")
+			}
+			td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
+			td.BH = true
+		}
+		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3) //td冒号kv
 		for k, v := range resm {
-			td.SortKV.AddKey(k, v)
+			td.SortKV.AddKey(k, v) //存放kv值
 		}
 		//u.Debug(td.SortKV.Keys, "-------2--------------------------------")
 		//		td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
@@ -308,13 +343,21 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 				td.BH = true
 			}
 		} else if !bsontable {
-			txt := repSpace.ReplaceAllString(text, "")
+			txt := repSpace.ReplaceAllString(td.Val, "")
 			btw, must, _, _, repl := CheckHeader(txt)
+			if lenval > 15 {
+				btw = false
+			}
+			if strings.Contains(td.Val, "个项目") {
+				must = false
+				btw = false
+			}
 			td.Valtype = repl
 			td.MustBH = must
 			td.BH = btw
 		}
-	} else if len(ub) == 0 { //之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉
+	} else if len(ub) == 0 {
+		//之前这里没加判断,现在加上判断,造成分块之后的kv被覆盖掉
 		//u.Debug("----\n\n\n", txt, "\n\n\n----")
 		//u.Debug(GetKVAll(txt, ""))
 		/*
@@ -329,7 +372,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 			}
 		*/
 
-		td.SortKV = FindKv(text, "", 2)
+		td.SortKV = FindKv(td.Val, "", 2)
 
 		//		td.LeftNode.Val
 		//		for _, vvv := range *td.TR {
@@ -339,24 +382,11 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		if len(td.TR.TDs) > 0 {
 			kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
 		}
-		_, resm := colonkvEntity.entrance(text, kvTitle, 2)
+		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2) //获取冒号kv入口
 		for k, v := range resm {
 			td.SortKV.AddKey(k, v)
 		}
 	}
-	bhead := false
-	if td.TR.RowPos == 0 { //第一行
-		if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
-			bhead = true
-		}
-	}
-	if bhead && !bsontable {
-		td.BH = true
-		td.KeyDirect = 1
-		td.KVDirect = 2
-	}
-	//u.Debug(td.BH, td.Val)
-	return td
 }
 func (t *Table) Print() {
 	for row, trs := range t.TRs {

+ 2 - 0
src/jy/util/article.go

@@ -53,6 +53,7 @@ type RuleBlock struct {
 type Block struct {
 	Tags        []Tags                   //对块做的标签,可以作为数据抽取的依据
 	Title       string                   //块标题
+	Titles      []string                 //拆分以后多个块标题
 	Index       int                      //块索引
 	Text        string                   //块内容
 	Start       int                      //开始索引
@@ -63,6 +64,7 @@ type Block struct {
 	BPackage    *BlockPackage            //分包信息
 	Tag         map[string]bool          //块标签
 	Block       []*Block                 //子块
+	Category    string                   //块分类
 	Winnerorder []map[string]interface{} //块中,中标候选人排序
 }
 

+ 3 - 3
src/jy/util/config.go

@@ -12,9 +12,9 @@ import (
 var FormatTextMap map[string][]map[string]interface{}
 
 func init() {
-	//loadFormatText()
-	//LoadTagDb("./res/tagdb")
-	//LoadTagDb("./res/blocktagdb")
+	loadFormatText()
+	LoadTagDb("./res/tagdb")
+	LoadTagDb("./res/blocktagdb")
 }
 
 //加载格式化正文配置

+ 1 - 1
src/res/fieldscore.json

@@ -172,7 +172,7 @@
         "position": [
             {
                 "describe": "全为中文汉字或符号",
-                "regstr": "^[\\u4e00-\\u9fa5()()【】\\\\[\\\\],,。、::《》]+$",
+                "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
                 "score": -20
             },
             {

+ 5 - 5
src/res/formattext.json

@@ -20,11 +20,6 @@
             "separator": " ",
             "desc": "替换掉无效的kv"
         },
-        {
-            "reg": "[^\\n::]{2,18}[::]\\s*详见[^,。,.::\\s]{2,18}",
-            "separator": "",
-            "desc": "替换掉无效的kv"
-        },
         {
             "reg": "(\\d+[,,.]+)+\\d+((百|千)?元|(百|千)?(万|亿)元?)",
             "separator": "[,,]__",
@@ -182,6 +177,11 @@
             "reg": "\n[\\d.\u3000\u2003\u00a0\\s]*(联系人)及(电话)[::](.+?)[\u3000\u2003\u00a0\\s]+(.+)",
             "separator": "\n$1:$3\n$2:$4",
             "desc": ""
+        },
+        {
+            "reg": "[^\\n::]{2,18}[::]\\s*详见[^,。,.::\\s]{2,18}",
+            "separator": "",
+            "desc": "替换掉无效的kv"
         }
     ]
 }

+ 89 - 2
src/udpfileserver/main.go

@@ -2,6 +2,8 @@ package main
 
 import (
 	"encoding/json"
+	"fmt"
+	"github.com/go-gomail/gomail"
 	"gopkg.in/mgo.v2/bson"
 	"jy/mongodbutil"
 	"log"
@@ -9,8 +11,11 @@ import (
 	"net"
 	"net/rpc"
 	"path"
+	"qfw/common/src/qfw/util"
 	qu "qfw/util"
+	"strconv"
 	"strings"
+	"sync"
 	"time"
 )
 
@@ -61,7 +66,7 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 		lid := strings.TrimSpace(mapInfo["lteid"].(string))
 		if bson.IsObjectIdHex(gid) && bson.IsObjectIdHex(lid) {
 			var jsq int64
-			query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(gid),"$lte": bson.ObjectIdHex(lid),}}
+			query := bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(gid),"$lte": bson.ObjectIdHex(lid),}}
 			log.Println("query---:", query)
 			sum :=mongodbutil.Mgo.Count(MgoC,query)
 			log.Println("sum:", sum)
@@ -71,7 +76,7 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 				limit = sum
 			}
 			for i := 0; i < pageNum; i++ {
-				query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(gid), "$lte": bson.ObjectIdHex(lid)}}
+				query = bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(gid), "$lte": bson.ObjectIdHex(lid)}}
 				log.Println("page=", i+1,"query=", query,limit)
 				list, b := mongodbutil.Mgo.Find(MgoC,query,nil,bson.M{"_id": 1,MgoFileFiled:1},false,0, limit)
 				if !b{
@@ -103,6 +108,11 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 										log.Println(mid, "mgo ", MgoFileFiled,"没有fid ")
 										continue
 									}
+									//if qu.ObjToString(fileinfo["update"]) ==""{
+									//	<-ChanB
+									//	log.Println(mid, "mgo ", MgoFileFiled,"没有update ")
+									//	continue
+									//}
 									save(mid,attk, qmap, &fileinfo,&updateNum)
 									<-ChanB
 								}
@@ -111,7 +121,10 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 					}
 				}
 			}
+			//识别完以后再次查询数据库,进行下一轮识别
 			log.Println("处理查询数据结束...",jsq,time.Now().Sub(stime))
+			//进行下一轮识别
+			forfunc(lid)
 		} else {
 			log.Println("开始id或结束id参数错误:", string(data))
 		}
@@ -185,6 +198,7 @@ func save(mid interface{},attk string, qmap, fileinfo *map[string]interface{},up
 			(*fileinfo)["content"] = rdata["context"]
 		}
 		(*fileinfo)["expend"] = rdata["expend"]
+		delete(*fileinfo,"update")
 		//log.Println((*fileinfo))
 
 		(*qmap)[MgoFileFiled].(map[string]interface{})["attachments"].(map[string]interface{})[attk]=*fileinfo
@@ -208,8 +222,81 @@ func save(mid interface{},attk string, qmap, fileinfo *map[string]interface{},up
 		}else {
 			log.Println(mid, "mongo更新数据失败",qu.ObjToString((*fileinfo)["fid"]))
 		}
+		nowHour := time.Now().Hour()
+		rdlock.Lock()
+		if nowHour != hourNum{
+			log.Println("send email:",SendMail(fmt.Sprint(updateBool,mid)))
+			hourNum = nowHour
+		}
+		rdlock.Unlock()
 	} else {
 		log.Println(mid, "调用rpc服务解析异常:",qu.ObjToString((*fileinfo)["fid"]), rdata["err"])
 	}
 
 }
+var hourNum int
+var rdlock sync.RWMutex
+func SendMail( body string ) error {
+	//定义邮箱服务器连接信息,如果是阿里邮箱 pass填密码,qq邮箱填授权码
+	mailConn := map[string]string {
+		"user": "550838476@qq.com",
+		"pass": "",
+		"host": "smtp.qq.com",
+		"port": "465",
+	}
+
+	port, _ := strconv.Atoi(mailConn["port"]) //转换端口类型为int
+
+	m := gomail.NewMessage()
+	m.SetHeader("From","Get to" + "<" + mailConn["user"] + ">")  //这种方式可以添加别名,即“XD Game”, 也可以直接用<code>m.SetHeader("From",mailConn["user"])</code> 读者可以自行实验下效果
+	m.SetHeader("To", []string{"550838476@qq.com"}...)  //发送给多个用户
+	m.SetHeader("Subject", "MongoId")  //设置邮件主题
+	m.SetBody("text/html", body)     //设置邮件正文
+
+	d := gomail.NewDialer(mailConn["host"], port, mailConn["user"], mailConn["pass"])
+
+	err := d.DialAndSend(m)
+	return err
+
+}
+
+func forfunc(lid string) {
+	for {
+		//查询最后一个id
+		lastObjectId, _ := mongodbutil.Mgo.Find(MgoC,nil,"-_id",bson.M{"_id":1},true,-1,-1)
+		lastId,ok := (*lastObjectId)[0]["_id"].(bson.ObjectId)
+		log.Println("lastID:",lastId)
+		//查询最后一个id出错重新查询
+		if!ok{//转换失败
+			log.Println("查询异常",*lastObjectId)
+			time.Sleep(time.Minute)
+			continue
+		}
+		//查询最后一个id等于上一轮的id就重新查询
+		if lastId.Hex() == lid {
+			log.Println("没有新数据",lastId.Hex())
+			SendMail(time.Now().String()+"没有最新数据,当前最后一条数据id:"+lastId.Hex())
+			time.Sleep(time.Hour)
+			continue
+		}
+		//不相等说明有新数据,进行下次处理
+		m := map[string]string{
+			"gtid":lid,//上一轮结束的最后id
+			"lteid":lastId.Hex(),//新一轮查询出来的id
+		}
+		bytes, _ := json.Marshal(m)
+		//发送udp
+		err := udpclient.WriteUdp(bytes,mu.OP_TYPE_DATA,&net.UDPAddr{
+			IP:   net.ParseIP( util.ObjToString(Sysconfig["udpip"])),
+			Port:  util.IntAll(Sysconfig["udpport"]),
+		})
+		if err != nil{
+			log.Println("发送udp失败",err,string(bytes))
+			time.Sleep(time.Minute)
+			continue
+		}
+		SendMail(time.Now().String()+fmt.Sprint("发送udp成功,gtid:",lid,",lteid:",lastId.Hex()))
+		log.Println("发送udp成功,gtid:",lid,",lteid:",lastId.Hex())
+		break//发送完后终止循环
+	}
+}

+ 6 - 3
udpprojectset/src/cleareids.go

@@ -35,10 +35,10 @@ func clearPKey() {
 	wg := sync.WaitGroup{}
 	for _, pncb := range []*KeyMap{PNKey, PCKey, PBKey} {
 		wg.Add(1)
-		go func() {
+		go func(pncb *KeyMap) {
 			defer wg.Done()
 			clearPNCBKey(pncb, nowtime)
-		}()
+		}(pncb)
 	}
 	wg.Wait()
 	log.Println("清理结束")
@@ -60,6 +60,9 @@ func clearIdsKeys(pKey *KeyMap, nowtime int64) []string {
 	for k, ma := range pKey.Map {
 		ids := ma.Arr
 		delids := []interface{}{}
+		if ids == nil {
+			continue
+		}
 		res := redis.Mget(REDISIDS, *ids)
 		for _, b1 := range res {
 			if b1 != nil {
@@ -100,7 +103,7 @@ func clearIdsKeys(pKey *KeyMap, nowtime int64) []string {
 }
 
 func deleteSliceId(a []string, id string) *[]string {
-	ret := make([]string, 0, len(a))
+	ret := make([]string, 0)
 	for _, val := range a {
 		if val != id {
 			ret = append(ret, val)

+ 2 - 0
udpprojectset/src/config.json

@@ -23,6 +23,8 @@
     },
     "taskstock": {
         "open": true,
+		"startTime":1325347200,
+        "startdate": "2015-11-01",
         "endate": "2019-06-30"
     },
     "udpport": ":1482",

+ 7 - 80
udpprojectset/src/fulldata.go

@@ -2,7 +2,6 @@ package main
 
 import (
 	"log"
-	"strings"
 
 	"qfw/util"
 	"qfw/util/mongodb"
@@ -14,10 +13,13 @@ import (
 
 var FullCount = 0
 
-func RunFullData() {
+func RunFullData(startTime int64) {
+	if startTime < 1325347200 {
+		log.Println("时间错误", startTime)
+	}
 	defer util.Catch()
 	var wg = sync.WaitGroup{}
-	startTime := int64(1325347200) //2012-01-01
+	//startTime := int64(1325347200) //2012-01-01
 	ps := 3
 	pool := make(chan *task, ps)
 	day := 0
@@ -27,7 +29,7 @@ func RunFullData() {
 		bComplete := false
 		for {
 			if startTime > now || bComplete {
-				log.Println("任务结束")
+				log.Println("任务结束", startTime)
 				endChan <- true
 				break
 			}
@@ -113,6 +115,7 @@ func (t *task) query() {
 			info := PreThisInfo(tmp)
 			if info != nil {
 				lockPNCBMap(info)
+				storeLock(info)
 				startProjectMerge(info, tmp)
 				redis.Put(INFOID, thisid, 1, INFOTIMEOUT)
 				currentMegerTime = info.Publishtime
@@ -126,79 +129,3 @@ func (t *task) query() {
 	log.Println("currentFull", FullCount)
 
 }
-
-//获取对比项目数组
-func getComeperProjects2(p PCBV, thisinfo *Info) (res []interface{}, pncb []*CompareInfo) {
-	newarr := []string{}
-	repeatId := map[string]bool{}
-	if p.PnameLen > 0 {
-		pn := NewCompareInfo("pn", thisinfo.PNKey, PNKey)
-		pncb = append(pncb, pn)
-		thisinfo.AllRelatePNKeyMap = map[string]*Key{}
-		pn.KeyMap.Lock.Lock()
-		for k, v := range pn.KeyMap.Map {
-			if strings.Contains(k, pn.Key) || strings.Contains(pn.Key, k) {
-				thisinfo.AllRelatePNKeyMap[k] = v
-				for _, id := range *v.Arr {
-					if !repeatId[id] {
-						newarr = append(newarr, id)
-						repeatId[id] = true
-					}
-				}
-			}
-		}
-		if thisinfo.AllRelatePNKeyMap[pn.Key] == nil {
-			K := &Key{&[]string{}, &sync.Mutex{}}
-			thisinfo.AllRelatePNKeyMap[pn.Key] = K
-			pn.KeyMap.Map[pn.Key] = K
-		}
-		pn.KeyMap.Lock.Unlock()
-	}
-	if p.PcodeLen > 0 {
-		pc := NewCompareInfo("pc", thisinfo.PCKey, PCKey)
-		pncb = append(pncb, pc)
-		thisinfo.AllRelatePCKeyMap = map[string]*Key{}
-		pc.KeyMap.Lock.Lock()
-		for k, v := range pc.KeyMap.Map {
-			if strings.Contains(k, pc.Key) || strings.Contains(pc.Key, k) {
-				thisinfo.AllRelatePCKeyMap[k] = v
-				for _, id := range *v.Arr {
-					if !repeatId[id] {
-						newarr = append(newarr, id)
-						repeatId[id] = true
-					}
-				}
-			}
-		}
-		if thisinfo.AllRelatePCKeyMap[pc.Key] == nil {
-			K := &Key{&[]string{}, &sync.Mutex{}}
-			thisinfo.AllRelatePCKeyMap[pc.Key] = K
-			pc.KeyMap.Map[pc.Key] = K
-		}
-		pc.KeyMap.Lock.Unlock()
-	}
-
-	if p.BuyerLen > 0 {
-		pb := NewCompareInfo("pb", thisinfo.PBKey, PBKey)
-		pncb = append(pncb, pb)
-		pb.KeyMap.Lock.Lock()
-		K := pb.KeyMap.Map[pb.Key]
-		if K == nil {
-			K = &Key{&[]string{}, &sync.Mutex{}}
-			pb.KeyMap.Map[pb.Key] = K
-		} else {
-			for _, id := range *K.Arr {
-				if !repeatId[id] {
-					newarr = append(newarr, id)
-					repeatId[id] = true
-				}
-			}
-		}
-		pb.KeyMap.Lock.Unlock()
-	}
-
-	if len(newarr) > 0 {
-		res = redis.Mget(REDISIDS, newarr)
-	}
-	return
-}

+ 10 - 4
udpprojectset/src/main.go

@@ -146,11 +146,15 @@ func main() {
 	log.Println("load data from redis finished.", n)
 	//清理redis
 	//clearedis()
+
 	if taskstock, ok := Sysconfig["taskstock"].(map[string]interface{}); ok { //跑存量数据
 		if b, _ := taskstock["open"].(bool); b {
-			endate, _ := taskstock["endate"].(string)
-			taskStock(endate)
+			RunFullData(util.Int64All(taskstock["startTime"]))
+			//			startdate, _ := taskstock["startdate"].(string)
+			//			endate, _ := taskstock["endate"].(string)
+			//			taskStock(startdate, endate)
 		}
+
 	}
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
@@ -245,6 +249,7 @@ func taskInc(mapInfo map[string]interface{}) {
 				info := PreThisInfo(tmp)
 				if info != nil {
 					lockPNCBMap(info)
+					storeLock(info)
 					startProjectMerge(info, tmp)
 					redis.Put(INFOID, thisid, 1, INFOTIMEOUT)
 					currentMegerTime = info.Publishtime
@@ -288,13 +293,13 @@ func taskInc(mapInfo map[string]interface{}) {
 	}
 }
 
-func taskStock(endDate string) {
+func taskStock(startDate, endDate string) {
 	defer func() {
 		<-SingleThread
 	}()
 	defer util.Catch()
 	publishtimes := []map[string]interface{}{}
-	start, _ := time.ParseInLocation(util.Date_Short_Layout, "2015-11-01", time.Local)
+	start, _ := time.ParseInLocation(util.Date_Short_Layout, startDate, time.Local)
 	end, _ := time.ParseInLocation(util.Date_Short_Layout, endDate, time.Local)
 	for {
 		publishtime := map[string]interface{}{
@@ -364,6 +369,7 @@ func taskStock(endDate string) {
 					info := PreThisInfo(tmp)
 					if info != nil {
 						lockPNCBMap(info)
+						storeLock(info)
 						startProjectMerge(info, tmp)
 						redis.Put(INFOID, thisid, 1, INFOTIMEOUT)
 						currentMegerTime = info.Publishtime

+ 30 - 24
udpprojectset/src/projectmeger.go

@@ -73,9 +73,6 @@ func startProjectMerge(thisinfo *Info, tmp map[string]interface{}) {
 	}
 	//合并流程
 	if bNormalScore {
-		PNKeyMap.Store(thisinfo.PNKey, true)
-		PBKeyMap.Store(thisinfo.PBKey, true)
-		PCKeyMap.Store(thisinfo.PCKey, true)
 		if pcbv.Buyer { //有采购单位
 			hasBuyer(pcbv, thisinfo, tmp)
 		} else { //无采购单位
@@ -189,7 +186,7 @@ func noBuyer(p PCBV, thisinfo *Info, tmp map[string]interface{}) {
 			sflag = "invalid"
 		}
 	}
-	//extInfoTag(sflag, thisinfo.Id)
+	extInfoTag(sflag, thisinfo.Id)
 	//go IS.Add(sflag) //数据统计使用
 }
 
@@ -375,30 +372,28 @@ func getComeperProjects(p PCBV, thisinfo *Info) (res []interface{}, pncb []*Comp
 		pncb = append(pncb, pb)
 	}
 	repeatId := map[string]bool{}
-	IdLock.Lock() //此处加id锁,会引进多线程的死锁,对比三个大map数组,找到key相同的项目id数组,并去重
+	//IdLock.Lock() //此处加id锁,会引进多线程的死锁,对比三个大map数组,找到key相同的项目id数组,并去重
 	for _, pv := range pncb {
-		if pv != nil {
-			pv.KeyMap.Lock.Lock()
-			K := pv.KeyMap.Map[pv.Key]
-			if K == nil {
-				K = &Key{&[]string{}, &sync.Mutex{}}
-				pv.KeyMap.Map[pv.Key] = K
-			}
-			pv.K = K
-			pv.K.Lock.Lock()
-			pv.KeyMap.Lock.Unlock()
-			defer pv.K.Lock.Unlock()
-			newarr := []string{}
-			for _, id := range *K.Arr {
-				if !repeatId[id] {
-					newarr = append(newarr, id)
-					repeatId[id] = true
-				}
+		pv.KeyMap.Lock.Lock()
+		K := pv.KeyMap.Map[pv.Key]
+		if K == nil {
+			K = &Key{&[]string{}, &sync.Mutex{}}
+			pv.KeyMap.Map[pv.Key] = K
+		}
+		pv.K = K
+		pv.K.Lock.Lock()
+		pv.KeyMap.Lock.Unlock()
+		defer pv.K.Lock.Unlock()
+		newarr := []string{}
+		for _, id := range *K.Arr {
+			if !repeatId[id] {
+				newarr = append(newarr, id)
+				repeatId[id] = true
 			}
-			pv.IdArr = newarr
 		}
+		pv.IdArr = newarr
 	}
-	IdLock.Unlock()
+	//IdLock.Unlock()
 	for _, pv := range pncb {
 		if len(pv.IdArr) > 0 {
 			res = append(res, redis.Mget(REDISIDS, pv.IdArr))
@@ -581,6 +576,7 @@ func lockPNCBMap(thisinfo *Info) {
 		if ok {
 			break
 		} else {
+			//log.Println("has key store")
 			time.Sleep(100 * time.Millisecond)
 		}
 	}
@@ -588,6 +584,7 @@ func lockPNCBMap(thisinfo *Info) {
 
 //pncbMap解锁
 func unlockPNCBMap(thisinfo *Info) {
+	//log.Println("del key store", thisinfo.PNKey)
 	//if len(thisinfo.PNKey) > 3 {
 	PNKeyMap.Delete(thisinfo.PNKey)
 	//}
@@ -598,3 +595,12 @@ func unlockPNCBMap(thisinfo *Info) {
 	PBKeyMap.Delete(thisinfo.PBKey)
 	//}
 }
+
+//store lock
+func storeLock(thisinfo *Info) {
+	PncbMayLock.Lock()
+	PNKeyMap.Store(thisinfo.PNKey, true)
+	PBKeyMap.Store(thisinfo.PBKey, true)
+	PCKeyMap.Store(thisinfo.PCKey, true)
+	PncbMayLock.Unlock()
+}

Some files were not shown because too many files changed in this diff