unknown 6 лет назад
Родитель
Сommit
0431dd97ab

+ 17 - 18
src/jy/extract/extract.go

@@ -39,30 +39,28 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.IsRun = true
 	ext.InitTestTaskInfo(resultcoll, trackcoll)
 	ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
-	//	ext.InitRulePres()
-	//	ext.InitRuleBacks()
-	//	ext.InitRuleCore()
-	//	ext.InitPkgCore()
-	//	ext.InitTag()
-	//	ext.InitClearFn()
-	//	if ext.IsExtractCity { //版本上控制是否开始城市抽取
-	//		//初始化城市DFA信息
-	//		ext.InitDFA()
-	//	}
-	//	//质量审核
-	//	ext.InitAuditFields()
-	//	ext.InitAuditRule()
-	//	ext.InitAuditClass()
-	//	ext.InitAuditRecogField()
+	ext.InitRulePres()
+	ext.InitRuleBacks()
+	ext.InitRuleCore()
+	ext.InitPkgCore()
+	ext.InitTag()
+	ext.InitClearFn()
+	if ext.IsExtractCity { //版本上控制是否开始城市抽取
+		//初始化城市DFA信息
+		ext.InitDFA()
+	}
+	//质量审核
+	ext.InitAuditFields()
+	ext.InitAuditRule()
+	ext.InitAuditClass()
+	ext.InitAuditRecogField()
 
 	//品牌抽取
 	if ju.Config["brandgoods"].(bool) {
 		ext.InitBrand()
 		ext.InitGoods()
 	}
-
-	return true
-	//return RunExtractTestTask(ext, startId, num)
+	return RunExtractTestTask(ext, startId, num)
 }
 
 func IdTrans(startId string) bson.ObjectId {
@@ -219,6 +217,7 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
 		Province:  qu.ObjToString(doc["area"]),
 		Result:    map[string][]*ju.ExtField{},
 		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
+		HasTable:  0,
 	}
 	qu.Try(func() {
 		pretreated.AnalyStart(j)

+ 25 - 12
src/jy/extract/extractInit.go

@@ -80,14 +80,14 @@ type ExtractTask struct {
 	AreaToCity        map[string][]*City   //两个文件共用
 	DistrictCityMap   map[string]*City
 	StreetDistrictMap map[string]*District
-	AreaGet           DFA //市全称
-	AreaDistrict      DFA //区或县
-	AreaProvinceGet   DFA //省
-	AreaSimGet        DFA //市简称
-	AreaStreet        DFA //街道
-
-	GoodsGet DFA //商品
-	BrandGet DFA //品牌
+	AreaGet           *ju.DFA //市全称
+	AreaDistrict      *ju.DFA //区或县
+	AreaProvinceGet   *ju.DFA //省
+	AreaSimGet        *ju.DFA //市简称
+	AreaStreet        *ju.DFA //街道
+
+	GoodsGet *ju.DFA //商品
+	BrandGet *ju.DFA //品牌
 }
 
 type ClearTaskInfo struct {
@@ -547,9 +547,10 @@ func InitCityAll(version string) map[string]map[string]interface{} {
 //初始化城市省份敏感词
 func (e *ExtractTask) InitDFA() {
 	defer qu.Catch()
-	e.AreaGet = DFA{}
-	e.AreaProvinceGet = DFA{}
-	e.AreaStreet = DFA{}
+	e.AreaGet = &ju.DFA{}
+	e.AreaDistrict = &ju.DFA{}
+	e.AreaProvinceGet = &ju.DFA{}
+	e.AreaStreet = &ju.DFA{}
 	//初始化map
 	if e.ProvinceMap == nil {
 		e.ProvinceMap = make(map[string]string)
@@ -638,7 +639,7 @@ func (e *ExtractTask) InitDFA() {
 	}
 	//初始化城市简称
 	fn3 := InitCitySim(e.TaskInfo.Version)
-	e.AreaSimGet = DFA{}
+	e.AreaSimGet = &ju.DFA{}
 	for k, v := range fn3 {
 		pb := v["brief"].(string)
 		p := e.ProvinceBrief[pb]
@@ -914,3 +915,15 @@ func (c *ClearTask) InitClearLuas() {
 		}
 	}
 }
+
+//初始化商品
+func (e *ExtractTask) InitGoods() {
+	e.GoodsGet = &ju.DFA{}
+	e.GoodsGet.AddWord(ju.GoodsConfig...)
+}
+
+//初始化品牌
+func (e *ExtractTask) InitBrand() {
+	e.BrandGet = &ju.DFA{}
+	e.BrandGet.AddWord(ju.BrandConfig...)
+}

+ 0 - 23
src/jy/extract/extractbrandgoods.go

@@ -1,23 +0,0 @@
-package extract
-
-import (
-	qu "qfw/util"
-)
-
-var GoodsConfig []string
-var BrandConfig []string
-
-func init() {
-	qu.ReadConfig("./res/goods.json", &GoodsConfig)
-	qu.ReadConfig("./res/brand.json", &BrandConfig)
-}
-
-//初始化商品
-func (e *ExtractTask) InitGoods() {
-	e.GoodsGet.AddWord(GoodsConfig...)
-}
-
-//初始化品牌
-func (e *ExtractTask) InitBrand() {
-	e.BrandGet.AddWord(BrandConfig...)
-}

+ 3 - 68
src/jy/extract/extractcity.go

@@ -2,6 +2,7 @@ package extract
 
 import (
 	"fmt"
+	ju "jy/util"
 	"log"
 	qu "qfw/util"
 	"strings"
@@ -34,11 +35,6 @@ type Street struct {
 	D    *District
 }
 
-//敏感词
-type DFA struct {
-	Link map[string]interface{}
-}
-
 var SortField []string
 
 func init() {
@@ -70,7 +66,7 @@ func (e *ExtractTask) ExtractDistrict(field []string, bres bool, c, p, id string
 	d := ""
 	for _, str := range field {
 		//log.Println("field===========", str)
-		for pos, GET := range []DFA{e.AreaDistrict, e.AreaStreet} { //先匹配区或县再匹配街道
+		for pos, GET := range []*ju.DFA{e.AreaDistrict, e.AreaStreet} { //先匹配区或县再匹配街道
 			word := GET.CheckSensitiveWord(str)
 			//log.Println("word================", word)
 			if word != "" {
@@ -175,7 +171,7 @@ func (e *ExtractTask) ExtractProvinceCity(province, city, id string, text []stri
 	}
 	//匹配城市
 	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
-		for pos, GET := range []DFA{e.AreaGet, e.AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
+		for pos, GET := range []*ju.DFA{e.AreaGet, e.AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
 			ws := make([]string, 5)
 			for n, str := range text {
 				if str != "" {
@@ -285,64 +281,3 @@ func (e *ExtractTask) ExtractProvinceCity(province, city, id string, text []stri
 	}
 	return
 }
-
-func (d *DFA) AddWord(keys ...string) {
-	d.AddWordAll(true, keys...)
-}
-
-func (d *DFA) AddWordAll(haskey bool, keys ...string) {
-	if d.Link == nil {
-		d.Link = make(map[string]interface{})
-	}
-	for _, key := range keys {
-		nowMap := &d.Link
-		for i := 0; i < len(key); i++ {
-			kc := key[i : i+1]
-			if v, ok := (*nowMap)[kc]; ok {
-				nowMap, _ = v.(*map[string]interface{})
-			} else {
-				newMap := map[string]interface{}{}
-				newMap["YN"] = "0"
-				(*nowMap)[kc] = &newMap
-				nowMap = &newMap
-			}
-			if i == len(key)-1 {
-				(*nowMap)["YN"] = "1"
-				if haskey {
-					(*nowMap)["K"] = key
-				}
-			}
-		}
-	}
-}
-
-func (d *DFA) CheckSensitiveWord(src string) string {
-	pos := 0
-	nowMap := &d.Link
-	res := ""
-	for i := 0; i < len(src); i++ {
-		word := src[i : i+1]
-		nowMap, _ = (*nowMap)[word].(*map[string]interface{})
-		if nowMap != nil { // 存在,则判断是否为最后一个
-			if pos == 0 {
-				pos = i
-			}
-			if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
-				res = qu.ObjToString((*nowMap)["K"])
-				//pos = 0
-				//break
-			}
-		} else {
-			if res != "" {
-				break
-			} else {
-				nowMap = &d.Link
-				if pos > 0 {
-					i = pos
-					pos = 0
-				}
-			}
-		}
-	}
-	return res
-}

+ 14 - 12
src/jy/extract/extractcity2.go

@@ -2,6 +2,7 @@ package extract
 
 import (
 	db "jy/mongodbutil"
+	ju "jy/util"
 	"log"
 	qu "qfw/util"
 	"strings"
@@ -13,17 +14,18 @@ var ProvinceBrief2 map[string]*Province //只加载一次
 var AreaToCity2 map[string][]*City      //两个文件共用
 var DistrictCityMap2 map[string]*City
 var StreetDistrictMap2 map[string]*District
-var AreaGet2 DFA         //市全称
-var AreaDistrict2 DFA    //区或县
-var AreaProvinceGet2 DFA //省
-var AreaSimGet2 DFA      //市简称
-var AreaStreet2 DFA      //街道
+var AreaGet2 *ju.DFA         //市全称
+var AreaDistrict2 *ju.DFA    //区或县
+var AreaProvinceGet2 *ju.DFA //省
+var AreaSimGet2 *ju.DFA      //市简称
+var AreaStreet2 *ju.DFA      //街道
 
 func InitDFA2() {
 	defer qu.Catch()
-	AreaGet2 = DFA{}
-	AreaProvinceGet2 = DFA{}
-	AreaStreet2 = DFA{}
+	AreaGet2 = &ju.DFA{}
+	AreaDistrict2 = &ju.DFA{}
+	AreaProvinceGet2 = &ju.DFA{}
+	AreaStreet2 = &ju.DFA{}
 	//初始化map
 	if ProvinceMap2 == nil {
 		ProvinceMap2 = make(map[string]string)
@@ -111,7 +113,7 @@ func InitDFA2() {
 	}
 	//初始化城市简称
 	fn3 := InitCitySim("v3.0")
-	AreaSimGet2 = DFA{}
+	AreaSimGet2 = &ju.DFA{}
 	for k, v := range fn3 {
 		pb := v["brief"].(string)
 		p := ProvinceBrief2[pb]
@@ -182,7 +184,7 @@ func FindBuyer() {
 			//开始抽取城市省份
 			bres, c, p := ExtractProvinceCity2("", "", qu.BsonIdToSId(l["_id"]), []string{val})
 			bres, p, c, d := ExtractDistrict2([]string{val}, bres, c, p, qu.BsonIdToSId(l["_id"])) //抽取区或县
-			log.Println(bres, c, p, d)
+			log.Println("---------------------", bres, c, p, d)
 		}
 	}
 }
@@ -233,7 +235,7 @@ func ExtractProvinceCity2(province, city, id string, text []string) (bres bool,
 	}
 	//匹配城市
 	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
-		for pos, GET := range []DFA{AreaGet2, AreaSimGet2} { //AreaGet市全称,AreaSimGet省全称和简称
+		for pos, GET := range []*ju.DFA{AreaGet2, AreaSimGet2} { //AreaGet市全称,AreaSimGet省全称和简称
 			ws := make([]string, 5)
 			for n, str := range text {
 				if str != "" {
@@ -348,7 +350,7 @@ func ExtractDistrict2(field []string, bres bool, c, p, id string) (bool, string,
 	d := ""
 	for _, str := range field {
 		//log.Println("field===========", str)
-		for pos, GET := range []DFA{AreaDistrict2, AreaStreet2} { //先匹配区或县再匹配街道
+		for pos, GET := range []*ju.DFA{AreaDistrict2, AreaStreet2} { //先匹配区或县再匹配街道
 			word := GET.CheckSensitiveWord(str)
 			//log.Println("word================", word)
 			if word != "" {

+ 19 - 0
src/jy/pretreated/analystep.go

@@ -48,6 +48,7 @@ func AnalyStart(job *util.Job) {
 			//块中再查找表格(块,处理完把值赋到块)
 			t1, _ := ComputeConRatio(bl.Text, 2)
 			if len(t1) > 0 {
+				job.HasTable = 1 //添加标识:文本中有table
 				tabres := AnalyTableV2(t1, job.Category, bl.Title, bl.Text, 2, job.SourceMid)
 				processTableResult(tabres, bl, job)
 				if bl.Title == "" && tabres.BlockTag != "" {
@@ -63,6 +64,7 @@ func AnalyStart(job *util.Job) {
 		bl := &util.Block{}
 		newCon := con
 		if len(tabs) > 0 { //解析表格逻辑
+			job.HasTable = 1 //添加标识:文本中有table
 			newCon = TextAfterRemoveTable(con)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 			tabres := AnalyTableV2(tabs, job.Category, "", con, 1, job.SourceMid)
@@ -200,6 +202,23 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 			job.BlockPackage[k] = v
 		}
 	}
+	//增加brand
+	if tabres.HasKey != 0 {
+		job.HasKey = tabres.HasKey
+	}
+	if tabres.HasBrand != 0 {
+		job.HasBrand = tabres.HasBrand
+	}
+	if tabres.HasGoods != 0 {
+		job.HasGoods = tabres.HasGoods
+	}
+	job.HasGoods = tabres.HasGoods
+
+	if len(tabres.BrandData) > 0 { //分块table合并
+		for _, v := range tabres.BrandData {
+			job.BrandData = append(job.BrandData, v) //加入job
+		}
+	}
 }
 
 //一行多列 一列多行,按照分块逻辑处理

+ 385 - 2
src/jy/pretreated/analytable.go

@@ -4,6 +4,7 @@ import (
 	"fmt"
 	u "jy/util"
 	qutil "qfw/util"
+	//	"reflect"
 	"regexp"
 	"strings"
 
@@ -14,8 +15,12 @@ import (
 全局变量,主要是一堆判断正则
 **/
 var (
+	//清理表格title中的不需要的内容
+	tabletitleclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/((人民币万元件个公斤))]")
 	//清理表格中是key中包含的空格或数字等
 	tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]")
+	//清理表格td中的符号
+	tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_??;;~\\-#\\\\]*")
 	//判断key是金额,对万元的处理
 	moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
 	//根据表格的内容判断是不是表头,如果含有金额则不是表头
@@ -98,6 +103,7 @@ var (
 	BuyerContacts               = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"}
 	FilterSerial                = regexp.MustCompile(".+[、..::,]")
 	filterTableWror             = regexp.MustCompile("班子成员")
+	underline                   = regexp.MustCompile("_+$")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -503,6 +509,14 @@ func (table *Table) MergerToTableresult() {
 	if table.TableResult.WinnerOrder == nil || len(table.TableResult.WinnerOrder) == 0 {
 		table.TableResult.WinnerOrder = table.WinnerOrder
 	}
+	//增加brand 并列table
+	if len(table.BrandData) > 0 {
+		for _, v := range table.BrandData {
+			if len(v) > 0 {
+				table.TableResult.BrandData = append(table.TableResult.BrandData, v)
+			}
+		}
+	}
 }
 
 /**
@@ -544,7 +558,7 @@ func (ts *TableResult) Analy() {
 		ts := tn.Analy(contactFormat)
 		for _, tab := range ts {
 			tabs = append(tabs, tab)
-			//log.Println("tab.SortKV.Map", tab.SortKV.Map)
+			fmt.Println("tab.SortKV.Map", tab.SortKV.Map)
 		}
 		//tn.SonTables = append(tn.SonTables, tn)
 	}
@@ -641,6 +655,7 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 		//遍历每行的td
 		tds := sel.ChildrenFiltered("td,th")
 		TR := NewTR(table)
+		tdTextIsNull := true
 		tds.Each(func(m int, selm *goquery.Selection) {
 			//对隐藏列不处理!!!
 			if IsHide(selm) {
@@ -651,8 +666,14 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 			//num++
 			//log.Println(td.SortKV.Keys, td.SortKV.Map)
 			TR.AddTD(td)
+			if td.Val != "" { //删除一个tr,tr中所有td是空值的
+				tdTextIsNull = false
+			}
 		})
-		table.AddTR(TR)
+		//tr中所有td的内容为空 将tr删除
+		if !tdTextIsNull {
+			table.AddTR(TR)
+		}
 	})
 	//重置行列
 	table.ComputeRowColSpan()
@@ -758,6 +779,9 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
 			table.TdContactFormat(contactFormat)
 			//开始查找kv,核心模块
 			table.FindKV()
+			fmt.Println("KvMap----------", table.SortKV.Map)
+			//table中抽取品牌
+			//table.analyBrand()
 			//判断是否是多包,并处理分包的
 			table.CheckMultiPackageByTable()
 			str := "\n"
@@ -1283,6 +1307,19 @@ func (table *Table) FindKV() {
 			if bcon {
 				continue
 			}
+			if tr.TDs[0].StartRow > 0 {
+				numbh := 0
+				for _, td := range tr.TDs {
+					if td.BH {
+						numbh++
+					}
+				}
+				if numbh > 0 && numbh <= len(tr.TDs)/2 {
+					direct, vdirect = 1, 2
+				} else {
+					direct, vdirect = 2, 1
+				}
+			}
 			for _, td := range tr.TDs {
 				/**
 				rt := table.StartAndEndRation[fmtkey("r", td.StartCol, td.EndCol)]
@@ -2618,3 +2655,349 @@ L:
 	//		}
 	//	}
 }
+
+//func (table *Table) analyBrand() {
+//	//产品名称 品牌 规格 单价 单位 数量  小计 质保期
+//	lineMap := make(map[string]map[string]string)
+//	lineMapArr := make(map[string]map[string][]string)
+//	brandRule := u.BrandRules
+//	//将val为数组和string的分开
+//	for key, val := range table.SortKV.Map {
+//		key = regReplAllSpace.ReplaceAllString(key, "")
+//		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+//		kind := reflect.TypeOf(val).String()
+//		//处理多个key相同的数据
+//		if kind == "[]string" { //val为数组 {"数量":["1","2","3"]}
+//			/*
+//				{
+//					"商品":["","",],
+//					"商品_"["",""],
+//				}
+
+//			*/
+//			realTypeVal := val.([]string)
+//			hasGoods(table, realTypeVal) //判断val中是否含产品
+//			hasBrand(table, realTypeVal) //判断val中是否含品牌
+//			line := underline.FindString(key)
+//			lineValMap := lineMapArr[line]
+//			i := 1
+//		L:
+//			for { //去除数组空数据
+//				last := realTypeVal[len(realTypeVal)-i]
+//				if last == "" {
+//					i++
+//					if i > len(realTypeVal) {
+//						break
+//					}
+//					goto L
+//				} else {
+//					break
+//				}
+//			}
+//			dislodgeNull := realTypeVal[:(len(realTypeVal) - i + 1)] //去除数组中空数据
+//			if len(lineValMap) == 0 && len(realTypeVal) != 0 {       //没有数据
+//				lineMapArr[line] = map[string][]string{key: dislodgeNull}
+//			} else { //新增数据
+//				if len(dislodgeNull) != 0 {
+//					lineValMap[key] = dislodgeNull
+//				}
+//			}
+//		} else if kind == "string" { //val为字符串 {"数量":"1"}
+//			/*
+//				{
+//					"商品:"",名称:"",
+//					"商品_:"",名称_:"",
+//					"商品__:"",名称__:"",
+//				}
+//			*/
+
+//			realTypeVal := val.(string)
+//			afterFilter := tabletdclear.ReplaceAllString(realTypeVal, "")
+//			if afterFilter == "" { //空val值舍弃
+//				continue
+//			}
+//			hasGoods(table, realTypeVal) //判断val中是否含产品
+//			hasBrand(table, realTypeVal) //判断val中是否含品牌
+//			line := underline.FindString(key)
+//			lineValMap := lineMap[line]
+//			if len(lineValMap) == 0 { //没有数据
+//				lineMap[line] = map[string]string{key: realTypeVal}
+//			} else { //新增数据
+//				lineValMap[key] = realTypeVal
+//			}
+//		}
+//	}
+//	//fmt.Println("lineMapArr------------------------------", lineMapArr)
+//	//fmt.Println("lineMap------------------------------", lineMap)
+//	//处理数组数据后,匹配必须title和替换要保存的title
+//	if len(lineMapArr) > 0 {
+//		for _, aMap := range lineMapArr {
+//			//u.Debug(aMap)
+//			//minNum := 0
+//			maxNum := 0
+//			arrcount := 0                   //记录key是否存在必须title(数组数据)
+//			ka := make(map[string][]string) //最终存储数据
+//			for k0, v0 := range aMap {
+//				//匹配必须title
+//				for nameM, r := range brandRule["must"] {
+//					if convert(k0, r) { //匹配成功
+//						if len(ka[nameM]) != 0 && strings.Contains(k0, "描述") { //防止k0匹配到多次 和特殊情况 物料名称 物料描述同时出现
+//							continue
+//						}
+//						ka[nameM] = v0
+//						arrcount++
+//					}
+//				}
+//				//fmt.Println(arrcount, k0, v0)
+//				//替换其它要保存字段
+//				for nameR, r := range brandRule["replace"] {
+//					if convert(k0, r) { //匹配成功
+//						ka[nameR] = v0
+//					}
+//				}
+//			}
+//			//找最终存储数据的最小len(arr)
+//			//			for _, vf := range ka {
+//			//				//找最短的数组
+//			//				lenVal := len(vf)
+//			//				if minNum == 0 || minNum > lenVal { //maxNum = len(最短数组)
+//			//					minNum = lenVal
+//			//				}
+//			//			}
+//			//找最终存储数据的最大len(arr),小的补空
+//			for _, vf1 := range ka {
+//				lenVal := len(vf1)
+//				if lenVal > maxNum {
+//					maxNum = lenVal
+//				}
+//			}
+//			finishKa := make(map[string][]string)
+//			for vf2K, vf2 := range ka {
+//				if len(vf2) < maxNum {
+//					lenMv := maxNum - len(vf2)
+//					for i := 0; i < lenMv; i++ {
+//						vf2 = append(vf2, "")
+//					}
+//				}
+//				finishKa[vf2K] = vf2
+//			}
+//			hasKey(table, arrcount) //是否匹配到两个以上的key
+//			if arrcount >= 1 {
+//				finishData := dealArrData(maxNum, finishKa)
+//				table.BrandData = append(table.BrandData, finishData)
+//			}
+//		}
+//	}
+//	//处理string数据后,匹配必须title和替换要保存的title
+//	if len(lineMap) > 0 {
+//		for _, sMap := range lineMap {
+//			strcount := 0 //记录key是否存在必须title(字符串数据)
+//			//fmt.Println("---------------", sMap)
+//			endStrMap := make(map[string]string)
+//			for k1, v1 := range sMap {
+//				//匹配必须title
+//				for nameM, r := range brandRule["must"] {
+//					if convert(k1, r) { //匹配成功
+//						endStrMap[nameM] = v1
+//						strcount++
+//						//fmt.Println(strcount, k1, v1)
+//					}
+//				}
+//				//替换其它要保存字段
+//				for nameR, r := range brandRule["replace"] {
+//					if convert(k1, r) { //匹配成功
+//						endStrMap[nameR] = v1
+//					}
+//				}
+//			}
+//			//原始字符串数据处理
+//			hasKey(table, strcount) //是否匹配到两个以上的key
+//			if strcount >= 1 {
+//				finishData := dealStrData(endStrMap)
+//				table.BrandData = append(table.BrandData, finishData)
+//			}
+//		}
+//	}
+//	//fmt.Println("finish---", table.BrandData)
+//}
+
+//func dealArrData(minNum int, ka map[string][]string) []map[string]string {
+//	for k2, v2 := range ka {
+//		//处理数组长度不相等,使长度一致
+//		if len(v2) > minNum {
+//			ka[k2] = v2[:minNum]
+//		}
+//	}
+//	finalData := assembleData(ka)
+//	if len(finalData) > 0 {
+//		return finalData
+//	}
+//	return nil
+
+//}
+//func dealStrData(kv map[string]string) []map[string]string {
+//	finalData := assembleData(kv)
+//	if len(finalData) > 0 {
+//		return finalData
+//	}
+//	return nil
+
+//}
+
+////组装数据,每一行的数据为一数据集合
+//func assembleData(m interface{}) []map[string]string {
+//	defer qutil.Catch()
+//	/*
+//		{
+//			"itemname":["计算机","打印机","机柜"],
+//			"number"  :["1","12","4"]
+//		}
+//	*/
+//	datas := []map[string]string{}
+//	switch reflect.TypeOf(m).String() {
+//	case "map[string][]string": //数组数据
+//		realTypeM := m.(map[string][]string)
+//		//根据数组数据的顺序 将多个数组中索引相同的数据拼装成一个map,并将这多个map放入一个arr
+//		/*
+//			arr1 ["a1","b1","c1"]
+//			arr2 ["a2","b2","c2"]
+
+//			[
+//				{"a1","a2"},
+//				{"b1","b2"},
+//				{"c1","c2"}
+//			]
+//		*/
+//		//start
+//		for k3, v3 := range realTypeM {
+//			for _, val := range v3 {
+//				data := make(map[string]string)
+//				data[k3] = val
+//				datas = append(datas, data)
+//			}
+//			break
+//		}
+//		for i, data := range datas {
+//			for k4, v4 := range realTypeM {
+//				if i < len(v4) { //数组数据长度不一致
+//					if v4[i] != " " {
+//						data[k4] = v4[i]
+//					} else {
+//						delete(data, k4)
+//						continue
+//					}
+//				} else {
+//					fmt.Println("err table")
+//					continue
+//				}
+//			}
+//			datas[i] = data
+//		}
+//		//end
+//		for _, fdv := range datas { //清除空数据和只含特殊符号的数据
+//			for fmk, fmv := range fdv {
+//				if tabletdclear.ReplaceAllString(fmv, "") == "" {
+//					delete(fdv, fmk)
+//				}
+//			}
+//		}
+//	case "map[string]string": //字符串数据
+//		realTypeM := m.(map[string]string)
+//		datas = append(datas, realTypeM)
+//	default:
+//	}
+//	return datas
+//}
+
+//func convert(key, r string) bool {
+//	flag := false
+//	//fmt.Println("key1---", key)
+//	key = tabletitleclear.ReplaceAllString(key, "")
+//	//fmt.Println("key2---", key)
+//	reg, err := regexp.Compile(r)
+//	if err != nil {
+//		fmt.Println("reg err:", err)
+//		return flag
+//	}
+//	flag = reg.MatchString(key)
+//	//fmt.Println(key, "	", r, "	", flag)
+//	return flag
+//}
+
+//func hasKey(table *Table, n int) {
+//	//fmt.Println("key匹配到:", n, table.TableResult.HasKey)
+//	if table.TableResult.HasKey == 1 {
+//		return
+//	}
+//	if n >= 1 {
+//		table.TableResult.HasKey = 1
+//	}
+//	//fmt.Println(table.TableResult.HasKey)
+//}
+
+////是否有商品
+//func hasGoods(table *Table, data interface{}) {
+//	if table.TableResult.HasGoods == 1 {
+//		return
+//	}
+//	sData, ok := data.(string)
+//	proFlag := false
+//	if ok { //string数据检查goods
+//		if sData != "" {
+//			//			if name == "goods" {
+//			//				return GoodsGet.CheckSensitiveWord(src)
+//			//			} else {
+//			//				return BrandGet.CheckSensitiveWord(src)
+//			//			}
+
+//			proFlag = u.CheckSensitiveWord("goods", sData)
+//			if proFlag {
+//				table.TableResult.HasGoods = 1
+//			}
+//		}
+//	} else { //arr数据检查goods
+//		arrData := data.([]string)
+//		if len(arrData) > 0 {
+//			for _, src := range arrData {
+//				if src != "" {
+//					proFlag = u.CheckSensitiveWord("pro", src)
+//					if proFlag {
+//						table.TableResult.HasGoods = 1
+//						break
+//					}
+//				}
+//			}
+//		}
+//	}
+//}
+
+////是否有品牌
+//func hasBrand(table *Table, data interface{}) {
+//	if table.TableResult.HasBrand == 1 {
+//		return
+//	}
+//	sData, ok := data.(string)
+//	brandFlag := false
+//	if ok { //string数据检查goods
+//		if sData != "" {
+//			brandFlag = u.CheckSensitiveWord("brand", sData)
+//			//fmt.Println("--------------", sData, brandFlag)
+//			if brandFlag {
+//				table.TableResult.HasBrand = 1
+//			}
+//		}
+//	} else { //arr数据检查goods
+//		arrData := data.([]string)
+//		if len(arrData) > 0 {
+//			for _, src := range arrData {
+//				if src != "" {
+//					brandFlag = u.CheckSensitiveWord("brand", src)
+//					if brandFlag {
+//						table.TableResult.HasBrand = 1
+//						break
+//					}
+//				}
+//			}
+//		}
+//	}
+//}

+ 30 - 0
src/jy/pretreated/tablev2.go

@@ -29,6 +29,11 @@ type TableResult struct {
 	SortKV         *SortMap             //全局KVmap值,标准化处理过的
 	SortKVWeight   map[string]int       //全局KVmap值,标准化处理过的
 	WinnerOrder    []map[string]interface{}
+	BrandData      [][]map[string]string
+	HasKey         int //有key
+	HasVal         int //有val
+	HasBrand       int //有品牌
+	HasGoods       int //有商品
 }
 
 //快速创建TableResult对象
@@ -148,6 +153,26 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 			//				td.TR.Table.StandKV[k] = sonts.SortKV.Map[k].(string)
 			//				td.TR.Table.StandKVWeight[k] = sonts.SortKVWeight[k]
 			//}
+			//增加brand (子表)
+			//fmt.Println("sonsHasKey=============", sonts.HasKey)
+			//fmt.Println("sonsHasGoods========", sonts.HasGoods)
+			//fmt.Println("sonsHasBrand========", sonts.HasBrand)
+			if sonts.HasKey != 0 {
+				td.TR.Table.TableResult.HasKey = sonts.HasKey
+			}
+			if sonts.HasGoods != 0 {
+				td.TR.Table.TableResult.HasGoods = sonts.HasGoods
+			}
+			if sonts.HasBrand != 0 {
+				td.TR.Table.TableResult.HasBrand = sonts.HasBrand
+			}
+			if sonts.BrandData != nil && len(sonts.BrandData) > 0 { //子table
+				for _, v := range sonts.BrandData {
+					if len(v) > 0 {
+						td.TR.Table.TableResult.BrandData = append(td.TR.Table.TableResult.BrandData, v)
+					}
+				}
+			}
 			if sonts.WinnerOrder != nil && len(sonts.WinnerOrder) > 0 {
 				td.TR.Table.TableResult.WinnerOrder = sonts.WinnerOrder
 			}
@@ -469,6 +494,11 @@ type Table struct {
 	WinnerOrder            []map[string]interface{}
 	BSplit                 bool //是否是有一个表拆分成的多个表
 	BHeader                bool //拆分表是否有表头
+	BrandData              [][]map[string]string
+	HasKey                 int //有key
+	HasVal                 int //有val
+	HasBrand               int //有品牌
+	HasGoods               int //有商品
 }
 
 func NewTable(Html string, TableResult *TableResult, tab *goquery.Selection) *Table {

+ 69 - 0
src/jy/util/util.go

@@ -12,10 +12,19 @@ import (
 	"gopkg.in/natefinch/lumberjack.v2"
 )
 
+//敏感词
+type DFA struct {
+	Link map[string]interface{}
+}
+
 var syncint chan bool //获取下标锁
 var Config map[string]interface{}
 var Se = qu.SimpleEncrypt{Key: "topnet@extract"}
 
+var BrandRules map[string]map[string]string
+var GoodsConfig []string
+var BrandConfig []string
+
 func init() {
 	//输出日志配置,多输出源
 	filelog := &lumberjack.Logger{
@@ -71,3 +80,63 @@ func DeepCopy(value interface{}) interface{} {
 	}
 	return value
 }
+func (d *DFA) AddWord(keys ...string) {
+	d.AddWordAll(true, keys...)
+}
+
+func (d *DFA) AddWordAll(haskey bool, keys ...string) {
+	if d.Link == nil {
+		d.Link = make(map[string]interface{})
+	}
+	for _, key := range keys {
+		nowMap := &d.Link
+		for i := 0; i < len(key); i++ {
+			kc := key[i : i+1]
+			if v, ok := (*nowMap)[kc]; ok {
+				nowMap, _ = v.(*map[string]interface{})
+			} else {
+				newMap := map[string]interface{}{}
+				newMap["YN"] = "0"
+				(*nowMap)[kc] = &newMap
+				nowMap = &newMap
+			}
+			if i == len(key)-1 {
+				(*nowMap)["YN"] = "1"
+				if haskey {
+					(*nowMap)["K"] = key
+				}
+			}
+		}
+	}
+}
+
+func (d *DFA) CheckSensitiveWord(src string) string {
+	pos := 0
+	nowMap := &d.Link
+	res := ""
+	for i := 0; i < len(src); i++ {
+		word := src[i : i+1]
+		nowMap, _ = (*nowMap)[word].(*map[string]interface{})
+		if nowMap != nil { // 存在,则判断是否为最后一个
+			if pos == 0 {
+				pos = i
+			}
+			if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
+				res = qu.ObjToString((*nowMap)["K"])
+				//pos = 0
+				//break
+			}
+		} else {
+			if res != "" {
+				break
+			} else {
+				nowMap = &d.Link
+				if pos > 0 {
+					i = pos
+					pos = 0
+				}
+			}
+		}
+	}
+	return res
+}

+ 3 - 0
src/main.go

@@ -17,6 +17,9 @@ import (
 
 func init() {
 	qu.ReadConfig(&util.Config)
+	qu.ReadConfig("./res/brandrule.json", &util.BrandRules)
+	qu.ReadConfig("./res/goods.json", &util.GoodsConfig)
+	qu.ReadConfig("./res/brand.json", &util.BrandConfig)
 	//初始化mongo连接
 	util.InitMgoPool()
 	//初始化redis

+ 1 - 1
src/main_test.go

@@ -14,7 +14,7 @@ import (
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")
-	extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5c2a49d0a5cb26b9b766bd98", "1", "mxs_v3", "mxs_v3")
+	extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5caafa83a5cb26b9b7ec03b7", "1", "mxs_v3", "mxs_v3")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }