Ver Fonte

抽取正则匹配

fengweiqiang há 6 anos atrás
pai
commit
f4ddb1fdf0
2 ficheiros alterados com 73 adições e 18 exclusões
  1. 62 9
      src/jy/extract/extract.go
  2. 11 9
      src/jy/pretreated/analystep.go

+ 62 - 9
src/jy/extract/extract.go

@@ -24,12 +24,12 @@ import (
 var (
 	lock, lockrule, lockclear sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 200                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 200                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -892,7 +892,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if v.RegCore.NumSign == -1 { //正负值修正
 							val = "-" + val
 						}
-						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: val, Value: val}
+						exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
 						if extfrom == "title" {
 							exfield.Score = 4
 						}
@@ -919,6 +919,59 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 					}
 				}
 			}
+			if len(extinfo) == 0 {
+				regArr := strings.Split(v.RuleText, "__")
+				//fmt.Println(regArr[0])
+				if len(regArr) > 0 {
+					reg, err := regexp.Compile(regArr[0])
+					if err == nil {
+						datavals := reg.FindStringSubmatch(text)
+						tmps := []map[string]interface{}{}
+						for _, value := range datavals {
+							if value == "" {
+								continue
+							}
+							tmp := map[string]interface{}{
+								"field":     v.Field,
+								"code":      v.Code + "去除__*后",
+								"ruletext":  regArr[0],
+								"extfrom":   extfrom,
+								"value":     value,
+								"type":      "regexp",
+								"matchtype": "regcontent",
+								"blocktag":  *tag,
+							}
+							tmps = append(tmps, tmp)
+							extinfo[v.Field] = tmps
+
+							exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
+							if extfrom == "title" {
+								exfield.Score = 4
+							}
+							if tmp["blocktag"] != nil {
+								exfield.BlockTag = tmp["blocktag"].(map[string]bool)
+							}
+							item := ju.ScoreItem{Des: "初始化抽取规则去除__*", Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: value}
+							if extfrom == "title" {
+								item.Score = 4
+							}
+							if strings.Contains(value, "\n") {
+								item.Score -= 1
+								exfield.Score -= 1
+							}
+							if tmp["scoreitem"] == nil {
+								sitems := make([]*ju.ScoreItem, 0)
+								sitems = append(sitems, &item)
+								exfield.ScoreItem = sitems
+							} else {
+								exfield.ScoreItem = append(exfield.ScoreItem, &item)
+							}
+							j.Result[v.Field] = append(j.Result[v.Field], &exfield)
+							//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
+						}
+					}
+				}
+			}
 		}
 	} else {
 		pos := v.RegCore.Reg.FindStringIndex(text)
@@ -948,7 +1001,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: val, Value: val}
+			field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
 			if extfrom == "title" {
 				field.Score = 4
 			}
@@ -1501,7 +1554,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 11 - 9
src/jy/pretreated/analystep.go

@@ -30,7 +30,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -41,15 +41,15 @@ func AnalyStart(job *util.Job) {
 			t1, _ := ComputeConRatio(bl.Text, 2)
 			if len(t1) > 0 {
 				job.HasTable = 1
-				for i:=0;i<len(tabs);i++{
+				for i := 0; i < len(tabs); i++ {
 					bl := &util.Block{}
 					//添加标识:文本中有table
 					tabres := AnalyTableV2(t1[0], job.Category, bl.Title, bl.Text, 2, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
-					processTableResult(tabres, bl, job)                                                          //分析table解析结果
+					processTableResult(tabres, bl, job)                                                             //分析table解析结果
 					if bl.Title == "" && tabres.BlockTag != "" {
 						bl.Title = tabres.BlockTag
 					}
-					if len(bl.TableKV.Kv)>0{
+					if len(bl.TableKV.Kv) > 0 {
 						bl.Text = tabs[i].Text()
 						job.Block = append(job.Block, bl)
 					}
@@ -71,15 +71,15 @@ func AnalyStart(job *util.Job) {
 			job.HasTable = 1 //添加标识:文本中有table
 			newCon = TextAfterRemoveTable(con)
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
-			for i:=0;i<len(tabs);i++{
+			for i := 0; i < len(tabs); i++ {
 				bl := &util.Block{}
 				//添加标识:文本中有table
-				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock)//解析表格入口 返回:汇总表格对象
-				processTableResult(tabres, bl, job)                                                          //分析table解析结果
+				tabres := AnalyTableV2(tabs[i], job.Category, "", con, 1, job.SourceMid, job.RuleBlock) //解析表格入口 返回:汇总表格对象
+				processTableResult(tabres, bl, job)                                                     //分析table解析结果
 				if bl.Title == "" && tabres.BlockTag != "" {
 					bl.Title = tabres.BlockTag
 				}
-				if len(bl.TableKV.Kv) >0 {
+				if len(bl.TableKV.Kv) > 0 {
 					bl.Text = tabs[i].Text()
 					job.Block = append(job.Block, bl)
 				}
@@ -122,17 +122,19 @@ func FindProjectCode(newCon string, job *util.Job) {
 	var proCode string
 	proCode = projectcodeReg.FindString(newCon)
 	blCode := &util.Block{}
-	blCode.Text = proCode
 	if proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
 		blCode.ColonKV = ckv
+		blCode.Text = proCode
 		job.Block = append(job.Block, blCode)
 	} else if proCode = projectcodeReg2.FindString(newCon); proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
 		blCode.ColonKV = ckv
+		blCode.Text = proCode
 		job.Block = append(job.Block, blCode)
 	} else if proCode = projectcodeReg3.FindString(newCon); proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
+		blCode.Text = proCode
 		blCode.ColonKV = ckv
 		job.Block = append(job.Block, blCode)
 	}