zhangjinkun 6 年之前
父節點
當前提交
5d16a026bf
共有 2 個文件被更改,包括 35 次插入28 次删除
  1. 4 4
      src/config.json
  2. 31 24
      src/jy/extract/extract.go

+ 4 - 4
src/config.json

@@ -11,6 +11,8 @@
     "saveresult": false,
     "fieldscore": true,
     "qualityaudit": true,
+    "saveblock": false,
+    "filelength": 100000,
     "iscltlog": false,
     "brandgoods": true,
     "udptaskid": "5be107e600746bf92debf080",
@@ -51,11 +53,9 @@
             {
                 "zoneid": "cn-beijing-h",
                 "LaunchTemplateId4": "lt-2ze5ir54gy4ui8okr71f",
-                "LaunchTemplateId8": "	lt-2ze5fzxwgt8jcqczvmjy",
+                "LaunchTemplateId8": "lt-2ze5fzxwgt8jcqczvmjy",
                 "vswitchid": "vsw-2ze1n1k3mo3fv2irsfdps"
             }
         ]
-    },
-    "filelength": 100000,
-    "saveblock": true
+    }
 } 

+ 31 - 24
src/jy/extract/extract.go

@@ -23,13 +23,13 @@ import (
 )
 
 var (
-	lock    sync.RWMutex
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 200                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	lock          sync.RWMutex
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 200                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -372,12 +372,12 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					if vc.Field == "projectname" {
 						if len(j.Result[vc.Field]) < 1 {
 							items := make([]*ju.ScoreItem, 1)
-							items[0] = &ju.ScoreItem{Des:"项目名称未能抽取到,标题来凑初始化",ExtFrom: "title", Value: j.Title, Score: 4}
+							items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
 							field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
 							if tmp["blocktag"] != nil {
 								field.BlockTag = tmp["blocktag"].(map[string]bool)
 							}
-							j.Result[vc.Field] = append(j.Result[vc.Field],field)
+							j.Result[vc.Field] = append(j.Result[vc.Field], field)
 							//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
 						}
 					}
@@ -412,13 +412,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 					//项目名称未能抽取到,标题来凑
 					if vc.Field == "projectname" {
 						items := make([]*ju.ScoreItem, 1)
-						items[0] = &ju.ScoreItem{Des:"项目名称未能抽取到,标题来凑初始化",ExtFrom: "title", Value: j.Title, Score: 4}
+						items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
 						field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
 						if len(j.Result[vc.Field]) < 1 {
 							if tmp["blocktag"] != nil {
 								field.BlockTag = tmp["blocktag"].(map[string]bool)
 							}
-							j.Result[vc.Field] = append(j.Result[vc.Field],field)
+							j.Result[vc.Field] = append(j.Result[vc.Field], field)
 							//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
 						}
 					}
@@ -676,15 +676,15 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 						if tmp["blocktag"] != nil {
 							field.BlockTag = tmp["blocktag"].(map[string]bool)
 						}
-						item := &ju.ScoreItem{Des:"初始化",Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0}
-						if tmp["scoreitem"] == nil{
+						item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0}
+						if tmp["scoreitem"] == nil {
 							scoreItems := make([]*ju.ScoreItem, 0)
 							scoreItems = append(scoreItems, item)
 							field.ScoreItem = scoreItems
-						}else {
+						} else {
 							field.ScoreItem = append(field.ScoreItem, item)
 						}
-						j.Result[k] = append(j.Result[k],field)
+						j.Result[k] = append(j.Result[k], field)
 					}
 				}
 			}
@@ -703,11 +703,18 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 		//}
 		//块抽取
 		if in.Field != "" {
-			for _, v := range j.Block {
-				extinfo := extRegCoreToResult(extfrom, v.Text, &v.Tag, j, in)
+			if extfrom == "title" {
+				extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]bool{"title": true}, j, in)
 				if len(extinfo) > 0 {
 					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 				}
+			} else {
+				for _, v := range j.Block {
+					extinfo := extRegCoreToResult(extfrom, v.Text, &v.Tag, j, in)
+					if len(extinfo) > 0 {
+						AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+					}
+				}
 			}
 		}
 	}
@@ -918,13 +925,13 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 						if tmp["blocktag"] != nil {
 							exfield.BlockTag = tmp["blocktag"].(map[string]bool)
 						}
-						item := ju.ScoreItem{Des:"初始化",Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
+						item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
 						if tmp["scoreitem"] == nil {
 							sitems := make([]*ju.ScoreItem, 0)
 							sitems = append(sitems, &item)
 							exfield.ScoreItem = sitems
 						} else {
-							exfield.ScoreItem = append(exfield.ScoreItem , &item)
+							exfield.ScoreItem = append(exfield.ScoreItem, &item)
 						}
 						j.Result[k] = append(j.Result[k], &exfield)
 						//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
@@ -960,19 +967,19 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
 			if j.Result[v.Field] == nil {
 				j.Result[v.Field] = [](*ju.ExtField){}
 			}
-			field := &ju.ExtField{Field: v.Field, Code:v.Code, RuleText:v.RuleText,Type: "regexp",MatchType: "regcontent", ExtFrom:extfrom,Value: val,Score: 0}
+			field := &ju.ExtField{Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
 			if tmp["blocktag"] != nil {
 				field.BlockTag = tmp["blocktag"].(map[string]bool)
 			}
-			item := ju.ScoreItem{Des:"初始化",Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
+			item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
 			if tmp["scoreitem"] == nil {
 				sitems := make([]*ju.ScoreItem, 0)
 				sitems = append(sitems, &item)
 				field.ScoreItem = sitems
 			} else {
-				field.ScoreItem = append(field.ScoreItem , &item)
+				field.ScoreItem = append(field.ScoreItem, &item)
 			}
-			j.Result[v.Field] = append(j.Result[v.Field],field )
+			j.Result[v.Field] = append(j.Result[v.Field], field)
 		}
 	}
 	return extinfo
@@ -1418,7 +1425,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库