|
@@ -13,6 +13,7 @@ import (
|
|
|
"reflect"
|
|
|
"regexp"
|
|
|
"strconv"
|
|
|
+ "strings"
|
|
|
"sync"
|
|
|
"time"
|
|
|
"unicode/utf8"
|
|
@@ -370,13 +371,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
//项目名称未能抽取到,标题来凑
|
|
|
if vc.Field == "projectname" {
|
|
|
if len(j.Result[vc.Field]) < 1 {
|
|
|
+ items := make([]*ju.ScoreItem, 1)
|
|
|
+ items[0] = &ju.ScoreItem{Des:"项目名称未能抽取到,标题来凑初始化",ExtFrom: "title", Value: j.Title, Score: 4}
|
|
|
+ field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field],
|
|
|
- &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
- } else {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field],
|
|
|
- &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
}
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field],field)
|
|
|
//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
|
|
|
}
|
|
|
}
|
|
@@ -410,12 +411,14 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
|
|
|
//项目名称未能抽取到,标题来凑
|
|
|
if vc.Field == "projectname" {
|
|
|
+ items := make([]*ju.ScoreItem, 1)
|
|
|
+ items[0] = &ju.ScoreItem{Des:"项目名称未能抽取到,标题来凑初始化",ExtFrom: "title", Value: j.Title, Score: 4}
|
|
|
+ field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
|
|
|
if len(j.Result[vc.Field]) < 1 {
|
|
|
if tmp["blocktag"] != nil {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
- } else {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
}
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field],field)
|
|
|
//j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
|
|
|
}
|
|
|
}
|
|
@@ -669,13 +672,19 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
- j.Result[k] = append(j.Result[k],
|
|
|
- &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
|
|
|
- } else {
|
|
|
- j.Result[k] = append(j.Result[k],
|
|
|
- &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0})
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
+ }
|
|
|
+ item := &ju.ScoreItem{Des:"初始化",Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0}
|
|
|
+ if tmp["scoreitem"] == nil{
|
|
|
+ scoreItems := make([]*ju.ScoreItem, 0)
|
|
|
+ scoreItems = append(scoreItems, item)
|
|
|
+ field.ScoreItem = scoreItems
|
|
|
+ }else {
|
|
|
+ field.ScoreItem = append(field.ScoreItem, item)
|
|
|
}
|
|
|
+ j.Result[k] = append(j.Result[k],field)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -685,11 +694,20 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
} else {
|
|
|
//全文正则
|
|
|
- text := qu.ObjToString(doc[extfrom])
|
|
|
+ //text := qu.ObjToString(doc[extfrom])
|
|
|
+ //if in.Field != "" {
|
|
|
+ // extinfo := extRegCoreToResult(extfrom, text, j, in)
|
|
|
+ // if len(extinfo) > 0 {
|
|
|
+ // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //块抽取
|
|
|
if in.Field != "" {
|
|
|
- extinfo := extRegCoreToResult(extfrom, text, j, in)
|
|
|
- if len(extinfo) > 0 {
|
|
|
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ for _, v := range j.Block {
|
|
|
+ extinfo := extRegCoreToResult(extfrom, v.Text, &v.Tag, j, in)
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -869,7 +887,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
}
|
|
|
|
|
|
//正则提取结果
|
|
|
-func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
|
|
|
+func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
|
|
|
defer qu.Catch()
|
|
|
extinfo := map[string][]map[string]interface{}{}
|
|
|
if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
@@ -891,15 +909,24 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
"value": val,
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|
|
|
+ "blocktag": *tag,
|
|
|
}
|
|
|
tmps = append(tmps, tmp)
|
|
|
extinfo[k] = tmps
|
|
|
- if val != "" {
|
|
|
+ if strings.TrimSpace(val) != "" {
|
|
|
+ exfield := ju.ExtField{Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ exfield.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
+ }
|
|
|
+ item := ju.ScoreItem{Des:"初始化",Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
|
|
|
+ if tmp["scoreitem"] == nil {
|
|
|
+ sitems := make([]*ju.ScoreItem, 0)
|
|
|
+ sitems = append(sitems, &item)
|
|
|
+ exfield.ScoreItem = sitems
|
|
|
} else {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{nil, k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ exfield.ScoreItem = append(exfield.ScoreItem , &item)
|
|
|
}
|
|
|
+ j.Result[k] = append(j.Result[k], &exfield)
|
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
}
|
|
|
}
|
|
@@ -926,17 +953,26 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
"value": val,
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|
|
|
+ "blocktag": *tag,
|
|
|
}
|
|
|
tmps = append(tmps, tmp)
|
|
|
extinfo[v.Field] = tmps
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- if tmp["blocktag"] != nil{
|
|
|
- j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
- }else {
|
|
|
- j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{nil, v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ field := &ju.ExtField{Field: v.Field, Code:v.Code, RuleText:v.RuleText,Type: "regexp",MatchType: "regcontent", ExtFrom:extfrom,Value: val,Score: 0}
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
+ }
|
|
|
+ item := ju.ScoreItem{Des:"初始化",Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val, Score: 0}
|
|
|
+ if tmp["scoreitem"] == nil {
|
|
|
+ sitems := make([]*ju.ScoreItem, 0)
|
|
|
+ sitems = append(sitems, &item)
|
|
|
+ field.ScoreItem = sitems
|
|
|
+ } else {
|
|
|
+ field.ScoreItem = append(field.ScoreItem , &item)
|
|
|
}
|
|
|
+ j.Result[v.Field] = append(j.Result[v.Field],field )
|
|
|
}
|
|
|
}
|
|
|
return extinfo
|
|
@@ -956,7 +992,20 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
|
for _, tmp := range tmps {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
+ }
|
|
|
+ item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
|
|
|
+ if tmp["scoreitem"] == nil {
|
|
|
+ scoreItems := make([]*ju.ScoreItem, 0)
|
|
|
+ scoreItems = append(scoreItems, &item)
|
|
|
+ field.ScoreItem = scoreItems
|
|
|
+ } else {
|
|
|
+ field.ScoreItem = append(field.ScoreItem, &item)
|
|
|
+ }
|
|
|
+ j.Result[k] = append(j.Result[k], field)
|
|
|
+ //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -1188,15 +1237,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
if ju.Config["saveblock"].(bool) {
|
|
|
blocks := make([]ju.BlockAndTag, 0)
|
|
|
for _, v := range j.Block {
|
|
|
- xx,_:=json.Marshal(v)
|
|
|
+ xx, _ := json.Marshal(v)
|
|
|
tmpblock := new(ju.TmpBlock)
|
|
|
- err:= json.Unmarshal(xx,&tmpblock)
|
|
|
- if err != nil{
|
|
|
- if v.BPackage!= nil{
|
|
|
+ err := json.Unmarshal(xx, &tmpblock)
|
|
|
+ if err != nil {
|
|
|
+ if v.BPackage != nil {
|
|
|
bpb, _ := json.Marshal(v.BPackage)
|
|
|
tmpblock.BPackage = string(bpb)
|
|
|
}
|
|
|
- tmpblock = rangeBlockToJson(v,*tmpblock)
|
|
|
+ tmpblock = rangeBlockToJson(v, *tmpblock)
|
|
|
}
|
|
|
blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
|
|
|
}
|
|
@@ -1248,32 +1297,33 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
log.Debug("AnalysisSaveResult err", err)
|
|
|
})
|
|
|
}
|
|
|
-func rangeBlockToJson(j *ju.Block,tmpblock ju.TmpBlock)(b *ju.TmpBlock){
|
|
|
- if j == nil{
|
|
|
+func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
|
|
|
+ if j == nil {
|
|
|
return nil
|
|
|
}
|
|
|
- if len(j.Block)>0{
|
|
|
- for i,v := range j.Block{
|
|
|
+ if len(j.Block) > 0 {
|
|
|
+ for i, v := range j.Block {
|
|
|
rangetmp := new(ju.TmpBlock)
|
|
|
- vb,_:=json.Marshal(v)
|
|
|
- json.Unmarshal(vb,&rangetmp)
|
|
|
- tmpblock.Block[i]=rangeBlockToJson(v,*rangetmp)
|
|
|
+ vb, _ := json.Marshal(v)
|
|
|
+ json.Unmarshal(vb, &rangetmp)
|
|
|
+ tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
|
|
|
}
|
|
|
}
|
|
|
- if j.ColonKV!= nil {
|
|
|
- cb,_ := json.Marshal(j.ColonKV)
|
|
|
+ if j.ColonKV != nil {
|
|
|
+ cb, _ := json.Marshal(j.ColonKV)
|
|
|
tmpblock.ColonKV = string(cb)
|
|
|
}
|
|
|
- if j.SpaceKV != nil{
|
|
|
- sb,_ := json.Marshal(j.SpaceKV)
|
|
|
+ if j.SpaceKV != nil {
|
|
|
+ sb, _ := json.Marshal(j.SpaceKV)
|
|
|
tmpblock.SpaceKV = string(sb)
|
|
|
}
|
|
|
- if j.TableKV != nil{
|
|
|
- tb,_ := json.Marshal(j.TableKV)
|
|
|
+ if j.TableKV != nil {
|
|
|
+ tb, _ := json.Marshal(j.TableKV)
|
|
|
tmpblock.TableKV = string(tb)
|
|
|
}
|
|
|
return &tmpblock
|
|
|
}
|
|
|
+
|
|
|
//去重冗余字段
|
|
|
func delFiled(k string) bool {
|
|
|
return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|