|
@@ -22,13 +22,13 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- lock sync.RWMutex
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 200 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ lock sync.RWMutex
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -932,9 +932,9 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- if tmp["blocktag"] != nil{
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
- }else {
|
|
|
+ } else {
|
|
|
j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{nil, v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
}
|
|
|
}
|
|
@@ -956,7 +956,12 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
|
for _, tmp := range tmps {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
+ } else {
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{nil, k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
+ }
|
|
|
+ //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -1188,15 +1193,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
if ju.Config["saveblock"].(bool) {
|
|
|
blocks := make([]ju.BlockAndTag, 0)
|
|
|
for _, v := range j.Block {
|
|
|
- xx,_:=json.Marshal(v)
|
|
|
+ xx, _ := json.Marshal(v)
|
|
|
tmpblock := new(ju.TmpBlock)
|
|
|
- err:= json.Unmarshal(xx,&tmpblock)
|
|
|
- if err != nil{
|
|
|
- if v.BPackage!= nil{
|
|
|
+ err := json.Unmarshal(xx, &tmpblock)
|
|
|
+ if err != nil {
|
|
|
+ if v.BPackage != nil {
|
|
|
bpb, _ := json.Marshal(v.BPackage)
|
|
|
tmpblock.BPackage = string(bpb)
|
|
|
}
|
|
|
- tmpblock = rangeBlockToJson(v,*tmpblock)
|
|
|
+ tmpblock = rangeBlockToJson(v, *tmpblock)
|
|
|
}
|
|
|
blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
|
|
|
}
|
|
@@ -1248,32 +1253,33 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
log.Debug("AnalysisSaveResult err", err)
|
|
|
})
|
|
|
}
|
|
|
-func rangeBlockToJson(j *ju.Block,tmpblock ju.TmpBlock)(b *ju.TmpBlock){
|
|
|
- if j == nil{
|
|
|
+func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
|
|
|
+ if j == nil {
|
|
|
return nil
|
|
|
}
|
|
|
- if len(j.Block)>0{
|
|
|
- for i,v := range j.Block{
|
|
|
+ if len(j.Block) > 0 {
|
|
|
+ for i, v := range j.Block {
|
|
|
rangetmp := new(ju.TmpBlock)
|
|
|
- vb,_:=json.Marshal(v)
|
|
|
- json.Unmarshal(vb,&rangetmp)
|
|
|
- tmpblock.Block[i]=rangeBlockToJson(v,*rangetmp)
|
|
|
+ vb, _ := json.Marshal(v)
|
|
|
+ json.Unmarshal(vb, &rangetmp)
|
|
|
+ tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp)
|
|
|
}
|
|
|
}
|
|
|
- if j.ColonKV!= nil {
|
|
|
- cb,_ := json.Marshal(j.ColonKV)
|
|
|
+ if j.ColonKV != nil {
|
|
|
+ cb, _ := json.Marshal(j.ColonKV)
|
|
|
tmpblock.ColonKV = string(cb)
|
|
|
}
|
|
|
- if j.SpaceKV != nil{
|
|
|
- sb,_ := json.Marshal(j.SpaceKV)
|
|
|
+ if j.SpaceKV != nil {
|
|
|
+ sb, _ := json.Marshal(j.SpaceKV)
|
|
|
tmpblock.SpaceKV = string(sb)
|
|
|
}
|
|
|
- if j.TableKV != nil{
|
|
|
- tb,_ := json.Marshal(j.TableKV)
|
|
|
+ if j.TableKV != nil {
|
|
|
+ tb, _ := json.Marshal(j.TableKV)
|
|
|
tmpblock.TableKV = string(tb)
|
|
|
}
|
|
|
return &tmpblock
|
|
|
}
|
|
|
+
|
|
|
//去重冗余字段
|
|
|
func delFiled(k string) bool {
|
|
|
return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
@@ -1368,7 +1374,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|