|
@@ -22,13 +22,13 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- lock sync.RWMutex
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 200 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ lock sync.RWMutex
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -274,7 +274,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
|
}
|
|
|
}
|
|
|
qu.Try(func() {
|
|
|
- pretreated.AnalyStart(j)
|
|
|
+ pretreated.AnalyStart(j) //job.Block分块
|
|
|
if isextFile {
|
|
|
pretreated.AnalyStart(jf)
|
|
|
}
|
|
@@ -331,9 +331,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
qu.Try(func() {
|
|
|
doc := *j.Data
|
|
|
//全局前置规则,结果覆盖doc属性
|
|
|
- for _, v := range e.RulePres {
|
|
|
- doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
|
- }
|
|
|
+ //for _, v := range e.RulePres {
|
|
|
+ // doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
|
+ //}
|
|
|
if j.CategorySecond == "" {
|
|
|
//抽取规则
|
|
|
tmprules := map[string][]*RuleCore{}
|
|
@@ -349,10 +349,10 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
if !ju.Logic(vc.LuaLogic, tmp) {
|
|
|
continue
|
|
|
}
|
|
|
- //抽取-前置规则
|
|
|
- for _, v := range vc.RulePres {
|
|
|
- tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
|
- }
|
|
|
+ ////抽取-前置规则
|
|
|
+ //for _, v := range vc.RulePres {
|
|
|
+ // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
|
+ //}
|
|
|
// log.Debug("抽取-前置规则", tmp)
|
|
|
|
|
|
//抽取-规则
|
|
@@ -364,7 +364,14 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
//项目名称未能抽取到,标题来凑
|
|
|
if vc.Field == "projectname" {
|
|
|
if len(j.Result[vc.Field]) < 1 {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field],
|
|
|
+ &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
+ } else {
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field],
|
|
|
+ &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
+ }
|
|
|
+ //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -398,7 +405,12 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
//项目名称未能抽取到,标题来凑
|
|
|
if vc.Field == "projectname" {
|
|
|
if len(j.Result[vc.Field]) < 1 {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
+ } else {
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4})
|
|
|
+ }
|
|
|
+ //j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0})
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -651,8 +663,13 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
- j.Result[k] = append(j.Result[k],
|
|
|
- &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ j.Result[k] = append(j.Result[k],
|
|
|
+ &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0})
|
|
|
+ } else {
|
|
|
+ j.Result[k] = append(j.Result[k],
|
|
|
+ &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"], Score: 0})
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -680,9 +697,12 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
lock.Lock()
|
|
|
tags := t[field] //获取对应标签库
|
|
|
lock.Unlock()
|
|
|
+ if tags == nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
for _, bl := range j.Block {
|
|
|
//冒号kv
|
|
|
- if bl.ColonKV != nil {
|
|
|
+ if bl.ColonKV != nil && len(bl.ColonKV.Kvs) > 0 {
|
|
|
kvs := bl.ColonKV.Kvs
|
|
|
kvs2 := bl.ColonKV.Kvs_2
|
|
|
// log.Debug("ColonKV1", kvs)
|
|
@@ -701,6 +721,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": "colon1",
|
|
|
"matchtype": "tag_string",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -717,6 +738,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": "colon1",
|
|
|
"matchtype": "tag_regexp",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -736,6 +758,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": "colon2",
|
|
|
"matchtype": "tag_string",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -752,6 +775,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": "colon2",
|
|
|
"matchtype": "tag_regexp",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -761,7 +785,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
}
|
|
|
}
|
|
|
//空格kv
|
|
|
- if bl.SpaceKV != nil {
|
|
|
+ if bl.SpaceKV != nil && len(bl.SpaceKV.Kvs) > 0 {
|
|
|
kvs := bl.SpaceKV.Kvs
|
|
|
// log.Debug("SpaceKV", kvs)
|
|
|
for _, tag := range tags {
|
|
@@ -778,6 +802,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": "space",
|
|
|
"matchtype": "tag_string",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -794,6 +819,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": text,
|
|
|
"type": "space",
|
|
|
"matchtype": "tag_regexp",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
}
|
|
|
break
|
|
@@ -803,7 +829,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
}
|
|
|
}
|
|
|
//表格kv
|
|
|
- if bl.TableKV != nil {
|
|
|
+ if bl.TableKV != nil && len(bl.TableKV.Kv) > 0 {
|
|
|
tkv := bl.TableKV
|
|
|
// log.Debug("tkv", tkv)
|
|
|
for k, v := range tkv.Kv {
|
|
@@ -823,6 +849,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
|
"value": v,
|
|
|
"type": "table",
|
|
|
"matchtype": "tag_string",
|
|
|
+ "blocktag": bl.Tag,
|
|
|
})
|
|
|
} else { //涉及其他待处理
|
|
|
// log.Debug(tags)
|
|
@@ -862,10 +889,12 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
tmps = append(tmps, tmp)
|
|
|
extinfo[k] = tmps
|
|
|
if val != "" {
|
|
|
- if j.Result[v.Field] == nil {
|
|
|
- j.Result[k] = [](*ju.ExtField){}
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ } else {
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{nil, k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
}
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -897,7 +926,11 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ if tmp["blocktag"] != nil{
|
|
|
+ j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{tmp["blocktag"].(map[string]bool), v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ }else {
|
|
|
+ j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{nil, v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
return extinfo
|
|
@@ -917,7 +950,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
|
for _, tmp := range tmps {
|
|
|
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
+ j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -1145,6 +1178,24 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
}
|
|
|
// log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
|
|
|
}
|
|
|
+ //分包和标签
|
|
|
+ if ju.Config["saveblock"].(bool) {
|
|
|
+ blocks := make([]ju.BlockAndTag, 0)
|
|
|
+ for _, v := range j.Block {
|
|
|
+ xx,_:=json.Marshal(v)
|
|
|
+ tmpblock := new(ju.TmpBlock)
|
|
|
+ err:= json.Unmarshal(xx,&tmpblock)
|
|
|
+ if err != nil{
|
|
|
+ if v.BPackage!= nil{
|
|
|
+ bpb, _ := json.Marshal(v.BPackage)
|
|
|
+ tmpblock.BPackage = string(bpb)
|
|
|
+ }
|
|
|
+ tmpblock = rangeBlockToJson(v,*tmpblock)
|
|
|
+ }
|
|
|
+ blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
|
|
|
+ }
|
|
|
+ tmp["blocks"] = blocks
|
|
|
+ }
|
|
|
if e.TaskInfo.TestColl == "" {
|
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
|
for field, _ := range e.Fields {
|
|
@@ -1191,7 +1242,32 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
log.Debug("AnalysisSaveResult err", err)
|
|
|
})
|
|
|
}
|
|
|
-
|
|
|
+func rangeBlockToJson(j *ju.Block,tmpblock ju.TmpBlock)(b *ju.TmpBlock){
|
|
|
+ if j == nil{
|
|
|
+ return nil
|
|
|
+ }
|
|
|
+ if len(j.Block)>0{
|
|
|
+ for i,v := range j.Block{
|
|
|
+ rangetmp := new(ju.TmpBlock)
|
|
|
+ vb,_:=json.Marshal(v)
|
|
|
+ json.Unmarshal(vb,&rangetmp)
|
|
|
+ tmpblock.Block[i]=rangeBlockToJson(v,*rangetmp)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if j.ColonKV!= nil {
|
|
|
+ cb,_ := json.Marshal(j.ColonKV)
|
|
|
+ tmpblock.ColonKV = string(cb)
|
|
|
+ }
|
|
|
+ if j.SpaceKV != nil{
|
|
|
+ sb,_ := json.Marshal(j.SpaceKV)
|
|
|
+ tmpblock.SpaceKV = string(sb)
|
|
|
+ }
|
|
|
+ if j.TableKV != nil{
|
|
|
+ tb,_ := json.Marshal(j.TableKV)
|
|
|
+ tmpblock.TableKV = string(tb)
|
|
|
+ }
|
|
|
+ return &tmpblock
|
|
|
+}
|
|
|
//去重冗余字段
|
|
|
func delFiled(k string) bool {
|
|
|
return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo"
|
|
@@ -1286,7 +1362,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|