|
@@ -23,13 +23,13 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- lock sync.RWMutex
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 200 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ lock sync.RWMutex
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -683,7 +683,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"],}
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
|
|
|
if extfrom == "title" {
|
|
|
field.Score = 4
|
|
|
}
|
|
@@ -1204,6 +1204,7 @@ type FieldValue struct {
|
|
|
func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
qu.Try(func() {
|
|
|
doc, result, _id, values := funcAnalysis(j)
|
|
|
+ go otherNeedSave(j, result, e)
|
|
|
//从排序结果中取值
|
|
|
tmp := map[string]interface{}{} //抽取值
|
|
|
for key, val := range values {
|
|
@@ -1278,10 +1279,12 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
}
|
|
|
// log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
|
|
|
}
|
|
|
- //分包和标签
|
|
|
- if ju.Config["saveblock"].(bool) {
|
|
|
- blocks := make([]ju.BlockAndTag, 0)
|
|
|
- for _, v := range j.Block {
|
|
|
+ //所有kv组成的字符串
|
|
|
+ var kvtext bytes.Buffer
|
|
|
+ blocks := make([]ju.BlockAndTag, 0)
|
|
|
+ for _, v := range j.Block {
|
|
|
+ //分包和标签
|
|
|
+ if ju.Config["saveblock"].(bool) {
|
|
|
xx, _ := json.Marshal(v)
|
|
|
tmpblock := new(ju.TmpBlock)
|
|
|
err := json.Unmarshal(xx, &tmpblock)
|
|
@@ -1294,8 +1297,33 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
}
|
|
|
blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
|
|
|
}
|
|
|
+ //把所有kv组装成一个字符串,存库
|
|
|
+ for ck, cv := range v.ColonKV.Kv {
|
|
|
+ kvtext.WriteString(ck)
|
|
|
+ kvtext.WriteString(":")
|
|
|
+ kvtext.WriteString(cv)
|
|
|
+ kvtext.WriteString(" ")
|
|
|
+ }
|
|
|
+ for sk, sv := range v.SpaceKV.Kv {
|
|
|
+ kvtext.WriteString(sk)
|
|
|
+ kvtext.WriteString(":")
|
|
|
+ kvtext.WriteString(sv)
|
|
|
+ kvtext.WriteString(" ")
|
|
|
+ }
|
|
|
+ for tk, tv := range v.TableKV.Kv {
|
|
|
+ kvtext.WriteString(tk)
|
|
|
+ kvtext.WriteString(":")
|
|
|
+ kvtext.WriteString(tv)
|
|
|
+ kvtext.WriteString(" ")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if kvtext.Len() > 0 {
|
|
|
+ tmp["kvtext"] = kvtext.String()
|
|
|
+ }
|
|
|
+ if len(blocks) > 0 {
|
|
|
tmp["blocks"] = blocks
|
|
|
}
|
|
|
+ tmp["extract_content"] = j.Content
|
|
|
if e.TaskInfo.TestColl == "" {
|
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
|
for field, _ := range e.Fields {
|
|
@@ -1343,6 +1371,20 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
log.Debug("AnalysisSaveResult err", err)
|
|
|
})
|
|
|
}
|
|
|
+
|
|
|
+//保存其他
|
|
|
+//kv、表格、块上的标签凡是新的标签都入库
|
|
|
+//val type times firstid createtime 判定field
|
|
|
+func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
|
|
|
+ coll := e.TaskInfo.TestColl
|
|
|
+ if coll == "" {
|
|
|
+ coll = "extract_tag_result"
|
|
|
+ } else {
|
|
|
+ coll += "_tag"
|
|
|
+ }
|
|
|
+ //for _,v := range j.ColonKV
|
|
|
+}
|
|
|
+
|
|
|
func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
|
|
|
if j == nil {
|
|
|
return nil
|
|
@@ -1479,7 +1521,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|