|
@@ -10,7 +10,6 @@ import (
|
|
|
ju "jy/util"
|
|
|
qu "qfw/util"
|
|
|
"qfw/util/redis"
|
|
|
- "reflect"
|
|
|
"regexp"
|
|
|
"strconv"
|
|
|
"strings"
|
|
@@ -23,13 +22,13 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- lock sync.RWMutex
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 200 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ lock sync.RWMutex
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -376,7 +375,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
if len(j.Result[vc.Field]) < 1 {
|
|
|
items := make([]*ju.ScoreItem, 1)
|
|
|
items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
|
|
|
- field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
|
|
|
+ field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
}
|
|
@@ -422,7 +421,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
|
if vc.Field == "projectname" {
|
|
|
items := make([]*ju.ScoreItem, 1)
|
|
|
items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
|
|
|
- field := &ju.ExtField{nil, vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 4, items}
|
|
|
+ field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "regexp", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
|
|
|
if len(j.Result[vc.Field]) < 1 {
|
|
|
if tmp["blocktag"] != nil {
|
|
|
field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
@@ -683,7 +682,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
|
}
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
for _, tmp := range tmps {
|
|
|
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"],}
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
|
|
|
if extfrom == "title" {
|
|
|
field.Score = 4
|
|
|
}
|
|
@@ -947,7 +946,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
if v.RegCore.NumSign == -1 { //正负值修正
|
|
|
val = "-" + val
|
|
|
}
|
|
|
- exfield := ju.ExtField{BlockTag:*tag,Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
+ exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
if extfrom == "title" {
|
|
|
exfield.Score = 4
|
|
|
}
|
|
@@ -999,7 +998,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- field := &ju.ExtField{BlockTag:*tag,Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
+ field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
if extfrom == "title" {
|
|
|
field.Score = 4
|
|
|
}
|
|
@@ -1203,13 +1202,15 @@ type FieldValue struct {
|
|
|
//分析抽取结果并保存
|
|
|
func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
qu.Try(func() {
|
|
|
- doc, result, _id, values := funcAnalysis(j)
|
|
|
+ doc, result, _id := funcAnalysis(j)
|
|
|
+ auxinfo := auxInfo(j)
|
|
|
//从排序结果中取值
|
|
|
tmp := map[string]interface{}{} //抽取值
|
|
|
- for key, val := range values {
|
|
|
+ tmp["fieldall"] = auxinfo
|
|
|
+ for _, val := range result {
|
|
|
for _, v := range val { //取第一个非负数
|
|
|
- if v.Key != "" && v.Value > -1 {
|
|
|
- tmp[key] = v.Object
|
|
|
+ if v.Score > -1 {
|
|
|
+ tmp[v.Field] = v.Value
|
|
|
break
|
|
|
}
|
|
|
}
|
|
@@ -1222,14 +1223,15 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
}
|
|
|
//处理附件
|
|
|
var resultf map[string][]*ju.ExtField
|
|
|
- var filevalues map[string][]*ju.SortObject
|
|
|
if jf != nil {
|
|
|
- _, resultf, _, filevalues = funcAnalysis(jf)
|
|
|
+ _, resultf, _ = funcAnalysis(jf)
|
|
|
+ auxinfof := auxInfo(jf)
|
|
|
+ tmp["fieldallf"] = auxinfof
|
|
|
ffield := map[string]interface{}{}
|
|
|
- for key, val := range filevalues {
|
|
|
+ for _, val := range resultf {
|
|
|
for _, v := range val { //取第一个非负数
|
|
|
- if v.Key != "" && v.Value > -1 {
|
|
|
- ffield[key] = v.Object
|
|
|
+ if v.Score > -1 {
|
|
|
+ ffield[v.Field] = v.Value
|
|
|
break
|
|
|
}
|
|
|
}
|
|
@@ -1375,72 +1377,48 @@ func delFiled(k string) bool {
|
|
|
return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
|
}
|
|
|
|
|
|
-func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string, map[string][]*ju.SortObject) {
|
|
|
+func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|
|
|
defer qu.Catch()
|
|
|
doc := j.Data
|
|
|
result := j.Result
|
|
|
_id := qu.BsonIdToSId((*doc)["_id"])
|
|
|
- iscore, _ := ju.Config["fieldscore"].(bool)
|
|
|
- if iscore { //打分
|
|
|
- result = ScoreFields(j)
|
|
|
- }
|
|
|
+ result = ScoreFields(j)
|
|
|
+
|
|
|
//结果排序
|
|
|
- values := map[string][]*ju.SortObject{}
|
|
|
- for key, val := range result {
|
|
|
- fieldValue := map[string][]interface{}{}
|
|
|
- //cfscore := make(map[string]float64) //重复匹配加分
|
|
|
- if iscore { //走打分
|
|
|
- for _, v := range val {
|
|
|
- if len(fmt.Sprint(v.Value)) < 1 {
|
|
|
- continue //去除空串
|
|
|
- }
|
|
|
- //if v.Score >0 {
|
|
|
- // cfscore[fmt.Sprint(v.Value)] += 1
|
|
|
- //}
|
|
|
- if fieldValue[fmt.Sprint(v.Value)+v.Type] == nil {
|
|
|
- fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
|
|
|
- } else if fieldValue[fmt.Sprint(v.Value)+v.Type][0].(float64) < v.Score {
|
|
|
- fieldValue[fmt.Sprint(v.Value)+v.Type][0] = v.Score
|
|
|
- }
|
|
|
- }
|
|
|
- //for key := range fieldValue {
|
|
|
- // for cfkey, cfv := range cfscore {
|
|
|
- // if strings.Contains(key, cfkey) {
|
|
|
- // fieldValue[key][0] = fieldValue[key][0].(float64) + cfv
|
|
|
- // }
|
|
|
- // }
|
|
|
- //}
|
|
|
- } else { //不走打分,按出现频次
|
|
|
- for _, v := range val {
|
|
|
- if len(fmt.Sprint(v.Value)) < 1 {
|
|
|
- continue //去除空串
|
|
|
- }
|
|
|
- if fieldValue[fmt.Sprint(v.Value)] == nil {
|
|
|
- fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
|
|
|
- } else {
|
|
|
- fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
|
|
|
+ for _, val := range result {
|
|
|
+ ju.Sort(val)
|
|
|
+ }
|
|
|
+ return doc, result, _id
|
|
|
+}
|
|
|
+
|
|
|
+//辅助信息,如果没有排序先排序
|
|
|
+func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
|
|
|
+ fieldalls := map[string][]map[string]interface{}{}
|
|
|
+ for field, val := range j.Result {
|
|
|
+ //ju.Sort(val)
|
|
|
+ sfields := []map[string]interface{}{}
|
|
|
+ for _, v := range val {
|
|
|
+ standardized := false
|
|
|
+ if field == "buyer" || field == "winner" || field == "agency" {
|
|
|
+ i := redis.GetInt(field, field+"_"+qu.ObjToString(v.Value))
|
|
|
+ if i > 0 {
|
|
|
+ standardized = true
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
- objects := []*ju.SortObject{}
|
|
|
- for k, v := range fieldValue {
|
|
|
- ValueStr := "" //第二排序
|
|
|
- if reflect.TypeOf(v[1]).String() == "string" {
|
|
|
- ValueStr = qu.ObjToString(v[1])
|
|
|
- }
|
|
|
- tmp := &ju.SortObject{
|
|
|
- Key: k,
|
|
|
- Value: qu.IntAll(v[0]),
|
|
|
- Object: v[1],
|
|
|
- ValueStr: ValueStr,
|
|
|
+ sfield := map[string]interface{}{
|
|
|
+ "val": v.Value,
|
|
|
+ "type": v.Type,
|
|
|
+ "score": v.Score,
|
|
|
+ "blocktag": v.BlockTag,
|
|
|
+ "sourceval": v.SourceValue,
|
|
|
+ "standardized": standardized,
|
|
|
}
|
|
|
- objects = append(objects, tmp)
|
|
|
+ sfields = append(sfields, sfield)
|
|
|
}
|
|
|
- values[key] = ju.ExtSort(objects)
|
|
|
+ fieldalls[field] = sfields
|
|
|
}
|
|
|
- return doc, result, _id, values
|
|
|
+ return fieldalls
|
|
|
}
|
|
|
-
|
|
|
func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
//获取审核字段
|
|
@@ -1479,7 +1457,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|