|
@@ -24,12 +24,12 @@ import (
|
|
|
var (
|
|
|
lock, lockrule, lockclear sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
- saveLimit = 200 //抽取日志批量保存
|
|
|
- PageSize = 5000 //查询分页
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
+ saveLimit = 200 //抽取日志批量保存
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
@@ -892,7 +892,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
if v.RegCore.NumSign == -1 { //正负值修正
|
|
|
val = "-" + val
|
|
|
}
|
|
|
- exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: val, Value: val}
|
|
|
+ exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
|
if extfrom == "title" {
|
|
|
exfield.Score = 4
|
|
|
}
|
|
@@ -919,6 +919,59 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ if len(extinfo) == 0 {
|
|
|
+ regArr := strings.Split(v.RuleText, "__")
|
|
|
+ //fmt.Println(regArr[0])
|
|
|
+ if len(regArr) > 0 {
|
|
|
+ reg, err := regexp.Compile(regArr[0])
|
|
|
+ if err == nil {
|
|
|
+ datavals := reg.FindStringSubmatch(text)
|
|
|
+ tmps := []map[string]interface{}{}
|
|
|
+ for _, value := range datavals {
|
|
|
+ if value == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ tmp := map[string]interface{}{
|
|
|
+ "field": v.Field,
|
|
|
+ "code": v.Code + "去除__*后",
|
|
|
+ "ruletext": regArr[0],
|
|
|
+ "extfrom": extfrom,
|
|
|
+ "value": value,
|
|
|
+ "type": "regexp",
|
|
|
+ "matchtype": "regcontent",
|
|
|
+ "blocktag": *tag,
|
|
|
+ }
|
|
|
+ tmps = append(tmps, tmp)
|
|
|
+ extinfo[v.Field] = tmps
|
|
|
+
|
|
|
+ exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
|
|
|
+ if extfrom == "title" {
|
|
|
+ exfield.Score = 4
|
|
|
+ }
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ exfield.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
+ }
|
|
|
+ item := ju.ScoreItem{Des: "初始化抽取规则去除__*", Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: value}
|
|
|
+ if extfrom == "title" {
|
|
|
+ item.Score = 4
|
|
|
+ }
|
|
|
+ if strings.Contains(value, "\n") {
|
|
|
+ item.Score -= 1
|
|
|
+ exfield.Score -= 1
|
|
|
+ }
|
|
|
+ if tmp["scoreitem"] == nil {
|
|
|
+ sitems := make([]*ju.ScoreItem, 0)
|
|
|
+ sitems = append(sitems, &item)
|
|
|
+ exfield.ScoreItem = sitems
|
|
|
+ } else {
|
|
|
+ exfield.ScoreItem = append(exfield.ScoreItem, &item)
|
|
|
+ }
|
|
|
+ j.Result[v.Field] = append(j.Result[v.Field], &exfield)
|
|
|
+ //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
} else {
|
|
|
pos := v.RegCore.Reg.FindStringIndex(text)
|
|
@@ -948,7 +1001,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
|
if j.Result[v.Field] == nil {
|
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: val, Value: val}
|
|
|
+ field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
|
if extfrom == "title" {
|
|
|
field.Score = 4
|
|
|
}
|
|
@@ -1501,7 +1554,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
|
|
|
func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
|
|
|
defer qu.Catch()
|
|
|
i := redis.GetInt(field, field+"_"+fv) //查找redis
|
|
|
- if i == 0 { //reids未找到,执行规则匹配
|
|
|
+ if i == 0 { //reids未找到,执行规则匹配
|
|
|
val[field+"_isredis"] = false
|
|
|
e.RuleMatch(field, fv, val) //规则匹配
|
|
|
} else { //redis找到,打标识存库
|