|
@@ -45,6 +45,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleCore()
|
|
ext.InitRuleCore()
|
|
ext.InitPkgCore()
|
|
ext.InitPkgCore()
|
|
|
|
+ ext.InitBlockRule()
|
|
ext.InitTag()
|
|
ext.InitTag()
|
|
ext.InitClearFn()
|
|
ext.InitClearFn()
|
|
if ext.IsExtractCity { //版本上控制是否开始城市抽取
|
|
if ext.IsExtractCity { //版本上控制是否开始城市抽取
|
|
@@ -116,6 +117,7 @@ func StartExtractTaskId(taskId string) bool {
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleCore()
|
|
ext.InitRuleCore()
|
|
ext.InitPkgCore()
|
|
ext.InitPkgCore()
|
|
|
|
+ ext.InitBlockRule()
|
|
ext.InitTag()
|
|
ext.InitTag()
|
|
ext.InitClearFn()
|
|
ext.InitClearFn()
|
|
if ext.IsExtractCity { //版本上控制是否开始城市抽取
|
|
if ext.IsExtractCity { //版本上控制是否开始城市抽取
|
|
@@ -238,11 +240,11 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
toptype = "*"
|
|
toptype = "*"
|
|
}
|
|
}
|
|
j = &ju.Job{
|
|
j = &ju.Job{
|
|
- SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
|
- Category: toptype,
|
|
|
|
- CategorySecond:subtype,
|
|
|
|
- Content: qu.ObjToString(doc["detail"]),
|
|
|
|
- SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
|
|
|
+ SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
|
+ Category: toptype,
|
|
|
|
+ CategorySecond: subtype,
|
|
|
|
+ Content: qu.ObjToString(doc["detail"]),
|
|
|
|
+ SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
Title: qu.ObjToString(doc["title"]),
|
|
Title: qu.ObjToString(doc["title"]),
|
|
@@ -330,12 +332,12 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
for _, v := range e.RulePres {
|
|
for _, v := range e.RulePres {
|
|
doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
- if j.CategorySecond=="" {
|
|
|
|
|
|
+ if j.CategorySecond == "" {
|
|
//抽取规则
|
|
//抽取规则
|
|
- tmprules:= map[string][]*RuleCore{}
|
|
|
|
|
|
+ tmprules := map[string][]*RuleCore{}
|
|
lock.Lock()
|
|
lock.Lock()
|
|
for k, vc1 := range e.RuleCores[j.Category] {
|
|
for k, vc1 := range e.RuleCores[j.Category] {
|
|
- tmprules[k]=vc1
|
|
|
|
|
|
+ tmprules[k] = vc1
|
|
}
|
|
}
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
for _, vc1 := range tmprules {
|
|
for _, vc1 := range tmprules {
|
|
@@ -371,7 +373,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
// log.Debug("抽取-后置规则", tmp)
|
|
// log.Debug("抽取-后置规则", tmp)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- }else{
|
|
|
|
|
|
+ } else {
|
|
fmt.Println(e.RuleCores)
|
|
fmt.Println(e.RuleCores)
|
|
fmt.Println("++++++++++++++++")
|
|
fmt.Println("++++++++++++++++")
|
|
fmt.Println(e.RuleCores[j.Category+"_"+j.CategorySecond])
|
|
fmt.Println(e.RuleCores[j.Category+"_"+j.CategorySecond])
|
|
@@ -469,12 +471,12 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
doc := *j.Data
|
|
doc := *j.Data
|
|
//全局前置规则,结果覆盖doc属性
|
|
//全局前置规则,结果覆盖doc属性
|
|
for _, v := range e.RulePres {
|
|
for _, v := range e.RulePres {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//抽取规则
|
|
//抽取规则
|
|
- if j.CategorySecond==""{
|
|
|
|
|
|
+ if j.CategorySecond == "" {
|
|
for _, vc1 := range e.RuleCores[j.Category] {
|
|
for _, vc1 := range e.RuleCores[j.Category] {
|
|
for _, vc := range vc1 {
|
|
for _, vc := range vc1 {
|
|
tmp := ju.DeepCopy(doc).(map[string]interface{})
|
|
tmp := ju.DeepCopy(doc).(map[string]interface{})
|
|
@@ -484,7 +486,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
}
|
|
}
|
|
//抽取-前置规则
|
|
//抽取-前置规则
|
|
for _, v := range vc.RulePres {
|
|
for _, v := range vc.RulePres {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -492,7 +494,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
|
|
|
|
//抽取-规则
|
|
//抽取-规则
|
|
for _, v := range vc.RuleCores {
|
|
for _, v := range vc.RuleCores {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -500,14 +502,14 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
|
|
|
|
//抽取-后置规则
|
|
//抽取-后置规则
|
|
for _, v := range vc.RuleBacks {
|
|
for _, v := range vc.RuleBacks {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// log.Debug("抽取-后置规则", tmp)
|
|
// log.Debug("抽取-后置规则", tmp)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- }else{
|
|
|
|
|
|
+ } else {
|
|
for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
|
|
for _, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
|
|
for _, vc := range vc1 {
|
|
for _, vc := range vc1 {
|
|
tmp := ju.DeepCopy(doc).(map[string]interface{})
|
|
tmp := ju.DeepCopy(doc).(map[string]interface{})
|
|
@@ -517,7 +519,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
}
|
|
}
|
|
//抽取-前置规则
|
|
//抽取-前置规则
|
|
for _, v := range vc.RulePres {
|
|
for _, v := range vc.RulePres {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -525,7 +527,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
|
|
|
|
//抽取-规则
|
|
//抽取-规则
|
|
for _, v := range vc.RuleCores {
|
|
for _, v := range vc.RuleCores {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -533,7 +535,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
|
|
|
|
//抽取-后置规则
|
|
//抽取-后置规则
|
|
for _, v := range vc.RuleBacks {
|
|
for _, v := range vc.RuleBacks {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -542,10 +544,9 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-
|
|
|
|
//全局后置规则
|
|
//全局后置规则
|
|
for _, v := range e.RuleBacks {
|
|
for _, v := range e.RuleBacks {
|
|
- if value, ok := e.FileFields.Load(v.Field);ok && qu.IntAllDef(value,1) >0{
|
|
|
|
|
|
+ if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|