|
@@ -27,12 +27,12 @@ import (
|
|
var (
|
|
var (
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
|
- PageSize = 5000 //查询分页
|
|
|
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
|
+ PageSize = 5000 //查询分页
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1}`
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
)
|
|
)
|
|
@@ -98,7 +98,7 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
}
|
|
}
|
|
var j, jf *ju.Job
|
|
var j, jf *ju.Job
|
|
var isSite bool
|
|
var isSite bool
|
|
- if ext.IsFileField && v["projectinfo"] != nil {
|
|
|
|
|
|
+ if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
|
|
v["isextFile"] = true
|
|
v["isextFile"] = true
|
|
j, jf, isSite = ext.PreInfo(v)
|
|
j, jf, isSite = ext.PreInfo(v)
|
|
} else {
|
|
} else {
|
|
@@ -217,7 +217,7 @@ func RunExtractTask(taskId string) {
|
|
}
|
|
}
|
|
var j, jf *ju.Job
|
|
var j, jf *ju.Job
|
|
var isSite bool
|
|
var isSite bool
|
|
- if ext.IsFileField && v["projectinfo"] != nil {
|
|
|
|
|
|
+ if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
|
|
v["isextFile"] = true
|
|
v["isextFile"] = true
|
|
j, jf, isSite = ext.PreInfo(v)
|
|
j, jf, isSite = ext.PreInfo(v)
|
|
} else {
|
|
} else {
|
|
@@ -337,20 +337,21 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
}
|
|
}
|
|
if isextFile {
|
|
if isextFile {
|
|
jf = &ju.Job{
|
|
jf = &ju.Job{
|
|
- SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
|
- Category: toptype,
|
|
|
|
- Content: qu.ObjToString(doc["detailfile"]),
|
|
|
|
- SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
|
- Site: qu.ObjToString(doc["site"]),
|
|
|
|
- Title: qu.ObjToString(doc["title"]),
|
|
|
|
- Data: &doc,
|
|
|
|
- City: qu.ObjToString(doc["city"]),
|
|
|
|
- Province: qu.ObjToString(doc["area"]),
|
|
|
|
- Jsondata: toMap,
|
|
|
|
- Result: map[string][]*ju.ExtField{},
|
|
|
|
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
- RuleBlock: e.RuleBlock,
|
|
|
|
- IsFile: isextFile,
|
|
|
|
|
|
+ SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
|
+ Category: toptype,
|
|
|
|
+ CategorySecond: subtype,
|
|
|
|
+ Content: qu.ObjToString(doc["detailfile"]),
|
|
|
|
+ SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
|
+ Site: qu.ObjToString(doc["site"]),
|
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
|
+ Data: &doc,
|
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
|
+ Jsondata: toMap,
|
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
+ RuleBlock: e.RuleBlock,
|
|
|
|
+ IsFile: isextFile,
|
|
}
|
|
}
|
|
if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
|
|
if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
|
|
delete((*jf.Jsondata), "jsoncontent")
|
|
delete((*jf.Jsondata), "jsoncontent")
|