|
@@ -78,9 +78,9 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
|
continue
|
|
|
}
|
|
|
//log.Println(v["_id"])
|
|
|
- j := PreInfo(v)
|
|
|
+ j, jf := PreInfo(v, false)
|
|
|
ext.TaskInfo.ProcessPool <- true
|
|
|
- go ext.ExtractProcess(j)
|
|
|
+ go ext.ExtractProcess(j, jf)
|
|
|
}
|
|
|
return true
|
|
|
} else {
|
|
@@ -171,9 +171,9 @@ func RunExtractTask(taskId string) {
|
|
|
if !ext.IsRun {
|
|
|
break
|
|
|
}
|
|
|
- j := PreInfo(v)
|
|
|
+ j, jf := PreInfo(v, false)
|
|
|
ext.TaskInfo.ProcessPool <- true
|
|
|
- go ext.ExtractProcess(j)
|
|
|
+ go ext.ExtractProcess(j, jf)
|
|
|
ext.TaskInfo.LastExtId = _id
|
|
|
}
|
|
|
db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
|
|
@@ -186,7 +186,7 @@ func RunExtractTask(taskId string) {
|
|
|
}
|
|
|
|
|
|
//信息预处理
|
|
|
-func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
|
+func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
|
|
|
defer qu.Catch()
|
|
|
detail := ""
|
|
|
d1, _ := doc["detail"].(string)
|
|
@@ -199,6 +199,7 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
|
detail = ju.CutLableStr(detail)
|
|
|
detail = cut.ClearHtml(detail)
|
|
|
doc["detail"] = detail
|
|
|
+ doc["detailfile"] = "" //附件文本堆一起(后期可以考虑,分开处理)
|
|
|
toptype := qu.ObjToString(doc["toptype"])
|
|
|
if qu.ObjToString(doc["type"]) == "bid" {
|
|
|
toptype = "结果"
|
|
@@ -206,7 +207,7 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
|
if toptype == "" {
|
|
|
toptype = "*"
|
|
|
}
|
|
|
- j := &ju.Job{
|
|
|
+ j = &ju.Job{
|
|
|
SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
Category: toptype,
|
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
@@ -220,17 +221,33 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
|
Result: map[string][]*ju.ExtField{},
|
|
|
BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
}
|
|
|
+ if isextFile {
|
|
|
+ jf = &ju.Job{
|
|
|
+ SourceMid: qu.BsonIdToSId(doc["_id"]),
|
|
|
+ Category: toptype,
|
|
|
+ Content: qu.ObjToString(doc["detailfile"]),
|
|
|
+ SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
+ Data: &doc,
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
+ }
|
|
|
+ }
|
|
|
qu.Try(func() {
|
|
|
-
|
|
|
pretreated.AnalyStart(j)
|
|
|
+ if isextFile {
|
|
|
+ pretreated.AnalyStart(jf)
|
|
|
+ }
|
|
|
}, func(err interface{}) {
|
|
|
log.Println("pretreated.AnalyStart", err)
|
|
|
})
|
|
|
- return j
|
|
|
+ return j, jf
|
|
|
}
|
|
|
|
|
|
//抽取
|
|
|
-func (e *ExtractTask) ExtractProcess(j *ju.Job) {
|
|
|
+func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
|
|
|
qu.Try(func() {
|
|
|
doc := *j.Data
|
|
|
//全局前置规则,结果覆盖doc属性
|