|
@@ -74,7 +74,6 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
n, _ := strconv.Atoi(num)
|
|
n, _ := strconv.Atoi(num)
|
|
id := IdTrans(startId)
|
|
id := IdTrans(startId)
|
|
if id.Valid() {
|
|
if id.Valid() {
|
|
- //query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
@@ -83,12 +82,7 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
}
|
|
}
|
|
var j, jf *ju.Job
|
|
var j, jf *ju.Job
|
|
var isSite bool
|
|
var isSite bool
|
|
- if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
|
|
|
|
- v["isextFile"] = true
|
|
|
|
- j, jf, isSite = ext.PreInfo(v)
|
|
|
|
- } else { //无附件
|
|
|
|
- j, _, isSite = ext.PreInfo(v)
|
|
|
|
- }
|
|
|
|
|
|
+ j, _, isSite = ext.PreInfo(v)
|
|
go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
|
|
go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
|
|
ext.TaskInfo.ProcessPool <- true
|
|
ext.TaskInfo.ProcessPool <- true
|
|
}
|
|
}
|
|
@@ -231,28 +225,14 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
if doc["isextFile"] != nil {
|
|
if doc["isextFile"] != nil {
|
|
isextFile = doc["isextFile"].(bool)
|
|
isextFile = doc["isextFile"].(bool)
|
|
}
|
|
}
|
|
|
|
+ isextFile = false
|
|
detail := ""
|
|
detail := ""
|
|
summary := qu.ObjToString(doc["summary"])
|
|
summary := qu.ObjToString(doc["summary"])
|
|
detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
|
|
detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
|
|
- //d1 := CleanDetailText(qu.ObjToString(doc["detail"]), summary)
|
|
|
|
- //d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
|
|
|
|
- ////log.Debug("正文长度:", len(d1), "~", "源码长度:", len(d2))
|
|
|
|
- //if len(d1) > len(d2) || d2 == "" {
|
|
|
|
- // detail = d1
|
|
|
|
- // if SelectSourceStructText(d1, d2) {
|
|
|
|
- // detail = d2
|
|
|
|
- // }
|
|
|
|
- //} else {
|
|
|
|
- // detail = d2
|
|
|
|
- // if SelectDetailSourceText(d1, d2) {
|
|
|
|
- // detail = d1
|
|
|
|
- // }
|
|
|
|
- //}
|
|
|
|
//调整采用detail抽取
|
|
//调整采用detail抽取
|
|
- if utf8.RuneCountInString(detail) >= 100000 {
|
|
|
|
- detail = detail[:100000]
|
|
|
|
|
|
+ if utf8.RuneCountInString(detail) >= 50000 {
|
|
|
|
+ detail = detail[:50000]
|
|
}
|
|
}
|
|
-
|
|
|
|
doc["detail"] = detail
|
|
doc["detail"] = detail
|
|
isClearnMoney := !clearMoneyReg.MatchString(detail)
|
|
isClearnMoney := !clearMoneyReg.MatchString(detail)
|
|
if isClearnMoney {
|
|
if isClearnMoney {
|
|
@@ -303,19 +283,17 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
Site: qu.ObjToString(doc["site"]),
|
|
Site: qu.ObjToString(doc["site"]),
|
|
- //Domain: qu.ObjToString(doc["domain"]),
|
|
|
|
- //Href: qu.ObjToString(doc["href"]),
|
|
|
|
- Title: qu.ObjToString(doc["title"]),
|
|
|
|
- Data: &doc,
|
|
|
|
- City: qu.ObjToString(doc["city"]),
|
|
|
|
- Province: qu.ObjToString(doc["area"]),
|
|
|
|
- Jsondata: toMap,
|
|
|
|
- Result: map[string][]*ju.ExtField{},
|
|
|
|
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
- RuleBlock: e.RuleBlock,
|
|
|
|
- Dataging: qu.IntAll(doc["dataging"]),
|
|
|
|
- IsClearnMoney: isClearnMoneystr,
|
|
|
|
- IsUnRulesTab: false,
|
|
|
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
|
+ Data: &doc,
|
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
|
+ Jsondata: toMap,
|
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
+ RuleBlock: e.RuleBlock,
|
|
|
|
+ Dataging: qu.IntAll(doc["dataging"]),
|
|
|
|
+ IsClearnMoney: isClearnMoneystr,
|
|
|
|
+ IsUnRulesTab: false,
|
|
}
|
|
}
|
|
if isextFile {
|
|
if isextFile {
|
|
jf = &ju.Job{
|
|
jf = &ju.Job{
|
|
@@ -362,10 +340,10 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- qu.Try(func() {
|
|
|
|
- pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
|
|
|
|
|
|
+ qu.Try(func() { //不解析表格
|
|
|
|
+ pretreated.AnalyStartNoTable(j, isSite, codeSite) //job.Block分块
|
|
if isextFile && strings.TrimSpace(jf.Content) != "" {
|
|
if isextFile && strings.TrimSpace(jf.Content) != "" {
|
|
- pretreated.AnalyStart(jf, isSite, codeSite)
|
|
|
|
|
|
+ pretreated.AnalyStartNoTable(jf, isSite, codeSite)
|
|
}
|
|
}
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
log.Debug("pretreated.AnalyStart", err, j.SourceMid)
|
|
log.Debug("pretreated.AnalyStart", err, j.SourceMid)
|