|
@@ -13,7 +13,6 @@ import (
|
|
"reflect"
|
|
"reflect"
|
|
"regexp"
|
|
"regexp"
|
|
"strconv"
|
|
"strconv"
|
|
- "strings"
|
|
|
|
"sync"
|
|
"sync"
|
|
"time"
|
|
"time"
|
|
|
|
|
|
@@ -27,7 +26,7 @@ var (
|
|
TaskList map[string]*ExtractTask //任务列表
|
|
TaskList map[string]*ExtractTask //任务列表
|
|
saveLimit = 200 //抽取日志批量保存
|
|
saveLimit = 200 //抽取日志批量保存
|
|
PageSize = 5000 //查询分页
|
|
PageSize = 5000 //查询分页
|
|
- Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1}`
|
|
|
|
|
|
+ Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1}`
|
|
)
|
|
)
|
|
|
|
|
|
//启动测试抽取
|
|
//启动测试抽取
|
|
@@ -37,7 +36,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
|
|
ext.Id = taskId
|
|
ext.Id = taskId
|
|
ext.IsRun = true
|
|
ext.IsRun = true
|
|
ext.InitTestTaskInfo(resultcoll, trackcoll)
|
|
ext.InitTestTaskInfo(resultcoll, trackcoll)
|
|
- ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
|
|
|
|
+ ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
ext.InitRulePres()
|
|
ext.InitRulePres()
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleCore()
|
|
ext.InitRuleCore()
|
|
@@ -67,7 +66,7 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
id := IdTrans(startId)
|
|
id := IdTrans(startId)
|
|
if id.Valid() {
|
|
if id.Valid() {
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
- list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
|
|
|
|
|
|
+ list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
//log.Println(v["_id"])
|
|
//log.Println(v["_id"])
|
|
j := PreInfo(v)
|
|
j := PreInfo(v)
|
|
@@ -93,7 +92,8 @@ func StartExtractTaskId(taskId string) bool {
|
|
ext.Id = taskId
|
|
ext.Id = taskId
|
|
ext.InitTaskInfo()
|
|
ext.InitTaskInfo()
|
|
}
|
|
}
|
|
- ext.TaskInfo.DB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
|
|
|
|
+ ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
|
|
+ ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
|
|
ext.InitRulePres()
|
|
ext.InitRulePres()
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleBacks()
|
|
ext.InitRuleCore()
|
|
ext.InitRuleCore()
|
|
@@ -136,7 +136,7 @@ func StopExtractTaskId(taskId string) bool {
|
|
func RunExtractTask(taskId string) {
|
|
func RunExtractTask(taskId string) {
|
|
ext := TaskList[taskId]
|
|
ext := TaskList[taskId]
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
- count := ext.TaskInfo.DB.Count(ext.TaskInfo.FromColl, query)
|
|
|
|
|
|
+ count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
|
|
pageNum := (count + PageSize - 1) / PageSize
|
|
pageNum := (count + PageSize - 1) / PageSize
|
|
limit := PageSize
|
|
limit := PageSize
|
|
if count < PageSize {
|
|
if count < PageSize {
|
|
@@ -146,7 +146,7 @@ func RunExtractTask(taskId string) {
|
|
for i := 0; i < pageNum; i++ {
|
|
for i := 0; i < pageNum; i++ {
|
|
query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
log.Printf("page=%d,query=%v", i+1, query)
|
|
log.Printf("page=%d,query=%v", i+1, query)
|
|
- list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
|
|
|
|
+ list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
//log.Println(v["_id"])
|
|
//log.Println(v["_id"])
|
|
if !ext.IsRun {
|
|
if !ext.IsRun {
|
|
@@ -179,17 +179,6 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
detail = ju.CutLableStr(detail)
|
|
detail = ju.CutLableStr(detail)
|
|
detail = cut.ClearHtml(detail)
|
|
detail = cut.ClearHtml(detail)
|
|
doc["detail"] = detail
|
|
doc["detail"] = detail
|
|
- href := qu.ObjToString(doc["href"])
|
|
|
|
- if strings.HasPrefix(href, "http://") {
|
|
|
|
- href = href[7:]
|
|
|
|
- } else if strings.HasPrefix(href, "https://") {
|
|
|
|
- href = href[8:]
|
|
|
|
- }
|
|
|
|
- pos := strings.Index(href, "/")
|
|
|
|
- if pos > 0 {
|
|
|
|
- href = href[:pos]
|
|
|
|
- }
|
|
|
|
- doc["domain"] = href
|
|
|
|
toptype := qu.ObjToString(doc["toptype"])
|
|
toptype := qu.ObjToString(doc["toptype"])
|
|
if qu.ObjToString(doc["type"]) == "bid" {
|
|
if qu.ObjToString(doc["type"]) == "bid" {
|
|
toptype = "结果"
|
|
toptype = "结果"
|
|
@@ -202,14 +191,14 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
|
|
Category: toptype,
|
|
Category: toptype,
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
Content: qu.ObjToString(doc["detail"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
SpiderCode: qu.ObjToString(doc["spidercode"]),
|
|
- Domain: qu.ObjToString(doc["domain"]),
|
|
|
|
- Href: qu.ObjToString(doc["href"]),
|
|
|
|
- Title: qu.ObjToString(doc["title"]),
|
|
|
|
- Data: &doc,
|
|
|
|
- City: qu.ObjToString(doc["city"]),
|
|
|
|
- Province: qu.ObjToString(doc["area"]),
|
|
|
|
- Result: map[string][]*ju.ExtField{},
|
|
|
|
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
|
|
+ //Domain: qu.ObjToString(doc["domain"]),
|
|
|
|
+ //Href: qu.ObjToString(doc["href"]),
|
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
|
+ Data: &doc,
|
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
}
|
|
}
|
|
qu.Try(func() {
|
|
qu.Try(func() {
|
|
pretreated.AnalyStart(j)
|
|
pretreated.AnalyStart(j)
|
|
@@ -826,7 +815,8 @@ func AnalysisSaveResult(j *ju.Job, e *ExtractTask) {
|
|
tmp["winnerorder"] = j.Winnerorder
|
|
tmp["winnerorder"] = j.Winnerorder
|
|
}
|
|
}
|
|
for k, v := range *doc {
|
|
for k, v := range *doc {
|
|
- if k == "detail" || k == "contenthtml" {
|
|
|
|
|
|
+ //去重冗余字段
|
|
|
|
+ if k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
if tmp[k] == nil {
|
|
if tmp[k] == nil {
|