|
@@ -14,6 +14,7 @@ import (
|
|
mu "mfw/util"
|
|
mu "mfw/util"
|
|
mgo "mongodb"
|
|
mgo "mongodb"
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
|
|
+ mgu "qfw/util/mongodbutil"
|
|
"sync"
|
|
"sync"
|
|
|
|
|
|
//"sync"
|
|
//"sync"
|
|
@@ -30,6 +31,13 @@ import (
|
|
"github.com/yuin/gopher-lua"
|
|
"github.com/yuin/gopher-lua"
|
|
)
|
|
)
|
|
|
|
|
|
|
|
+type Heart struct {
|
|
|
|
+ DetailHeart int64 //爬虫三级页执行心跳
|
|
|
|
+ DetailExecuteHeart int64 //三级页采集到数据心跳
|
|
|
|
+ ListHeart int64 //爬虫列表页执行心跳
|
|
|
|
+ ModifyUser string //爬虫维护人
|
|
|
|
+}
|
|
|
|
+
|
|
//爬虫()
|
|
//爬虫()
|
|
type Spider struct {
|
|
type Spider struct {
|
|
Script
|
|
Script
|
|
@@ -65,12 +73,46 @@ type Spider struct {
|
|
IsMustDownload bool //是否强制下载
|
|
IsMustDownload bool //是否强制下载
|
|
}
|
|
}
|
|
|
|
|
|
-var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
|
|
|
|
+var UpdataMgoCache = make(chan []map[string]interface{}, 1000) //更新要重下数据的状态
|
|
|
|
+var UpdataHeartCache = make(chan []map[string]interface{}, 1000) //更新爬虫心跳信息
|
|
var SP = make(chan bool, 5)
|
|
var SP = make(chan bool, 5)
|
|
|
|
+var SPH = make(chan bool, 5)
|
|
var Mgo *mgo.MongodbSim
|
|
var Mgo *mgo.MongodbSim
|
|
var TimeChan = make(chan bool, 1)
|
|
var TimeChan = make(chan bool, 1)
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
var Reg = regexp.MustCompile(`(http|https)://([\w]+\.)+[\w]+(/?)`)
|
|
|
|
|
|
|
|
+//心跳
|
|
|
|
+func UpdateHeart(code, user, t string) {
|
|
|
|
+ if htmp, ok := SpiderHeart.Load(code); ok {
|
|
|
|
+ if heart, ok := htmp.(*Heart); ok {
|
|
|
|
+ if t == "list" {
|
|
|
|
+ heart.ListHeart = time.Now().Unix()
|
|
|
|
+ } else if t == "detail" {
|
|
|
|
+ heart.DetailHeart = time.Now().Unix()
|
|
|
|
+ } else if t == "detailexcute" {
|
|
|
|
+ heart.DetailExecuteHeart = time.Now().Unix()
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ if t == "list" {
|
|
|
|
+ SpiderHeart.Store(code, &Heart{
|
|
|
|
+ ListHeart: time.Now().Unix(),
|
|
|
|
+ ModifyUser: user,
|
|
|
|
+ })
|
|
|
|
+ } else if t == "detail" {
|
|
|
|
+ SpiderHeart.Store(code, &Heart{
|
|
|
|
+ DetailHeart: time.Now().Unix(),
|
|
|
|
+ ModifyUser: user,
|
|
|
|
+ })
|
|
|
|
+ } else if t == "detailexcute" {
|
|
|
|
+ SpiderHeart.Store(code, &Heart{
|
|
|
|
+ DetailExecuteHeart: time.Now().Unix(),
|
|
|
|
+ ModifyUser: user,
|
|
|
|
+ })
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
//任务
|
|
//任务
|
|
func (s *Spider) StartJob() {
|
|
func (s *Spider) StartJob() {
|
|
s.Stop = false
|
|
s.Stop = false
|
|
@@ -117,7 +159,8 @@ func (s *Spider) ExecJob(reload bool) {
|
|
if err != nil {
|
|
if err != nil {
|
|
logger.Error(s.Code, err)
|
|
logger.Error(s.Code, err)
|
|
}
|
|
}
|
|
- err = s.DownListPageItem() //下载列表
|
|
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "list") //记录所有节点列表页心跳
|
|
|
|
+ err = s.DownListPageItem() //下载列表
|
|
if err != nil {
|
|
if err != nil {
|
|
logger.Error(s.Code, err)
|
|
logger.Error(s.Code, err)
|
|
}
|
|
}
|
|
@@ -144,10 +187,12 @@ func (s *Spider) ExecJob(reload bool) {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
*/
|
|
*/
|
|
- if s.IsMustDownload { //历史数据下载,只跑一轮
|
|
|
|
- fmt.Println("Delete History Code:", s.Code)
|
|
|
|
|
|
+ //if s.IsMustDownload { //历史数据下载,只跑一轮
|
|
|
|
+ if s.IsHistoricalMend && util.Config.IsHistoryEvent { //历史节点7000,高性能模式,历史补漏只下载一轮
|
|
s.Stop = true
|
|
s.Stop = true
|
|
s.L.Close()
|
|
s.L.Close()
|
|
|
|
+ b := mgu.Update("luaconfig", "editor", "editor", map[string]interface{}{"code": s.Code}, map[string]interface{}{"$set": map[string]interface{}{"state": 6}}, false, false)
|
|
|
|
+ logger.Info("Delete History Code:", s.Code, b)
|
|
} else {
|
|
} else {
|
|
if !s.Stop { //未下架定时执行
|
|
if !s.Stop { //未下架定时执行
|
|
util.TimeAfterFunc(time.Duration(s.SpiderRunRate)*time.Minute, func() {
|
|
util.TimeAfterFunc(time.Duration(s.SpiderRunRate)*time.Minute, func() {
|
|
@@ -420,6 +465,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
} else {
|
|
} else {
|
|
SaveListPageData(paramdata) //保存7000、7500、7700节点列表页采集的信息
|
|
SaveListPageData(paramdata) //保存7000、7500、7700节点列表页采集的信息
|
|
}
|
|
}
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "detail") //记录modal=0老模式采集三级页心跳
|
|
//下载、解析、入库
|
|
//下载、解析、入库
|
|
data, err = s.DownloadDetailPage(paramdata, data)
|
|
data, err = s.DownloadDetailPage(paramdata, data)
|
|
if err != nil || data == nil {
|
|
if err != nil || data == nil {
|
|
@@ -445,6 +491,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
data["publishtime"] = time.Now().Unix()
|
|
data["publishtime"] = time.Now().Unix()
|
|
}
|
|
}
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
|
|
delete(data, "exit")
|
|
delete(data, "exit")
|
|
delete(data, "checkpublishtime")
|
|
delete(data, "checkpublishtime")
|
|
data["comeintime"] = time.Now().Unix()
|
|
data["comeintime"] = time.Now().Unix()
|
|
@@ -599,6 +646,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
"comeintime": 0,
|
|
"comeintime": 0,
|
|
"event": 0,
|
|
"event": 0,
|
|
}
|
|
}
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
|
|
list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
if list != nil && len(*list) > 0 {
|
|
if list != nil && len(*list) > 0 {
|
|
for _, tmp := range *list {
|
|
for _, tmp := range *list {
|
|
@@ -626,6 +674,7 @@ func (s *Spider) DownloadHighDetail() {
|
|
}
|
|
}
|
|
//下载、解析、入库
|
|
//下载、解析、入库
|
|
data, err = s.DownloadDetailPage(tmp, data)
|
|
data, err = s.DownloadDetailPage(tmp, data)
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "detailexcute") //记录modal=1下载数据心跳
|
|
if err != nil || data == nil {
|
|
if err != nil || data == nil {
|
|
success = false
|
|
success = false
|
|
times++
|
|
times++
|
|
@@ -680,6 +729,8 @@ func (s *Spider) DownloadHighDetail() {
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1}} //下载成功state置为1
|
|
set := map[string]interface{}{"$set": map[string]interface{}{"state": 1}} //下载成功state置为1
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
Mgo.Update("spider_highlistdata", query, set, false, false)
|
|
}
|
|
}
|
|
|
|
+ //重载spider
|
|
|
|
+ s.LoadScript(s.Code, s.ScriptFile, true)
|
|
} else { //没有数据
|
|
} else { //没有数据
|
|
time.Sleep(2 * time.Minute)
|
|
time.Sleep(2 * time.Minute)
|
|
}
|
|
}
|
|
@@ -715,6 +766,7 @@ func (s *Spider) DownloadListDetail() {
|
|
"comeintime": 0,
|
|
"comeintime": 0,
|
|
"event": 0,
|
|
"event": 0,
|
|
}
|
|
}
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
|
|
list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
|
|
if list != nil && len(*list) > 0 {
|
|
if list != nil && len(*list) > 0 {
|
|
for _, tmp := range *list {
|
|
for _, tmp := range *list {
|
|
@@ -742,6 +794,7 @@ func (s *Spider) DownloadListDetail() {
|
|
}
|
|
}
|
|
//下载、解析、入库
|
|
//下载、解析、入库
|
|
data, err = s.DownloadDetailPage(tmp, data)
|
|
data, err = s.DownloadDetailPage(tmp, data)
|
|
|
|
+ UpdateHeart(s.Code, s.MUserName, "detailexcute") //记录modal=1下载数据心跳
|
|
if err != nil || data == nil {
|
|
if err != nil || data == nil {
|
|
success = false
|
|
success = false
|
|
times++
|
|
times++
|