|
@@ -1,4 +1,5 @@
|
|
-/**
|
|
|
|
|
|
+/*
|
|
|
|
+*
|
|
爬虫,脚本接口,需要扩展
|
|
爬虫,脚本接口,需要扩展
|
|
*/
|
|
*/
|
|
package spider
|
|
package spider
|
|
@@ -112,7 +113,7 @@ type DelaySite struct {
|
|
Compete bool
|
|
Compete bool
|
|
}
|
|
}
|
|
|
|
|
|
-//任务
|
|
|
|
|
|
+// 任务
|
|
func (s *Spider) StartJob() {
|
|
func (s *Spider) StartJob() {
|
|
s.Stop = false
|
|
s.Stop = false
|
|
s.Pass = false
|
|
s.Pass = false
|
|
@@ -120,7 +121,7 @@ func (s *Spider) StartJob() {
|
|
go s.ExecJob(false)
|
|
go s.ExecJob(false)
|
|
}
|
|
}
|
|
|
|
|
|
-//单次执行
|
|
|
|
|
|
+// 单次执行
|
|
func (s *Spider) ExecJob(reload bool) {
|
|
func (s *Spider) ExecJob(reload bool) {
|
|
defer func() {
|
|
defer func() {
|
|
s.ExecuteOkTime = time.Now().Unix()
|
|
s.ExecuteOkTime = time.Now().Unix()
|
|
@@ -150,11 +151,14 @@ func (s *Spider) ExecJob(reload bool) {
|
|
//}
|
|
//}
|
|
//判断是否使用高并发下载三级页
|
|
//判断是否使用高并发下载三级页
|
|
var err interface{}
|
|
var err interface{}
|
|
- if util.Config.PageTurnInfo.ListThreadsNum > 1 {
|
|
|
|
|
|
+ if Supplement {
|
|
|
|
+ err = s.SupplementDownListPageItem() //增量补采数据,下载列表
|
|
|
|
+ } else if util.Config.PageTurnInfo.ListThreadsNum > 1 {
|
|
err = s.DownListPageItemByThreads() //并发下载列表
|
|
err = s.DownListPageItemByThreads() //并发下载列表
|
|
} else {
|
|
} else {
|
|
err = s.DownListPageItem() //下载列表
|
|
err = s.DownListPageItem() //下载列表
|
|
}
|
|
}
|
|
|
|
+
|
|
//if util.Config.Working == 0 && util.Config.Modal == 1 && !util.Config.IsHistoryEvent {
|
|
//if util.Config.Working == 0 && util.Config.Modal == 1 && !util.Config.IsHistoryEvent {
|
|
// err = s.DownListPageItemByThreads() //下载列表
|
|
// err = s.DownListPageItemByThreads() //下载列表
|
|
//} else {
|
|
//} else {
|
|
@@ -168,7 +172,7 @@ func (s *Spider) ExecJob(reload bool) {
|
|
SpiderCodeSendToEditor(s.Code) //历史转增量爬虫发送编辑器,切换节点上下架
|
|
SpiderCodeSendToEditor(s.Code) //历史转增量爬虫发送编辑器,切换节点上下架
|
|
return
|
|
return
|
|
} else {
|
|
} else {
|
|
- if util.Config.Working == 0 { //高性能模式
|
|
|
|
|
|
+ if util.Config.Working == 0 && !Supplement { //高性能模式
|
|
/*
|
|
/*
|
|
for !s.Stop && s.Pass {
|
|
for !s.Stop && s.Pass {
|
|
util.TimeSleepFunc(2*time.Second, TimeSleepChan)
|
|
util.TimeSleepFunc(2*time.Second, TimeSleepChan)
|
|
@@ -191,13 +195,13 @@ func (s *Spider) ExecJob(reload bool) {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- } else { //排队模式
|
|
|
|
|
|
+ } else { //排队模式或者数据补采
|
|
return
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//获取最新时间--作为最后更新时间
|
|
|
|
|
|
+// 获取最新时间--作为最后更新时间
|
|
func (s *Spider) GetLastPublishTime() (errs interface{}) {
|
|
func (s *Spider) GetLastPublishTime() (errs interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
var lastpublishtime string
|
|
var lastpublishtime string
|
|
@@ -229,7 +233,7 @@ func (s *Spider) GetLastPublishTime() (errs interface{}) {
|
|
return nil
|
|
return nil
|
|
}
|
|
}
|
|
|
|
|
|
-//下载列表(较DownListPageItemBack去掉了无数据的重试和重复页记录)
|
|
|
|
|
|
+// 下载列表(较DownListPageItemBack去掉了无数据的重试和重复页记录)
|
|
func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
@@ -607,7 +611,7 @@ func (s *Spider) DownListPageItemBack() (errs interface{}) {
|
|
return errs
|
|
return errs
|
|
}
|
|
}
|
|
|
|
|
|
-//并发下载列表
|
|
|
|
|
|
+// 并发下载列表
|
|
func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
|
|
func (s *Spider) DownListPageItemByThreads() (errs interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
@@ -932,7 +936,133 @@ func (s *Spider) DownListPageItemByThreadsBack() (errs interface{}) {
|
|
return errs
|
|
return errs
|
|
}
|
|
}
|
|
|
|
|
|
-//下载某一页数据
|
|
|
|
|
|
+// 补采下载列表
|
|
|
|
+func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ var (
|
|
|
|
+ errtimes int //采集异常次数(暂定10次)
|
|
|
|
+ errPageNum int //当前采集异常页码
|
|
|
|
+ downtimes int //记录某页重试次数(暂定3次)
|
|
|
|
+ downloadAllNum int //记录本次采集,信息采集总量
|
|
|
|
+ saveAllNum int //记录本次采集,信息补采总量
|
|
|
|
+ repeatAllNum int //记录本次采集,信息重复总量
|
|
|
|
+ pageTitleHash string //记录当前页所有title文本
|
|
|
|
+ finishText = "正常退出"
|
|
|
|
+ )
|
|
|
|
+ start := 1 //起始页
|
|
|
|
+ for {
|
|
|
|
+ if errtimes >= Supplement_MaxErrorTimes { //连续异常次数超过10次,爬虫不再翻页
|
|
|
|
+ finishText = "异常退出"
|
|
|
|
+ logger.Info(s.Code + "连续10页采集异常")
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ if err := s.L.CallByParam(lua.P{
|
|
|
|
+ Fn: s.L.GetGlobal("downloadAndParseListPage"),
|
|
|
|
+ NRet: 1,
|
|
|
|
+ Protect: true,
|
|
|
|
+ }, lua.LNumber(start)); err != nil {
|
|
|
|
+ //panic(s.Code + "," + err.Error())
|
|
|
|
+ logger.Error("列表页采集报错", start, s.Code+","+err.Error())
|
|
|
|
+ errs = err.Error()
|
|
|
|
+ if downtimes < 3 {
|
|
|
|
+ downtimes++
|
|
|
|
+ } else if errtimes == 0 || start == errPageNum+1 {
|
|
|
|
+ errtimes++
|
|
|
|
+ errPageNum = start
|
|
|
|
+ start++
|
|
|
|
+ downtimes = 0
|
|
|
|
+ }
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ lv := s.L.Get(-1)
|
|
|
|
+ s.L.Pop(1)
|
|
|
|
+ if tbl, ok := lv.(*lua.LTable); ok {
|
|
|
|
+ qu.Debug(tbl.Len())
|
|
|
|
+ if tabLen := tbl.Len(); tabLen > 0 { //列表页有数据,根据列表页信息下载三级页
|
|
|
|
+ repeatListNum := 0 // 当前列表页连接重复个数
|
|
|
|
+ isBreak := false
|
|
|
|
+ var publishtimeErrTimes int
|
|
|
|
+ var text string
|
|
|
|
+ num := 1
|
|
|
|
+ for ; num <= tabLen; num++ {
|
|
|
|
+ v := tbl.RawGetInt(num).(*lua.LTable)
|
|
|
|
+ tmp := util.TableToMap(v)
|
|
|
|
+ pTmp := qu.ObjToString(tmp["publishtime"])
|
|
|
|
+ title := qu.ObjToString(tmp["title"])
|
|
|
|
+ text += title
|
|
|
|
+ pTime, _ := time.ParseInLocation(qu.Date_Full_Layout, pTmp, time.Local)
|
|
|
|
+ publishtime := pTime.Unix()
|
|
|
|
+ if publishtime > 1000000000 && publishtime < Supplement_Publishtime { //正常退出
|
|
|
|
+ isBreak = true
|
|
|
|
+ break
|
|
|
|
+ } else if publishtime <= 1000000000 { //异常发布时间
|
|
|
|
+ publishtimeErrTimes++
|
|
|
|
+ }
|
|
|
|
+ tmp["dataging"] = 0 //数据中打标记dataging=0
|
|
|
|
+ s.DownloadDetailItem(tmp, &repeatListNum)
|
|
|
|
+ }
|
|
|
|
+ downloadAllNum += tabLen
|
|
|
|
+ repeatAllNum += repeatListNum
|
|
|
|
+ saveAllNum += num - 1 - repeatListNum
|
|
|
|
+ tmpPageTitleHash := pageTitleHash
|
|
|
|
+ pageTitleHash = util.HexText(text)
|
|
|
|
+ if tabLen == publishtimeErrTimes || tmpPageTitleHash == pageTitleHash { //当前页数据发布时间均异常;当前页与上页采集内容一致
|
|
|
|
+ //if errtimes == 0 || start == errPageNum+1 {
|
|
|
|
+ errtimes++
|
|
|
|
+ errPageNum = start
|
|
|
|
+ start++
|
|
|
|
+ //}
|
|
|
|
+ continue
|
|
|
|
+ } else if isBreak { //中断不再采集
|
|
|
|
+ start++
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ if downtimes < 3 {
|
|
|
|
+ downtimes++
|
|
|
|
+ } else if errtimes == 0 || start == errPageNum+1 {
|
|
|
|
+ errtimes++
|
|
|
|
+ errPageNum = start
|
|
|
|
+ start++
|
|
|
|
+ downtimes = 0
|
|
|
|
+ }
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ if downtimes < 3 {
|
|
|
|
+ downtimes++
|
|
|
|
+ } else if errtimes == 0 || start == errPageNum+1 {
|
|
|
|
+ errtimes++
|
|
|
|
+ errPageNum = start
|
|
|
|
+ start++
|
|
|
|
+ downtimes = 0
|
|
|
|
+ }
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ start++
|
|
|
|
+ downtimes = 0
|
|
|
|
+ errtimes = 0
|
|
|
|
+ errPageNum = 0
|
|
|
|
+ util.TimeSleepFunc(100*time.Millisecond, TimeSleepChan)
|
|
|
|
+ }
|
|
|
|
+ logger.Info(s.Code, "本轮列表页采集详情:", downloadAllNum, repeatAllNum, start, finishText)
|
|
|
|
+ if !util.Config.IsHistoryEvent && !s.Stop { //非历史节点统计下载率
|
|
|
|
+ save := map[string]interface{}{
|
|
|
|
+ "site": s.Name,
|
|
|
|
+ "channel": s.Channel,
|
|
|
|
+ "spidercode": s.Code,
|
|
|
|
+ "comeintime": time.Now().Unix(),
|
|
|
|
+ "modifyuser": s.MUserName,
|
|
|
|
+ "endpage": start,
|
|
|
|
+ "finish": finishText,
|
|
|
|
+ "num": saveAllNum,
|
|
|
|
+ }
|
|
|
|
+ MgoS.Save("spider_supplement", save)
|
|
|
|
+ }
|
|
|
|
+ return errs
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// 下载某一页数据
|
|
func (s *Spider) DownListOnePage(pagenum int) (downnum, repeatnum int) {
|
|
func (s *Spider) DownListOnePage(pagenum int) (downnum, repeatnum int) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
downtimes := 0
|
|
downtimes := 0
|
|
@@ -988,7 +1118,7 @@ func (s *Spider) DownListOnePage(pagenum int) (downnum, repeatnum int) {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
|
|
|
|
-//开启单独线程继续采集列表页
|
|
|
|
|
|
+// 开启单独线程继续采集列表页
|
|
func ContinueDownListPageItem(s *Spider) {
|
|
func ContinueDownListPageItem(s *Spider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
spTmp, errstr := CreateSpider(s.SCode, s.ScriptFile, true, true) //生成新爬虫
|
|
spTmp, errstr := CreateSpider(s.SCode, s.ScriptFile, true, true) //生成新爬虫
|
|
@@ -1004,7 +1134,7 @@ func ContinueDownListPageItem(s *Spider) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//遍历,开启三级页下载(历史补漏)
|
|
|
|
|
|
+// 遍历,开启三级页下载(历史补漏)
|
|
func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
//qu.Debug("--------------历史下载-----------------")
|
|
//qu.Debug("--------------历史下载-----------------")
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
@@ -1121,7 +1251,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
//qu.Debug("--------------保存结束---------------")
|
|
//qu.Debug("--------------保存结束---------------")
|
|
}
|
|
}
|
|
|
|
|
|
-//遍历,开启三级页下载(增量)
|
|
|
|
|
|
+// 遍历,开启三级页下载(增量)
|
|
func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
var err interface{}
|
|
var err interface{}
|
|
@@ -1138,13 +1268,16 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
hashHref := util.HexText(href)
|
|
hashHref := util.HexText(href)
|
|
//列表页redis判重
|
|
//列表页redis判重
|
|
isExist := util.RedisExist("list", "list_"+hashHref)
|
|
isExist := util.RedisExist("list", "list_"+hashHref)
|
|
|
|
+ if Supplement { //补采,再进行全量redis判重
|
|
|
|
+ isExist, _ = util.ExistsBloomRedis("href", href)
|
|
|
|
+ }
|
|
if isExist {
|
|
if isExist {
|
|
*num++ //已采集
|
|
*num++ //已采集
|
|
return
|
|
return
|
|
}
|
|
}
|
|
id := "" //记录spider_listdata中保存的数据id,便于下载成功后更新状态
|
|
id := "" //记录spider_listdata中保存的数据id,便于下载成功后更新状态
|
|
//if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //列表页、详情页分开采集模式节点和7000节点新爬虫采集的数据数据
|
|
//if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //列表页、详情页分开采集模式节点和7000节点新爬虫采集的数据数据
|
|
- if util.Config.Modal == 1 || util.Config.IsHistoryEvent { //分开采集模式和历史节点(7000)
|
|
|
|
|
|
+ if util.Config.Modal == 1 || util.Config.IsHistoryEvent || Supplement { //分开采集模式和历史节点(7000)
|
|
SaveHighListPageData(paramdata, hashHref, num) //存表
|
|
SaveHighListPageData(paramdata, hashHref, num) //存表
|
|
return
|
|
return
|
|
} else {
|
|
} else {
|
|
@@ -1230,7 +1363,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
}
|
|
}
|
|
|
|
|
|
-//遍历下载名录
|
|
|
|
|
|
+// 遍历下载名录
|
|
func (s *Spider) DownloadDetailByNames(p interface{}) {
|
|
func (s *Spider) DownloadDetailByNames(p interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
var err interface{}
|
|
var err interface{}
|
|
@@ -1266,7 +1399,7 @@ func (s *Spider) DownloadDetailByNames(p interface{}) {
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
}
|
|
}
|
|
|
|
|
|
-//下载解析详情页
|
|
|
|
|
|
+// 下载解析详情页
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
s.LastHeartbeat = time.Now().Unix()
|
|
s.LastHeartbeat = time.Now().Unix()
|
|
@@ -1321,7 +1454,7 @@ func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[strin
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//高性能模式定时采集三级页信息
|
|
|
|
|
|
+// 高性能模式定时采集三级页信息
|
|
func DetailData() {
|
|
func DetailData() {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
<-InitAllLuaOver //脚本加载完毕,执行
|
|
<-InitAllLuaOver //脚本加载完毕,执行
|
|
@@ -1340,7 +1473,7 @@ func GetListDataDownloadDetail() {
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
|
|
-//高性能模式根据列表页数据下载三级页
|
|
|
|
|
|
+// 高性能模式根据列表页数据下载三级页
|
|
func (s *Spider) DownloadHighDetail(reload bool) {
|
|
func (s *Spider) DownloadHighDetail(reload bool) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
for {
|
|
for {
|
|
@@ -1353,7 +1486,7 @@ func (s *Spider) DownloadHighDetail(reload bool) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//队列模式根据列表页数据下载三级页
|
|
|
|
|
|
+// 队列模式根据列表页数据下载三级页
|
|
func (s *Spider) DownloadListDetail(reload bool) {
|
|
func (s *Spider) DownloadListDetail(reload bool) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
s.DownloadDetail(reload, false)
|
|
s.DownloadDetail(reload, false)
|
|
@@ -1366,7 +1499,7 @@ func (s *Spider) DownloadListDetail(reload bool) {
|
|
CC2 <- s.L
|
|
CC2 <- s.L
|
|
}
|
|
}
|
|
|
|
|
|
-//下载详情页
|
|
|
|
|
|
+// 下载详情页
|
|
func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
coll := "spider_highlistdata"
|
|
coll := "spider_highlistdata"
|
|
@@ -1600,7 +1733,7 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//初始化sp对象
|
|
|
|
|
|
+// 初始化sp对象
|
|
func NewSpiderByScript(num int, code, script string, spChan chan *Spider) {
|
|
func NewSpiderByScript(num int, code, script string, spChan chan *Spider) {
|
|
for i := 1; i <= num; i++ {
|
|
for i := 1; i <= num; i++ {
|
|
spTmp, errstr := CreateSpider(code, script, true, true)
|
|
spTmp, errstr := CreateSpider(code, script, true, true)
|
|
@@ -1612,7 +1745,7 @@ func NewSpiderByScript(num int, code, script string, spChan chan *Spider) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
|
|
|
|
|
|
+// detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
|
|
func AnalysisProjectInfo(data map[string]interface{}) string {
|
|
func AnalysisProjectInfo(data map[string]interface{}) string {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
detail := qu.ObjToString(data["detail"])
|
|
detail := qu.ObjToString(data["detail"])
|
|
@@ -1641,14 +1774,14 @@ func AnalysisProjectInfo(data map[string]interface{}) string {
|
|
return ""
|
|
return ""
|
|
}
|
|
}
|
|
|
|
|
|
-//打印线程数
|
|
|
|
|
|
+// 打印线程数
|
|
func AllThreadLog() {
|
|
func AllThreadLog() {
|
|
logger.Info("List Download All Thread:", ListAllThreadNum)
|
|
logger.Info("List Download All Thread:", ListAllThreadNum)
|
|
logger.Info("Detail Download All Thread:", AllThreadNum)
|
|
logger.Info("Detail Download All Thread:", AllThreadNum)
|
|
time.AfterFunc(1*time.Minute, AllThreadLog)
|
|
time.AfterFunc(1*time.Minute, AllThreadLog)
|
|
}
|
|
}
|
|
|
|
|
|
-//获取hascode
|
|
|
|
|
|
+// 获取hascode
|
|
func GetHas1(data string) string {
|
|
func GetHas1(data string) string {
|
|
t := sha1.New()
|
|
t := sha1.New()
|
|
io.WriteString(t, data)
|
|
io.WriteString(t, data)
|