|
@@ -4,143 +4,23 @@ import (
|
|
|
"container/list"
|
|
|
"errors"
|
|
|
"fmt"
|
|
|
+ "github.com/chromedp/chromedp"
|
|
|
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
|
be "spider_creator/backend"
|
|
|
"time"
|
|
|
-
|
|
|
- "github.com/chromedp/chromedp"
|
|
|
)
|
|
|
|
|
|
// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
|
-//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
-// qu.Debug("sc---", *sc)
|
|
|
-// verifyResult := list.New()
|
|
|
-// be.DataResults[sc.Code] = verifyResult
|
|
|
-// ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
|
|
|
-// _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
|
|
|
-// _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, false) //详情页使用
|
|
|
-// defer func() {
|
|
|
-// incCancelFn2()
|
|
|
-// baseCancelFn2()
|
|
|
-// incCancelFn()
|
|
|
-// baseCancelFn()
|
|
|
-// }()
|
|
|
-//
|
|
|
-// listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
-// //2. 执行JS代码,获取列表页信息
|
|
|
-// if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
|
-// listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
-// }
|
|
|
-// if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
-// contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
-// }
|
|
|
-// qu.Debug("获取列表页JS代码:", listRunJs)
|
|
|
-// qu.Debug("获取详情页JS代码:", contentRunJs)
|
|
|
-// //3.打开列表,获取条目清单
|
|
|
-// chromedp.Run(ctx, chromedp.Tasks{
|
|
|
-// chromedp.Navigate(sc.Href),
|
|
|
-// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
-// //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
-// chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
-// })
|
|
|
-// //初始化列表页信息
|
|
|
-// if !vm.InitListPage(ctx, sc) {
|
|
|
-// qu.Debug("初始化列表页失败,退出")
|
|
|
-// return ret, errors.New("初始化列表页失败")
|
|
|
-// }
|
|
|
-// no := 1
|
|
|
-//T:
|
|
|
-// for j := 0; j < 2; j++ { //最多检查2页
|
|
|
-// qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
-// listResult := make(be.ResultItems, 0)
|
|
|
-// err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
-// chromedp.Evaluate(listRunJs, &listResult),
|
|
|
-// })
|
|
|
-// if err != nil {
|
|
|
-// qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
-// continue
|
|
|
-// }
|
|
|
-// //TODO 5.操作详情页
|
|
|
-// qu.Debug("列表采集条数:", len(listResult))
|
|
|
-// for contentIndex, r := range listResult {
|
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
-// if contentIndex > 1 { //每页只校验2条
|
|
|
-// break
|
|
|
-// }
|
|
|
-// //打开详情页
|
|
|
-// err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
-// chromedp.Navigate(r.Href),
|
|
|
-// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
-// //chromedp.Sleep(2000 * time.Millisecond),
|
|
|
-// chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
|
|
|
-// })
|
|
|
-// if err != nil {
|
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
|
|
|
-// continue
|
|
|
-// }
|
|
|
-// //获取详情页内容
|
|
|
-// err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
-// chromedp.Evaluate(contentRunJs, r),
|
|
|
-// })
|
|
|
-// if err != nil {
|
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
|
|
|
-// continue
|
|
|
-// }
|
|
|
-// //下载附件
|
|
|
-// if sc.AttachCss != "" {
|
|
|
-// downloadAttaches(r, vm.attachesDir)
|
|
|
-// }
|
|
|
-// r.Site = sc.Site
|
|
|
-// r.Channel = sc.Channel
|
|
|
-// if r.Title == "" {
|
|
|
-// r.Title = r.ListTitle
|
|
|
-// }
|
|
|
-// if r.PublishTime == "" {
|
|
|
-// r.PublishTime = r.ListPubTime
|
|
|
-// }
|
|
|
-// r.No = no
|
|
|
-// no += 1
|
|
|
-// //结果放入缓存
|
|
|
-// verifyResult.PushBack(r)
|
|
|
-// }
|
|
|
-// qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
|
|
|
-// //翻页
|
|
|
-// if verifyResult.Len() > 0 {
|
|
|
-// if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
|
-// ret.ListTrunPage = true
|
|
|
-// break
|
|
|
-// } else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
|
|
|
-// if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
-// qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
|
|
|
-// break T
|
|
|
-// } else {
|
|
|
-// ret.ListTrunPage = true
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// //检查
|
|
|
-// for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
-// r, _ := el.Value.(*be.ResultItem)
|
|
|
-// ret.Title = r.Title != ""
|
|
|
-// ret.PublishUnit = r.PublishUnit != ""
|
|
|
-// ret.PublishTime = r.PublishTime != ""
|
|
|
-// ret.Content = r.Content != ""
|
|
|
-// ret.Attaches = len(r.AttachLinks) > 0
|
|
|
-// }
|
|
|
-// qu.Debug(verifyResult.Len())
|
|
|
-// ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
-// return ret, nil
|
|
|
-//}
|
|
|
-
|
|
|
-// VerifySpiderConfig 只验证列表标注
|
|
|
func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
qu.Debug("sc---", *sc)
|
|
|
verifyResult := list.New()
|
|
|
be.DataResults[sc.Code] = verifyResult
|
|
|
- ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
- _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
|
|
|
+ ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
|
|
|
+ _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
|
|
|
+ _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, false) //详情页使用
|
|
|
defer func() {
|
|
|
+ incCancelFn2()
|
|
|
+ baseCancelFn2()
|
|
|
incCancelFn()
|
|
|
baseCancelFn()
|
|
|
}()
|
|
@@ -153,7 +33,8 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
|
|
|
if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
}
|
|
|
- qu.Debug("列表页JS:", listRunJs)
|
|
|
+ qu.Debug("获取列表页JS代码:", listRunJs)
|
|
|
+ qu.Debug("获取详情页JS代码:", contentRunJs)
|
|
|
//3.打开列表,获取条目清单
|
|
|
chromedp.Run(ctx, chromedp.Tasks{
|
|
|
chromedp.Navigate(sc.Href),
|
|
@@ -161,7 +42,7 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
|
|
|
//chromedp.Sleep(1000 * time.Millisecond),
|
|
|
chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
})
|
|
|
- //4.初始化列表页信息
|
|
|
+ //初始化列表页信息
|
|
|
if !vm.InitListPage(ctx, sc) {
|
|
|
qu.Debug("初始化列表页失败,退出")
|
|
|
return ret, errors.New("初始化列表页失败")
|
|
@@ -178,20 +59,41 @@ T:
|
|
|
qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
continue
|
|
|
}
|
|
|
- //5.操作详情页
|
|
|
+ //TODO 5.操作详情页
|
|
|
qu.Debug("列表采集条数:", len(listResult))
|
|
|
for contentIndex, r := range listResult {
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
if contentIndex > 1 { //每页只校验2条
|
|
|
break
|
|
|
}
|
|
|
- qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
+ //打开详情页
|
|
|
+ err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
+ chromedp.Navigate(r.Href),
|
|
|
+ chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
+ //chromedp.Sleep(2000 * time.Millisecond),
|
|
|
+ chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
|
|
|
+ })
|
|
|
+ if err != nil {
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //获取详情页内容
|
|
|
+ err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
+ chromedp.Evaluate(contentRunJs, r),
|
|
|
+ })
|
|
|
+ if err != nil {
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //下载附件
|
|
|
+ if sc.AttachCss != "" {
|
|
|
+ downloadAttaches(r, vm.attachesDir)
|
|
|
+ }
|
|
|
r.Site = sc.Site
|
|
|
r.Channel = sc.Channel
|
|
|
- //qu.Debug(r.Title, r.ListTitle)
|
|
|
if r.Title == "" {
|
|
|
r.Title = r.ListTitle
|
|
|
}
|
|
|
- //qu.Debug(r.PublishTime, r.ListPubTime)
|
|
|
if r.PublishTime == "" {
|
|
|
r.PublishTime = r.ListPubTime
|
|
|
}
|
|
@@ -200,15 +102,15 @@ T:
|
|
|
//结果放入缓存
|
|
|
verifyResult.PushBack(r)
|
|
|
}
|
|
|
- qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
|
- //6.翻页
|
|
|
+ qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
|
|
|
+ //翻页
|
|
|
if verifyResult.Len() > 0 {
|
|
|
if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
|
ret.ListTrunPage = true
|
|
|
break
|
|
|
- } else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
|
|
|
+ } else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
|
|
|
if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
- qu.Debug("翻页失败:", err)
|
|
|
+ qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
|
|
|
break T
|
|
|
} else {
|
|
|
ret.ListTrunPage = true
|
|
@@ -220,13 +122,110 @@ T:
|
|
|
for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
r, _ := el.Value.(*be.ResultItem)
|
|
|
ret.Title = r.Title != ""
|
|
|
- qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
+ ret.PublishUnit = r.PublishUnit != ""
|
|
|
ret.PublishTime = r.PublishTime != ""
|
|
|
- qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
|
+ ret.Content = r.Content != ""
|
|
|
+ ret.Attaches = len(r.AttachLinks) > 0
|
|
|
}
|
|
|
- if ret.ListItems {
|
|
|
- ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
- }
|
|
|
-
|
|
|
+ qu.Debug(verifyResult.Len())
|
|
|
+ ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
return ret, nil
|
|
|
}
|
|
|
+
|
|
|
+// VerifySpiderConfig 只验证列表标注
|
|
|
+//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
+// qu.Debug("sc---", *sc)
|
|
|
+// verifyResult := list.New()
|
|
|
+// be.DataResults[sc.Code] = verifyResult
|
|
|
+// ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
+// _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
|
|
|
+// defer func() {
|
|
|
+// incCancelFn()
|
|
|
+// baseCancelFn()
|
|
|
+// }()
|
|
|
+//
|
|
|
+// listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
+// //2. 执行JS代码,获取列表页信息
|
|
|
+// if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
|
+// listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
+// }
|
|
|
+// if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
+// contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
+// }
|
|
|
+// qu.Debug("列表页JS:", listRunJs)
|
|
|
+// //3.打开列表,获取条目清单
|
|
|
+// chromedp.Run(ctx, chromedp.Tasks{
|
|
|
+// chromedp.Navigate(sc.Href),
|
|
|
+// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
+// //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
+// chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
+// })
|
|
|
+// //4.初始化列表页信息
|
|
|
+// if !vm.InitListPage(ctx, sc) {
|
|
|
+// qu.Debug("初始化列表页失败,退出")
|
|
|
+// return ret, errors.New("初始化列表页失败")
|
|
|
+// }
|
|
|
+// no := 1
|
|
|
+//T:
|
|
|
+// for j := 0; j < 2; j++ { //最多检查2页
|
|
|
+// qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
+// listResult := make(be.ResultItems, 0)
|
|
|
+// err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
+// chromedp.Evaluate(listRunJs, &listResult),
|
|
|
+// })
|
|
|
+// if err != nil {
|
|
|
+// qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
+// continue
|
|
|
+// }
|
|
|
+// //5.操作详情页
|
|
|
+// qu.Debug("列表采集条数:", len(listResult))
|
|
|
+// for contentIndex, r := range listResult {
|
|
|
+// if contentIndex > 1 { //每页只校验2条
|
|
|
+// break
|
|
|
+// }
|
|
|
+// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
+// r.Site = sc.Site
|
|
|
+// r.Channel = sc.Channel
|
|
|
+// //qu.Debug(r.Title, r.ListTitle)
|
|
|
+// if r.Title == "" {
|
|
|
+// r.Title = r.ListTitle
|
|
|
+// }
|
|
|
+// //qu.Debug(r.PublishTime, r.ListPubTime)
|
|
|
+// if r.PublishTime == "" {
|
|
|
+// r.PublishTime = r.ListPubTime
|
|
|
+// }
|
|
|
+// r.No = no
|
|
|
+// no += 1
|
|
|
+// //结果放入缓存
|
|
|
+// verifyResult.PushBack(r)
|
|
|
+// }
|
|
|
+// qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
|
+// //6.翻页
|
|
|
+// if verifyResult.Len() > 0 {
|
|
|
+// if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
|
+// ret.ListTrunPage = true
|
|
|
+// break
|
|
|
+// } else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
|
|
|
+// if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
+// qu.Debug("翻页失败:", err)
|
|
|
+// break T
|
|
|
+// } else {
|
|
|
+// ret.ListTrunPage = true
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// //检查
|
|
|
+// for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
+// r, _ := el.Value.(*be.ResultItem)
|
|
|
+// ret.Title = r.Title != ""
|
|
|
+// qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
+// ret.PublishTime = r.PublishTime != ""
|
|
|
+// qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
|
+// }
|
|
|
+// if ret.ListItems {
|
|
|
+// ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
+// }
|
|
|
+//
|
|
|
+// return ret, nil
|
|
|
+//}
|