|
@@ -10,13 +10,23 @@ import (
|
|
|
"time"
|
|
|
)
|
|
|
|
|
|
-// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
|
+// 爬虫验证
|
|
|
func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
+ if be.Cfg.IsOnly4MainSite {
|
|
|
+ return vm.verifySpiderConfig4MainSite(sc) //重点网站
|
|
|
+ } else {
|
|
|
+ return vm.verifySpiderConfig4Prod(sc) //正式环境
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// verifySpiderConfig4Prod 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
|
+// 正式环境
|
|
|
+func (vm *VM) verifySpiderConfig4Prod(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
qu.Debug("sc---", *sc)
|
|
|
verifyResult := list.New()
|
|
|
be.DataResults[sc.Code] = verifyResult
|
|
|
ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
|
|
|
- _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false, sc.Href) //列表页使用
|
|
|
+ _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href) //列表页使用
|
|
|
_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href) //详情页使用
|
|
|
defer func() {
|
|
|
incCancelFn2()
|
|
@@ -87,7 +97,6 @@ T:
|
|
|
continue
|
|
|
}
|
|
|
//下载附件
|
|
|
- qu.Debug(r.Title, r.ListTitle)
|
|
|
if sc.AttachCss != "" {
|
|
|
downloadAttaches(r, vm.attachesDir)
|
|
|
}
|
|
@@ -133,101 +142,102 @@ T:
|
|
|
return ret, nil
|
|
|
}
|
|
|
|
|
|
-// VerifySpiderConfig 只验证列表标注
|
|
|
-//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
-// qu.Debug("sc---", *sc)
|
|
|
-// verifyResult := list.New()
|
|
|
-// be.DataResults[sc.Code] = verifyResult
|
|
|
-// ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
-// _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
|
|
|
-// defer func() {
|
|
|
-// incCancelFn()
|
|
|
-// baseCancelFn()
|
|
|
-// }()
|
|
|
-//
|
|
|
-// listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
-// //2. 执行JS代码,获取列表页信息
|
|
|
-// if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
|
-// listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
-// }
|
|
|
-// if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
-// contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
-// }
|
|
|
-// qu.Debug("列表页JS:", listRunJs)
|
|
|
-// //3.打开列表,获取条目清单
|
|
|
-// chromedp.Run(ctx, chromedp.Tasks{
|
|
|
-// chromedp.Navigate(sc.Href),
|
|
|
-// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
-// //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
-// chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
-// })
|
|
|
-// //4.初始化列表页信息
|
|
|
-// if !vm.InitListPage(ctx, sc) {
|
|
|
-// qu.Debug("初始化列表页失败,退出")
|
|
|
-// return ret, errors.New("初始化列表页失败")
|
|
|
-// }
|
|
|
-// no := 1
|
|
|
-// ret.ListTrunPage = true
|
|
|
-//T:
|
|
|
-// for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
|
|
|
-// qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
-// listResult := make(be.ResultItems, 0)
|
|
|
-// err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
-// chromedp.Evaluate(listRunJs, &listResult),
|
|
|
-// })
|
|
|
-// if err != nil {
|
|
|
-// qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
-// continue
|
|
|
-// }
|
|
|
-// //5.操作详情页
|
|
|
-// qu.Debug("列表采集条数:", len(listResult))
|
|
|
-// for contentIndex, r := range listResult {
|
|
|
-// if contentIndex > 1 { //每页只校验2条
|
|
|
-// break
|
|
|
-// }
|
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
-// r.Site = sc.Site
|
|
|
-// r.Channel = sc.Channel
|
|
|
-// //qu.Debug(r.Title, r.ListTitle)
|
|
|
-// if r.Title == "" {
|
|
|
-// r.Title = r.ListTitle
|
|
|
-// }
|
|
|
-// //qu.Debug(r.PublishTime, r.ListPubTime)
|
|
|
-// if r.PublishTime == "" {
|
|
|
-// r.PublishTime = r.ListPubTime
|
|
|
-// }
|
|
|
-// r.No = no
|
|
|
-// no += 1
|
|
|
-// //结果放入缓存
|
|
|
-// verifyResult.PushBack(r)
|
|
|
-// }
|
|
|
-// qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
|
-// //6.翻页
|
|
|
-// if verifyResult.Len() > 0 {
|
|
|
-// if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
|
|
|
-// if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
-// qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
|
|
|
-// ret.ListTrunPage = false
|
|
|
-// break T
|
|
|
-// }
|
|
|
-// }
|
|
|
-// } else {
|
|
|
-// ret.ListTrunPage = false
|
|
|
-// break T
|
|
|
-// }
|
|
|
-// }
|
|
|
-// //检查
|
|
|
-// for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
-// r, _ := el.Value.(*be.ResultItem)
|
|
|
-// ret.Title = r.Title != ""
|
|
|
-// qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
-//
|
|
|
-// ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
|
|
|
-// qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
|
-// }
|
|
|
-// if ret.ListItems {
|
|
|
-// ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
-// }
|
|
|
-//
|
|
|
-// return ret, nil
|
|
|
-//}
|
|
|
+// verifySpiderConfig4MainSite 只验证列表标注
|
|
|
+// 重点网站测试环境
|
|
|
+func (vm *VM) verifySpiderConfig4MainSite(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
+ qu.Debug("sc---", *sc)
|
|
|
+ verifyResult := list.New()
|
|
|
+ be.DataResults[sc.Code] = verifyResult
|
|
|
+ ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
+ _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href) //列表页使用
|
|
|
+ defer func() {
|
|
|
+ incCancelFn()
|
|
|
+ baseCancelFn()
|
|
|
+ }()
|
|
|
+
|
|
|
+ listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
+ //2. 执行JS代码,获取列表页信息
|
|
|
+ if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
|
+ listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
+ }
|
|
|
+ if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
+ contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
+ }
|
|
|
+ qu.Debug("列表页JS:", listRunJs)
|
|
|
+ //3.打开列表,获取条目清单
|
|
|
+ chromedp.Run(ctx, chromedp.Tasks{
|
|
|
+ chromedp.Navigate(sc.Href),
|
|
|
+ chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
+ //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
+ chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
+ })
|
|
|
+ //4.初始化列表页信息
|
|
|
+ if !vm.InitListPage(ctx, sc) {
|
|
|
+ qu.Debug("初始化列表页失败,退出")
|
|
|
+ return ret, errors.New("初始化列表页失败")
|
|
|
+ }
|
|
|
+ no := 1
|
|
|
+ ret.ListTrunPage = true
|
|
|
+T:
|
|
|
+ for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
|
|
|
+ qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
+ listResult := make(be.ResultItems, 0)
|
|
|
+ err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
+ chromedp.Evaluate(listRunJs, &listResult),
|
|
|
+ })
|
|
|
+ if err != nil {
|
|
|
+ qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //5.操作详情页
|
|
|
+ qu.Debug("列表采集条数:", len(listResult))
|
|
|
+ for contentIndex, r := range listResult {
|
|
|
+ if contentIndex > 1 { //每页只校验2条
|
|
|
+ break
|
|
|
+ }
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
+ r.Site = sc.Site
|
|
|
+ r.Channel = sc.Channel
|
|
|
+ //qu.Debug(r.Title, r.ListTitle)
|
|
|
+ if r.Title == "" {
|
|
|
+ r.Title = r.ListTitle
|
|
|
+ }
|
|
|
+ //qu.Debug(r.PublishTime, r.ListPubTime)
|
|
|
+ if r.PublishTime == "" {
|
|
|
+ r.PublishTime = r.ListPubTime
|
|
|
+ }
|
|
|
+ r.No = no
|
|
|
+ no += 1
|
|
|
+ //结果放入缓存
|
|
|
+ verifyResult.PushBack(r)
|
|
|
+ }
|
|
|
+ qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
|
+ //6.翻页
|
|
|
+ if verifyResult.Len() > 0 {
|
|
|
+ if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
|
|
|
+ if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
+ qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
|
|
|
+ ret.ListTrunPage = false
|
|
|
+ break T
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ ret.ListTrunPage = false
|
|
|
+ break T
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //检查
|
|
|
+ for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
+ r, _ := el.Value.(*be.ResultItem)
|
|
|
+ ret.Title = r.Title != ""
|
|
|
+ qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
+
|
|
|
+ ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
|
|
|
+ qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
|
+ }
|
|
|
+ if ret.ListItems {
|
|
|
+ ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
+ }
|
|
|
+
|
|
|
+ return ret, nil
|
|
|
+}
|