|
@@ -11,29 +11,144 @@ import (
|
|
)
|
|
)
|
|
|
|
|
|
// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
|
|
+func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
|
+ qu.Debug("sc---", *sc)
|
|
|
|
+ verifyResult := list.New()
|
|
|
|
+ be.DataResults[sc.Code] = verifyResult
|
|
|
|
+ ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
|
|
|
|
+ _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
|
|
+ _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
|
|
|
|
+ defer func() {
|
|
|
|
+ incCancelFn2()
|
|
|
|
+ baseCancelFn2()
|
|
|
|
+ incCancelFn()
|
|
|
|
+ baseCancelFn()
|
|
|
|
+ }()
|
|
|
|
+
|
|
|
|
+ listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
|
+ //2. 执行JS代码,获取列表页信息
|
|
|
|
+ if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
|
|
+ listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
|
+ }
|
|
|
|
+ if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
|
+ contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
|
+ }
|
|
|
|
+ qu.Debug("获取列表页JS代码:", listRunJs)
|
|
|
|
+ qu.Debug("获取详情页JS代码:", contentRunJs)
|
|
|
|
+ //TODO 3.打开列表,获取条目清单
|
|
|
|
+ chromedp.Run(ctx, chromedp.Tasks{
|
|
|
|
+ chromedp.Navigate(sc.Href),
|
|
|
|
+ chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
+ //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
|
+ chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
|
+ })
|
|
|
|
+ no := 1
|
|
|
|
+T:
|
|
|
|
+ for j := 0; j < 2; j++ { //最多检查2页
|
|
|
|
+ qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
|
+ listResult := make(be.ResultItems, 0)
|
|
|
|
+ err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
|
+ chromedp.Evaluate(listRunJs, &listResult),
|
|
|
|
+ })
|
|
|
|
+ if err != nil {
|
|
|
|
+ qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ //TODO 5.操作详情页
|
|
|
|
+ qu.Debug("列表采集条数:", len(listResult))
|
|
|
|
+ for contentIndex, r := range listResult {
|
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
|
+ if contentIndex > 1 { //每页只校验2条
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ //打开详情页
|
|
|
|
+ err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
+ chromedp.Navigate(r.Href),
|
|
|
|
+ chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
+ //chromedp.Sleep(2000 * time.Millisecond),
|
|
|
|
+ chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
|
|
|
|
+ })
|
|
|
|
+ if err != nil {
|
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ //获取详情页内容
|
|
|
|
+ err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
+ chromedp.Evaluate(contentRunJs, r),
|
|
|
|
+ })
|
|
|
|
+ if err != nil {
|
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ //下载附件
|
|
|
|
+ if sc.AttachCss != "" {
|
|
|
|
+ downloadAttaches(r, vm.attachesDir)
|
|
|
|
+ }
|
|
|
|
+ r.Site = sc.Site
|
|
|
|
+ r.Channel = sc.Channel
|
|
|
|
+ if r.Title == "" {
|
|
|
|
+ r.Title = r.ListTitle
|
|
|
|
+ }
|
|
|
|
+ if r.PublishTime == "" {
|
|
|
|
+ r.PublishTime = r.ListPubTime
|
|
|
|
+ }
|
|
|
|
+ r.No = no
|
|
|
|
+ no += 1
|
|
|
|
+ //结果放入缓存
|
|
|
|
+ verifyResult.PushBack(r)
|
|
|
|
+ }
|
|
|
|
+ qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
|
|
|
|
+ //TODO 6.翻页
|
|
|
|
+ if verifyResult.Len() > 0 {
|
|
|
|
+ if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
|
|
+ ret.ListTrunPage = true
|
|
|
|
+ break
|
|
|
|
+ } else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
|
|
|
|
+ if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
|
+ qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
|
|
|
|
+ break T
|
|
|
|
+ } else {
|
|
|
|
+ ret.ListTrunPage = true
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ //检查
|
|
|
|
+ for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
|
+ r, _ := el.Value.(*be.ResultItem)
|
|
|
|
+ ret.Title = r.Title != ""
|
|
|
|
+ ret.PublishUnit = r.PublishUnit != ""
|
|
|
|
+ ret.PublishTime = r.PublishTime != ""
|
|
|
|
+ ret.Content = r.Content != ""
|
|
|
|
+ ret.Attaches = len(r.AttachLinks) > 0
|
|
|
|
+ }
|
|
|
|
+ qu.Debug(verifyResult.Len())
|
|
|
|
+ ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
|
+
|
|
|
|
+ //TODO:每次验证结果存库、内存?
|
|
|
|
+ return ret, nil
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// VerifySpiderConfig 只验证列表标注
|
|
//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
// qu.Debug("sc---", *sc)
|
|
// qu.Debug("sc---", *sc)
|
|
// verifyResult := list.New()
|
|
// verifyResult := list.New()
|
|
-// ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
|
|
|
|
-// _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
|
|
-// _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
|
|
|
|
|
|
+// ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
|
+// _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
// defer func() {
|
|
// defer func() {
|
|
-// incCancelFn2()
|
|
|
|
-// baseCancelFn2()
|
|
|
|
// incCancelFn()
|
|
// incCancelFn()
|
|
// baseCancelFn()
|
|
// baseCancelFn()
|
|
// }()
|
|
// }()
|
|
//
|
|
//
|
|
// listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
// listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
// //TODO 2. 执行JS代码,获取列表页信息
|
|
// //TODO 2. 执行JS代码,获取列表页信息
|
|
-// if listRunJs == "" {
|
|
|
|
|
|
+// if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
// listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
// listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
// }
|
|
// }
|
|
-// if contentRunJs == "" {
|
|
|
|
|
|
+// if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
// contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
// contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
// }
|
|
// }
|
|
// qu.Debug("列表页JS:", listRunJs)
|
|
// qu.Debug("列表页JS:", listRunJs)
|
|
-// qu.Debug("详情页JS:", contentRunJs)
|
|
|
|
// //TODO 3.打开列表,获取条目清单
|
|
// //TODO 3.打开列表,获取条目清单
|
|
// chromedp.Run(ctx, chromedp.Tasks{
|
|
// chromedp.Run(ctx, chromedp.Tasks{
|
|
// chromedp.Navigate(sc.Href),
|
|
// chromedp.Navigate(sc.Href),
|
|
@@ -56,37 +171,17 @@ import (
|
|
// //TODO 5.操作详情页
|
|
// //TODO 5.操作详情页
|
|
// qu.Debug("列表采集条数:", len(listResult))
|
|
// qu.Debug("列表采集条数:", len(listResult))
|
|
// for contentIndex, r := range listResult {
|
|
// for contentIndex, r := range listResult {
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
|
-// if contentIndex > 1 { //每页只采集2条
|
|
|
|
|
|
+// if contentIndex > 1 { //每页只校验2条
|
|
// break
|
|
// break
|
|
// }
|
|
// }
|
|
-// //打开详情页
|
|
|
|
-// err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
-// chromedp.Navigate(r.Href),
|
|
|
|
-// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
-// chromedp.Sleep(2000 * time.Millisecond),
|
|
|
|
-// //chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
|
|
|
|
-// })
|
|
|
|
-// if err != nil {
|
|
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
|
|
|
|
-// continue
|
|
|
|
-// }
|
|
|
|
-// //获取详情页内容
|
|
|
|
-// err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
-// chromedp.Evaluate(contentRunJs, r),
|
|
|
|
-// })
|
|
|
|
-// if err != nil {
|
|
|
|
-// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
|
|
|
|
-// continue
|
|
|
|
-// }
|
|
|
|
-// if sc.AttachCss != "" {
|
|
|
|
-// downloadAttaches(r, vm.attachesDir)
|
|
|
|
-// }
|
|
|
|
|
|
+// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
// r.Site = sc.Site
|
|
// r.Site = sc.Site
|
|
// r.Channel = sc.Channel
|
|
// r.Channel = sc.Channel
|
|
|
|
+// qu.Debug(r.Title, r.ListTitle)
|
|
// if r.Title == "" {
|
|
// if r.Title == "" {
|
|
// r.Title = r.ListTitle
|
|
// r.Title = r.ListTitle
|
|
// }
|
|
// }
|
|
|
|
+// qu.Debug(r.PublishTime, r.ListPubTime)
|
|
// if r.PublishTime == "" {
|
|
// if r.PublishTime == "" {
|
|
// r.PublishTime = r.ListPubTime
|
|
// r.PublishTime = r.ListPubTime
|
|
// }
|
|
// }
|
|
@@ -101,8 +196,9 @@ import (
|
|
// if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
// if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
// ret.ListTrunPage = true
|
|
// ret.ListTrunPage = true
|
|
// break
|
|
// break
|
|
-// } else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
|
|
|
|
|
|
+// } else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
|
|
// if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
// if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
|
+// qu.Debug("翻页失败:", err)
|
|
// break T
|
|
// break T
|
|
// } else {
|
|
// } else {
|
|
// ret.ListTrunPage = true
|
|
// ret.ListTrunPage = true
|
|
@@ -113,119 +209,14 @@ import (
|
|
// //检查
|
|
// //检查
|
|
// for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
// for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
// r, _ := el.Value.(*be.ResultItem)
|
|
// r, _ := el.Value.(*be.ResultItem)
|
|
-// if ret.Title {
|
|
|
|
-// ret.Title = r.Title != ""
|
|
|
|
-// }
|
|
|
|
-// if ret.PublishUnit {
|
|
|
|
-// ret.PublishUnit = r.PublishUnit != ""
|
|
|
|
-// }
|
|
|
|
-// if ret.PublishTime {
|
|
|
|
-// ret.PublishTime = r.PublishTime != ""
|
|
|
|
-// }
|
|
|
|
-// if ret.Content {
|
|
|
|
-// ret.Content = r.Content != ""
|
|
|
|
-// }
|
|
|
|
-// if ret.Attaches {
|
|
|
|
-// ret.Attaches = len(r.AttachLinks) > 0
|
|
|
|
-// }
|
|
|
|
|
|
+// ret.Title = r.Title != ""
|
|
|
|
+// qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
|
+// ret.PublishTime = r.PublishTime != ""
|
|
|
|
+// qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
// }
|
|
// }
|
|
-// qu.Debug(verifyResult.Len())
|
|
|
|
// if ret.ListItems {
|
|
// if ret.ListItems {
|
|
// ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
// ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
// }
|
|
// }
|
|
//
|
|
//
|
|
-// //TODO:每次验证结果存库、内存?
|
|
|
|
// return ret, nil
|
|
// return ret, nil
|
|
//}
|
|
//}
|
|
-
|
|
|
|
-// VerifySpiderConfig 只验证列表标注
|
|
|
|
-func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
|
- qu.Debug("sc---", *sc)
|
|
|
|
- verifyResult := list.New()
|
|
|
|
- ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
|
- _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
|
|
- defer func() {
|
|
|
|
- incCancelFn()
|
|
|
|
- baseCancelFn()
|
|
|
|
- }()
|
|
|
|
-
|
|
|
|
- listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
|
- //TODO 2. 执行JS代码,获取列表页信息
|
|
|
|
- if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
|
|
|
|
- listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
|
- }
|
|
|
|
- if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
|
|
|
|
- contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
|
- }
|
|
|
|
- qu.Debug("列表页JS:", listRunJs)
|
|
|
|
- //TODO 3.打开列表,获取条目清单
|
|
|
|
- chromedp.Run(ctx, chromedp.Tasks{
|
|
|
|
- chromedp.Navigate(sc.Href),
|
|
|
|
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
- //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
|
- chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
|
- })
|
|
|
|
- no := 1
|
|
|
|
-T:
|
|
|
|
- for j := 0; j < 2; j++ { //最多检查2页
|
|
|
|
- qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
|
- listResult := make(be.ResultItems, 0)
|
|
|
|
- err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
|
- chromedp.Evaluate(listRunJs, &listResult),
|
|
|
|
- })
|
|
|
|
- if err != nil {
|
|
|
|
- qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
|
- continue
|
|
|
|
- }
|
|
|
|
- //TODO 5.操作详情页
|
|
|
|
- qu.Debug("列表采集条数:", len(listResult))
|
|
|
|
- for contentIndex, r := range listResult {
|
|
|
|
- if contentIndex > 1 { //每页只校验2条
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
|
- r.Site = sc.Site
|
|
|
|
- r.Channel = sc.Channel
|
|
|
|
- qu.Debug(r.Title, r.ListTitle)
|
|
|
|
- if r.Title == "" {
|
|
|
|
- r.Title = r.ListTitle
|
|
|
|
- }
|
|
|
|
- qu.Debug(r.PublishTime, r.ListPubTime)
|
|
|
|
- if r.PublishTime == "" {
|
|
|
|
- r.PublishTime = r.ListPubTime
|
|
|
|
- }
|
|
|
|
- r.No = no
|
|
|
|
- no += 1
|
|
|
|
- //结果放入缓存
|
|
|
|
- verifyResult.PushBack(r)
|
|
|
|
- }
|
|
|
|
- qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
|
|
- //TODO 6.翻页
|
|
|
|
- if verifyResult.Len() > 0 {
|
|
|
|
- if sc.MaxPages == 1 { //最大页为1,不校验翻页
|
|
|
|
- ret.ListTrunPage = true
|
|
|
|
- break
|
|
|
|
- } else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
|
|
|
|
- if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
|
|
|
|
- qu.Debug("翻页失败:", err)
|
|
|
|
- break T
|
|
|
|
- } else {
|
|
|
|
- ret.ListTrunPage = true
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- //检查
|
|
|
|
- for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
|
- r, _ := el.Value.(*be.ResultItem)
|
|
|
|
- ret.Title = r.Title != ""
|
|
|
|
- qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
|
- ret.PublishTime = r.PublishTime != ""
|
|
|
|
- qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
|
|
- }
|
|
|
|
- if ret.ListItems {
|
|
|
|
- ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return ret, nil
|
|
|
|
-}
|
|
|