|
@@ -2,6 +2,7 @@ package vm
|
|
|
|
|
|
import (
|
|
import (
|
|
"container/list"
|
|
"container/list"
|
|
|
|
+ "fmt"
|
|
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
be "spider_creator/backend"
|
|
be "spider_creator/backend"
|
|
"time"
|
|
"time"
|
|
@@ -10,15 +11,133 @@ import (
|
|
)
|
|
)
|
|
|
|
|
|
// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
|
|
|
|
+//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
|
|
+// qu.Debug("sc---", *sc)
|
|
|
|
+// verifyResult := list.New()
|
|
|
|
+// ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
|
|
|
|
+// _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
|
|
+// _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
|
|
|
|
+// defer func() {
|
|
|
|
+// incCancelFn2()
|
|
|
|
+// baseCancelFn2()
|
|
|
|
+// incCancelFn()
|
|
|
|
+// baseCancelFn()
|
|
|
|
+// }()
|
|
|
|
+//
|
|
|
|
+// listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
|
|
|
|
+// //TODO 2. 执行JS代码,获取列表页信息
|
|
|
|
+// if listRunJs == "" {
|
|
|
|
+// listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
|
|
+// }
|
|
|
|
+// if contentRunJs == "" {
|
|
|
|
+// contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
|
|
+// }
|
|
|
|
+// qu.Debug("列表页JS:", listRunJs)
|
|
|
|
+// qu.Debug("详情页JS:", contentRunJs)
|
|
|
|
+// //TODO 3.打开列表,获取条目清单
|
|
|
|
+// chromedp.Run(ctx, chromedp.Tasks{
|
|
|
|
+// chromedp.Navigate(sc.Href),
|
|
|
|
+// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
+// //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
|
+// chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
|
|
|
|
+// })
|
|
|
|
+// no := 1
|
|
|
|
+//T:
|
|
|
|
+// for j := 0; j < 2; j++ { //最多检查2页
|
|
|
|
+// qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
|
|
+// listResult := make(be.ResultItems, 0)
|
|
|
|
+// err := chromedp.Run(ctx, chromedp.Tasks{
|
|
|
|
+// chromedp.Evaluate(listRunJs, &listResult),
|
|
|
|
+// })
|
|
|
|
+// if err != nil {
|
|
|
|
+// qu.Debug("执行列表页JS代码失败", err.Error())
|
|
|
|
+// continue
|
|
|
|
+// }
|
|
|
|
+// //TODO 5.操作详情页
|
|
|
|
+// qu.Debug("列表采集条数:", len(listResult))
|
|
|
|
+// for contentIndex, r := range listResult {
|
|
|
|
+// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
|
|
+// if contentIndex > 1 { //每页只采集2条
|
|
|
|
+// break
|
|
|
|
+// }
|
|
|
|
+// //打开详情页
|
|
|
|
+// err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
+// chromedp.Navigate(r.Href),
|
|
|
|
+// chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
+// chromedp.Sleep(2000 * time.Millisecond),
|
|
|
|
+// //chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
|
|
|
|
+// })
|
|
|
|
+// if err != nil {
|
|
|
|
+// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
|
|
|
|
+// continue
|
|
|
|
+// }
|
|
|
|
+// //获取详情页内容
|
|
|
|
+// err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
+// chromedp.Evaluate(contentRunJs, r),
|
|
|
|
+// })
|
|
|
|
+// if err != nil {
|
|
|
|
+// qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
|
|
|
|
+// continue
|
|
|
|
+// }
|
|
|
|
+// if sc.AttachCss != "" {
|
|
|
|
+// downloadAttaches(r, vm.attachesDir)
|
|
|
|
+// }
|
|
|
|
+// r.Site = sc.Site
|
|
|
|
+// r.Channel = sc.Channel
|
|
|
|
+// if r.Title == "" {
|
|
|
|
+// r.Title = r.ListTitle
|
|
|
|
+// }
|
|
|
|
+// if r.PublishTime == "" {
|
|
|
|
+// r.PublishTime = r.ListPubTime
|
|
|
|
+// }
|
|
|
|
+// r.No = no
|
|
|
|
+// no += 1
|
|
|
|
+// //结果放入缓存
|
|
|
|
+// verifyResult.PushBack(r)
|
|
|
|
+// }
|
|
|
|
+// qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
|
|
+// //TODO 6.翻页
|
|
|
|
+// if err = trunPage(sc, 2000, ctx); err != nil {
|
|
|
|
+// //if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil {
|
|
|
|
+// ret.ListTrunPage = false
|
|
|
|
+// break T
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// //检查
|
|
|
|
+// for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
|
|
+// r, _ := el.Value.(*be.ResultItem)
|
|
|
|
+// if ret.Title {
|
|
|
|
+// ret.Title = r.Title != ""
|
|
|
|
+// }
|
|
|
|
+// if ret.PublishUnit {
|
|
|
|
+// ret.PublishUnit = r.PublishUnit != ""
|
|
|
|
+// }
|
|
|
|
+// if ret.PublishTime {
|
|
|
|
+// ret.PublishTime = r.PublishTime != ""
|
|
|
|
+// }
|
|
|
|
+// if ret.Content {
|
|
|
|
+// ret.Content = r.Content != ""
|
|
|
|
+// }
|
|
|
|
+// if ret.Attaches {
|
|
|
|
+// ret.Attaches = len(r.AttachLinks) > 0
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// qu.Debug(verifyResult.Len())
|
|
|
|
+// if ret.ListItems {
|
|
|
|
+// ret.ListItems = verifyResult.Len() > 2
|
|
|
|
+// }
|
|
|
|
+//
|
|
|
|
+// //TODO:每次验证结果存库、内存?
|
|
|
|
+// return ret, nil
|
|
|
|
+//}
|
|
|
|
+
|
|
|
|
+// VerifySpiderConfig 只验证列表标注
|
|
func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
|
|
qu.Debug("sc---", *sc)
|
|
qu.Debug("sc---", *sc)
|
|
verifyResult := list.New()
|
|
verifyResult := list.New()
|
|
- ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
|
|
|
|
- _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
|
|
- _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
|
|
|
|
|
|
+ ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
|
|
|
|
+ _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
|
|
defer func() {
|
|
defer func() {
|
|
- incCancelFn2()
|
|
|
|
- baseCancelFn2()
|
|
|
|
incCancelFn()
|
|
incCancelFn()
|
|
baseCancelFn()
|
|
baseCancelFn()
|
|
}()
|
|
}()
|
|
@@ -31,6 +150,8 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
|
|
if contentRunJs == "" {
|
|
if contentRunJs == "" {
|
|
contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
contentRunJs = renderJavascriptCoder(loadContentJS, sc)
|
|
}
|
|
}
|
|
|
|
+ qu.Debug("列表页JS:", listRunJs)
|
|
|
|
+ qu.Debug("详情页JS:", contentRunJs)
|
|
//TODO 3.打开列表,获取条目清单
|
|
//TODO 3.打开列表,获取条目清单
|
|
chromedp.Run(ctx, chromedp.Tasks{
|
|
chromedp.Run(ctx, chromedp.Tasks{
|
|
chromedp.Navigate(sc.Href),
|
|
chromedp.Navigate(sc.Href),
|
|
@@ -41,44 +162,29 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
|
|
no := 1
|
|
no := 1
|
|
T:
|
|
T:
|
|
for j := 0; j < 2; j++ { //最多检查2页
|
|
for j := 0; j < 2; j++ { //最多检查2页
|
|
|
|
+ qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
|
|
listResult := make(be.ResultItems, 0)
|
|
listResult := make(be.ResultItems, 0)
|
|
err := chromedp.Run(ctx, chromedp.Tasks{
|
|
err := chromedp.Run(ctx, chromedp.Tasks{
|
|
chromedp.Evaluate(listRunJs, &listResult),
|
|
chromedp.Evaluate(listRunJs, &listResult),
|
|
})
|
|
})
|
|
if err != nil {
|
|
if err != nil {
|
|
- qu.Debug("执行JS代码失败", err.Error())
|
|
|
|
|
|
+ qu.Debug("执行列表页JS代码失败", err.Error())
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
//TODO 5.操作详情页
|
|
//TODO 5.操作详情页
|
|
|
|
+ qu.Debug("列表采集条数:", len(listResult))
|
|
for contentIndex, r := range listResult {
|
|
for contentIndex, r := range listResult {
|
|
if contentIndex > 1 { //每页只采集2条
|
|
if contentIndex > 1 { //每页只采集2条
|
|
break
|
|
break
|
|
}
|
|
}
|
|
- //打开详情页
|
|
|
|
- err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
- chromedp.Navigate(r.Href),
|
|
|
|
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
|
|
|
|
- //chromedp.Sleep(1000 * time.Millisecond),
|
|
|
|
- chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
|
|
|
|
- })
|
|
|
|
- if err != nil {
|
|
|
|
- continue
|
|
|
|
- }
|
|
|
|
- //获取详情页内容
|
|
|
|
- err = chromedp.Run(ctx2, chromedp.Tasks{
|
|
|
|
- chromedp.Evaluate(contentRunJs, r),
|
|
|
|
- })
|
|
|
|
- if err != nil {
|
|
|
|
- continue
|
|
|
|
- }
|
|
|
|
- if sc.AttachCss != "" {
|
|
|
|
- downloadAttaches(r, vm.attachesDir)
|
|
|
|
- }
|
|
|
|
|
|
+ qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
|
|
r.Site = sc.Site
|
|
r.Site = sc.Site
|
|
r.Channel = sc.Channel
|
|
r.Channel = sc.Channel
|
|
|
|
+ qu.Debug(r.Title, r.ListTitle)
|
|
if r.Title == "" {
|
|
if r.Title == "" {
|
|
r.Title = r.ListTitle
|
|
r.Title = r.ListTitle
|
|
}
|
|
}
|
|
|
|
+ qu.Debug(r.PublishTime, r.ListPubTime)
|
|
if r.PublishTime == "" {
|
|
if r.PublishTime == "" {
|
|
r.PublishTime = r.ListPubTime
|
|
r.PublishTime = r.ListPubTime
|
|
}
|
|
}
|
|
@@ -87,38 +193,28 @@ T:
|
|
//结果放入缓存
|
|
//结果放入缓存
|
|
verifyResult.PushBack(r)
|
|
verifyResult.PushBack(r)
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+ qu.Debug("列表采集条数结果:", verifyResult.Len())
|
|
//TODO 6.翻页
|
|
//TODO 6.翻页
|
|
- //if err = trunPage(sc, 2000, ctx); err != nil {
|
|
|
|
- if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil {
|
|
|
|
- ret.ListTrunPage = false
|
|
|
|
- break T
|
|
|
|
|
|
+ if len(listResult) > 0 && !ret.ListTrunPage {
|
|
|
|
+ if err = trunPage(sc, 2000, ctx); err != nil { //翻页失败
|
|
|
|
+ break T
|
|
|
|
+ } else {
|
|
|
|
+ ret.ListTrunPage = true
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//检查
|
|
//检查
|
|
for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
for el := verifyResult.Front(); el != nil; el = el.Next() {
|
|
r, _ := el.Value.(*be.ResultItem)
|
|
r, _ := el.Value.(*be.ResultItem)
|
|
- if ret.Title {
|
|
|
|
- ret.Title = r.Title != ""
|
|
|
|
- }
|
|
|
|
- if ret.PublishUnit {
|
|
|
|
- ret.PublishUnit = r.PublishUnit != ""
|
|
|
|
- }
|
|
|
|
- if ret.PublishTime {
|
|
|
|
- ret.PublishTime = r.PublishTime != ""
|
|
|
|
- }
|
|
|
|
- if ret.Content {
|
|
|
|
- ret.Content = r.Content != ""
|
|
|
|
- }
|
|
|
|
- if ret.Attaches {
|
|
|
|
- ret.Attaches = len(r.AttachLinks) > 0
|
|
|
|
- }
|
|
|
|
|
|
+ qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
|
|
|
|
+ ret.Title = r.Title != ""
|
|
|
|
+ qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
|
|
|
|
+ ret.PublishTime = r.PublishTime != ""
|
|
}
|
|
}
|
|
qu.Debug(verifyResult.Len())
|
|
qu.Debug(verifyResult.Len())
|
|
if ret.ListItems {
|
|
if ret.ListItems {
|
|
ret.ListItems = verifyResult.Len() > 2
|
|
ret.ListItems = verifyResult.Len() > 2
|
|
}
|
|
}
|
|
|
|
|
|
- //TODO:每次验证结果存库、内存?
|
|
|
|
return ret, nil
|
|
return ret, nil
|
|
}
|
|
}
|