|
@@ -5,7 +5,7 @@ import (
|
|
_ "embed"
|
|
_ "embed"
|
|
"fmt"
|
|
"fmt"
|
|
"github.com/chromedp/chromedp"
|
|
"github.com/chromedp/chromedp"
|
|
- "log"
|
|
|
|
|
|
+ qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
be "spider_creator/backend"
|
|
be "spider_creator/backend"
|
|
ai "spider_creator/backend/ai"
|
|
ai "spider_creator/backend/ai"
|
|
"strconv"
|
|
"strconv"
|
|
@@ -22,8 +22,9 @@ func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
|
|
// RunSpider
|
|
// RunSpider
|
|
func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
|
|
func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
|
|
sc, err := be.NewSpiderConfig(cssMark)
|
|
sc, err := be.NewSpiderConfig(cssMark)
|
|
|
|
+ qu.Debug("sc---", *sc)
|
|
if err != nil {
|
|
if err != nil {
|
|
- log.Println("标注信息传输失败!")
|
|
|
|
|
|
+ qu.Debug("标注信息传输失败!")
|
|
vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
|
|
vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
|
|
return
|
|
return
|
|
}
|
|
}
|
|
@@ -31,12 +32,12 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
|
|
sc.Href = url
|
|
sc.Href = url
|
|
}
|
|
}
|
|
_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
|
|
_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
|
|
- log.Println("1浏览器打开", *sc)
|
|
|
|
|
|
+ qu.Debug("1浏览器打开", *sc)
|
|
vm.dnf.Dispatch("debug_event", "1 浏览器打开")
|
|
vm.dnf.Dispatch("debug_event", "1 浏览器打开")
|
|
defer func() {
|
|
defer func() {
|
|
cancel()
|
|
cancel()
|
|
baseCancel()
|
|
baseCancel()
|
|
- log.Println("0浏览器已经销毁")
|
|
|
|
|
|
+ qu.Debug("0浏览器已经销毁")
|
|
vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
|
|
vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
|
|
close(exit)
|
|
close(exit)
|
|
}()
|
|
}()
|
|
@@ -46,24 +47,24 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
|
|
chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
|
|
chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
|
|
})
|
|
})
|
|
vm.dnf.Dispatch("debug_event", "2 页面已经打开")
|
|
vm.dnf.Dispatch("debug_event", "2 页面已经打开")
|
|
- log.Println("2页面打开")
|
|
|
|
|
|
+ qu.Debug("2页面打开")
|
|
var runJs string = sc.ListJSCode
|
|
var runJs string = sc.ListJSCode
|
|
listResult := make(be.ResultItems, 0)
|
|
listResult := make(be.ResultItems, 0)
|
|
//TODO 2. 执行JS代码,获取列表页信息
|
|
//TODO 2. 执行JS代码,获取列表页信息
|
|
if runJs == "" {
|
|
if runJs == "" {
|
|
runJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
runJs = renderJavascriptCoder(loadListItemsJS, sc)
|
|
}
|
|
}
|
|
- //log.Println("execute list jscode", runJs)
|
|
|
|
|
|
+ //qu.Debug("execute list jscode", runJs)
|
|
err = chromedp.Run(ctx, chromedp.Tasks{
|
|
err = chromedp.Run(ctx, chromedp.Tasks{
|
|
chromedp.Evaluate(runJs, &listResult),
|
|
chromedp.Evaluate(runJs, &listResult),
|
|
})
|
|
})
|
|
if err != nil {
|
|
if err != nil {
|
|
- log.Println("执行JS代码失败", err.Error())
|
|
|
|
|
|
+ qu.Debug("执行JS代码失败", err.Error())
|
|
vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
|
|
vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
|
|
return
|
|
return
|
|
}
|
|
}
|
|
vm.dnf.Dispatch("debug_event", "3 获取列表完成")
|
|
vm.dnf.Dispatch("debug_event", "3 获取列表完成")
|
|
- log.Println("3获取列表完成")
|
|
|
|
|
|
+ qu.Debug("3获取列表完成")
|
|
|
|
|
|
//TODO 3. 打开详情页 ,最多打开10条
|
|
//TODO 3. 打开详情页 ,最多打开10条
|
|
runJs = sc.ContentJSCode
|
|
runJs = sc.ContentJSCode
|
|
@@ -72,7 +73,7 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
|
|
}
|
|
}
|
|
currentResult := list.New()
|
|
currentResult := list.New()
|
|
be.DataResults[sc.Code] = currentResult
|
|
be.DataResults[sc.Code] = currentResult
|
|
- //log.Println("execute content js", runJs)
|
|
|
|
|
|
+ //qu.Debug("execute content js", runJs)
|
|
for _, v := range listResult {
|
|
for _, v := range listResult {
|
|
select {
|
|
select {
|
|
case <-exit:
|
|
case <-exit:
|
|
@@ -87,7 +88,7 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
|
|
chromedp.Evaluate(runJs, v),
|
|
chromedp.Evaluate(runJs, v),
|
|
})
|
|
})
|
|
if err != nil {
|
|
if err != nil {
|
|
- log.Println("执行JS代码失败", err.Error())
|
|
|
|
|
|
+ qu.Debug("执行JS代码失败", err.Error())
|
|
}
|
|
}
|
|
if len(v.AttachLinks) > 0 { //有附件
|
|
if len(v.AttachLinks) > 0 { //有附件
|
|
vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
|
|
vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
|
|
@@ -103,7 +104,7 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
|
|
}
|
|
}
|
|
}
|
|
}
|
|
vm.dnf.Dispatch("debug_event", "5 采集测试完成")
|
|
vm.dnf.Dispatch("debug_event", "5 采集测试完成")
|
|
- log.Println("5采集测试完成")
|
|
|
|
|
|
+ qu.Debug("5采集测试完成")
|
|
}
|
|
}
|
|
|
|
|
|
// CountYestodayArts 统计昨日信息发布量
|
|
// CountYestodayArts 统计昨日信息发布量
|
|
@@ -111,12 +112,12 @@ func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64
|
|
headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
|
|
headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
|
|
sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
|
|
sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
|
|
_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
|
|
_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
|
|
- log.Println("1浏览器打开")
|
|
|
|
|
|
+ qu.Debug("1浏览器打开")
|
|
vm.dnf.Dispatch("debug_event", "1 浏览器打开")
|
|
vm.dnf.Dispatch("debug_event", "1 浏览器打开")
|
|
defer func() {
|
|
defer func() {
|
|
cancel()
|
|
cancel()
|
|
baseCancel()
|
|
baseCancel()
|
|
- log.Println("0浏览器已经销毁")
|
|
|
|
|
|
+ qu.Debug("0浏览器已经销毁")
|
|
vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
|
|
vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
|
|
vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
|
|
vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
|
|
close(exit)
|
|
close(exit)
|
|
@@ -135,7 +136,7 @@ func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64
|
|
chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
|
|
chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
|
|
})
|
|
})
|
|
vm.dnf.Dispatch("debug_event", "2 页面已经打开")
|
|
vm.dnf.Dispatch("debug_event", "2 页面已经打开")
|
|
- log.Println("2页面打开")
|
|
|
|
|
|
+ qu.Debug("2页面打开")
|
|
//TODO 2. 执行JS代码,获取列表页信息
|
|
//TODO 2. 执行JS代码,获取列表页信息
|
|
runJs := renderJavascriptCoder(loadListItemsJS, sc)
|
|
runJs := renderJavascriptCoder(loadListItemsJS, sc)
|
|
tmp := map[string]bool{}
|
|
tmp := map[string]bool{}
|
|
@@ -151,7 +152,7 @@ func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64
|
|
chromedp.Evaluate(runJs, &listResult),
|
|
chromedp.Evaluate(runJs, &listResult),
|
|
})
|
|
})
|
|
if err != nil {
|
|
if err != nil {
|
|
- log.Println("执行JS代码失败", err.Error())
|
|
|
|
|
|
+ qu.Debug("执行JS代码失败", err.Error())
|
|
vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
|
|
vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
|
|
return
|
|
return
|
|
}
|
|
}
|