Kaynağa Gözat

列表页面初始化功能

mxs 9 ay önce
ebeveyn
işleme
8f94e34657

+ 45 - 3
backend/browser.go

@@ -2,8 +2,11 @@ package backend
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
+	"io/ioutil"
 	"math/rand"
+	"net/http"
 
 	"github.com/chromedp/cdproto/page"
 
@@ -61,7 +64,7 @@ var (
 	}
 )
 
-func NewBrowser(headless bool, showImage bool, proxyServe string) (
+func NewBrowser(headless bool, showImage bool, proxyServe bool) (
 	context.Context, context.CancelFunc,
 	context.Context, context.CancelFunc,
 	context.Context, context.CancelFunc,
@@ -86,9 +89,10 @@ func NewBrowser(headless bool, showImage bool, proxyServe string) (
 		chromedp.Flag("mute-audio", false),
 		chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`),
 	)
-	if proxyServe != "" {
+	if proxyServe {
 		chromeOptions = append(chromeOptions,
-			chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyServe)),
+			//chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyServe)),
+			chromedp.ProxyServer(GetProxyAddr()),
 		)
 	}
 	if showImage {
@@ -114,3 +118,41 @@ func NewBrowser(headless bool, showImage bool, proxyServe string) (
 	)
 	return ctx, cancelFn, allocCtx, allocCancelFn, incCtx, incCancelFn
 }
+
+func GetProxyAddr() string {
+	proxyAddr := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+	roxyAuthor := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+	//获取代理
+	req, err := http.NewRequest(http.MethodGet, proxyAddr, nil)
+	if err != nil {
+		fmt.Println("get proxy request err:", err)
+		return ""
+	}
+	//添加请求头
+	req.Header.Add("Authorization", roxyAuthor)
+	client := http.Client{}
+	//发送请求
+	resp, err := client.Do(req)
+	if err != nil {
+		fmt.Println("get proxy client err:", err)
+		return ""
+	}
+	defer resp.Body.Close()
+	bodyByte, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		fmt.Println("get proxy read body err:", err)
+		return ""
+	}
+	tmp := map[string]interface{}{}
+	if json.Unmarshal(bodyByte, &tmp) != nil {
+		return ""
+	}
+	if data, ok := tmp["data"].(map[string]interface{}); ok && len(data) > 0 {
+		if httpProxy, ok := data["http"].(string); ok {
+			return httpProxy
+		} else if httpsProxy, ok := data["https"].(string); ok {
+			return httpsProxy
+		}
+	}
+	return ""
+}

+ 47 - 20
backend/types.go

@@ -9,6 +9,18 @@ import (
 const (
 	JOB_RUNNING_EVENT_PROGRESS = 1
 	JOB_RUNNING_EVENT_DEBUG    = 0
+
+	//动作执行结果
+	RUN_ACTION_NOTRUN  = 0 //未执行
+	RUN_ACTION_SUCCESS = 1 //执行成功
+	RUN_ACTION_ERROR   = 2 //执行异常
+	RUN_ACTION_TIMEOUT = 3 //执行超时
+	//动作执行检查结果
+	CHECK_ACTION_NOTCHECK = 0
+	CHECK_ACTION_SUCCESS  = 1
+	CHECK_ACTION_ERROR    = 2
+	CHECK_ACTION_TIMEOUT  = 3
+	CHECH_RESULT          = "true"
 )
 
 var (
@@ -20,26 +32,27 @@ var (
 type (
 	//爬虫配置信息
 	SpiderConfig struct {
-		Site               string `json:"site"`
-		Channel            string `json:"channel"`
-		ModifyUser         string `json:"modifyuser"`
-		Href               string `json:"href"`
-		Code               string `json:"code"`
-		ListBodyCss        string `json:"listBodyCss"` //用于判断是否翻页成功
-		ListItemCss        string `json:"listItemCss"`
-		ListLinkCss        string `json:"listLinkCss"`
-		ListPubtimeCss     string `json:"listPublishTimeCss"`
-		ListNextPageCss    string `json:"listNextPageCss"`
-		TitleCss           string `json:"titleCss"`
-		PublishUnitCss     string `json:"publishUnitCss"`
-		PublishTimeCss     string `json:"publishTimeCss"`
-		ContentCss         string `json:"contentCss"`
-		AttachCss          string `json:"attachCss"`
-		ListJSCode         string `json:"listJs"`
-		ContentJSCode      string `json:"contentJs"`
-		AttachJSCode       string `json:"attachJs"` //无效
-		ListTurnPageJSCode string `json:"listTurnPageJs"`
-		MaxPages           int64  `json:"maxPages"`
+		Site               string     `json:"site"`
+		Channel            string     `json:"channel"`
+		ModifyUser         string     `json:"modifyuser"`
+		Href               string     `json:"href"`
+		Code               string     `json:"code"`
+		InitList           []*Actions `json:"initList"`
+		ListBodyCss        string     `json:"listBodyCss"` //用于判断是否翻页成功
+		ListItemCss        string     `json:"listItemCss"`
+		ListLinkCss        string     `json:"listLinkCss"`
+		ListPubtimeCss     string     `json:"listPublishTimeCss"`
+		ListNextPageCss    string     `json:"listNextPageCss"`
+		TitleCss           string     `json:"titleCss"`
+		PublishUnitCss     string     `json:"publishUnitCss"`
+		PublishTimeCss     string     `json:"publishTimeCss"`
+		ContentCss         string     `json:"contentCss"`
+		AttachCss          string     `json:"attachCss"`
+		ListJSCode         string     `json:"listJs"`
+		ContentJSCode      string     `json:"contentJs"`
+		AttachJSCode       string     `json:"attachJs"` //无效
+		ListTurnPageJSCode string     `json:"listTurnPageJs"`
+		MaxPages           int64      `json:"maxPages"`
 		//延时
 		ListDelayTime     int64 `json:"listDelayTime"`
 		ListTurnDelayTime int64 `json:"listTurnDelayTime"`
@@ -54,6 +67,20 @@ type (
 		FileSize string `json:"fileSize"`
 		FilePath string `json:"filePath"`
 	}
+	Actions struct {
+		ActionJs  string `json:"actionJs"`
+		CheckJs   string `json:"checkJs"`
+		SleepTime int64  `json:"sleepTime"`
+	}
+	//动作执行与检查
+	ActionRunResult struct {
+		ActionNum   int    `json:"actionNum"`
+		ActionJs    string `json:"actionJs`
+		CheckJs     string `json:"checkJs"`
+		Result      bool   `json:"result"`
+		RunResult   int    `json:"runResult"`
+		CheckResult int    `json:"checkResult"`
+	}
 	//爬取结果信息
 	ResultItem struct {
 		No          int           `json:"no"` //编号

+ 145 - 135
backend/vm/check.go

@@ -2,6 +2,7 @@ package vm
 
 import (
 	"container/list"
+	"errors"
 	"fmt"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	be "spider_creator/backend"
@@ -11,16 +12,135 @@ import (
 )
 
 // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
+//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
+//	qu.Debug("sc---", *sc)
+//	verifyResult := list.New()
+//	be.DataResults[sc.Code] = verifyResult
+//	ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
+//	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false)    //列表页使用
+//	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, false) //详情页使用
+//	defer func() {
+//		incCancelFn2()
+//		baseCancelFn2()
+//		incCancelFn()
+//		baseCancelFn()
+//	}()
+//
+//	listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
+//	//2. 执行JS代码,获取列表页信息
+//	if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
+//		listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
+//	}
+//	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
+//		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
+//	}
+//	qu.Debug("获取列表页JS代码:", listRunJs)
+//	qu.Debug("获取详情页JS代码:", contentRunJs)
+//	//3.打开列表,获取条目清单
+//	chromedp.Run(ctx, chromedp.Tasks{
+//		chromedp.Navigate(sc.Href),
+//		chromedp.WaitReady("document.body", chromedp.ByJSPath),
+//		//chromedp.Sleep(1000 * time.Millisecond),
+//		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
+//	})
+//	//初始化列表页信息
+//	if !vm.InitListPage(ctx, sc) {
+//		qu.Debug("初始化列表页失败,退出")
+//		return ret, errors.New("初始化列表页失败")
+//	}
+//	no := 1
+//T:
+//	for j := 0; j < 2; j++ { //最多检查2页
+//		qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
+//		listResult := make(be.ResultItems, 0)
+//		err := chromedp.Run(ctx, chromedp.Tasks{
+//			chromedp.Evaluate(listRunJs, &listResult),
+//		})
+//		if err != nil {
+//			qu.Debug("执行列表页JS代码失败", err.Error())
+//			continue
+//		}
+//		//TODO 5.操作详情页
+//		qu.Debug("列表采集条数:", len(listResult))
+//		for contentIndex, r := range listResult {
+//			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
+//			if contentIndex > 1 { //每页只校验2条
+//				break
+//			}
+//			//打开详情页
+//			err = chromedp.Run(ctx2, chromedp.Tasks{
+//				chromedp.Navigate(r.Href),
+//				chromedp.WaitReady("document.body", chromedp.ByJSPath),
+//				//chromedp.Sleep(2000 * time.Millisecond),
+//				chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
+//			})
+//			if err != nil {
+//				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
+//				continue
+//			}
+//			//获取详情页内容
+//			err = chromedp.Run(ctx2, chromedp.Tasks{
+//				chromedp.Evaluate(contentRunJs, r),
+//			})
+//			if err != nil {
+//				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
+//				continue
+//			}
+//			//下载附件
+//			if sc.AttachCss != "" {
+//				downloadAttaches(r, vm.attachesDir)
+//			}
+//			r.Site = sc.Site
+//			r.Channel = sc.Channel
+//			if r.Title == "" {
+//				r.Title = r.ListTitle
+//			}
+//			if r.PublishTime == "" {
+//				r.PublishTime = r.ListPubTime
+//			}
+//			r.No = no
+//			no += 1
+//			//结果放入缓存
+//			verifyResult.PushBack(r)
+//		}
+//		qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
+//		//翻页
+//		if verifyResult.Len() > 0 {
+//			if sc.MaxPages == 1 { //最大页为1,不校验翻页
+//				ret.ListTrunPage = true
+//				break
+//			} else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
+//				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
+//					qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
+//					break T
+//				} else {
+//					ret.ListTrunPage = true
+//				}
+//			}
+//		}
+//	}
+//	//检查
+//	for el := verifyResult.Front(); el != nil; el = el.Next() {
+//		r, _ := el.Value.(*be.ResultItem)
+//		ret.Title = r.Title != ""
+//		ret.PublishUnit = r.PublishUnit != ""
+//		ret.PublishTime = r.PublishTime != ""
+//		ret.Content = r.Content != ""
+//		ret.Attaches = len(r.AttachLinks) > 0
+//	}
+//	qu.Debug(verifyResult.Len())
+//	ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
+//	return ret, nil
+//}
+
+// VerifySpiderConfig 只验证列表标注
 func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
 	qu.Debug("sc---", *sc)
 	verifyResult := list.New()
 	be.DataResults[sc.Code] = verifyResult
-	ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "")    //列表页使用
-	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
+	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
 	defer func() {
-		incCancelFn2()
-		baseCancelFn2()
 		incCancelFn()
 		baseCancelFn()
 	}()
@@ -33,15 +153,19 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
 	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
 		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
 	}
-	qu.Debug("获取列表页JS代码:", listRunJs)
-	qu.Debug("获取详情页JS代码:", contentRunJs)
-	//TODO 3.打开列表,获取条目清单
+	qu.Debug("列表页JS:", listRunJs)
+	//3.打开列表,获取条目清单
 	chromedp.Run(ctx, chromedp.Tasks{
 		chromedp.Navigate(sc.Href),
 		chromedp.WaitReady("document.body", chromedp.ByJSPath),
 		//chromedp.Sleep(1000 * time.Millisecond),
 		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
 	})
+	//4.初始化列表页信息
+	if !vm.InitListPage(ctx, sc) {
+		qu.Debug("初始化列表页失败,退出")
+		return ret, errors.New("初始化列表页失败")
+	}
 	no := 1
 T:
 	for j := 0; j < 2; j++ { //最多检查2页
@@ -54,41 +178,20 @@ T:
 			qu.Debug("执行列表页JS代码失败", err.Error())
 			continue
 		}
-		//TODO 5.操作详情页
+		//5.操作详情页
 		qu.Debug("列表采集条数:", len(listResult))
 		for contentIndex, r := range listResult {
-			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
 			if contentIndex > 1 { //每页只校验2条
 				break
 			}
-			//打开详情页
-			err = chromedp.Run(ctx2, chromedp.Tasks{
-				chromedp.Navigate(r.Href),
-				chromedp.WaitReady("document.body", chromedp.ByJSPath),
-				//chromedp.Sleep(2000 * time.Millisecond),
-				chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
-			})
-			if err != nil {
-				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
-				continue
-			}
-			//获取详情页内容
-			err = chromedp.Run(ctx2, chromedp.Tasks{
-				chromedp.Evaluate(contentRunJs, r),
-			})
-			if err != nil {
-				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
-				continue
-			}
-			//下载附件
-			if sc.AttachCss != "" {
-				downloadAttaches(r, vm.attachesDir)
-			}
+			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
 			r.Site = sc.Site
 			r.Channel = sc.Channel
+			//qu.Debug(r.Title, r.ListTitle)
 			if r.Title == "" {
 				r.Title = r.ListTitle
 			}
+			//qu.Debug(r.PublishTime, r.ListPubTime)
 			if r.PublishTime == "" {
 				r.PublishTime = r.ListPubTime
 			}
@@ -97,15 +200,15 @@ T:
 			//结果放入缓存
 			verifyResult.PushBack(r)
 		}
-		qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
-		//TODO 6.翻页
+		qu.Debug("列表采集条数结果:", verifyResult.Len())
+		//6.翻页
 		if verifyResult.Len() > 0 {
 			if sc.MaxPages == 1 { //最大页为1,不校验翻页
 				ret.ListTrunPage = true
 				break
-			} else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
+			} else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
 				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
-					qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
+					qu.Debug("翻页失败", err)
 					break T
 				} else {
 					ret.ListTrunPage = true
@@ -117,106 +220,13 @@ T:
 	for el := verifyResult.Front(); el != nil; el = el.Next() {
 		r, _ := el.Value.(*be.ResultItem)
 		ret.Title = r.Title != ""
-		ret.PublishUnit = r.PublishUnit != ""
+		qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
 		ret.PublishTime = r.PublishTime != ""
-		ret.Content = r.Content != ""
-		ret.Attaches = len(r.AttachLinks) > 0
+		qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
+	}
+	if ret.ListItems {
+		ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
 	}
-	qu.Debug(verifyResult.Len())
-	ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
 
-	//TODO:每次验证结果存库、内存?
 	return ret, nil
 }
-
-// VerifySpiderConfig 只验证列表标注
-//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
-//	qu.Debug("sc---", *sc)
-//	verifyResult := list.New()
-//	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
-//	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
-//	defer func() {
-//		incCancelFn()
-//		baseCancelFn()
-//	}()
-//
-//	listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
-//	//TODO 2. 执行JS代码,获取列表页信息
-//	if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
-//		listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
-//	}
-//	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
-//		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
-//	}
-//	qu.Debug("列表页JS:", listRunJs)
-//	//TODO 3.打开列表,获取条目清单
-//	chromedp.Run(ctx, chromedp.Tasks{
-//		chromedp.Navigate(sc.Href),
-//		chromedp.WaitReady("document.body", chromedp.ByJSPath),
-//		//chromedp.Sleep(1000 * time.Millisecond),
-//		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
-//	})
-//	no := 1
-//T:
-//	for j := 0; j < 2; j++ { //最多检查2页
-//		qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
-//		listResult := make(be.ResultItems, 0)
-//		err := chromedp.Run(ctx, chromedp.Tasks{
-//			chromedp.Evaluate(listRunJs, &listResult),
-//		})
-//		if err != nil {
-//			qu.Debug("执行列表页JS代码失败", err.Error())
-//			continue
-//		}
-//		//TODO 5.操作详情页
-//		qu.Debug("列表采集条数:", len(listResult))
-//		for contentIndex, r := range listResult {
-//			if contentIndex > 1 { //每页只校验2条
-//				break
-//			}
-//			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
-//			r.Site = sc.Site
-//			r.Channel = sc.Channel
-//			qu.Debug(r.Title, r.ListTitle)
-//			if r.Title == "" {
-//				r.Title = r.ListTitle
-//			}
-//			qu.Debug(r.PublishTime, r.ListPubTime)
-//			if r.PublishTime == "" {
-//				r.PublishTime = r.ListPubTime
-//			}
-//			r.No = no
-//			no += 1
-//			//结果放入缓存
-//			verifyResult.PushBack(r)
-//		}
-//		qu.Debug("列表采集条数结果:", verifyResult.Len())
-//		//TODO 6.翻页
-//		if verifyResult.Len() > 0 {
-//			if sc.MaxPages == 1 { //最大页为1,不校验翻页
-//				ret.ListTrunPage = true
-//				break
-//			} else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
-//				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
-//					qu.Debug("翻页失败:", err)
-//					break T
-//				} else {
-//					ret.ListTrunPage = true
-//				}
-//			}
-//		}
-//	}
-//	//检查
-//	for el := verifyResult.Front(); el != nil; el = el.Next() {
-//		r, _ := el.Value.(*be.ResultItem)
-//		ret.Title = r.Title != ""
-//		qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
-//		ret.PublishTime = r.PublishTime != ""
-//		qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
-//	}
-//	if ret.ListItems {
-//		ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
-//	}
-//
-//	return ret, nil
-//}

+ 2 - 2
backend/vm/jobs.go

@@ -56,8 +56,8 @@ func (vm *VM) RunJob(code string) {
 	vm.dnf.Dispatch("run_job_event", &be.JobRunningEvent{Code: job.Code, Act: be.JOB_RUNNING_EVENT_DEBUG, Msg: "加载作业完成"})
 	no := 1
 	//加载参数
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(true, false, "")    //列表页使用
-	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(true, false, "") //详情页使用
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(true, false, false)    //列表页使用
+	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(true, false, false) //详情页使用
 	defer func() {
 		job.State = 0
 		job.Progress = 0

+ 167 - 15
backend/vm/single.go

@@ -2,6 +2,7 @@ package vm
 
 import (
 	"container/list"
+	"context"
 	_ "embed"
 	"fmt"
 	"github.com/chromedp/chromedp"
@@ -19,7 +20,7 @@ func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
 	}
 }
 
-func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
+func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) {
 	sc, err := be.NewSpiderConfig(cssMark)
 	if err != nil {
 		qu.Debug("标注信息传输失败!")
@@ -47,7 +48,12 @@ func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, c
 	vm.dnf.Dispatch("debug_event", "2 页面已经打开")
 	qu.Debug("2页面打开")
 	var runJs string = sc.ListJSCode
-
+	//列表页信息初始化
+	vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
+	if !vm.InitListPage(ctx, sc) {
+		vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
+		return
+	}
 	//TODO 2. 执行JS代码,获取列表页信息
 	if be.RegSpace.ReplaceAllString(runJs, "") == "" {
 		runJs = renderJavascriptCoder(loadListItemsJS, sc)
@@ -100,7 +106,7 @@ func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, c
 }
 
 // RunSpider 适用于测试1页数据
-func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
+func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) {
 	sc, err := be.NewSpiderConfig(cssMark)
 	if err != nil {
 		qu.Debug("标注信息传输失败!")
@@ -126,10 +132,15 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
 		chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
 	})
 	vm.dnf.Dispatch("debug_event", "2 页面已经打开")
-	qu.Debug("2页面打开")
+	vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
+	//1、列表页信息初始化
+	if !vm.InitListPage(ctx, sc) {
+		vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
+		return
+	}
 	var runJs string = sc.ListJSCode
 	listResult := make(be.ResultItems, 0)
-	//TODO 2. 执行JS代码,获取列表页信息
+	//2、执行JS代码,获取列表页信息
 	if be.RegSpace.ReplaceAllString(runJs, "") == "" {
 		runJs = renderJavascriptCoder(loadListItemsJS, sc)
 	}
@@ -139,13 +150,12 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
 	})
 	if err != nil {
 		qu.Debug("执行JS代码失败", err.Error())
-		vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
+		vm.dnf.Dispatch("debug_event", "4 执行JS代码失败")
 		return
 	}
-	vm.dnf.Dispatch("debug_event", "3 获取列表完成")
+	vm.dnf.Dispatch("debug_event", "4 获取列表完成")
 	qu.Debug("3获取列表完成", len(listResult))
-
-	//TODO 3. 打开详情页 ,最多打开10条
+	//3、打开详情页
 	runJs = sc.ContentJSCode
 	if be.RegSpace.ReplaceAllString(runJs, "") == "" {
 		runJs = renderJavascriptCoder(loadContentJS, sc)
@@ -160,7 +170,7 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
 			return
 		default:
 			qu.Debug(v.No, v.Href, v.ListTitle, v.ListPubTime)
-			vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
+			vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
 			var result string = ""
 			err = chromedp.Run(ctx, chromedp.Tasks{
 				chromedp.Navigate(v.Href),
@@ -176,27 +186,169 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
 				qu.Debug("执行JS代码失败", err.Error())
 			}
 			if len(v.AttachLinks) > 0 { //有附件
-				vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
-				//TODO 下载附件
+				vm.dnf.Dispatch("debug_event", fmt.Sprintf("6. 下载附件"))
+				//4、下载附件
 				downloadAttaches(v, vm.attachesDir)
 			}
 			//关闭当前TAB页
 			chromedp.Run(ctx, chromedp.Tasks{
 				chromedp.Evaluate(`var ret="";window.close();ret`, &result),
 			})
-			vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
+			vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 下载详情页 %s 完成", v.No, v.Title))
 			currentResult.PushBack(v)
 		}
 	}
-	vm.dnf.Dispatch("debug_event", "5 采集测试完成")
+	vm.dnf.Dispatch("debug_event", "7 采集测试完成")
 	qu.Debug("5采集测试完成")
 }
 
+// InitPage 初始化页面
+func (vm *VM) InitListPage(ctx context.Context, sc *be.SpiderConfig) (initPage bool) {
+	//	var initPage []*be.Actions
+	//	initPage = append(initPage, &be.Actions{
+	//		Action: `var initHtml = document.querySelector("#showList").outerText;var clicklabel = document.querySelector("#catefirst > a:nth-child(3)");if(clicklabel)clicklabel.click();"";`,
+	//		//Check: `var oneHtml = document.querySelector("#showList").outerText;(oneHtml === initHtml)?"false":"true"`,
+	//		Sleep: 1000,
+	//	})
+	//	initPage = append(initPage, &be.Actions{
+	//		Action: `var clicklabel = document.querySelector("#zfcg > a:nth-child(2)");if(clicklabel)clicklabel.click();"";`,
+	//		//Check:  `var twoHtml = document.querySelector("#showList").outerText;(twoHtml === oneHtml)?"false":"true"`,
+	//		Sleep: 1000,
+	//	})
+	//	sc.InitList = initPage
+	if len(sc.InitList) == 0 { //没有初始化页面行为
+		return true
+	}
+	for j, ac := range sc.InitList {
+		arc := vm.RunAction(ctx, ac, j) //itype 0:执行成功 1:执行错误 2:超时
+		if !arc.Result {                //动作执行失败,不再执行后续动作
+			return false
+		}
+	}
+	return true
+}
+
+// RunAction 执行动作
+func (vm *VM) RunAction(ctx context.Context, ac *be.Actions, num int) *be.ActionRunResult {
+	ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(ac.SleepTime+5000)*time.Millisecond)
+	defer cancel()
+	done := make(chan *be.ActionRunResult)
+	go func() {
+		for {
+			select {
+			case <-ctxTmp.Done():
+				done <- &be.ActionRunResult{
+					Result:      false,
+					RunResult:   be.RUN_ACTION_TIMEOUT,
+					CheckResult: be.CHECK_ACTION_TIMEOUT,
+				}
+				vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS,超时")
+				qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS,超时")
+				return
+			default:
+				//执行动作
+				vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS")
+				qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS:", ac.ActionJs)
+				var result string
+				err := chromedp.Run(ctx, chromedp.Tasks{
+					chromedp.Evaluate(ac.ActionJs, &result),
+					chromedp.Sleep(time.Duration(ac.SleepTime) * time.Millisecond),
+				})
+				if err != nil {
+					done <- &be.ActionRunResult{
+						Result:      false,
+						RunResult:   be.RUN_ACTION_ERROR,
+						CheckResult: be.CHECK_ACTION_NOTCHECK,
+					}
+					vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常")
+					qu.Debug("3.1 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS,异常")
+					return
+				}
+				//检查结果
+				var checkResult string
+				if ac.CheckJs != "" {
+					vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS")
+					qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS:", ac.CheckJs)
+					err = chromedp.Run(ctx, chromedp.Tasks{
+						chromedp.Evaluate(ac.CheckJs, &checkResult),
+					})
+					if err != nil {
+						vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常")
+						qu.Debug("3.2 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作检查JS,异常")
+						done <- &be.ActionRunResult{
+							Result:      false,
+							RunResult:   be.RUN_ACTION_SUCCESS,
+							CheckResult: be.CHECK_ACTION_ERROR,
+						}
+						return
+					}
+					vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:"+checkResult)
+					qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:" + checkResult)
+					done <- &be.ActionRunResult{
+						Result:      checkResult == be.CHECH_RESULT,
+						RunResult:   be.RUN_ACTION_SUCCESS,
+						CheckResult: be.CHECK_ACTION_SUCCESS,
+					}
+					return
+				}
+				vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:true")
+				qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:true")
+				done <- &be.ActionRunResult{
+					Result:      true,
+					RunResult:   be.RUN_ACTION_SUCCESS,
+					CheckResult: be.CHECK_ACTION_NOTCHECK,
+				}
+				return
+			}
+		}
+	}()
+	return <-done
+}
+
+// InitPageTmp 初始化页面
+func (vm *VM) InitPageTmp(ctx context.Context, timeout int) bool {
+	//1、页面初始化需要执行的事件(多个动作)
+	initPageJs := `var clicklabel = document.querySelector("#app-base > div > div.IndexContent > div > div > form > div > div.ant-col.ant-col-4 > button.ant-btn.ant-btn-primary > span");if(clicklabel)clicklabel.click();"";`
+	var result string
+	err := chromedp.Run(ctx, chromedp.Tasks{
+		chromedp.Evaluate(initPageJs, &result),
+		//chromedp.WaitReady(".ant-list-items"),
+	})
+	if err != nil {
+		qu.Debug("初始化页面JS执行失败", err.Error())
+		return false
+	}
+	ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
+	defer cancel()
+	done := make(chan bool)
+	go func() {
+		for {
+			select {
+			case <-ctxTmp.Done():
+				done <- false
+				return
+			default:
+				getJs := `var label = document.querySelector(".ant-list-items");if(label)label.outerText;`
+				err = chromedp.Run(ctx, chromedp.Tasks{
+					chromedp.Evaluate(getJs, &result),
+				})
+				if result != "" {
+					done <- true
+					return
+				}
+				time.Sleep(1 * time.Second) // 模拟工作负载
+			}
+		}
+	}()
+	get := <-done
+	return get
+}
+
 // CountYestodayArts 统计昨日信息发布量
 func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
 	headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
 	sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
-	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
+	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, false)
 	qu.Debug("1浏览器打开")
 	vm.dnf.Dispatch("debug_event", "1 浏览器打开")
 	defer func() {

+ 0 - 1
backend/vm/vm.go

@@ -157,7 +157,6 @@ func trunPage(sc *be.SpiderConfig, delay int64, ctx context.Context) error {
 	} else {
 		return errors.New("trun page error ")
 	}
-	qu.Debug("--------------------------")
 	//获取翻页后内容
 	err = chromedp.Run(ctx, chromedp.Tasks{
 		chromedp.Evaluate(checkRunJs, &result2),

+ 7 - 3
backend/vm/worker.go

@@ -22,7 +22,7 @@ func (w *Worker) Destory() {
 }
 
 // NewWorker
-func NewWorker(headless bool, showImage bool, proxyServe string, contentDelay int64, js string, vm *VM) *Worker {
+func NewWorker(headless bool, showImage bool, proxyServe bool, contentDelay int64, js string, vm *VM) *Worker {
 	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
 	return &Worker{
 		baseCancel:   baseCancel,
@@ -64,7 +64,7 @@ func (w *Worker) Run(v *be.ResultItem, ch chan *Worker, wg *sync.WaitGroup) {
 }
 
 // RunSpiderMulThreads
-func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64, trunPageDelay int64, contentDelay int64, headless bool, showImage bool, proxyServe string, threads int, exit chan bool,
+func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64, trunPageDelay int64, contentDelay int64, headless bool, showImage bool, proxyServe bool, threads int, exit chan bool,
 	cssMark map[string]interface{}) {
 	//sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
 	sc, err := be.NewSpiderConfig(cssMark)
@@ -111,7 +111,6 @@ func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64, tru
 			}
 		}
 	}()
-
 	no := 0
 	//TODO 1.翻页操作,需要在外层打开列表页
 	chromedp.Run(ctx, chromedp.Tasks{
@@ -121,6 +120,11 @@ func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64, tru
 	})
 	vm.dnf.Dispatch("debug_event", "2 页面已经打开")
 	qu.Debug("2页面打开")
+	vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
+	if !vm.InitListPage(ctx, sc) {
+		vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
+		return
+	}
 	currentResult := list.New()
 	be.DataResults[sc.Code] = currentResult
 	for i := 0; i < maxPages; i++ {

+ 7 - 8
bind4spider.go

@@ -10,17 +10,17 @@ import (
 )
 
 // DebugSpider 调试爬虫
-func (a *App) DebugSpider(url string, proxyServe string, maxPages int, listDealy int64, trunPageDelay int64, contentDelay int64, headless bool,
+func (a *App) DebugSpider(url string, proxyServe bool, maxPages int, listDealy int64, trunPageDelay int64, contentDelay int64, headless bool,
 	showImage bool, threads int, cssMark map[string]interface{}) {
 	exitCh = make(chan bool, 1)
 	qu.Debug(url, proxyServe, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, threads)
 	qu.Debug("cssMark---", cssMark)
-	//vm.RunSpiderTmp(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
-	if maxPages == 1 && threads == 1 {
-		vm.RunSpider(url, maxPages, listDealy, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
-	} else { //多页下载强制使用多线程模式
-		vm.RunSpiderMulThreads(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, threads, exitCh, cssMark)
-	}
+	vm.RunSpiderTmp(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
+	//if maxPages == 1 && threads == 1 {
+	//	vm.RunSpider(url, maxPages, listDealy, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
+	//} else { //多页下载强制使用多线程模式
+	//	vm.RunSpiderMulThreads(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, threads, exitCh, cssMark)
+	//}
 }
 
 // VerifySpiderConfig 验证
@@ -79,7 +79,6 @@ func (a *App) ViewResultItemAll(code string) be.ResultItems {
 			ret = append(ret, v)
 		}
 	}
-	qu.Debug(len(ret))
 	return ret
 }
 

+ 1 - 1
frontend/wailsjs/go/main/App.d.ts

@@ -6,7 +6,7 @@ import {main} from '../models';
 
 export function CountYestodayArts(arg1:string,arg2:number,arg3:number,arg4:boolean,arg5:boolean):Promise<void>;
 
-export function DebugSpider(arg1:string,arg2:string,arg3:number,arg4:number,arg5:number,arg6:number,arg7:boolean,arg8:boolean,arg9:number,arg10:{[key: string]: any}):Promise<void>;
+export function DebugSpider(arg1:string,arg2:boolean,arg3:number,arg4:number,arg5:number,arg6:number,arg7:boolean,arg8:boolean,arg9:number,arg10:{[key: string]: any}):Promise<void>;
 
 export function DeleteJob(arg1:string):Promise<string>;