Selaa lähdekoodia

正式版本列表页详情页均标注

mxs 9 kuukautta sitten
vanhempi
commit
57dad30652

+ 141 - 142
backend/vm/check.go

@@ -4,143 +4,23 @@ import (
 	"container/list"
 	"errors"
 	"fmt"
+	"github.com/chromedp/chromedp"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	be "spider_creator/backend"
 	"time"
-
-	"github.com/chromedp/chromedp"
 )
 
 // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
-//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
-//	qu.Debug("sc---", *sc)
-//	verifyResult := list.New()
-//	be.DataResults[sc.Code] = verifyResult
-//	ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
-//	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false)    //列表页使用
-//	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, false) //详情页使用
-//	defer func() {
-//		incCancelFn2()
-//		baseCancelFn2()
-//		incCancelFn()
-//		baseCancelFn()
-//	}()
-//
-//	listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
-//	//2. 执行JS代码,获取列表页信息
-//	if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
-//		listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
-//	}
-//	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
-//		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
-//	}
-//	qu.Debug("获取列表页JS代码:", listRunJs)
-//	qu.Debug("获取详情页JS代码:", contentRunJs)
-//	//3.打开列表,获取条目清单
-//	chromedp.Run(ctx, chromedp.Tasks{
-//		chromedp.Navigate(sc.Href),
-//		chromedp.WaitReady("document.body", chromedp.ByJSPath),
-//		//chromedp.Sleep(1000 * time.Millisecond),
-//		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
-//	})
-//	//初始化列表页信息
-//	if !vm.InitListPage(ctx, sc) {
-//		qu.Debug("初始化列表页失败,退出")
-//		return ret, errors.New("初始化列表页失败")
-//	}
-//	no := 1
-//T:
-//	for j := 0; j < 2; j++ { //最多检查2页
-//		qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
-//		listResult := make(be.ResultItems, 0)
-//		err := chromedp.Run(ctx, chromedp.Tasks{
-//			chromedp.Evaluate(listRunJs, &listResult),
-//		})
-//		if err != nil {
-//			qu.Debug("执行列表页JS代码失败", err.Error())
-//			continue
-//		}
-//		//TODO 5.操作详情页
-//		qu.Debug("列表采集条数:", len(listResult))
-//		for contentIndex, r := range listResult {
-//			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
-//			if contentIndex > 1 { //每页只校验2条
-//				break
-//			}
-//			//打开详情页
-//			err = chromedp.Run(ctx2, chromedp.Tasks{
-//				chromedp.Navigate(r.Href),
-//				chromedp.WaitReady("document.body", chromedp.ByJSPath),
-//				//chromedp.Sleep(2000 * time.Millisecond),
-//				chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
-//			})
-//			if err != nil {
-//				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
-//				continue
-//			}
-//			//获取详情页内容
-//			err = chromedp.Run(ctx2, chromedp.Tasks{
-//				chromedp.Evaluate(contentRunJs, r),
-//			})
-//			if err != nil {
-//				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
-//				continue
-//			}
-//			//下载附件
-//			if sc.AttachCss != "" {
-//				downloadAttaches(r, vm.attachesDir)
-//			}
-//			r.Site = sc.Site
-//			r.Channel = sc.Channel
-//			if r.Title == "" {
-//				r.Title = r.ListTitle
-//			}
-//			if r.PublishTime == "" {
-//				r.PublishTime = r.ListPubTime
-//			}
-//			r.No = no
-//			no += 1
-//			//结果放入缓存
-//			verifyResult.PushBack(r)
-//		}
-//		qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
-//		//翻页
-//		if verifyResult.Len() > 0 {
-//			if sc.MaxPages == 1 { //最大页为1,不校验翻页
-//				ret.ListTrunPage = true
-//				break
-//			} else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
-//				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
-//					qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
-//					break T
-//				} else {
-//					ret.ListTrunPage = true
-//				}
-//			}
-//		}
-//	}
-//	//检查
-//	for el := verifyResult.Front(); el != nil; el = el.Next() {
-//		r, _ := el.Value.(*be.ResultItem)
-//		ret.Title = r.Title != ""
-//		ret.PublishUnit = r.PublishUnit != ""
-//		ret.PublishTime = r.PublishTime != ""
-//		ret.Content = r.Content != ""
-//		ret.Attaches = len(r.AttachLinks) > 0
-//	}
-//	qu.Debug(verifyResult.Len())
-//	ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
-//	return ret, nil
-//}
-
-// VerifySpiderConfig 只验证列表标注
 func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
 	qu.Debug("sc---", *sc)
 	verifyResult := list.New()
 	be.DataResults[sc.Code] = verifyResult
-	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
+	ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false)    //列表页使用
+	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, false) //详情页使用
 	defer func() {
+		incCancelFn2()
+		baseCancelFn2()
 		incCancelFn()
 		baseCancelFn()
 	}()
@@ -153,7 +33,8 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
 	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
 		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
 	}
-	qu.Debug("列表页JS:", listRunJs)
+	qu.Debug("获取列表页JS代码:", listRunJs)
+	qu.Debug("获取详情页JS代码:", contentRunJs)
 	//3.打开列表,获取条目清单
 	chromedp.Run(ctx, chromedp.Tasks{
 		chromedp.Navigate(sc.Href),
@@ -161,7 +42,7 @@ func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyRes
 		//chromedp.Sleep(1000 * time.Millisecond),
 		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
 	})
-	//4.初始化列表页信息
+	//初始化列表页信息
 	if !vm.InitListPage(ctx, sc) {
 		qu.Debug("初始化列表页失败,退出")
 		return ret, errors.New("初始化列表页失败")
@@ -178,20 +59,41 @@ T:
 			qu.Debug("执行列表页JS代码失败", err.Error())
 			continue
 		}
-		//5.操作详情页
+		//TODO 5.操作详情页
 		qu.Debug("列表采集条数:", len(listResult))
 		for contentIndex, r := range listResult {
+			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
 			if contentIndex > 1 { //每页只校验2条
 				break
 			}
-			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
+			//打开详情页
+			err = chromedp.Run(ctx2, chromedp.Tasks{
+				chromedp.Navigate(r.Href),
+				chromedp.WaitReady("document.body", chromedp.ByJSPath),
+				//chromedp.Sleep(2000 * time.Millisecond),
+				chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
+			})
+			if err != nil {
+				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
+				continue
+			}
+			//获取详情页内容
+			err = chromedp.Run(ctx2, chromedp.Tasks{
+				chromedp.Evaluate(contentRunJs, r),
+			})
+			if err != nil {
+				qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
+				continue
+			}
+			//下载附件
+			if sc.AttachCss != "" {
+				downloadAttaches(r, vm.attachesDir)
+			}
 			r.Site = sc.Site
 			r.Channel = sc.Channel
-			//qu.Debug(r.Title, r.ListTitle)
 			if r.Title == "" {
 				r.Title = r.ListTitle
 			}
-			//qu.Debug(r.PublishTime, r.ListPubTime)
 			if r.PublishTime == "" {
 				r.PublishTime = r.ListPubTime
 			}
@@ -200,15 +102,15 @@ T:
 			//结果放入缓存
 			verifyResult.PushBack(r)
 		}
-		qu.Debug("列表采集条数结果:", verifyResult.Len())
-		//6.翻页
+		qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
+		//翻页
 		if verifyResult.Len() > 0 {
 			if sc.MaxPages == 1 { //最大页为1,不校验翻页
 				ret.ListTrunPage = true
 				break
-			} else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
+			} else if sc.MaxPages > 1 { //&& !ret.ListTrunPage {
 				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
-					qu.Debug("翻页失败", err)
+					qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
 					break T
 				} else {
 					ret.ListTrunPage = true
@@ -220,13 +122,110 @@ T:
 	for el := verifyResult.Front(); el != nil; el = el.Next() {
 		r, _ := el.Value.(*be.ResultItem)
 		ret.Title = r.Title != ""
-		qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
+		ret.PublishUnit = r.PublishUnit != ""
 		ret.PublishTime = r.PublishTime != ""
-		qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
+		ret.Content = r.Content != ""
+		ret.Attaches = len(r.AttachLinks) > 0
 	}
-	if ret.ListItems {
-		ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
-	}
-
+	qu.Debug(verifyResult.Len())
+	ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
 	return ret, nil
 }
+
+// VerifySpiderConfig 只验证列表标注
+//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
+//	qu.Debug("sc---", *sc)
+//	verifyResult := list.New()
+//	be.DataResults[sc.Code] = verifyResult
+//	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
+//	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
+//	defer func() {
+//		incCancelFn()
+//		baseCancelFn()
+//	}()
+//
+//	listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
+//	//2. 执行JS代码,获取列表页信息
+//	if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
+//		listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
+//	}
+//	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
+//		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
+//	}
+//	qu.Debug("列表页JS:", listRunJs)
+//	//3.打开列表,获取条目清单
+//	chromedp.Run(ctx, chromedp.Tasks{
+//		chromedp.Navigate(sc.Href),
+//		chromedp.WaitReady("document.body", chromedp.ByJSPath),
+//		//chromedp.Sleep(1000 * time.Millisecond),
+//		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
+//	})
+//	//4.初始化列表页信息
+//	if !vm.InitListPage(ctx, sc) {
+//		qu.Debug("初始化列表页失败,退出")
+//		return ret, errors.New("初始化列表页失败")
+//	}
+//	no := 1
+//T:
+//	for j := 0; j < 2; j++ { //最多检查2页
+//		qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
+//		listResult := make(be.ResultItems, 0)
+//		err := chromedp.Run(ctx, chromedp.Tasks{
+//			chromedp.Evaluate(listRunJs, &listResult),
+//		})
+//		if err != nil {
+//			qu.Debug("执行列表页JS代码失败", err.Error())
+//			continue
+//		}
+//		//5.操作详情页
+//		qu.Debug("列表采集条数:", len(listResult))
+//		for contentIndex, r := range listResult {
+//			if contentIndex > 1 { //每页只校验2条
+//				break
+//			}
+//			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
+//			r.Site = sc.Site
+//			r.Channel = sc.Channel
+//			//qu.Debug(r.Title, r.ListTitle)
+//			if r.Title == "" {
+//				r.Title = r.ListTitle
+//			}
+//			//qu.Debug(r.PublishTime, r.ListPubTime)
+//			if r.PublishTime == "" {
+//				r.PublishTime = r.ListPubTime
+//			}
+//			r.No = no
+//			no += 1
+//			//结果放入缓存
+//			verifyResult.PushBack(r)
+//		}
+//		qu.Debug("列表采集条数结果:", verifyResult.Len())
+//		//6.翻页
+//		if verifyResult.Len() > 0 {
+//			if sc.MaxPages == 1 { //最大页为1,不校验翻页
+//				ret.ListTrunPage = true
+//				break
+//			} else if sc.MaxPages > 1 { // && !ret.ListTrunPage {
+//				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
+//					qu.Debug("翻页失败:", err)
+//					break T
+//				} else {
+//					ret.ListTrunPage = true
+//				}
+//			}
+//		}
+//	}
+//	//检查
+//	for el := verifyResult.Front(); el != nil; el = el.Next() {
+//		r, _ := el.Value.(*be.ResultItem)
+//		ret.Title = r.Title != ""
+//		qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
+//		ret.PublishTime = r.PublishTime != ""
+//		qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
+//	}
+//	if ret.ListItems {
+//		ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
+//	}
+//
+//	return ret, nil
+//}

+ 2 - 2
backend/vm/single.go

@@ -249,7 +249,7 @@ func (vm *VM) RunAction(ctx context.Context, ac *be.Actions, num int) *be.Action
 						CheckResult: be.CHECK_ACTION_NOTCHECK,
 					}
 					vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常")
-					qu.Debug("3.1 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS,异常")
+					qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常", err)
 					return
 				}
 				//检查结果
@@ -262,7 +262,7 @@ func (vm *VM) RunAction(ctx context.Context, ac *be.Actions, num int) *be.Action
 					})
 					if err != nil {
 						vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常")
-						qu.Debug("3.2 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作检查JS,异常")
+						qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常", err)
 						done <- &be.ActionRunResult{
 							Result:      false,
 							RunResult:   be.RUN_ACTION_SUCCESS,

+ 6 - 6
bind4spider.go

@@ -15,12 +15,12 @@ func (a *App) DebugSpider(url string, proxyServe bool, maxPages int, listDealy i
 	exitCh = make(chan bool, 1)
 	qu.Debug(url, proxyServe, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, threads)
 	qu.Debug("cssMark---", cssMark)
-	vm.RunSpiderTmp(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
-	//if maxPages == 1 && threads == 1 {
-	//	vm.RunSpider(url, maxPages, listDealy, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
-	//} else { //多页下载强制使用多线程模式
-	//	vm.RunSpiderMulThreads(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, threads, exitCh, cssMark)
-	//}
+	//vm.RunSpiderTmp(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
+	if maxPages == 1 && threads == 1 {
+		vm.RunSpider(url, maxPages, listDealy, contentDelay, headless, showImage, proxyServe, exitCh, cssMark)
+	} else { //多页下载强制使用多线程模式
+		vm.RunSpiderMulThreads(url, maxPages, listDealy, trunPageDelay, contentDelay, headless, showImage, proxyServe, threads, exitCh, cssMark)
+	}
 }
 
 // VerifySpiderConfig 验证

+ 10 - 2
frontend/src/components/spider/EditSpider.vue

@@ -564,8 +564,16 @@ const handleSave = () => {
 
 
     // 如果有js,则需要二次确认
-    if (formData.value.contentJs || formData.value.listJs) {
-        ElMessageBox.confirm('是否保存列表页JS代码或详情页JS代码?', '提示',
+    let jswarn = "";
+    if (formData.value.contentJs && formData.value.listJs){
+      jswarn = "是否保存列表页JS代码和详情页JS代码?";
+    }else if (formData.value.contentJs){
+      jswarn = "是否保存详情页JS代码?";
+    }else if (formData.value.listJs){
+      jswarn = "是否保存列表页JS代码?";
+    }
+    if (jswarn != "") {
+        ElMessageBox.confirm(jswarn, '提示',
             {
                 customClass: 'j-confirm-message-box',
                 type: 'warning',

+ 0 - 3
frontend/src/components/spider/RunSpider.vue

@@ -205,7 +205,6 @@ const truncateString = (str, maxLength) => {
 const handleRefersh = () => {
     ViewResultItemAll(formData.value.code).then(result => {
         //result = result.slice(-20);
-      console.log("---------",result)
         result.forEach((v, i) => {
             v.contentShort = truncateString(v.content, 50)
         })
@@ -248,7 +247,6 @@ const handleExportJson = () => {
     cancelButtonText: '取消',
   }).then(({ value }) => {
     SelectSaveFilePath("", value,"json").then(save2file => {
-      console.log("json",save2file)
       if (save2file == "") {
         console.log("无效的文件存储路径", save2file)
         return
@@ -278,7 +276,6 @@ const handleExportExcel = () => {
     cancelButtonText: '取消',
   }).then(({ value }) => {
     SelectSaveFilePath("", value,"xlsx").then(save2file => {
-      console.log("excel",save2file)
       if (save2file == "") {
         console.log("无效的文件存储路径", save2file)
         return

+ 0 - 2
frontend/src/store/modules/rulesList.js

@@ -31,7 +31,6 @@ export default {
     // 获取爬虫列表
     async getCodeList({ commit }, payload) {
       const { pageSize, pageNum } = payload
-      console.log(payload)
       const r = await ServerActionCodeList({
         modifyuser: payload.modifyuser, // 维护人
         state: payload.state, // 爬虫状态
@@ -41,7 +40,6 @@ export default {
         start: (pageNum - 1) * pageSize,
         limit: pageSize
       })
-      console.log(r)
       return r
     },
     // 爬虫认领

+ 0 - 5
frontend/src/views/CodeList.vue

@@ -502,16 +502,13 @@ const dialogEvents = {
         const payload = data.value
         const code = rowData.code
         // 保存逻辑
-        console.log(data, payload, code)
         dialogEvents.saveRequest(code, payload)
     },
     runSpiderConfigSave(data) {
-        console.log(data)
         const rowData = data._originData
         const payload = data.value
         const code = rowData.code
         // 保存逻辑
-        console.log(data, payload, code)
         dialogEvents.saveRequest(code, payload)
     },
     async saveRequest(code, payload) {
@@ -743,7 +740,6 @@ const tableEvents = {
         })
     },
     copyAction(row, value) {
-        console.log(row, value)
         const params = {
             code: row.code,
             copycode: value,
@@ -804,7 +800,6 @@ const tableActionDisabled = {
 
 //Wails事件绑定
 EventsOn("spiderConfigChange", data => {
-    console.log(data)
     const { key, css, url } = data
     refreshAndAsyncEditDialog(key, css)
     // 当触发修改时候,同步给客户端一份(此处不需要同步,因为在编辑弹窗中,检查到数据发生变化会自动同步)

+ 0 - 2
frontend/src/views/ReviewList.vue

@@ -532,7 +532,6 @@ const dialogEvents = {
             stype: 'save',
             update: updateRule
         }
-        console.log("change data:", data, params)
         try {
             const r = await store.dispatch('rulesList/editCodeItem', params)
             const { msg, err } = r
@@ -826,7 +825,6 @@ const tableEvents = {
     },
     // 管理退回
     adminRollback(_, row) {
-        console.log('管理退回', row)
         onlyClickHighlight(row, '_action_clicked_rollback')
         ElMessageBox.confirm('确认退回?', '提示',
             {

+ 2 - 3
server.go

@@ -12,9 +12,8 @@ import (
 	"time"
 )
 
-const HREF = "http://127.0.0.1:8091/%s"
-
-//const HREF = "http://visualize.spdata.jianyu360.com/%s"
+// const HREF = "http://127.0.0.1:8091/%s"
+const HREF = "http://visualizeld.spdata.jianyu360.com/%s"
 
 type Result struct {
 	Msg  string `json:"msg"`