Răsfoiți Sursa

功能修改

mxs 7 luni în urmă
părinte
comite
16bafaed32

+ 24 - 0
backend/browser.go

@@ -4,10 +4,13 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"github.com/chromedp/cdproto/fetch"
+	"github.com/chromedp/cdproto/network"
 	"io/ioutil"
 	"math/rand"
 	"net/http"
 	"strings"
+	"time"
 
 	"github.com/chromedp/cdproto/page"
 
@@ -123,8 +126,29 @@ func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string)
 	// 创建一个浏览器实例
 	incCtx, incCancelFn := chromedp.NewContext(allocCtx,
 		chromedp.WithLogf(nil))
+	//TODO 设置浏览器网络加载超时
+	// 监听网络请求并设置特定资源的超时
+	chromedp.ListenTarget(ctx, func(ev interface{}) {
+		switch ev := ev.(type) {
+		case *network.EventRequestWillBeSent:
+			// 设置超时时间
+			timeout := time.Duration(Cfg.BrowserLoadResourceTimeout) * time.Second
+			// TODO 这里要检查哪些资源进行超时中断监测,默认仅资源类请求
+			// 配置太麻烦,先全局,所有请求类型,这个chromedp的超时上下文不一个概念
+			// 设置一个定时器,当超时后取消请求
+			go func() {
+				select {
+				case <-time.After(timeout):
+					fid := fetch.RequestID(ev.RequestID)
+					fetch.FailRequest(fid, network.ErrorReasonTimedOut).Do(ctx)
+				}
+			}()
+		}
+	})
+
 	//
 	chromedp.Run(ctx,
+		fetch.Enable(),
 		chromedp.ActionFunc(func(cxt context.Context) error {
 			_, err := page.AddScriptToEvaluateOnNewDocument("Object.defineProperty(navigator, 'webdriver', { get: () => false, });").Do(cxt)
 			return err

+ 62 - 43
backend/script/script.go

@@ -37,31 +37,33 @@ const (
 	qlm_detail_lua = "/script/qlm_detail.lua"
 )
 
-type GLVm struct {
-	attachesDir string
-	dnf         backend.EventNotifyFace
-	Headless    bool
-	ShowImage   bool
-	ProxyServer bool
-	ProxyAddr   string
-	B           *GLBrowser
-}
+var (
+	DataCache = make(chan map[string]interface{}, 2000)
+	Datas     []map[string]interface{}
+)
 
-type GLTask struct {
-	glvm     *GLVm
-	recordid string //记录id
+type GLVm struct {
+	AttachesDir   string
+	Dnf           backend.EventNotifyFace
+	Headless      bool
+	ShowImage     bool
+	ProxyServer   bool
+	ProxyAddr     string
+	B             *GLBrowser
+	ScriptRunning bool //控制一次只能执行一个脚本
+	DataSaveOver  chan bool
 }
 
 type GLBrowser struct {
-	BaseCancelFn context.CancelFunc
-	Ctx          context.Context
-	CancelFn     context.CancelFunc
+	Ctx      context.Context
+	CancelFn context.CancelFunc
 }
 
 func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm {
 	return &GLVm{
-		attachesDir: attachesDir,
-		dnf:         dnf,
+		AttachesDir:  attachesDir,
+		Dnf:          dnf,
+		DataSaveOver: make(chan bool, 1),
 	}
 }
 
@@ -69,9 +71,9 @@ func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm {
 func (glvm *GLVm) LoadScript(page string) string {
 	var path string
 	if page == "list" {
-		path = glvm.attachesDir + qlm_list_lua
+		path = glvm.AttachesDir + qlm_list_lua
 	} else if page == "detail" {
-		path = glvm.attachesDir + qlm_detail_lua
+		path = glvm.AttachesDir + qlm_detail_lua
 	}
 
 	bs, err := os.ReadFile(path)
@@ -82,21 +84,19 @@ func (glvm *GLVm) LoadScript(page string) string {
 }
 
 // RunScript 执行lua代码
-func (glvm *GLVm) RunScript(script string) error {
+func (glvm *GLVm) RunScript(script, recordId string) error {
 	defer Catch()
-	var state *lua.LState = lua.NewState()
-	defer state.Close()
+	var s *lua.LState = lua.NewState()
+	defer s.Close()
 	//方法绑定
-	glvm.ResetBrowser()      //先创建浏览器对象
-	glvm.BindLuaState(state) //绑定虚拟机函数
-	glvm.B.BindLuaState(state)
+	glvm.ResetBrowser()  //先创建浏览器对象
+	glvm.BindLuaState(s) //绑定虚拟机函数
+	glvm.B.BindLuaState(s, recordId)
 	defer func() {
 		if b := glvm.B; b != nil {
 			b.CancelFn()
 			b.Ctx = nil
 			b.CancelFn = nil
-			b.BaseCancelFn()
-			b.BaseCancelFn = nil
 		}
 	}()
 
@@ -109,27 +109,24 @@ func (glvm *GLVm) RunScript(script string) error {
 	if err != nil {
 		return err
 	}
-	lfunc := state.NewFunctionFromProto(proto)
-	state.Push(lfunc)
-	state.Call(0, 0)
+	lfunc := s.NewFunctionFromProto(proto)
+	s.Push(lfunc)
+	s.Call(0, 0)
 
 	return nil
 }
 
 // ResetBrowser 重置浏览器
 func (glvm *GLVm) ResetBrowser() {
-	if glvm.B != nil && glvm.B.CancelFn != nil && glvm.B.BaseCancelFn != nil {
+	if glvm.B != nil && glvm.B.CancelFn != nil {
 		glvm.B.CancelFn()
-		glvm.B.BaseCancelFn()
 		glvm.B.Ctx = nil
 		glvm.B.CancelFn = nil
-		glvm.B.BaseCancelFn = nil
 	}
-	_, baseCancelFn, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "https://")
+	_, _, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "http://")
 	b := &GLBrowser{
-		BaseCancelFn: baseCancelFn,
-		Ctx:          ctx,
-		CancelFn:     incCancelFn,
+		Ctx:      ctx,
+		CancelFn: incCancelFn,
 	}
 
 	if glvm.B == nil {
@@ -404,7 +401,7 @@ func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selecto
 }
 
 // BindLuaState
-func (b *GLBrowser) BindLuaState(s *lua.LState) {
+func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
 	//执行暂停
 	s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int {
 		fmt.Println("---browser_sleep---")
@@ -573,12 +570,34 @@ func (b *GLBrowser) BindLuaState(s *lua.LState) {
 		}
 		return 1
 	}))
-	//保存
+	//保存数据
 	s.SetGlobal("browser_savedata", s.NewFunction(func(l *lua.LState) int {
-		//fmt.Println("---browser_upsertdata---")
-		//param := l.ToTable(-1)
-		//upset := TableToMap(param)
-
+		fmt.Println("---browser_upsertdata---")
+		page := l.ToString(-2)
+		data := l.ToTable(-1)
+		result := TableToMap(data)
+		if page == "list" {
+			result["recordid"] = recordId
+		}
+		DataCache <- result
 		return 1
 	}))
+	//获取数据
+	s.SetGlobal("browser_getdata", s.NewFunction(func(l *lua.LState) int {
+		fmt.Println("---browser_getdata---")
+		num := l.ToInt(-1) //获取多少条数据
+		count := len(Datas)
+		if count == 0 {
+			l.Push(lua.LString("err"))
+			l.Push(lua.LString("当前可下载量为0"))
+		} else {
+			resultTable := &lua.LTable{}
+			for i := 0; i < num && i < count; i++ {
+				resultTable.Append(MapToTable(Datas[i]))
+			}
+			l.Push(lua.LString("ok"))
+			l.Push(resultTable)
+		}
+		return 2
+	}))
 }

+ 111 - 101
backend/vm/check.go

@@ -10,13 +10,23 @@ import (
 	"time"
 )
 
-// VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
+// 爬虫验证
 func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
+	if be.Cfg.IsOnly4MainSite {
+		return vm.verifySpiderConfig4MainSite(sc) //重点网站
+	} else {
+		return vm.verifySpiderConfig4Prod(sc) //正式环境
+	}
+}
+
+// verifySpiderConfig4Prod 验证爬虫配置,支持翻页,列表项数据只提取2条
+// 正式环境
+func (vm *VM) verifySpiderConfig4Prod(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
 	qu.Debug("sc---", *sc)
 	verifyResult := list.New()
 	be.DataResults[sc.Code] = verifyResult
 	ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false, sc.Href)   //列表页使用
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href)    //列表页使用
 	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href) //详情页使用
 	defer func() {
 		incCancelFn2()
@@ -87,7 +97,6 @@ T:
 				continue
 			}
 			//下载附件
-			qu.Debug(r.Title, r.ListTitle)
 			if sc.AttachCss != "" {
 				downloadAttaches(r, vm.attachesDir)
 			}
@@ -133,101 +142,102 @@ T:
 	return ret, nil
 }
 
-// VerifySpiderConfig 只验证列表标注
-//func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
-//	qu.Debug("sc---", *sc)
-//	verifyResult := list.New()
-//	be.DataResults[sc.Code] = verifyResult
-//	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
-//	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, false) //列表页使用
-//	defer func() {
-//		incCancelFn()
-//		baseCancelFn()
-//	}()
-//
-//	listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
-//	//2. 执行JS代码,获取列表页信息
-//	if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
-//		listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
-//	}
-//	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
-//		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
-//	}
-//	qu.Debug("列表页JS:", listRunJs)
-//	//3.打开列表,获取条目清单
-//	chromedp.Run(ctx, chromedp.Tasks{
-//		chromedp.Navigate(sc.Href),
-//		chromedp.WaitReady("document.body", chromedp.ByJSPath),
-//		//chromedp.Sleep(1000 * time.Millisecond),
-//		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
-//	})
-//	//4.初始化列表页信息
-//	if !vm.InitListPage(ctx, sc) {
-//		qu.Debug("初始化列表页失败,退出")
-//		return ret, errors.New("初始化列表页失败")
-//	}
-//	no := 1
-//	ret.ListTrunPage = true
-//T:
-//	for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
-//		qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
-//		listResult := make(be.ResultItems, 0)
-//		err := chromedp.Run(ctx, chromedp.Tasks{
-//			chromedp.Evaluate(listRunJs, &listResult),
-//		})
-//		if err != nil {
-//			qu.Debug("执行列表页JS代码失败", err.Error())
-//			continue
-//		}
-//		//5.操作详情页
-//		qu.Debug("列表采集条数:", len(listResult))
-//		for contentIndex, r := range listResult {
-//			if contentIndex > 1 { //每页只校验2条
-//				break
-//			}
-//			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
-//			r.Site = sc.Site
-//			r.Channel = sc.Channel
-//			//qu.Debug(r.Title, r.ListTitle)
-//			if r.Title == "" {
-//				r.Title = r.ListTitle
-//			}
-//			//qu.Debug(r.PublishTime, r.ListPubTime)
-//			if r.PublishTime == "" {
-//				r.PublishTime = r.ListPubTime
-//			}
-//			r.No = no
-//			no += 1
-//			//结果放入缓存
-//			verifyResult.PushBack(r)
-//		}
-//		qu.Debug("列表采集条数结果:", verifyResult.Len())
-//		//6.翻页
-//		if verifyResult.Len() > 0 {
-//			if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
-//				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
-//					qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
-//					ret.ListTrunPage = false
-//					break T
-//				}
-//			}
-//		} else {
-//			ret.ListTrunPage = false
-//			break T
-//		}
-//	}
-//	//检查
-//	for el := verifyResult.Front(); el != nil; el = el.Next() {
-//		r, _ := el.Value.(*be.ResultItem)
-//		ret.Title = r.Title != ""
-//		qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
-//
-//		ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
-//		qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
-//	}
-//	if ret.ListItems {
-//		ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
-//	}
-//
-//	return ret, nil
-//}
+// verifySpiderConfig4MainSite 只验证列表标注
+// 重点网站测试环境
+func (vm *VM) verifySpiderConfig4MainSite(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
+	qu.Debug("sc---", *sc)
+	verifyResult := list.New()
+	be.DataResults[sc.Code] = verifyResult
+	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href) //列表页使用
+	defer func() {
+		incCancelFn()
+		baseCancelFn()
+	}()
+
+	listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
+	//2. 执行JS代码,获取列表页信息
+	if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
+		listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
+	}
+	if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
+		contentRunJs = renderJavascriptCoder(loadContentJS, sc)
+	}
+	qu.Debug("列表页JS:", listRunJs)
+	//3.打开列表,获取条目清单
+	chromedp.Run(ctx, chromedp.Tasks{
+		chromedp.Navigate(sc.Href),
+		chromedp.WaitReady("document.body", chromedp.ByJSPath),
+		//chromedp.Sleep(1000 * time.Millisecond),
+		chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
+	})
+	//4.初始化列表页信息
+	if !vm.InitListPage(ctx, sc) {
+		qu.Debug("初始化列表页失败,退出")
+		return ret, errors.New("初始化列表页失败")
+	}
+	no := 1
+	ret.ListTrunPage = true
+T:
+	for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
+		qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
+		listResult := make(be.ResultItems, 0)
+		err := chromedp.Run(ctx, chromedp.Tasks{
+			chromedp.Evaluate(listRunJs, &listResult),
+		})
+		if err != nil {
+			qu.Debug("执行列表页JS代码失败", err.Error())
+			continue
+		}
+		//5.操作详情页
+		qu.Debug("列表采集条数:", len(listResult))
+		for contentIndex, r := range listResult {
+			if contentIndex > 1 { //每页只校验2条
+				break
+			}
+			qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
+			r.Site = sc.Site
+			r.Channel = sc.Channel
+			//qu.Debug(r.Title, r.ListTitle)
+			if r.Title == "" {
+				r.Title = r.ListTitle
+			}
+			//qu.Debug(r.PublishTime, r.ListPubTime)
+			if r.PublishTime == "" {
+				r.PublishTime = r.ListPubTime
+			}
+			r.No = no
+			no += 1
+			//结果放入缓存
+			verifyResult.PushBack(r)
+		}
+		qu.Debug("列表采集条数结果:", verifyResult.Len())
+		//6.翻页
+		if verifyResult.Len() > 0 {
+			if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
+				if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
+					qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
+					ret.ListTrunPage = false
+					break T
+				}
+			}
+		} else {
+			ret.ListTrunPage = false
+			break T
+		}
+	}
+	//检查
+	for el := verifyResult.Front(); el != nil; el = el.Next() {
+		r, _ := el.Value.(*be.ResultItem)
+		ret.Title = r.Title != ""
+		qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
+
+		ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
+		qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
+	}
+	if ret.ListItems {
+		ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
+	}
+
+	return ret, nil
+}

+ 4 - 8
frontend/src/components/spider/EditSpider.vue

@@ -580,12 +580,8 @@ const handleSave = () => {
 
   // 如果有js,则需要二次确认
   let jswarn = "";
-  if (formData.value.contentJs && formData.value.listJs){
-    jswarn = "是否保存列表页JS代码和详情页JS代码?";
-  }else if (formData.value.contentJs){
-    jswarn = "是否保存详情页JS代码?";
-  }else if (formData.value.listJs){
-    jswarn = "是否保存列表页JS代码?";
+  if (!formData.value.listJs || !formData.value.listTurnPageJs||!formData.value.contentJs){
+    jswarn = "部分模块JS代码为空,请生成JS代码后,再进行保存!"
   }
   if (jswarn != "") {
     ElMessageBox.confirm(jswarn, '提示',
@@ -596,9 +592,9 @@ const handleSave = () => {
           cancelButtonText: '取消',
           showCancelButton: false,
         }
-    ).then(() => {
+    )/*.then(() => {
       emitSaveEvent()
-    })
+    })*/
   } else {
     emitSaveEvent()
   }

+ 69 - 3
frontend/src/components/spider/jscodetpl.js

@@ -240,13 +240,19 @@ if ("{{.ContentCss}}" != "") {//正文内容
   }
 }
 if("{{.AttachCss}}"!=""){//附件
-	tmp = document.querySelectorAll("{{.AttachCss}} a") || document.querySelector("第二套CSS选择器,请修改")
+	tmp = document.querySelectorAll("{{.AttachCss}} a")  
 	let attach=[]
 	if(tmp){
 		tmp.forEach((v,i)=>{
 			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
 		})
 	}
+    tmp = document.querySelectorAll("第二套CSS选择器,请修改 a")
+    if(tmp){
+		tmp.forEach((v,i)=>{
+			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+		})
+	}
 	ret["attachLinks"]=attach
 }
 //检查中文字符个数,少于20,修正正文内容
@@ -256,9 +262,69 @@ let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
 if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
 ret 
     `,
-        },
-        {
+        }, {
             "name": "模版3",
+            "tooltip": "详情页存在2套不同CSS选择器,一套是微信公众号",
+            "code": `
+    var ret = {}
+    var tmp = null
+    
+    if ("{{.TitleCss}}" != "") {//标题
+    tmp = document.querySelector("{{.TitleCss}}") || document.querySelector("#activity-name")
+    if (tmp) ret["title"] = tmp.getAttribute("title") || tmp.innerText
+    }
+    if ("{{.PublishUnitCss}}" != "") {//采购单位
+    tmp = document.querySelector("{{.PublishUnitCss}}") || document.querySelector("#js_name")
+    if (tmp) ret["publishUnit"] = tmp.getAttribute("title") || tmp.innerText
+    }
+    if ("{{.PublishTimeCss}}" != "") {//发布时间
+    tmp = document.querySelector("{{.PublishTimeCss}}") || document.querySelector("#publish_time")
+    if (tmp) ret["publishTime"] = tmp.getAttribute("title") || tmp.innerText
+    }
+    if ("{{.ContentCss}}" != "") {//正文内容
+    tmp = document.querySelector("{{.ContentCss}}") || document.querySelector("#js_content")
+    if (tmp) {
+    ret["content"] = tmp.innerText
+    ret["contentHtml"] = tmp.innerHTML
+    var patchContent = false
+    //处理详情页中的大图,大图作为附件使用
+    const images = tmp.querySelectorAll("img");
+    images.forEach((img, i) => {
+      if (img.width > 300) {
+        patchContent = true
+        const a = document.createElement("a");
+        a.href = img.src;
+        a.innerText = img.src;
+        tmp.appendChild(a);
+      }
+    })
+    }
+    }
+    if("{{.AttachCss}}"!=""){//附件
+    tmp = document.querySelectorAll("{{.AttachCss}} a")
+    let attach=[]
+    if(tmp){
+        tmp.forEach((v,i)=>{
+            attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+        })
+    }
+    tmp = document.querySelectorAll("#js_content a")
+    if(tmp){
+        tmp.forEach((v,i)=>{
+            attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+        })
+    }
+    ret["attachLinks"]=attach
+    }
+    //检查中文字符个数,少于20,修正正文内容
+    let regex = /[\\u4e00-\\u9fa5]/g;
+    let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
+    let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
+    if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
+    ret 
+    `,
+        }, {
+            "name": "模版4",
             "tooltip": "详情页正文包含其他元素,需要清洗",
             "code": `
 var ret = {}

+ 1 - 1
frontend/src/store/index.js

@@ -25,7 +25,7 @@ export default createStore({
             [USER_ROLE_ADMIN]: [
                 { title: '爬虫列表', icon: 'List', path: '/code/list' },
                 { title: '审核列表', icon: 'Checked', path: '/review/list' },
-                { title: '千里马采集', icon: 'Checked', path: '/third/collection/list' },
+                { title: '千里马采集', icon: 'Document', path: '/third/collection/list' },
                 // { title: '系统设置', icon: 'Help', path: '/setting' },
             ],
             // 开发者菜单

+ 35 - 57
frontend/src/views/CollectionList.vue

@@ -6,8 +6,8 @@
             <div class="action-bar-container">
                 <el-space class="action-bar-item-container action-bar-action-left">
                     <el-button-group class="ml-4">
-                        <!-- <el-button type="primary" :icon="Refresh" @click="resetFilterAndRefreshTableList">刷新</el-button> -->
-                        <el-button type="primary" :icon="DocumentAdd" :loading="recordInfo.loading" @click="addRecord">新增记录</el-button>
+                        <el-button type="success" :icon="DocumentAdd" :loading="recordInfo.loading" @click="addRecord">新增采集记录</el-button>
+                         <el-button type="primary" :icon="Refresh" @click="resetFilterAndRefreshTableList">刷新</el-button>
                     </el-button-group>
                 </el-space>
                 <el-space class="action-bar-item-container action-bar-action-right"></el-space>
@@ -28,16 +28,16 @@
                         <div class="highlight-main">{{scope.row.stateText}}</div>
                     </template>
                 </el-table-column>
-                <el-table-column label="功能" width="200" align="center">
+                <el-table-column label="功能" width="220" align="center">
                     <template #default="scope">
                         <el-tooltip content="列表页采集" placement="top">
                             <el-button size="small" :class="{ active: scope.row._action_clicked_list_collect }" @click="tableEvents.handleListCollect(scope.$index, scope.row)">
-                                <el-icon><Edit /></el-icon>
+                                <el-icon><Help /></el-icon>
                             </el-button>
                         </el-tooltip>
-                        <el-tooltip content="去重" placement="top">
+                        <el-tooltip content="列表去重" placement="top">
                             <el-button size="small" :class="{ active: scope.row._action_clicked_duplicate_remove }" @click="tableEvents.handleDuplicateRemove(scope.$index, scope.row)">
-                                <el-icon><SetUp /></el-icon>
+                                <el-icon><DocumentDelete /></el-icon>
                             </el-button>
                         </el-tooltip>
                         <el-tooltip content="详情页采集" placement="top">
@@ -47,24 +47,23 @@
                         </el-tooltip>
                         <el-tooltip content="推送" placement="top">
                             <el-button size="small" :class="{ active: scope.row._action_clicked_pushed }" @click="tableEvents.handlePushed(scope.$index, scope.row)">
-                                <el-icon><DocumentCopy /></el-icon>
+                                <el-icon><Position /></el-icon>
                             </el-button>
                         </el-tooltip>
                     </template>
                 </el-table-column>
-                <el-table-column label="操作" width="150" align="center">
+                <el-table-column label="操作" width="120" align="center">
                     <template #default="scope">
                         <el-tooltip content="清除记录" placement="top">
                             <el-button size="small" :class="{ active: scope.row._action_clicked_remove_history }" @click="tableEvents.handleRemoveHistory(scope.$index, scope.row)">
-                                <el-icon><DArrowLeft /></el-icon>
+                                <el-icon><Delete /></el-icon>
                             </el-button>
                         </el-tooltip>
 
                         <el-dropdown>
-                            <el-button size="small">结果导出</el-button>
+                            <el-button size="small">导出</el-button>
                             <template #dropdown>
                                 <el-dropdown-menu>
-                                    <el-dropdown-item @click="handleExportEpub(scope.row)">导出EPUB格式文件</el-dropdown-item>
                                     <el-dropdown-item @click="handleExportJson(scope.row)">导出JSON格式文件</el-dropdown-item>
                                     <el-dropdown-item @click="handleExportExcel(scope.row)">导出Excel格式文件</el-dropdown-item>
                                     <!-- <el-dropdown-item>补录/上推至平台</el-dropdown-item> -->
@@ -91,10 +90,10 @@
                     <el-radio :value="false">否</el-radio>
                 </el-radio-group>
             </el-form-item>
-            <el-form-item label="无头浏览器" label-width="100px">
+            <el-form-item label="浏览器" label-width="100px">
                 <el-radio-group v-model="recordInfo.form.headless">
-                    <el-radio :value="true">是</el-radio>
-                    <el-radio :value="false">否</el-radio>
+                    <el-radio :value="false">显示</el-radio>
+                    <el-radio :value="true">无头</el-radio>
                 </el-radio-group>
             </el-form-item>
             <el-form-item label="显示图像" label-width="100px">
@@ -119,7 +118,7 @@ import { ref, computed, reactive, watch } from 'vue'
 import { useStore } from 'vuex';
 import { ElMessage, ElMessageBox } from 'element-plus'
 import { ServerActionQlmAddRecord, QlmListDataDownload, QlmDetailDataDownload, ServerActionQlmRemoveRepeat, ServerActionQlmPushData, ServerActionQlmClearData } from "../../wailsjs/go/main/App"
-import { SelectSaveFilePath, ExportJsonFile, ExportEpubFile, ExportExcelFile } from "../../wailsjs/go/main/App"
+import { SelectSaveFilePath, QlmRunExportJsonFile, QlmRunExportExcelFile } from "../../wailsjs/go/main/App"
 
 import Breadcrumb from "../components/Breadcrumb.vue"
 import { USER_ROLE_ADMIN, USER_ROLE_DEVELOPER, USER_ROLE_REVIEWER } from '../data/user'
@@ -182,6 +181,11 @@ const recordInfo = reactive({
     dialog: false,
     loading: false,
     type: '', // list/detail
+    defaultForm: {
+      proxy: false,
+      headless: false,
+      image: true,
+    },
     form: {
         proxy: false,
         headless: false,
@@ -286,7 +290,7 @@ function refreshTableList() {
 }
 // 刷新列表(并重置选择器)
 function resetFilterAndRefreshTableList() {
-    resetFilterState()
+    // resetFilterState()
     refreshTableList()
 }
 
@@ -332,14 +336,14 @@ const addRecord = () => {
 
 const getActionCommonParams = row => {
     const param = {
-        id: row._id,
+        recordId: row._id,
         state: row.state
     }
     const other = {
-        headless: row.headless,
-        showImage: row.image,
-        proxyServer: row.proxy,
-        id: row._id,
+        headless: recordInfo.form.headless,
+        showImage: recordInfo.form.image,
+        proxyServer: recordInfo.form.proxy,
+        recordid: row._id,
     }
     return {
         param,
@@ -364,9 +368,9 @@ const confirmDialog = (conf = {}) => {
 
 const setRecordDialogState = (row) => {
     Object.assign(recordInfo.form, {
-        proxy: row.proxy || false,
-        headless: row.headless || false,
-        image: row.image || false,
+        proxy: recordInfo.defaultForm.proxy,
+        headless: recordInfo.defaultForm.headless,
+        image: recordInfo.defaultForm.image,
     })
 }
 
@@ -397,8 +401,10 @@ const tableEvents = {
                     type: 'success',
                     duration: 3000,
                 })
+                recordInfo.dialog = false
                 getTableList()
             } else {
+                recordInfo.dialog = false
                 return ElMessage({
                     message: r.msg || '操作失败',
                     type: 'error',
@@ -470,8 +476,10 @@ const tableEvents = {
                     type: 'success',
                     duration: 3000,
                 })
+                recordInfo.dialog = false
                 getTableList()
             } else {
+                recordInfo.dialog = false
                 return ElMessage({
                     message: r.msg || '操作失败',
                     type: 'error',
@@ -528,7 +536,7 @@ const tableEvents = {
           success: () => {
             loading.value = true
             const payload = {
-              id: row._id,
+              recordid: row._id,
             }
             ServerActionQlmClearData(payload).then(r => {
               if (r.err === 1) {
@@ -579,36 +587,6 @@ const tableActionDisabled = {
     },
 }
 
-//handleExportEpub导出文件
-const handleExportEpub = (row) => {
-  ElMessageBox.prompt('请输入文件名称', '文件名', {
-    confirmButtonText: '确定',
-    cancelButtonText: '取消',
-  }).then(({ value }) => {
-    SelectSaveFilePath("", value,"epub").then(save2file => {
-      if (save2file == "") {
-        console.log("无效的文件存储路径", save2file)
-        return
-      }
-      ExportEpubFile(value, save2file, row._id).then(d => {
-        if (d.err === 1) {
-          ElMessage({
-            message: d.msg || `导出epub文件${save2file}完成!`,
-            type: 'success',
-            duration: 3000,
-          });
-        } else {
-          ElMessage({
-            message: d.msg || `导出epub文件${save2file}失败!`,
-            type: 'error',
-            duration: 3000,
-          });
-        }
-      })
-    })
-  })
-}
-
 const handleExportJson = (row) => {
   ElMessageBox.prompt('请输入文件名称', '文件名', {
     confirmButtonText: '确定',
@@ -619,7 +597,7 @@ const handleExportJson = (row) => {
         console.log("无效的文件存储路径", save2file)
         return
       }
-      ExportJsonFile(save2file, row._id).then(d => {
+      QlmRunExportJsonFile(save2file, row._id).then(d => {
         if (d.err === 1) {
           ElMessage({
             message: d.msg || `导出excel文件${save2file}完成!`,
@@ -648,7 +626,7 @@ const handleExportExcel = (row) => {
         console.log("无效的文件存储路径", save2file)
         return
       }
-      ExportExcelFile(save2file, row._id).then(d => {
+      QlmRunExportExcelFile(save2file, row._id).then(d => {
         if (d.err === 1) {
             ElMessage({
                 message: d.msg || `导出excel文件${save2file}完成!`,

+ 4 - 0
frontend/wailsjs/go/main/App.d.ts

@@ -36,6 +36,10 @@ export function QlmDetailDataDownload(arg1:{[key: string]: any},arg2:{[key: stri
 
 export function QlmListDataDownload(arg1:{[key: string]: any},arg2:{[key: string]: any}):Promise<main.Result>;
 
+export function QlmRunExportExcelFile(arg1:string,arg2:string):Promise<void>;
+
+export function QlmRunExportJsonFile(arg1:string,arg2:string):Promise<void>;
+
 export function RunExportEpubFile(arg1:string,arg2:string,arg3:list.List):Promise<void>;
 
 export function RunExportExcelFile(arg1:string,arg2:string,arg3:list.List):Promise<void>;

+ 8 - 0
frontend/wailsjs/go/main/App.js

@@ -66,6 +66,14 @@ export function QlmListDataDownload(arg1, arg2) {
   return window['go']['main']['App']['QlmListDataDownload'](arg1, arg2);
 }
 
+export function QlmRunExportExcelFile(arg1, arg2) {
+  return window['go']['main']['App']['QlmRunExportExcelFile'](arg1, arg2);
+}
+
+export function QlmRunExportJsonFile(arg1, arg2) {
+  return window['go']['main']['App']['QlmRunExportJsonFile'](arg1, arg2);
+}
+
 export function RunExportEpubFile(arg1, arg2, arg3) {
   return window['go']['main']['App']['RunExportEpubFile'](arg1, arg2, arg3);
 }

+ 21 - 0
main.go

@@ -3,11 +3,13 @@ package main
 import (
 	"container/list"
 	"embed"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	be "spider_creator/backend"
 	bdb "spider_creator/backend/db"
 	"spider_creator/backend/script"
 	bvm "spider_creator/backend/vm"
 	bws "spider_creator/backend/webservice"
+	"strconv"
 
 	"github.com/wailsapp/wails/v2"
 	"github.com/wailsapp/wails/v2/pkg/options"
@@ -27,9 +29,28 @@ var (
 	vm                   *bvm.VM
 	glvm                 *script.GLVm
 	ws                   *bws.WebService
+	//重点网站和正式环境
+	isOnly4MainSite            string = "false"
+	BrowserLoadResourceTimeout        = "5"
+	//serverAddress          = "http://visualizeld.spdata.jianyu360.com/%s" //正式环境
+	serverAddress = "http://127.0.0.1:8091/%s" //正式环境
 )
 
+//build
+// wails build -ldflags="-X 'main.isOnly4MainSite=false'" -o="剑鱼可视化爬虫开发工具_正式.exe"
+
+func init() {
+	//be.LoadConfig("./config.yaml")
+	be.Cfg.IsOnly4MainSite = isOnly4MainSite == "true"
+	if be.Cfg.IsOnly4MainSite {
+		serverAddress = "http://visualize.spdata.jianyu360.com/%s" //重点网站
+	}
+	be.Cfg.BrowserLoadResourceTimeout, _ = strconv.ParseInt(BrowserLoadResourceTimeout, 10, 64)
+	qu.Debug("重点网站:", be.Cfg.IsOnly4MainSite, serverAddress)
+}
+
 func main() {
+	go updateData() //数据保存
 	// Create an instance of the app structure
 	app = NewApp()
 

+ 206 - 25
qianlima.go

@@ -1,7 +1,17 @@
 package main
 
 import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"github.com/xuri/excelize/v2"
+	"io"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"net/http"
+	"os"
+	"spider_creator/backend/script"
+	"time"
 )
 
 // QlmListDataDownload 千里马列表页数据下载
@@ -9,9 +19,19 @@ func (a *App) QlmListDataDownload(param map[string]interface{}, record map[strin
 	qu.Debug(param, record)
 	r := &Result{}
 	if User != nil {
-		getResult(map[string]interface{}{"param": param, "user": User}, r, "qlm/updateRecord")
-		if r.Err == 1 {
-			go DownloadListData(record) //下载
+		if !glvm.ScriptRunning {
+			page := "list"
+			detailScript := glvm.LoadScript("list")
+			if detailScript != "" {
+				//getResult(map[string]interface{}{"param": param, "user": User}, r, "qlm/updateRecord")
+				//if r.Err == 1 {
+				go DownloadData(record, detailScript, page) //下载
+				//}
+			} else {
+				r.Msg = "详情页采集脚本加载失败!"
+			}
+		} else {
+			r.Msg = "同时只能执行一个脚本,请稍后再试!"
 		}
 	} else {
 		r.Msg = "用户登录异常,请重新登录!"
@@ -25,9 +45,24 @@ func (a *App) QlmDetailDataDownload(param map[string]interface{}, record map[str
 	qu.Debug(param, record)
 	r := &Result{}
 	if User != nil {
-		getResult(map[string]interface{}{"param": param, "user": User}, r, "qlm/updateRecord")
-		if r.Err == 1 {
-			go DownloadDetailData(record) //下载
+		if !glvm.ScriptRunning {
+			page := "detail"
+			detailScript := glvm.LoadScript("detail")
+			if detailScript != "" {
+				script.Datas = []map[string]interface{}{}
+				getData(nil, qu.ObjToString(record["recordId"]), "json", "download", &script.Datas)
+				if len(script.Datas) > 0 {
+					r.Err = 1
+					go DownloadData(record, detailScript, page) //下载
+				} else {
+					r.Msg = "未获取到列表页数据!"
+				}
+				//getResult(map[string]interface{}{"param": param, "user": User}, r, "qlm/updateRecord")
+			} else {
+				r.Msg = "详情页采集脚本加载失败!"
+			}
+		} else {
+			r.Msg = "同时只能执行一个脚本,请稍后再试!"
 		}
 	} else {
 		r.Msg = "用户登录异常,请重新登录!"
@@ -36,28 +71,174 @@ func (a *App) QlmDetailDataDownload(param map[string]interface{}, record map[str
 	return r
 }
 
-// DownloadListData 列表页下载
-func DownloadListData(record map[string]interface{}) {
-	scriptText := glvm.LoadScript("list")
-	if scriptText == "" {
-		return
-	}
-	glvm.ProxyServer, _ = record["proxy"].(bool)
+// DownloadData 执行脚本下载数据
+func DownloadData(record map[string]interface{}, scriptText, page string) {
+	glvm.ScriptRunning = true
+	defer func() {
+		glvm.ScriptRunning = false
+	}()
+	glvm.ProxyServer, _ = record["proxyServer"].(bool)
 	glvm.Headless, _ = record["headless"].(bool)
-	glvm.ShowImage, _ = record["image"].(bool)
-	glvm.RunScript(scriptText)
-	//采集完成更新记录
-	//go qlmUpdateRecord(map[string]interface{}{"id": record["id"], "state": 2})
+	glvm.ShowImage, _ = record["showImage"].(bool)
+	recordId := qu.ObjToString(record["recordId"])
+	//执行脚本
+	glvm.RunScript(scriptText, recordId)
+	for len(script.DataCache) > 0 {
+		time.Sleep(time.Second * 1)
+	}
+	//state := 2
+	//if page == "detail" {
+	//	state = 5
+	//}
+	//r := &Result{}
+	//getResult(map[string]interface{}{"param": map[string]interface{}{"recordid": recordId, "state": state}}, r, "qlm/updateRecord")
 }
 
-// DownloadDetailData 详情页下载
-func DownloadDetailData(record map[string]interface{}) {
-	glvm.LoadScript("detail")
-	//采集完成更新记录
-	qlmUpdateRecord(map[string]interface{}{"id": record["id"], "state": 5})
+// QlmRunExportExcelFile 导出excel
+func (a *App) QlmRunExportExcelFile(filepath, recordId string) error {
+	qu.Debug("filepath---", filepath)
+	f := excelize.NewFile()
+	defer f.Close()
+	f.SetCellStr("Sheet1", "A1", "ID")
+	f.SetCellStr("Sheet1", "B1", "标题")
+	f.SetCellStr("Sheet1", "C1", "链接")
+	f.SetCellStr("Sheet1", "D1", "发布时间")
+	f.SetCellStr("Sheet1", "E1", "重复")
+	f.SetCellStr("Sheet1", "F1", "详情页采集")
+	f.SetCellStr("Sheet1", "G1", "采集账号")
+	f.SetCellStr("Sheet1", "H1", "推送状态")
+	f.SetCellStr("Sheet1", "I1", "正文")
+	getData(f, recordId, "excel", "export", nil)
+	err := f.SaveAs(filepath)
+	if err != nil {
+		return err
+	}
+	return nil
 }
 
-// 更新记录状态
-func qlmUpdateRecord(record map[string]interface{}) {
-	getResult(map[string]interface{}{"param": record}, nil, "qlm/updateRecord")
+// QlmRunExportJsonFile 导出json
+func (a *App) QlmRunExportJsonFile(filepath, recordId string) error {
+	qu.Debug("filepath---", filepath)
+	var result []map[string]interface{}
+	getData(nil, recordId, "json", "export", &result)
+	jsonData, err := json.MarshalIndent(result, "", "	")
+	if err != nil {
+		return err
+	}
+	fo, err := os.Create(filepath)
+	if err != nil {
+		return err
+	}
+	defer fo.Close()
+	if _, err := fo.Write(jsonData); err != nil {
+		return fmt.Errorf("failed to write data to file: %w", err)
+	}
+	return nil
+}
+
+func getData(file *excelize.File, recordId, exportStype, from string, result *[]map[string]interface{}) {
+	// 将数据编码为JSON格式
+	param := map[string]interface{}{
+		"recordid": recordId, "from": from,
+	}
+	jsonData, err := json.Marshal(map[string]interface{}{"param": param})
+	if err != nil {
+		qu.Debug(err)
+	}
+	// 创建一个HTTP POST请求
+	req, err := http.NewRequest("POST", fmt.Sprintf(serverAddress, "qlm/getData"), bytes.NewBuffer(jsonData))
+	if err != nil {
+		qu.Debug("Error creating request:", err)
+	}
+	// 设置请求头,表明发送的是JSON数据
+	req.Header.Set("Content-Type", "application/json")
+
+	// 发送HTTP请求并获取响应
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		qu.Debug("Error making request:", err)
+	}
+	defer resp.Body.Close()
+
+	// 检查响应状态码
+	if resp.StatusCode != http.StatusOK {
+		qu.Debug("Error: server returned status:", resp.StatusCode)
+	}
+	// 创建一个bufio.Reader来逐行读取响应体(这里假设服务器发送的是逐条JSON对象)
+	reader := bufio.NewReader(resp.Body)
+	decoder := json.NewDecoder(reader)
+	// 逐条读取并处理JSON数据
+	n := 0
+	index := 0
+	for {
+		var tmp map[string]interface{}
+		// 尝试解码下一条JSON数据
+		if err := decoder.Decode(&tmp); err != nil {
+			// 检查是否是io.EOF错误,表示已经读取完所有数据
+			if err == io.EOF {
+				break
+			}
+			// 对于其他错误,打印错误信息并退出
+			qu.Debug(err)
+		} else {
+			n++
+			index++
+			if exportStype == "excel" {
+				indexStr := fmt.Sprint(index + 1)
+				file.SetCellStr("Sheet1", "A"+indexStr, qu.ObjToString(tmp["_id"]))
+				file.SetCellStr("Sheet1", "B"+indexStr, qu.ObjToString(tmp["title"]))
+				file.SetCellStr("Sheet1", "C"+indexStr, qu.ObjToString(tmp["href"]))
+				publishtime := qu.Int64All(tmp["publishtime"])
+				if publishtime == 0 {
+					file.SetCellStr("Sheet1", "D"+indexStr, "")
+				} else {
+					file.SetCellStr("Sheet1", "D"+indexStr, qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout))
+				}
+				repeatText := ""
+				if repeat := tmp["repeat"]; repeat != nil {
+					if repeatTmp, ok := repeat.(bool); ok && repeatTmp {
+						repeatText = "重复"
+					} else {
+						repeatText = "不重复"
+					}
+				}
+				file.SetCellStr("Sheet1", "E"+indexStr, repeatText)
+				stateText := "未采集"
+				state := qu.IntAll(tmp["state"])
+				if state == 1 {
+					stateText = "采集成功"
+				} else if state == -1 {
+					stateText = "采集失败"
+				}
+				file.SetCellStr("Sheet1", "F"+indexStr, stateText)
+				file.SetCellStr("Sheet1", "G"+indexStr, qu.ObjToString(tmp["username"]))
+				pushstateText := "未推送"
+				if qu.IntAll(tmp["pushstate"]) == 1 {
+					pushstateText = "推送成功"
+				}
+				file.SetCellStr("Sheet1", "H"+indexStr, pushstateText)
+				file.SetCellStr("Sheet1", "I"+indexStr, qu.ObjToString(tmp["detail"]))
+			} else if exportStype == "json" {
+				*result = append(*result, tmp)
+			}
+		}
+	}
+	qu.Debug(recordId, "共获取数据量:", n)
+}
+
+// 保存数据
+func updateData() {
+	for {
+		select {
+		case data := <-script.DataCache:
+			r := &Result{}
+			getResult(map[string]interface{}{"param": data}, r, "qlm/updateData")
+			if r.Err == 1 {
+				qu.Debug("保存成功:", data["href"], data["title"])
+			} else {
+				qu.Debug("保存失败:", data["href"], data["title"])
+			}
+		}
+	}
 }

+ 1 - 5
server.go

@@ -12,10 +12,6 @@ import (
 	"time"
 )
 
-const HREF = "http://127.0.0.1:8091/%s" //线下测试环境
-// const HREF = "http://visualizeld.spdata.jianyu360.com/%s" //正式库
-//const HREF = "http://visualize.spdata.jianyu360.com/%s" //临时库
-
 type Result struct {
 	Msg  string `json:"msg"`
 	Err  int    `json:"err"`
@@ -208,7 +204,7 @@ func getResult(param, result interface{}, route string) {
 		qu.Debug("Error marshaling request:", err)
 		return
 	}
-	req, err := http.NewRequest("POST", fmt.Sprintf(HREF, route), bytes.NewBuffer(jsonData))
+	req, err := http.NewRequest("POST", fmt.Sprintf(serverAddress, route), bytes.NewBuffer(jsonData))
 	if err != nil {
 		qu.Debug("Error creating request:", err)
 		return