Przeglądaj źródła

新增资源过滤配置

mxs 7 miesięcy temu
rodzic
commit
c5ca758110

+ 3 - 2
backend/browser.go

@@ -69,7 +69,7 @@ var (
 	}
 )
 
-func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string) (context.Context, context.CancelFunc, context.Context, context.CancelFunc, context.Context, context.CancelFunc) {
+func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string, filterResource string) (context.Context, context.CancelFunc, context.Context, context.CancelFunc, context.Context, context.CancelFunc) {
 	ignoreCertificateErrors := false
 	if strings.HasPrefix(baseUrl, "https") {
 		ignoreCertificateErrors = true
@@ -130,7 +130,8 @@ func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string)
 	trie := NewTrie()
 	//TODO 这里默认构建通用的资源加载排除,最好是单个网站可以定制,
 	// 对于纯后端渲染网站,可以屏蔽所有资源加载,达到平台最高性能目的
-	trie.BatchInsert(Cfg.DisableLoadResource)
+	trie.BatchInsert(Cfg.DisableLoadResource) //全局过滤
+	trie.BatchInsert(filterResource)          //指定过滤
 	chromedp.ListenTarget(incCtx, func(event interface{}) {
 		switch ev := event.(type) {
 		case *fetch.EventRequestPaused:

+ 3 - 1
backend/prefixtree.go

@@ -107,6 +107,8 @@ func (t *Trie) HasKeyword(text string) bool {
 // BatchInsert
 func (t *Trie) BatchInsert(words string) {
 	for _, s := range strings.Split(words, ";") {
-		t.Insert(s)
+		if s != "" {
+			t.Insert(s)
+		}
 	}
 }

+ 1 - 1
backend/script/script.go

@@ -143,7 +143,7 @@ func (glvm *GLVm) ResetBrowser() {
 		glvm.B.Ctx = nil
 		glvm.B.CancelFn = nil
 	}
-	_, _, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "http://")
+	_, _, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "https://", "")
 	b := &GLBrowser{
 		Ctx:      ctx,
 		CancelFn: incCancelFn,

+ 1 - 0
backend/types.go

@@ -53,6 +53,7 @@ type (
 		AttachJSCode       string     `json:"attachJs"` //无效
 		ListTurnPageJSCode string     `json:"listTurnPageJs"`
 		MaxPages           int64      `json:"maxPages"`
+		FilterResource     string     `json:"filterResource"` //要过滤的资源
 		//延时
 		ListDelayTime     int64 `json:"listDelayTime"`
 		ListTurnDelayTime int64 `json:"listTurnDelayTime"`

+ 3 - 3
backend/vm/check.go

@@ -26,8 +26,8 @@ func (vm *VM) verifySpiderConfig4Prod(sc *be.SpiderConfig) (*be.SpiderConfigVeri
 	verifyResult := list.New()
 	be.DataResults[sc.Code] = verifyResult
 	ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href)    //列表页使用
-	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href) //详情页使用
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource)    //列表页使用
+	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //详情页使用
 	defer func() {
 		incCancelFn2()
 		baseCancelFn2()
@@ -149,7 +149,7 @@ func (vm *VM) verifySpiderConfig4MainSite(sc *be.SpiderConfig) (*be.SpiderConfig
 	verifyResult := list.New()
 	be.DataResults[sc.Code] = verifyResult
 	ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href) //列表页使用
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //列表页使用
 	defer func() {
 		incCancelFn()
 		baseCancelFn()

+ 2 - 2
backend/vm/jobs.go

@@ -56,8 +56,8 @@ func (vm *VM) RunJob(code string) {
 	vm.dnf.Dispatch("run_job_event", &be.JobRunningEvent{Code: job.Code, Act: be.JOB_RUNNING_EVENT_DEBUG, Msg: "加载作业完成"})
 	no := 1
 	//加载参数
-	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(true, false, false, "https://")    //列表页使用
-	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(true, false, false, "https://") //详情页使用
+	_, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(true, false, false, "https://", "")    //列表页使用
+	_, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(true, false, false, "https://", "") //详情页使用
 	defer func() {
 		job.State = 0
 		job.Progress = 0

+ 3 - 3
backend/vm/single.go

@@ -30,7 +30,7 @@ func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, c
 	if url != "" {
 		sc.Href = url
 	}
-	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, sc.Href)
+	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, sc.Href, sc.FilterResource)
 	qu.Debug("1浏览器打开", *sc)
 	vm.dnf.Dispatch("debug_event", "1 浏览器打开")
 	defer func() {
@@ -116,7 +116,7 @@ func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay
 	if url != "" {
 		sc.Href = url
 	}
-	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, sc.Href)
+	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, sc.Href, sc.FilterResource)
 	qu.Debug("1浏览器打开", *sc)
 	vm.dnf.Dispatch("debug_event", "1 浏览器打开")
 	defer func() {
@@ -336,7 +336,7 @@ func (vm *VM) InitPageTmp(ctx context.Context, timeout int) bool {
 func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
 	headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
 	sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
-	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, false, sc.Href)
+	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, false, sc.Href, sc.FilterResource)
 	qu.Debug("1浏览器打开")
 	vm.dnf.Dispatch("debug_event", "1 浏览器打开")
 	defer func() {

+ 4 - 4
backend/vm/worker.go

@@ -22,8 +22,8 @@ func (w *Worker) Destory() {
 }
 
 // NewWorker
-func NewWorker(headless bool, showImage bool, proxyServe bool, contentDelay int64, js string, vm *VM) *Worker {
-	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, "https://")
+func NewWorker(headless bool, showImage bool, proxyServe bool, contentDelay int64, js string, vm *VM, filterResource string) *Worker {
+	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, "https://", filterResource)
 	return &Worker{
 		baseCancel:   baseCancel,
 		incCancel:    cancel,
@@ -76,7 +76,7 @@ func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64, tru
 	if url != "" {
 		sc.Href = url
 	}
-	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, sc.Href)
+	_, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe, sc.Href, sc.FilterResource)
 	qu.Debug("1浏览器打开")
 	vm.dnf.Dispatch("debug_event", "1 浏览器打开")
 	defer func() {
@@ -99,7 +99,7 @@ func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64, tru
 	ch := make(chan *Worker, threads)
 	wg := new(sync.WaitGroup)
 	for i := 0; i < threads; i++ {
-		w := NewWorker(headless, showImage, proxyServe, contentDelay, runContentJs, vm)
+		w := NewWorker(headless, showImage, proxyServe, contentDelay, runContentJs, vm, sc.FilterResource)
 		wts = append(wts, w)
 		ch <- w
 	}

+ 0 - 4
bind4spider.go

@@ -168,7 +168,6 @@ func (a *App) LoadAllJobs() be.Jobs {
 	return jobs
 }
 
-// SaveJob
 func (a *App) SaveJob(job *be.Job) string {
 	err := bdb.SaveEntity[be.Job]("jobs", job.Code, job)
 	if err != nil {
@@ -177,7 +176,6 @@ func (a *App) SaveJob(job *be.Job) string {
 	return "ok"
 }
 
-// DeleteJob
 func (a *App) DeleteJob(code string) string {
 	err := bdb.DeleteEntity[be.Job]("jobs", code)
 	if err != nil {
@@ -186,13 +184,11 @@ func (a *App) DeleteJob(code string) string {
 	return "ok"
 }
 
-// LoadJob
 func (a *App) LoadJob(code string) *be.Job {
 	job, _ := bdb.LoadEntity[be.Job]("jobs", code)
 	return job
 }
 
-// RunJob
 func (a *App) RunJob(code string) string {
 	go vm.RunJob(code)
 	return "ok"

+ 5 - 5
frontend/src/components/spider/EditSpider.vue

@@ -28,26 +28,26 @@
       </el-space>
     </div>
     <div class="space" />
-    <el-form ref="form0" label-width="115px">
+    <el-form ref="form0" label-width="120px">
       <el-row>
-        <el-col :span="5">
+        <el-col :span="12">
           <el-form-item label="列表页延迟(MS)">
             <el-input v-model="formData.listDelayTime" placeholder="1000"></el-input>
           </el-form-item>
         </el-col>
-        <el-col :span="5">
+        <el-col :span="12">
           <el-form-item label="翻页延迟(MS)">
             <el-input v-model="formData.listTurnDelayTime" placeholder="1000"></el-input>
           </el-form-item>
         </el-col>
       </el-row>
       <el-row>
-        <el-col :span="5">
+        <el-col :span="12">
           <el-form-item label="详情页延迟(MS)">
             <el-input v-model="formData.contentDelayTime" placeholder="1000"></el-input>
           </el-form-item>
         </el-col>
-        <el-col :span="5">
+        <el-col :span="12">
           <el-form-item label="采集最大页">
             <el-input v-model="formData.maxPages" :placeholder="defaultFormValue.maxPages + ''"></el-input>
           </el-form-item>