Răsfoiți Sursa

新增了正文中无效附件过滤

mxs 5 luni în urmă
părinte
comite
42cb1a17d9
2 a modificat fișierele cu 22 adăugiri și 4 ștergeri
  1. 21 3
      backend/vm/vm.go
  2. 1 1
      frontend/src/components/spider/EditSpider.vue

+ 21 - 3
backend/vm/vm.go

@@ -29,7 +29,9 @@ const (
 )
 
 var (
-	Reg_Date = regexp.MustCompile(`\d`)
+	Reg_Date             = regexp.MustCompile(`\d`)
+	Reg_File_ContentType = regexp.MustCompile(`(?i)^(application/(vnd\.(openxmlformats-officedocument|ms-excel)|msword|pdf)|image/(png|jpeg))`)
+	Reg_File_Type        = regexp.MustCompile(`(?i)\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|bmp|zip|rar|7z|gz|csv|swf)$`)
 )
 
 type (
@@ -78,7 +80,23 @@ func downloadAttaches(v *be.ResultItem, attachesDir string) {
 	}
 	for _, attach := range v.AttachLinks {
 		qu.Debug("准备下载附件,", attach.Href, attach.Title)
-		req, err := http.NewRequest("GET", attach.Href, nil)
+		//if !Reg_File_Type.MatchString(attach.Title) {
+		req, err := http.NewRequest("HEAD", attach.Href, nil)
+		if err != nil {
+			continue
+		}
+		resp, err := client.Do(req)
+		if err != nil || resp.StatusCode != http.StatusOK {
+			continue
+		}
+		ft := resp.Header.Get("Content-Type")
+		fl := resp.Header.Get("Content-Length")
+		qu.Debug("------------", ft, qu.IntAll(fl), qu.IntAll(fl)/1024)
+		if !Reg_File_ContentType.MatchString(ft) || qu.IntAll(fl) < 1024*5 {
+			continue
+		}
+		//}
+		req, err = http.NewRequest("GET", attach.Href, nil)
 		if err != nil {
 			qu.Debug(" 下载附件 构建req 出错:", attach.Href, attach.FileName, err.Error())
 			continue
@@ -91,7 +109,7 @@ func downloadAttaches(v *be.ResultItem, attachesDir string) {
 		req.Header.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
 		req.Header.Add("host", hostName)
 		req.Header.Add("referer", v.Href)
-		resp, err := client.Do(req)
+		resp, err = client.Do(req)
 		if err != nil {
 			qu.Debug(" 下载附件 发送请求 出错:", attach.Href, attach.FileName, err.Error())
 			continue

+ 1 - 1
frontend/src/components/spider/EditSpider.vue

@@ -398,7 +398,7 @@ const fastKeyDownMap = {
 }
 // 背景色map
 const cssInputBg = {
-  listItemCss: { color: "#fff9c4", label: "列表条目", formLabel: '条目区域块CSS' },
+  listItemCss: { color: "#fff9c4", label: "列表条目", formLabel: '条目CSS' },
   listLinkCss: { color: "#bbdefb", label: "列表标题", formLabel: '条目链接CSS' },
   listPublishTimeCss: { color: "#c8e6c9", label: "列表发布时间", formLabel: '条目发布时间CSS' },
   listNextPageCss: { color: "#dcedc8", label: "列表下一页", formLabel: '翻页下一页CSS' },