فهرست منبع

Merge remote-tracking branch 'origin/dev1.1' into dev1.1

mxs 5 ماه پیش
والد
کامیت
097d7ce416
1فایلهای تغییر یافته به همراه65 افزوده شده و 4 حذف شده
  1. 65 4
      frontend/src/components/spider/jscodetpl.js

+ 65 - 4
frontend/src/components/spider/jscodetpl.js

@@ -204,11 +204,72 @@ let regex = /[\\u4e00-\\u9fa5]/g;
 let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
 let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
 if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
+ret 
+    `,
+        },{
+            "name": "模版2",
+            "tooltip": "基础模版,信息来源需要分割",
+            "code": `
+var ret = {}
+var tmp = null
+
+if ("{{.TitleCss}}" != "") {//标题
+	tmp = document.querySelector("{{.TitleCss}}")
+	if (tmp) ret["title"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.PublishUnitCss}}" != "") {//采购单位
+	tmp = document.querySelector("{{.PublishUnitCss}}")
+	if (tmp) ret["publishUnit"] = tmp.getAttribute("title") || tmp.innerText.split(':').at(-1)
+}
+if ("{{.PublishTimeCss}}" != "") {//发布时间
+	tmp = document.querySelector("{{.PublishTimeCss}}")
+	if (tmp) ret["publishTime"] = tmp.getAttribute("title") || tmp.innerText.split(':').at(-1)
+}
+if ("{{.ContentCss}}" != "") {//正文内容
+  tmp = document.querySelector("{{.ContentCss}}")
+  if (tmp) {
+    ret["content"] = tmp.innerText
+    ret["contentHtml"] = tmp.innerHTML
+    var patchContent = false
+    //处理详情页中的大图,大图作为附件使用
+    const images = tmp.querySelectorAll("img");
+    images.forEach((img, i) => {
+      if (img.width > 300) {
+        patchContent = true
+        const a = document.createElement("a");
+        a.href = img.src;
+        a.innerText = img.src;
+        tmp.appendChild(a);
+      }
+    })
+  }
+}
+if("{{.AttachCss}}"!=""){//附件
+	let attach=[]
+	//先处理正文附件
+	if("{{.AttachCss}}"!="{{.ContentCss}}"){
+		document.querySelectorAll("{{.ContentCss}} a").forEach(v=>{
+			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+		})
+	}
+	tmp = document.querySelectorAll("{{.AttachCss}} a")
+	if(tmp){
+		tmp.forEach((v,i)=>{
+			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+		})
+	}
+	ret["attachLinks"]=attach
+}
+//检查中文字符个数,少于20,修正正文内容
+let regex = /[\\u4e00-\\u9fa5]/g;
+let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
+let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
+if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
 ret 
     `,
         },
         {
-            "name": "模版2",
+            "name": "模版3",
             "tooltip": "详情页存在2套不同CSS选择器",
             "code": `
 var ret = {}
@@ -275,7 +336,7 @@ if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length
 ret 
     `,
         }, {
-            "name": "模版3",
+            "name": "模版4",
             "tooltip": "详情页存在2套不同CSS选择器,一套是微信公众号",
             "code": `
     var ret = {}
@@ -342,7 +403,7 @@ ret
     ret 
     `,
         }, {
-            "name": "模版4",
+            "name": "模版5",
             "tooltip": "详情页正文包含其他元素,需要清洗",
             "code": `
 var ret = {}
@@ -410,7 +471,7 @@ let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
 if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
 ret 
     `}, {
-            "name": "模版5",
+            "name": "模版6",
             "tooltip": "详情页日期提取,需要用正则提取",
             "code": `
 var ret = {}