Browse Source

详情页模板调整

mxs 8 tháng trước cách đây
mục cha
commit
08b72300c5

+ 20 - 2
backend/vm/load_content.js

@@ -1,4 +1,4 @@
-//执行JS代码
+
 var ret = {}
 var tmp = null
 
@@ -19,6 +19,18 @@ if ("{{.ContentCss}}" != "") {//正文内容
 	if (tmp) {
 		ret["content"] = tmp.innerText
 		ret["contentHtml"] = tmp.innerHTML
+		var patchContent = false
+		//处理详情页中的大图,大图作为附件使用
+		const images = tmp.querySelectorAll("img");
+		images.forEach((img, i) => {
+			if (img.width > 300) {
+				patchContent = true
+				const a = document.createElement("a");
+				a.href = img.src;
+				a.innerText = img.src;
+				tmp.appendChild(a);
+			}
+		})
 	}
 }
 if("{{.AttachCss}}"!=""){//附件
@@ -31,4 +43,10 @@ if("{{.AttachCss}}"!=""){//附件
 	}
 	ret["attachLinks"]=attach
 }
-ret
+//检查中文字符个数,少于20,修正正文内容
+let regex = /[\u4e00-\u9fa5]/g;
+let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
+let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
+if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
+ret 
+    

+ 30 - 22
frontend/src/components/spider/EditSpider.vue

@@ -32,19 +32,19 @@
       <el-row>
         <el-col :span="12">
           <el-form-item label="列表页延迟时间(MS)">
-            <el-input v-model="formData.listDelayTime" placeholder="500"></el-input>
+            <el-input v-model="formData.listDelayTime" placeholder="1000"></el-input>
           </el-form-item>
         </el-col>
         <el-col :span="12">
           <el-form-item label="列表翻页延迟时间(MS)">
-            <el-input v-model="formData.listTurnDelayTime" placeholder="500"></el-input>
+            <el-input v-model="formData.listTurnDelayTime" placeholder="1000"></el-input>
           </el-form-item>
         </el-col>
       </el-row>
       <el-row>
         <el-col :span="12">
           <el-form-item label="详情页延迟时间(MS)">
-            <el-input v-model="formData.contentDelayTime" placeholder="500"></el-input>
+            <el-input v-model="formData.contentDelayTime" placeholder="1000"></el-input>
           </el-form-item>
         </el-col>
         <el-col :span="12">
@@ -179,8 +179,12 @@
         </template>
         <el-divider>
           手写列表页提取JS代码
-          <!-- <el-button type="primary" @click='editorHandle.ImportListCode'>导入模板</el-button> -->
-          <el-button type="primary" @click='editorHandle.createImportListCode'>生成JS代码</el-button>
+          <el-button-group>
+            <el-tooltip v-for="item,index in TemplateJsCode.ListJsCodes" :key="index" class="box-item" effect="dark" :content="item.tooltip"
+                        placement="top-start">
+              <el-button size="small" type="primary" @click='editorHandle.createImportListCode(index)'>{{item.name}}</el-button>
+            </el-tooltip>
+          </el-button-group>
         </el-divider>
         <el-row>
           <el-input v-model="formData.listJs" class="codeEditor" :rows="6" type="textarea"
@@ -198,10 +202,12 @@
         </template>
         <el-divider>
           手写列表页翻页JS代码
-          <!-- <el-button type="primary" @click='editorHandle.ImportListTrunPageCode'>导入模板</el-button> -->
-          <el-button type="primary" @click='editorHandle.createImportListTrunPageCode(0)'>生成JS代码</el-button>
-          <el-button type="primary" @click='editorHandle.createImportListTrunPageCode(2)'>生成JS代码2</el-button>
-
+          <el-button-group>
+            <el-tooltip v-for="item,index in TemplateJsCode.ListTurnPageJsCodes" :key="index" class="box-item" effect="dark" :content="item.tooltip"
+                        placement="top-start">
+              <el-button size="small" type="primary" @click='editorHandle.createImportListTrunPageCode(index)'>{{item.name}}</el-button>
+            </el-tooltip>
+          </el-button-group>
         </el-divider>
         <el-row>
           <el-input v-model="formData.listTurnPageJs" class="codeEditor" :rows="6" type="textarea"
@@ -220,8 +226,12 @@
         </template>
         <el-divider>
           手写附件下载/上传JS代码
-          <!-- <el-button type="primary" @click='editorHandle.ImportContentCode'>导入模板</el-button> -->
-          <el-button type="primary" @click='editorHandle.createImportContentCode'>生成JS代码</el-button>
+          <el-button-group>
+            <el-tooltip v-for="item,index in TemplateJsCode.ContentJsCodes" :key="index" class="box-item" effect="dark" :content="item.tooltip"
+                        placement="top-start">
+              <el-button size="small" type="primary" @click='editorHandle.createImportContentCode(index)'>{{item.name}}</el-button>
+            </el-tooltip>
+          </el-button-group>
         </el-divider>
         <el-row><el-input v-model="formData.contentJs" class="codeEditor" :rows="6" type="textarea"
                           placeholder="Please input" />
@@ -259,13 +269,13 @@ const dialogTitle = ref('仅编辑 CSS选择器部分')
 const store = useStore()
 
 const defaultFormValue = {
-  delayTime: 500,
+  delayTime: 1000,
   maxPages: 2,
 }
 
 // 定义tabData.initList数据结构
 class InitListItem {
-  constructor(actionJs = '', checkJs = '', sleepTime = 500) {
+  constructor(actionJs = '', checkJs = '', sleepTime = 1000) {
     this.actionJs = actionJs //动作JS
     this.checkJs = checkJs //检查JS
     this.sleepTime = sleepTime //等待时长
@@ -280,7 +290,7 @@ const tabData = reactive({
     // {
     //     actionJs: '', //动作JS
     //     checkJs: '', //检查JS
-    //     sleepTime: 500, //等待时长
+    //     sleepTime: 1000, //等待时长
     // },
   ]
 })
@@ -381,10 +391,9 @@ const editorHandle = {
   ImportListTrunPageCode:()=>{
     formData.value.listTurnPageJs=TemplateJsCode.ListTurnPageJsCode
   },
-  createImportListCode() {
+  createImportListCode(mode) {
     // 数据替换
-    const originString = TemplateJsCode.ListJsCode
-    let replaceString = originString
+    let replaceString = TemplateJsCode.ListJsCodes[mode].code
     if (formData.value.listItemCss) {
       replaceString = replaceString.replaceAll('{{.ListItemCss}}', formData.value.listItemCss)
     } else {
@@ -403,9 +412,8 @@ const editorHandle = {
     formData.value.listJs = replaceString
     return replaceString
   },
-  createImportContentCode() {
-    const originString = TemplateJsCode.ContentJsCode
-    let replaceString = originString
+  createImportContentCode(mode) {
+    let replaceString = TemplateJsCode.ContentJsCodes[mode].code
     if (formData.value.titleCss) {
       replaceString = replaceString.replaceAll('{{.TitleCss}}', formData.value.titleCss)
     } else {
@@ -434,8 +442,8 @@ const editorHandle = {
     formData.value.contentJs = replaceString
   },
   createImportListTrunPageCode(mode) {
-    const originString = mode==2?TemplateJsCode.ListTurnPageJsCode2:TemplateJsCode.ListTurnPageJsCode
-    let replaceString = originString
+    //const originString = mode == 2 ? TemplateJsCode.ListTurnPageJsCode2 : TemplateJsCode.ListTurnPageJsCode
+    let replaceString = TemplateJsCode.ListTurnPageJsCodes[mode].code
     if (formData.value.listNextPageCss) {
       replaceString = replaceString.replaceAll('{{.ListNextPageCss}}', formData.value.listNextPageCss)
     } else {

+ 2 - 2
frontend/src/components/spider/RunSpider.vue

@@ -366,8 +366,8 @@ EventsOn("debug_event", data => {
 //加载当前爬虫配置
 // ViewCurrentSpiderConfig().then(result => {
 //     console.log(result)
-//     // result['listDelay'] = 500
-//     // result['contentDelay'] = 500
+//     // result['listDelay'] = 1000
+//     // result['contentDelay'] = 1000
 //     // result['proxyServe'] = ''
 //     // result['showImage'] = 'false'
 //     // result['headless'] = 'false'

+ 230 - 14
frontend/src/components/spider/jscodetpl.js

@@ -1,6 +1,11 @@
 //模板
-export const TemplateJsCode={
-    ListJsCode:`
+export const TemplateJsCode = {
+    //列表页代码集合
+    ListJsCodes: [
+        {
+            "name": "模版1",
+            "tooltip": "基础模版",
+            "code": `
 var ret = []
 document.querySelectorAll("{{.ListItemCss}}").forEach((v, i) => {
     let item = {}
@@ -25,8 +30,72 @@ document.querySelectorAll("{{.ListItemCss}}").forEach((v, i) => {
 })
 ret
     `,
-    //执行JS代码
-    ContentJsCode:`
+        }, {
+            "name": "模版2",
+            "tooltip": "发布日期需要加工,年月/日分开,需要拼装",
+            "code": `
+var ret = []
+document.querySelectorAll("{{.ListItemCss}}").forEach((v, i) => {
+    let item = {}
+    if ("{{.ListLinkCss}}" != "") {
+        let link = v.querySelector("{{.ListLinkCss}}")
+        if (link) {
+            var href = link.href
+            if (!href.startsWith("http")) href = window.location.origin + "/" + href
+            let title = link.getAttribute("title") || link.innerText
+            item = { "listTitle": title, "href": href, "no": i }
+        } else {
+            item = { "no": i }
+        }
+    }
+    if ("{{.ListPubtimeCss}}" != "") { //发布日期CSS选择器定位到条目日期外层
+        let pubtime = v.querySelector("{{.ListPubtimeCss}}")
+        if (pubtime) {
+            //TODO 修改这里进行拼装
+            item["listPublishTime"] = pubtime.querySelector("年月CSS选择器").innerText+'-'+pubtime.querySelector("日CSS选择器").innerText
+        }
+    }
+    ret.push(item)
+})
+ret
+    `,
+        }, {
+            "name": "模版3",
+            "tooltip": "发布日期有干扰,需要清洗",
+            "code": `
+var ret = []
+document.querySelectorAll("{{.ListItemCss}}").forEach((v, i) => {
+    let item = {}
+    if ("{{.ListLinkCss}}" != "") {
+        let link = v.querySelector("{{.ListLinkCss}}")
+        if (link) {
+            var href = link.href
+            if (!href.startsWith("http")) href = window.location.origin + "/" + href
+            let title = link.getAttribute("title") || link.innerText
+            item = { "listTitle": title, "href": href, "no": i }
+        } else {
+            item = { "no": i }
+        }
+    }
+    if ("{{.ListPubtimeCss}}" != "") { 
+        let pubtime = v.querySelector("{{.ListPubtimeCss}}")
+        if (pubtime) {
+            //TODO 修改这里进行清洗
+            item["listPublishTime"] = pubtime.innerText.length>15?pubtime.innerText.slice(0,15):""
+        }
+    }
+    ret.push(item)
+})
+ret
+    `,
+        },
+    ],
+    //详情页代码集合
+    ContentJsCodes: [
+        {
+            "name": "模版1",
+            "tooltip": "基础模版",
+            "code": `
 var ret = {}
 var tmp = null
 
@@ -76,24 +145,171 @@ let regex = /[\\u4e00-\\u9fa5]/g;
 let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
 let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
 if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
-ret
+ret 
+    `,
+        },
+        {
+            "name": "模版2",
+            "tooltip": "详情页存在2套不同CSS选择器",
+            "code": `
+var ret = {}
+var tmp = null
+
+if ("{{.TitleCss}}" != "") {//标题
+	tmp = document.querySelector("{{.TitleCss}}") || document.querySelector("第二套CSS选择器,请修改")
+	if (tmp) ret["title"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.PublishUnitCss}}" != "") {//采购单位
+	tmp = document.querySelector("{{.PublishUnitCss}}") || document.querySelector("第二套CSS选择器,请修改")
+	if (tmp) ret["publishUnit"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.PublishTimeCss}}" != "") {//发布时间
+	tmp = document.querySelector("{{.PublishTimeCss}}") || document.querySelector("第二套CSS选择器,请修改")
+	if (tmp) ret["publishTime"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.ContentCss}}" != "") {//正文内容
+  tmp = document.querySelector("{{.ContentCss}}") || document.querySelector("第二套CSS选择器,请修改")
+  if (tmp) {
+    ret["content"] = tmp.innerText
+    ret["contentHtml"] = tmp.innerHTML
+    var patchContent = false
+    //处理详情页中的大图,大图作为附件使用
+    const images = tmp.querySelectorAll("img");
+    images.forEach((img, i) => {
+      if (img.width > 300) {
+        patchContent = true
+        const a = document.createElement("a");
+        a.href = img.src;
+        a.innerText = img.src;
+        tmp.appendChild(a);
+      }
+    })
+  }
+}
+if("{{.AttachCss}}"!=""){//附件
+	tmp = document.querySelectorAll("{{.AttachCss}} a") || document.querySelector("第二套CSS选择器,请修改")
+	let attach=[]
+	if(tmp){
+		tmp.forEach((v,i)=>{
+			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+		})
+	}
+	ret["attachLinks"]=attach
+}
+//检查中文字符个数,少于20,修正正文内容
+let regex = /[\\u4e00-\\u9fa5]/g;
+let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
+let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
+if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
+ret 
     `,
-    AttachJsCode:`
+        },
+        {
+            "name": "模版3",
+            "tooltip": "详情页正文包含其他元素,需要清洗",
+            "code": `
+var ret = {}
+var tmp = null
+
+if ("{{.TitleCss}}" != "") {//标题
+	tmp = document.querySelector("{{.TitleCss}}")
+	if (tmp) ret["title"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.PublishUnitCss}}" != "") {//采购单位
+	tmp = document.querySelector("{{.PublishUnitCss}}")
+	if (tmp) ret["publishUnit"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.PublishTimeCss}}" != "") {//发布时间
+	tmp = document.querySelector("{{.PublishTimeCss}}")
+	if (tmp) ret["publishTime"] = tmp.getAttribute("title") || tmp.innerText
+}
+if ("{{.ContentCss}}" != "") {//正文内容
+  tmp = document.querySelector("{{.ContentCss}}")
+  if (tmp) {
+    //TODO在这里写清洗逻辑
+    tmp.removeChild(tmp.children[0])
+    //或者
+    //tmp.removeChild(tmp.querySelector("CSS选择器"))
+    ret["content"] = tmp.innerText
+    ret["contentHtml"] = tmp.innerHTML
+    var patchContent = false
+    //处理详情页中的大图,大图作为附件使用
+    const images = tmp.querySelectorAll("img");
+    images.forEach((img, i) => {
+      if (img.width > 300) {
+        patchContent = true
+        const a = document.createElement("a");
+        a.href = img.src;
+        a.innerText = img.src;
+        tmp.appendChild(a);
+      }
+    })
+  }
+}
+if("{{.AttachCss}}"!=""){//附件
+	tmp = document.querySelectorAll("{{.AttachCss}} a")
+	let attach=[]
+	if(tmp){
+		tmp.forEach((v,i)=>{
+			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
+		})
+	}
+	ret["attachLinks"]=attach
+}
+//检查中文字符个数,少于20,修正正文内容
+let regex = /[\\u4e00-\\u9fa5]/g;
+let chineseCharacters = ret["content"]?ret["content"].match(regex):[];
+let chineseCharactersLen=chineseCharacters ? chineseCharacters.length : 0;
+if (chineseCharactersLen < 20 && ret["attachLinks"] && ret["attachLinks"].length>0) ret["content"] = '详情请访问原网页!'
+ret 
+    `},
+    ],
+
+    AttachJsCode: `
 //附件下载以及提交
     
 `,
-    //列表页翻页代码
-    ListTurnPageJsCode: `
+    //列表页翻页代码集合
+    ListTurnPageJsCodes: [
+        {
+            "name": "模版1",
+            "tooltip": "可以直接定位到翻页链接/按钮",
+            "code": `
 var link=document.querySelector("{{.ListNextPageCss}}");
 if(link)link.click();
-"";
-`,//列表页翻页代码
-    ListTurnPageJsCode2: `
+""   
+    `,
+        },
+        {
+            "name": "模版2",
+            "tooltip": "选择器匹配多个 翻页链接/按钮,可根据文本精准匹配",
+            "code": `
 document.querySelectorAll("{{.ListNextPageCss}}").forEach(link=>{
   if(link.innerText==="下一页")link.click();
 })
-"";
-`,
-
+""
+    `,
+        },
+        {
+            "name": "模版3",
+            "tooltip": "选择器匹配多个 翻页链接/按钮,可根据文本模糊/包含匹配",
+            "code": `
+document.querySelectorAll("{{.ListNextPageCss}}").forEach(link=>{
+  if(link.innerText.indexOf("下一页")>-1)link.click();
+})
+""
+    `,
+        },
+        {
+            "name": "模版4",
+            "tooltip": "选择器匹配多个 翻页链接/按钮,可根据检测是否包含子对象匹配",
+            "code": `
+document.querySelectorAll("{{.ListNextPageCss}}").forEach(link=>{
+  if(link.querySelector("自对象CSS选择器,请修改"))link.click();
+})
+""
+    `,
+        },
+    ],
 
 }

+ 0 - 34
tpl/load_content.js

@@ -1,34 +0,0 @@
-//执行JS代码
-var ret = {}
-var tmp = null
-
-if ("{{.TitleCss}}" != "") {//标题
-	tmp = document.querySelector("{{.TitleCss}}")
-	if (tmp) ret["title"] = tmp.getAttribute("title") || tmp.innerText
-}
-if ("{{.PublishUnitCss}}" != "") {//采购单位
-	tmp = document.querySelector("{{.PublishUnitCss}}")
-	if (tmp) ret["publishUnit"] = tmp.getAttribute("title") || tmp.innerText
-}
-if ("{{.PublishTimeCss}}" != "") {//发布时间
-	tmp = document.querySelector("{{.PublishTimeCss}}")
-	if (tmp) ret["publishTime"] = tmp.getAttribute("title") || tmp.innerText
-}
-if ("{{.ContentCss}}" != "") {//正文内容
-	tmp = document.querySelector("{{.ContentCss}}")
-	if (tmp) {
-		ret["content"] = tmp.innerText
-		ret["contentHtml"] = tmp.innerHTML
-	}
-}
-if("{{.AttachCss}}"!=""){//附件
-	tmp = document.querySelectorAll("{{.AttachCss}} a")
-	let attach=[]
-	if(tmp){
-		tmp.forEach((v,i)=>{
-			attach.push({title:v.getAttribute("title")||v.innerText,href:v.href})
-		})
-	}
-	ret["attachLinks"]=attach
-}
-ret

+ 0 - 23
tpl/load_list_items.js

@@ -1,23 +0,0 @@
-var ret = []
-document.querySelectorAll("{{.ListItemCss}}").forEach((v, i) => {
-    let item = {}
-    if ("{{.ListLinkCss}}" != "") {
-        let link = v.querySelector("{{.ListLinkCss}}")
-        if (link) {
-            var href = link.href
-            if (!href.startsWith("http")) href = window.location.origin + "/" + href
-            let title = link.getAttribute("title") || link.innerText
-            item = { "listTitle": title, "href": href, "no": i }
-        } else {
-            item = { "no": i }
-        }
-    }
-    if ("{{.ListPubtimeCss}}" != "") {
-        let pubtime = v.querySelector("{{.ListPubtimeCss}}")
-        if (pubtime) {
-            item["listPublishTime"] = pubtime.innerText
-        }
-    }
-    ret.push(item)
-})
-ret