瀏覽代碼

拟建标题过滤,YSAJ反爬方法

lizongze 1 年之前
父節點
當前提交
bb14bad18e
共有 2 個文件被更改,包括 62 次插入7 次删除
  1. 60 0
      FworkSpider/crawl_func/YunSuoAutoJump.py
  2. 2 7
      FworkSpider/untils/check_data.py

+ 60 - 0
FworkSpider/crawl_func/YunSuoAutoJump.py

@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-09-12 
+---------
+@summary: cookies -> security_session_mid_verify
+---------
+@author: Lzz
+"""
+import requests
+import execjs
+import time
+
+
+
+def get_mid_code(security_verify_data_url,proxies=False):
+
+    session = requests.session()
+    session.proxies = proxies
+
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language": "zh-CN,zh;q=0.9",
+        "Cache-Control": "no-cache",
+        "Connection": "keep-alive",
+        "Pragma": "no-cache",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
+    }
+    res = session.get(security_verify_data_url, headers=headers, timeout=60, verify=False)
+
+    ex_js = '''
+    function YunSuoAutoJump(url) {
+        function stringToHex(str) {
+            var val = "";
+            for (var i = 0; i < str.length; i++) {
+                if (val == "") val = str.charCodeAt(i).toString(16); else val += str.charCodeAt(i).toString(16);
+            }
+            return val;
+        }
+        var width = 1536;
+        var height = 864;
+        var screendate = width + "," + height;
+        location = url + "?security_verify_data=" + stringToHex(screendate);
+        return location
+    }
+    '''
+    ctx = execjs.compile(ex_js)
+    yz_url = ctx.call("YunSuoAutoJump",security_verify_data_url)
+
+    num = 0
+    cookies = {}
+    while num < 10:
+        response = session.get(yz_url, headers=headers, timeout=60, verify=False)
+        cookies = session.cookies.get_dict()
+        if cookies.get('security_session_mid_verify'):
+            break
+        num += 1
+        time.sleep(2)
+    return cookies
+

+ 2 - 7
FworkSpider/untils/check_data.py

@@ -19,13 +19,8 @@ class CheckData:
                                  '审核', '审批', '批复', '批后', '批前', '核准',
                                  '备案', '立项', '规划设计', '环评', }
 
-    __plan_to_build_channel_set = {"通知公告", "公示公告", "部门文件",
-                                   "发布公示", "公告信息",
-                                   "公示公开", "公开公示", "公示通知",
-                                   "公示信息", "公告信息", "公示专区",
-                                   "公告专区", "公司公告", "公司通知",
-                                   "公司新闻", "其他公示", "通知公示",
-                                   "最新公告", "最新公示", "最新资讯"}
+    __plan_to_build_channel_set = {"通知", "部门文件", "公告", "公示",
+                                    "办件" ,"公司", "新闻", "最新资讯"}
 
     @classmethod
     def title(cls, name: str, group=None):