Browse Source

动态js处理

dongzhaorui 3 years ago
parent
commit
325a423bed
1 changed files with 173 additions and 0 deletions
  1. 173 0
      qlm/utils/sessions_521.py

+ 173 - 0
qlm/utils/sessions_521.py

@@ -0,0 +1,173 @@
+import copy
+import re
+import subprocess
+
+import execjs
+import jsbeautifier
+import requests
+from requests.utils import dict_from_cookiejar
+
+from config.load import node_module_path
+
+
+def save_js_script(js_code: str, allow_beautify_code=False):
+    with open('etx.js', 'w', encoding='utf-8') as f:
+        if allow_beautify_code:
+            # 解压缩js代码
+            f.write(jsbeautifier.beautify(js_code))
+        f.write(js_code)
+
+
+def load_js_script():
+    with open('etx.js', encoding='utf-8') as f:
+        return f.read()
+
+
+def modify_go_func(repl_js: str, js_func: str):
+    document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group()
+    property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group()
+    return_back = 'return {};'.format(property_name)
+    new_js = '\n{a}{b}\n{a}{c}\n{a}'.format(
+        a=' ' * 6,
+        b=document_code,
+        c=return_back
+    )
+    return js_func.replace(repl_js, new_js)
+
+
+def execute_js_script(script_js: str):
+    js_header = '''
+            const jsdom = require("jsdom");
+            const { JSDOM } = jsdom;
+            const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
+            window = dom.window;
+            document = window.document;
+        '''
+    js_script = js_header + script_js
+    # 格式化js代码
+    beautify_js = jsbeautifier.beautify(js_script)
+    # 将js代码弹窗事件修改为控制台输出
+    js_script = beautify_js.replace('alert', 'console.log')
+    # 将源js脚本代码中go函数回调逻辑修改为 return
+    go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group()
+    return_go_etx = 'return ' + go_etx
+    js_script = js_script.replace(go_etx, return_go_etx)
+
+    # 删除js代码中setTimeout事件
+    go_func = re.search('function go\(.*\};', js_script, flags=re.S).group()
+    patterns = {
+        'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+',
+        'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+',
+        'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);',
+        'p4': '[ ]+setTimeout.*_\w{8}\);',
+    }
+    go_func_new = copy.deepcopy(go_func)
+    for p, pattern in patterns.items():
+        if p in ['p3', 'p4']:
+            # p1 会误判 p3 情况
+            results = re.findall(pattern, go_func_new, flags=re.S)
+        else:
+            results = re.findall(pattern, go_func_new)
+
+        if len(results) > 0:
+            # print(f"清洗规则:{p}")
+            for obj_js in results:
+                go_func_new = modify_go_func(obj_js, go_func_new)
+
+    js_script = js_script.replace(go_func, go_func_new)
+    js_script = 'function getCookies(){' + js_script + '}'
+    # print(js_script)
+    cwd = node_module_path
+    etx = execjs.compile(js_script, cwd=cwd)
+    return etx.call('getCookies')
+
+
+def extract_clearance(js_code: str):
+    result = re.search('_clearance=.*%3D', js_code)
+    '''
+    __jsl_clearance=
+    '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/'
+    '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D'
+    '''
+    if result is None:
+        return None
+    result = result.group()
+    result = result.replace('_clearance=', '')
+    return result
+
+
+def extract_cookies_js(js_html: str):
+    patterns = [
+        '<script>document\.cookie=(.*);location\.href=location\.pathname\+location\.search</script>',
+        '<script>(.*)</script>'
+    ]
+    for pattern in patterns:
+        result = re.search(pattern, js_html)
+        if result is not None:
+            return result.group(1)
+    else:
+        return None
+
+
+def http_session_521(session, url: str, headers: dict, cookies: dict, **kwargs):
+    request_params = {}
+    request_params.setdefault('headers', headers)
+    request_params.setdefault('cookies', cookies)
+    request_params.setdefault('proxies', kwargs.pop('proxies', None))
+    request_params.setdefault('timeout', kwargs.pop('timeout', None) or 60)
+    for key, val in kwargs.items():
+        request_params.setdefault(key, val)
+    # session = requests.Session()
+    try:
+        resp1 = session.post(url, **request_params)
+    except requests.RequestException:
+        # print("代理超时")
+        return False, session, None
+    else:
+        if resp1.status_code != 521:
+            return True, session, dict_from_cookiejar(resp1.cookies)
+
+        cookies_js1 = extract_cookies_js(resp1.text)
+        if cookies_js1 is None:
+            return False, session, None
+
+    resp1_jsl_clearance = execjs.eval(cookies_js1)
+    _jsl_clearance = extract_clearance(resp1_jsl_clearance)
+    cookies.update({'__jsl_clearance': _jsl_clearance})
+    request_params.update({'cookies': cookies})
+    try:
+        resp2 = session.post(url, **request_params)
+    except requests.RequestException:
+        # print("代理超时")
+        return False, session, None
+    else:
+        cookies_js2 = extract_cookies_js(resp2.text)
+        if cookies_js2 is None:
+            return False, session, None
+
+    if '失败' in cookies_js2:
+        cookies_js2 = cookies_js2.replace('失败', '')
+
+    try:
+        resp2_jsl_clearance_s = execute_js_script(cookies_js2)
+    except:
+        # 偶尔会在sojson全混淆的代码中,无法正常修改js,导致异常发生
+        return False, session, None
+
+    clearance2 = extract_clearance(resp2_jsl_clearance_s)
+    cookies.update({'__jsl_clearance': clearance2})
+    return True, session, cookies
+
+
+# if __name__ == '__main__':
+#     with open('cookies2.js') as rp:
+#         js_script = rp.read()
+#         cookies_js2 = extract_cookies_js(js_script)
+#
+#         if '失败' in cookies_js2:
+#             js_script = cookies_js2.replace('失败', '')
+#         resp2_jsl_clearance_s = execute_js_script(js_script)
+#         print(resp2_jsl_clearance_s)
+#         # clearance2 = extract_clearance(resp2_jsl_clearance_s)
+#         # resp2_cookies = dict_from_cookiejar(resp2.cookies)
+#         # resp2_cookies.update({'__jsl_clearance_s': clearance2})