|
@@ -0,0 +1,173 @@
|
|
|
+import copy
|
|
|
+import re
|
|
|
+import subprocess
|
|
|
+
|
|
|
+import execjs
|
|
|
+import jsbeautifier
|
|
|
+import requests
|
|
|
+from requests.utils import dict_from_cookiejar
|
|
|
+
|
|
|
+from config.load import node_module_path
|
|
|
+
|
|
|
+
|
|
|
+def save_js_script(js_code: str, allow_beautify_code=False):
|
|
|
+ with open('etx.js', 'w', encoding='utf-8') as f:
|
|
|
+ if allow_beautify_code:
|
|
|
+ # 解压缩js代码
|
|
|
+ f.write(jsbeautifier.beautify(js_code))
|
|
|
+ f.write(js_code)
|
|
|
+
|
|
|
+
|
|
|
+def load_js_script():
|
|
|
+ with open('etx.js', encoding='utf-8') as f:
|
|
|
+ return f.read()
|
|
|
+
|
|
|
+
|
|
|
+def modify_go_func(repl_js: str, js_func: str):
|
|
|
+ document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group()
|
|
|
+ property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group()
|
|
|
+ return_back = 'return {};'.format(property_name)
|
|
|
+ new_js = '\n{a}{b}\n{a}{c}\n{a}'.format(
|
|
|
+ a=' ' * 6,
|
|
|
+ b=document_code,
|
|
|
+ c=return_back
|
|
|
+ )
|
|
|
+ return js_func.replace(repl_js, new_js)
|
|
|
+
|
|
|
+
|
|
|
+def execute_js_script(script_js: str):
|
|
|
+ js_header = '''
|
|
|
+ const jsdom = require("jsdom");
|
|
|
+ const { JSDOM } = jsdom;
|
|
|
+ const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
|
|
|
+ window = dom.window;
|
|
|
+ document = window.document;
|
|
|
+ '''
|
|
|
+ js_script = js_header + script_js
|
|
|
+ # 格式化js代码
|
|
|
+ beautify_js = jsbeautifier.beautify(js_script)
|
|
|
+ # 将js代码弹窗事件修改为控制台输出
|
|
|
+ js_script = beautify_js.replace('alert', 'console.log')
|
|
|
+ # 将源js脚本代码中go函数回调逻辑修改为 return
|
|
|
+ go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group()
|
|
|
+ return_go_etx = 'return ' + go_etx
|
|
|
+ js_script = js_script.replace(go_etx, return_go_etx)
|
|
|
+
|
|
|
+ # 删除js代码中setTimeout事件
|
|
|
+ go_func = re.search('function go\(.*\};', js_script, flags=re.S).group()
|
|
|
+ patterns = {
|
|
|
+ 'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+',
|
|
|
+ 'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+',
|
|
|
+ 'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);',
|
|
|
+ 'p4': '[ ]+setTimeout.*_\w{8}\);',
|
|
|
+ }
|
|
|
+ go_func_new = copy.deepcopy(go_func)
|
|
|
+ for p, pattern in patterns.items():
|
|
|
+ if p in ['p3', 'p4']:
|
|
|
+ # p1 会误判 p3 情况
|
|
|
+ results = re.findall(pattern, go_func_new, flags=re.S)
|
|
|
+ else:
|
|
|
+ results = re.findall(pattern, go_func_new)
|
|
|
+
|
|
|
+ if len(results) > 0:
|
|
|
+ # print(f"清洗规则:{p}")
|
|
|
+ for obj_js in results:
|
|
|
+ go_func_new = modify_go_func(obj_js, go_func_new)
|
|
|
+
|
|
|
+ js_script = js_script.replace(go_func, go_func_new)
|
|
|
+ js_script = 'function getCookies(){' + js_script + '}'
|
|
|
+ # print(js_script)
|
|
|
+ cwd = node_module_path
|
|
|
+ etx = execjs.compile(js_script, cwd=cwd)
|
|
|
+ return etx.call('getCookies')
|
|
|
+
|
|
|
+
|
|
|
+def extract_clearance(js_code: str):
|
|
|
+ result = re.search('_clearance=.*%3D', js_code)
|
|
|
+ '''
|
|
|
+ __jsl_clearance=
|
|
|
+ '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/'
|
|
|
+ '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D'
|
|
|
+ '''
|
|
|
+ if result is None:
|
|
|
+ return None
|
|
|
+ result = result.group()
|
|
|
+ result = result.replace('_clearance=', '')
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def extract_cookies_js(js_html: str):
|
|
|
+ patterns = [
|
|
|
+ '<script>document\.cookie=(.*);location\.href=location\.pathname\+location\.search</script>',
|
|
|
+ '<script>(.*)</script>'
|
|
|
+ ]
|
|
|
+ for pattern in patterns:
|
|
|
+ result = re.search(pattern, js_html)
|
|
|
+ if result is not None:
|
|
|
+ return result.group(1)
|
|
|
+ else:
|
|
|
+ return None
|
|
|
+
|
|
|
+
|
|
|
+def http_session_521(session, url: str, headers: dict, cookies: dict, **kwargs):
|
|
|
+ request_params = {}
|
|
|
+ request_params.setdefault('headers', headers)
|
|
|
+ request_params.setdefault('cookies', cookies)
|
|
|
+ request_params.setdefault('proxies', kwargs.pop('proxies', None))
|
|
|
+ request_params.setdefault('timeout', kwargs.pop('timeout', None) or 60)
|
|
|
+ for key, val in kwargs.items():
|
|
|
+ request_params.setdefault(key, val)
|
|
|
+ # session = requests.Session()
|
|
|
+ try:
|
|
|
+ resp1 = session.post(url, **request_params)
|
|
|
+ except requests.RequestException:
|
|
|
+ # print("代理超时")
|
|
|
+ return False, session, None
|
|
|
+ else:
|
|
|
+ if resp1.status_code != 521:
|
|
|
+ return True, session, dict_from_cookiejar(resp1.cookies)
|
|
|
+
|
|
|
+ cookies_js1 = extract_cookies_js(resp1.text)
|
|
|
+ if cookies_js1 is None:
|
|
|
+ return False, session, None
|
|
|
+
|
|
|
+ resp1_jsl_clearance = execjs.eval(cookies_js1)
|
|
|
+ _jsl_clearance = extract_clearance(resp1_jsl_clearance)
|
|
|
+ cookies.update({'__jsl_clearance': _jsl_clearance})
|
|
|
+ request_params.update({'cookies': cookies})
|
|
|
+ try:
|
|
|
+ resp2 = session.post(url, **request_params)
|
|
|
+ except requests.RequestException:
|
|
|
+ # print("代理超时")
|
|
|
+ return False, session, None
|
|
|
+ else:
|
|
|
+ cookies_js2 = extract_cookies_js(resp2.text)
|
|
|
+ if cookies_js2 is None:
|
|
|
+ return False, session, None
|
|
|
+
|
|
|
+ if '失败' in cookies_js2:
|
|
|
+ cookies_js2 = cookies_js2.replace('失败', '')
|
|
|
+
|
|
|
+ try:
|
|
|
+ resp2_jsl_clearance_s = execute_js_script(cookies_js2)
|
|
|
+ except:
|
|
|
+ # 偶尔会在sojson全混淆的代码中,无法正常修改js,导致异常发生
|
|
|
+ return False, session, None
|
|
|
+
|
|
|
+ clearance2 = extract_clearance(resp2_jsl_clearance_s)
|
|
|
+ cookies.update({'__jsl_clearance': clearance2})
|
|
|
+ return True, session, cookies
|
|
|
+
|
|
|
+
|
|
|
+# if __name__ == '__main__':
|
|
|
+# with open('cookies2.js') as rp:
|
|
|
+# js_script = rp.read()
|
|
|
+# cookies_js2 = extract_cookies_js(js_script)
|
|
|
+#
|
|
|
+# if '失败' in cookies_js2:
|
|
|
+# js_script = cookies_js2.replace('失败', '')
|
|
|
+# resp2_jsl_clearance_s = execute_js_script(js_script)
|
|
|
+# print(resp2_jsl_clearance_s)
|
|
|
+# # clearance2 = extract_clearance(resp2_jsl_clearance_s)
|
|
|
+# # resp2_cookies = dict_from_cookiejar(resp2.cookies)
|
|
|
+# # resp2_cookies.update({'__jsl_clearance_s': clearance2})
|