123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- import copy
- import re
- import time
- import execjs
- import jsbeautifier
- import requests
- from requests.utils import dict_from_cookiejar
- from config.load import node_module_path
- def modify_go_func(repl_js: str, js_func: str):
- document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group()
- property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group()
- return_back = 'return {};'.format(property_name)
- new_js = '\n{a}{b}\n{a}{c}\n{a}'.format(
- a=' ' * 6,
- b=document_code,
- c=return_back
- )
- return js_func.replace(repl_js, new_js)
- def execute_js_script(script_js: str):
- js_header = '''
- const jsdom = require("jsdom");
- const { JSDOM } = jsdom;
- const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
- window = dom.window;
- document = window.document;
- '''
- js_script = js_header + script_js
- # 格式化js代码
- beautify_js = jsbeautifier.beautify(js_script)
- # 将js代码弹窗事件修改为控制台输出
- js_script = beautify_js.replace('alert', 'console.log')
- # 将源js脚本代码中go函数回调逻辑修改为 return
- go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group()
- return_go_etx = 'return ' + go_etx
- js_script = js_script.replace(go_etx, return_go_etx)
- # 删除js代码中setTimeout事件
- go_func = re.search('function go\(.*\};', js_script, flags=re.S).group()
- patterns = {
- 'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+',
- 'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+',
- 'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);',
- 'p4': '[ ]+setTimeout.*_\w{8}\);',
- }
- go_func_new = copy.deepcopy(go_func)
- for p, pattern in patterns.items():
- if p in ['p3', 'p4']:
- # p1 会误判 p3 情况
- results = re.findall(pattern, go_func_new, flags=re.S)
- else:
- results = re.findall(pattern, go_func_new)
- if len(results) > 0:
- # print(f"清洗规则:{p}")
- for obj_js in results:
- go_func_new = modify_go_func(obj_js, go_func_new)
- js_script = js_script.replace(go_func, go_func_new)
- js_script = 'function getCookies(){' + js_script + '}'
- # print(js_script)
- cwd = node_module_path
- etx = execjs.compile(js_script, cwd=cwd)
- return etx.call('getCookies')
- def extract_clearance(js_code: str):
- result = re.search('_clearance=.*%3D', js_code)
- '''
- __jsl_clearance=
- '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/'
- '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D'
- '''
- if result is None:
- return None
- result = result.group()
- result = result.replace('_clearance=', '')
- return result
- def extract_js_script(js_html: str):
- patterns = [
- '<script>document\.cookie=(.*);location\.href=location\.pathname\+location\.search</script>',
- '<script>(.*)</script>'
- ]
- for pattern in patterns:
- result = re.search(pattern, js_html)
- if result is not None:
- return result.group(1)
- else:
- return None
- def http_session_521(session, url: str, headers: dict, cookies: dict, **kwargs):
- request_params = {}
- request_params.setdefault('headers', headers)
- request_params.setdefault('cookies', cookies)
- request_params.setdefault('proxies', kwargs.pop('proxies', None))
- request_params.setdefault('timeout', kwargs.pop('timeout', None) or 60)
- for key, val in kwargs.items():
- request_params.setdefault(key, val)
- try:
- resp1 = session.post(url, **request_params)
- except requests.RequestException:
- # print("代理超时")
- return False, session, cookies
- else:
- if resp1.status_code != 521:
- return True, session, dict_from_cookiejar(resp1.cookies)
- cookies_js1 = extract_js_script(resp1.text)
- if cookies_js1 is None:
- return False, session, cookies
- time.sleep(0.05)
- if cookies_js1.startswith('var'):
- cookies_js2 = cookies_js1
- else:
- resp1_jsl_clearance = execjs.eval(cookies_js1)
- _jsl_clearance = extract_clearance(resp1_jsl_clearance)
- cookies.update({'__jsl_clearance': _jsl_clearance})
- request_params.update({'cookies': cookies})
- try:
- resp2 = session.post(url, **request_params)
- except requests.RequestException:
- # print("代理超时")
- return False, session, cookies
- else:
- cookies_js2 = extract_js_script(resp2.text)
- if cookies_js2 is None:
- return False, session, cookies
- if '失败' in cookies_js2:
- cookies_js2 = cookies_js2.replace('失败', '')
- try:
- resp2_jsl_clearance_s = execute_js_script(cookies_js2)
- except:
- # 偶尔会在sojson全混淆的代码中,无法正常修改js,导致异常发生
- return False, session, cookies
- clearance2 = extract_clearance(resp2_jsl_clearance_s)
- cookies.update({'__jsl_clearance': clearance2})
- return True, session, cookies
|