import copy import re import time import execjs import jsbeautifier import requests from requests.utils import dict_from_cookiejar from config.load import node_module_path def modify_go_func(repl_js: str, js_func: str): document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group() property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group() return_back = 'return {};'.format(property_name) new_js = '\n{a}{b}\n{a}{c}\n{a}'.format( a=' ' * 6, b=document_code, c=return_back ) return js_func.replace(repl_js, new_js) def execute_js_script(script_js: str): js_header = ''' const jsdom = require("jsdom"); const { JSDOM } = jsdom; const dom = new JSDOM(`
Hello world
`); window = dom.window; document = window.document; ''' js_script = js_header + script_js # 格式化js代码 beautify_js = jsbeautifier.beautify(js_script) # 将js代码弹窗事件修改为控制台输出 js_script = beautify_js.replace('alert', 'console.log') # 将源js脚本代码中go函数回调逻辑修改为 return go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group() return_go_etx = 'return ' + go_etx js_script = js_script.replace(go_etx, return_go_etx) # 删除js代码中setTimeout事件 go_func = re.search('function go\(.*\};', js_script, flags=re.S).group() patterns = { 'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+', 'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+', 'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);', 'p4': '[ ]+setTimeout.*_\w{8}\);', } go_func_new = copy.deepcopy(go_func) for p, pattern in patterns.items(): if p in ['p3', 'p4']: # p1 会误判 p3 情况 results = re.findall(pattern, go_func_new, flags=re.S) else: results = re.findall(pattern, go_func_new) if len(results) > 0: # print(f"清洗规则:{p}") for obj_js in results: go_func_new = modify_go_func(obj_js, go_func_new) js_script = js_script.replace(go_func, go_func_new) js_script = 'function getCookies(){' + js_script + '}' # print(js_script) cwd = node_module_path etx = execjs.compile(js_script, cwd=cwd) return etx.call('getCookies') def extract_clearance(js_code: str): result = re.search('_clearance=.*%3D', js_code) ''' __jsl_clearance= '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/' '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D' ''' if result is None: return None result = result.group() result = result.replace('_clearance=', '') return result def extract_js_script(js_html: str): patterns = [ '', '' ] for pattern in patterns: result = re.search(pattern, js_html) if result is not None: return result.group(1) else: return None def http_session_521(session, url: str, headers: dict, cookies: dict, **kwargs): request_params = {} request_params.setdefault('headers', headers) request_params.setdefault('cookies', cookies) request_params.setdefault('proxies', kwargs.pop('proxies', None)) request_params.setdefault('timeout', kwargs.pop('timeout', None) or 60) for key, val in kwargs.items(): request_params.setdefault(key, val) try: resp1 = session.post(url, **request_params) except requests.RequestException: # print("代理超时") return False, session, cookies else: if resp1.status_code != 521: return True, session, dict_from_cookiejar(resp1.cookies) cookies_js1 = extract_js_script(resp1.text) if cookies_js1 is None: return False, session, cookies time.sleep(0.05) if cookies_js1.startswith('var'): cookies_js2 = cookies_js1 else: resp1_jsl_clearance = execjs.eval(cookies_js1) _jsl_clearance = extract_clearance(resp1_jsl_clearance) cookies.update({'__jsl_clearance': _jsl_clearance}) request_params.update({'cookies': cookies}) try: resp2 = session.post(url, **request_params) except requests.RequestException: # print("代理超时") return False, session, cookies else: cookies_js2 = extract_js_script(resp2.text) if cookies_js2 is None: return False, session, cookies if '失败' in cookies_js2: cookies_js2 = cookies_js2.replace('失败', '') try: resp2_jsl_clearance_s = execute_js_script(cookies_js2) except: # 偶尔会在sojson全混淆的代码中,无法正常修改js,导致异常发生 return False, session, cookies clearance2 = extract_clearance(resp2_jsl_clearance_s) cookies.update({'__jsl_clearance': clearance2}) return True, session, cookies