sessions_521.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. import copy
  2. import re
  3. import time
  4. import execjs
  5. import jsbeautifier
  6. import requests
  7. from requests.utils import dict_from_cookiejar
  8. from config.load import node_module_path
  9. def modify_go_func(repl_js: str, js_func: str):
  10. document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group()
  11. property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group()
  12. return_back = 'return {};'.format(property_name)
  13. new_js = '\n{a}{b}\n{a}{c}\n{a}'.format(
  14. a=' ' * 6,
  15. b=document_code,
  16. c=return_back
  17. )
  18. return js_func.replace(repl_js, new_js)
  19. def execute_js_script(script_js: str):
  20. js_header = '''
  21. const jsdom = require("jsdom");
  22. const { JSDOM } = jsdom;
  23. const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
  24. window = dom.window;
  25. document = window.document;
  26. '''
  27. js_script = js_header + script_js
  28. # 格式化js代码
  29. beautify_js = jsbeautifier.beautify(js_script)
  30. # 将js代码弹窗事件修改为控制台输出
  31. js_script = beautify_js.replace('alert', 'console.log')
  32. # 将源js脚本代码中go函数回调逻辑修改为 return
  33. go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group()
  34. return_go_etx = 'return ' + go_etx
  35. js_script = js_script.replace(go_etx, return_go_etx)
  36. # 删除js代码中setTimeout事件
  37. go_func = re.search('function go\(.*\};', js_script, flags=re.S).group()
  38. patterns = {
  39. 'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+',
  40. 'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+',
  41. 'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);',
  42. 'p4': '[ ]+setTimeout.*_\w{8}\);',
  43. }
  44. go_func_new = copy.deepcopy(go_func)
  45. for p, pattern in patterns.items():
  46. if p in ['p3', 'p4']:
  47. # p1 会误判 p3 情况
  48. results = re.findall(pattern, go_func_new, flags=re.S)
  49. else:
  50. results = re.findall(pattern, go_func_new)
  51. if len(results) > 0:
  52. # print(f"清洗规则:{p}")
  53. for obj_js in results:
  54. go_func_new = modify_go_func(obj_js, go_func_new)
  55. js_script = js_script.replace(go_func, go_func_new)
  56. js_script = 'function getCookies(){' + js_script + '}'
  57. # print(js_script)
  58. cwd = node_module_path
  59. etx = execjs.compile(js_script, cwd=cwd)
  60. return etx.call('getCookies')
  61. def extract_clearance(js_code: str):
  62. result = re.search('_clearance=.*%3D', js_code)
  63. '''
  64. __jsl_clearance=
  65. '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/'
  66. '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D'
  67. '''
  68. if result is None:
  69. return None
  70. result = result.group()
  71. result = result.replace('_clearance=', '')
  72. return result
  73. def extract_js_script(js_html: str):
  74. patterns = [
  75. '<script>document\.cookie=(.*);location\.href=location\.pathname\+location\.search</script>',
  76. '<script>(.*)</script>'
  77. ]
  78. for pattern in patterns:
  79. result = re.search(pattern, js_html)
  80. if result is not None:
  81. return result.group(1)
  82. else:
  83. return None
  84. def http_session_521(session, url: str, headers: dict, cookies: dict, **kwargs):
  85. request_params = {}
  86. request_params.setdefault('headers', headers)
  87. request_params.setdefault('cookies', cookies)
  88. request_params.setdefault('proxies', kwargs.pop('proxies', None))
  89. request_params.setdefault('timeout', kwargs.pop('timeout', None) or 60)
  90. for key, val in kwargs.items():
  91. request_params.setdefault(key, val)
  92. try:
  93. resp1 = session.post(url, **request_params)
  94. except requests.RequestException:
  95. # print("代理超时")
  96. return False, session, cookies
  97. else:
  98. if resp1.status_code != 521:
  99. return True, session, dict_from_cookiejar(resp1.cookies)
  100. cookies_js1 = extract_js_script(resp1.text)
  101. if cookies_js1 is None:
  102. return False, session, cookies
  103. time.sleep(0.05)
  104. if cookies_js1.startswith('var'):
  105. cookies_js2 = cookies_js1
  106. else:
  107. resp1_jsl_clearance = execjs.eval(cookies_js1)
  108. _jsl_clearance = extract_clearance(resp1_jsl_clearance)
  109. cookies.update({'__jsl_clearance': _jsl_clearance})
  110. request_params.update({'cookies': cookies})
  111. try:
  112. resp2 = session.post(url, **request_params)
  113. except requests.RequestException:
  114. # print("代理超时")
  115. return False, session, cookies
  116. else:
  117. cookies_js2 = extract_js_script(resp2.text)
  118. if cookies_js2 is None:
  119. return False, session, cookies
  120. if '失败' in cookies_js2:
  121. cookies_js2 = cookies_js2.replace('失败', '')
  122. try:
  123. resp2_jsl_clearance_s = execute_js_script(cookies_js2)
  124. except:
  125. # 偶尔会在sojson全混淆的代码中,无法正常修改js,导致异常发生
  126. return False, session, cookies
  127. clearance2 = extract_clearance(resp2_jsl_clearance_s)
  128. cookies.update({'__jsl_clearance': clearance2})
  129. return True, session, cookies