sessions_521.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. import copy
  2. import json
  3. import re
  4. import time
  5. from pathlib import Path
  6. import execjs
  7. import jsbeautifier
  8. import requests
  9. from requests.utils import dict_from_cookiejar
  10. from config.load import node_module_path
  11. from jsl import jsl
  12. def save_js_script(js_code: str, allow_beautify_code=False):
  13. with open('etx.js', 'w', encoding='utf-8') as f:
  14. if allow_beautify_code:
  15. f.write(jsbeautifier.beautify(js_code)) # 解压缩js代码
  16. f.write(js_code)
  17. def load_js_script():
  18. with open('etx.js', encoding='utf-8') as f:
  19. return f.read()
  20. def modify_go_func(repl_js: str, js_func: str):
  21. document_code = re.search('document\[.*?\]\s{0,1}=.*?;', repl_js).group()
  22. property_name = re.search('document\[.*?\]\s{0,1}?', document_code).group()
  23. return_back = 'return {};'.format(property_name)
  24. new_js = '\n{a}{b}\n{a}{c}\n{a}'.format(
  25. a=' ' * 6,
  26. b=document_code,
  27. c=return_back
  28. )
  29. return js_func.replace(repl_js, new_js)
  30. def execute_js_script(script_js: str):
  31. js_header = '''
  32. const jsdom = require("jsdom");
  33. const { JSDOM } = jsdom;
  34. const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
  35. window = dom.window;
  36. document = window.document;
  37. '''
  38. js_script = js_header + script_js
  39. # 格式化js代码
  40. beautify_js = jsbeautifier.beautify(js_script)
  41. # 将js代码弹窗事件修改为控制台输出
  42. js_script = beautify_js.replace('alert', 'console.log')
  43. # 将源js脚本代码中go函数回调逻辑修改为 return
  44. go_etx = re.search('go\(\{.*?\}\)', js_script, flags=re.S).group()
  45. return_go_etx = 'return ' + go_etx
  46. js_script = js_script.replace(go_etx, return_go_etx)
  47. # 替换js代码中setTimeout事件
  48. go_func = re.search('function go\(.*\};', js_script, flags=re.S).group()
  49. patterns = {
  50. 'p1': '\n[ ]+.{10,100}\(setTimeout.*\n[ ]+document.*\n[ ]+location.*?\n.*?\n[ ]+',
  51. 'p2': '\n[ ]+setTimeout.*\n[ ]+.*document.*\n[ ]+location.*?\n.*?\n[ ]+',
  52. 'p3': '[ ]+.{10,100}\(setTimeout.*[ ]+if \(.*\) \{.*\}.*, _\w{8}\);',
  53. 'p4': '[ ]+setTimeout.*_\w{8}\);',
  54. }
  55. go_func_new = copy.deepcopy(go_func)
  56. for p, pattern in patterns.items():
  57. if p in ['p3', 'p4']:
  58. # p1 会误判 p3 情况
  59. results = re.findall(pattern, go_func_new, flags=re.S)
  60. else:
  61. results = re.findall(pattern, go_func_new)
  62. if len(results) > 0:
  63. # print(f"清洗规则:{p}")
  64. for obj_js in results:
  65. go_func_new = modify_go_func(obj_js, go_func_new)
  66. js_script = js_script.replace(go_func, go_func_new)
  67. js_script = 'function getCookies(){' + js_script + '}'
  68. # print(js_script)
  69. cwd = node_module_path
  70. etx = execjs.compile(js_script, cwd=cwd)
  71. return etx.call('getCookies')
  72. def extract_clearance(js_code: str):
  73. result = re.search('_s=.*%3D', js_code)
  74. '''
  75. '__jsl_clearance_s=1641259814.553|-1|LlKSd3QgHj0KliuCI5cEMbwU7HU%3D;max-age=3600;path=/'
  76. '__jsl_clearance_s=1641259382.821|0|ThOeD4stO5usoh9oC0MP5%2Fx3SPc%3D'
  77. '''
  78. if result is None:
  79. return None
  80. result = result.group()
  81. result = result.replace('_s=', '')
  82. return result
  83. def extract_cookies_js(js_html: str):
  84. patterns = [
  85. '<script>document\.cookie=(.*);location\.href=location\.pathname\+location\.search</script>',
  86. '<script>(.*)</script>'
  87. ]
  88. for pattern in patterns:
  89. result = re.search(pattern, js_html)
  90. if result is not None:
  91. return result.group(1)
  92. else:
  93. return None
  94. def http_session_521_old(url: str, headers: dict, **kwargs):
  95. if 'Cookie' in headers:
  96. del headers['Cookie']
  97. if 'cookies' in kwargs:
  98. del kwargs['cookies']
  99. request_params = {}
  100. request_params.setdefault('proxies', kwargs.get('proxies'))
  101. request_params.setdefault('timeout', kwargs.get('timeout') or 60)
  102. http_session = requests.Session()
  103. try:
  104. resp1 = http_session.get(url, headers=headers, **request_params)
  105. except requests.RequestException:
  106. # print("代理超时")
  107. return False, http_session, None
  108. else:
  109. if resp1.status_code != 521:
  110. # print(dict_from_cookiejar(resp1.cookies))
  111. return True, http_session, dict_from_cookiejar(resp1.cookies)
  112. cookies_js1 = extract_cookies_js(resp1.text)
  113. if cookies_js1 is None:
  114. return False, http_session, None
  115. resp1_jsl_clearance_s = execjs.eval(cookies_js1)
  116. clearance1 = extract_clearance(resp1_jsl_clearance_s)
  117. resp1_cookies = dict_from_cookiejar(resp1.cookies)
  118. resp1_cookies.update({'__jsl_clearance_s': clearance1})
  119. try:
  120. resp2 = http_session.get(url, headers=headers, cookies=resp1_cookies, **request_params)
  121. except requests.RequestException:
  122. # print("代理超时")
  123. return False, http_session, None
  124. else:
  125. cookies_js2 = extract_cookies_js(resp2.text)
  126. if cookies_js2 is None:
  127. return False, http_session, None
  128. # save_js_script(cookies_js2)
  129. js_script = cookies_js2.replace('失败', '')
  130. resp2_jsl_clearance_s = execute_js_script(js_script)
  131. clearance2 = extract_clearance(resp2_jsl_clearance_s)
  132. resp2_cookies = dict_from_cookiejar(resp2.cookies)
  133. resp2_cookies.update({'__jsl_clearance_s': clearance2})
  134. return True, http_session, resp2_cookies
  135. def create_cookie_old(page_url, headers, proxies=None, is_save_js=False):
  136. retry = 0
  137. while (retry := retry + 1) < 10:
  138. try:
  139. session = requests.Session()
  140. session.proxies = proxies
  141. start_url = page_url
  142. res = session.get(start_url, headers=headers, timeout=60, verify=False)
  143. js_func = "".join(re.findall("document.cookie=(.*?)location.href", res.text))
  144. js_func = 'function sd() { return ' + js_func + "}"
  145. ctx = execjs.compile(js_func)
  146. sss = ctx.call("sd")
  147. cookie = {}
  148. for temp, index in res.cookies.get_dict().items():
  149. cookie[temp] = index
  150. for item in sss.split(";"):
  151. if '=' in item:
  152. cookie[item.split("=")[0]] = item.split("=")[-1]
  153. res = session.get(start_url, cookies=cookie, headers=headers, timeout=60, verify=False)
  154. html_str = res.content.decode()
  155. if "<!DOCTYPE html>" in html_str:
  156. html_str = re.sub("<!DOCTYPE html>[\s\S]*?</html>", "", html_str.strip(), re.S)
  157. if is_save_js:
  158. with open('./source_code.js', 'w+', encoding='utf-8') as f:
  159. f.write(html_str)
  160. js_do_data = "".join(re.findall('};go\((.*?)\)', html_str))
  161. js_func = re.sub("<(/*?)script>", "", html_str)
  162. location = re.compile('location(.*?)}}else')
  163. location2 = re.compile('location(.*?)}else')
  164. setTimeout = re.compile('0x5dc;}}(.*?)setTimeout,function\(\)\{')
  165. setTimeout2 = re.compile('0x5dc;}(.*?)setTimeout\(function\(\)\{')
  166. gox = re.compile('};go(.*?)\)')
  167. js_func = re.sub(location, "}}else", js_func)
  168. js_func = re.sub(location2, "}else", js_func)
  169. js_func = re.sub(setTimeout, "0x5dc;}}", js_func)
  170. js_func = re.sub(setTimeout2, "0x5dc;}", js_func)
  171. js_func = re.sub(gox, "return document['cookie']\n};", js_func)
  172. js_func = '''const jsdom = require("jsdom");
  173. const {JSDOM} = jsdom;
  174. const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`,
  175. {
  176. url: "https://example.org/",
  177. referrer: "https://example.com/",
  178. contentType: "text/html",
  179. });
  180. window = dom.window;
  181. document = window.document;
  182. location = window.location;
  183. ''' + js_func
  184. ctx = execjs.compile(js_func, cwd=node_module_path)
  185. if is_save_js:
  186. with open('./clean_code.js', 'w+', encoding='utf-8') as f:
  187. f.write(js_func)
  188. ss = ctx.call("go", json.loads(js_do_data))
  189. for item in ss.split(";"):
  190. if '=' in item:
  191. session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
  192. session.get(start_url, headers=headers, timeout=60, verify=False)
  193. cookies = session.cookies.get_dict()
  194. return cookies
  195. except Exception as e:
  196. print("cookie生产错误!")
  197. time.sleep(3)
  198. def create_cookie(page_url, headers, **kwargs):
  199. try:
  200. return jsl.get_jsl_cookies(page_url, headers, **kwargs)
  201. except IndexError:
  202. raise requests.exceptions.ContentDecodingError('jsl cookies fetch failed')
  203. except AssertionError:
  204. return {}
  205. def http_session_521(url, headers, proxies=None, storage=True):
  206. jsl_cookies = create_cookie(page_url=url, headers=headers, proxies=proxies)
  207. if storage:
  208. root = Path(__file__).parent.parent
  209. file = root.joinpath("config/jsl_ck.json").resolve()
  210. with file.open('w', encoding='utf-8') as fw:
  211. fw.write(json.dumps(jsl_cookies))
  212. return jsl_cookies