defaults.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. import json
  2. from pathlib import Path
  3. import requests
  4. import urllib3
  5. from requests.models import Response, Request
  6. from crawler.sessions_521 import http_session_521
  7. urllib3.disable_warnings()
  8. def prepare_request(
  9. headers: dict = None,
  10. proxies: dict = None,
  11. timeout: int = None,
  12. verify: bool = None,
  13. cookies=None,
  14. ):
  15. request_params = {}
  16. request_params.setdefault('headers', headers)
  17. request_params.setdefault('timeout', timeout or 60)
  18. request_params.setdefault('proxies', proxies)
  19. if cookies is not None:
  20. request_params.setdefault('cookies', cookies)
  21. if verify is not None:
  22. request_params.setdefault('verify', verify)
  23. return request_params
  24. def get_jsl_cookies(url, headers, proxies=None):
  25. root = Path(__file__).parent.parent
  26. file = root.joinpath("config/jsl_ck.json").resolve()
  27. if not file.exists():
  28. http_session_521(url, headers, proxies)
  29. with file.open('r', encoding='utf-8') as fr:
  30. cookies = fr.read()
  31. return json.loads(cookies.replace("'", '"'))
  32. def http_request_get(url, login=False, **kwargs):
  33. request_params = prepare_request(**kwargs)
  34. headers = request_params.get('headers')
  35. proxies = request_params.get('proxies')
  36. retries = 0
  37. response = Response()
  38. while retries < 3:
  39. try:
  40. if not login:
  41. jsl_cookies = get_jsl_cookies(url, headers, proxies)
  42. request_params['cookies'] = jsl_cookies
  43. response = requests.get(url, **request_params)
  44. if response.status_code == 200:
  45. response.encoding = response.apparent_encoding
  46. return True, response
  47. elif response.status_code == 521:
  48. print("****** 521 ******")
  49. response.status_code = 10521
  50. _jsl_clearance_s = http_session_521(url, headers, proxies)
  51. # TODO 验证是否需要将这个临时变量写入登录cookies,临时写入内存,如有必要再添加
  52. # if account is not None:
  53. # update_login_cookies(account, _jsl_clearance_s)
  54. if 'cookies' not in request_params:
  55. request_params.setdefault('cookies', _jsl_clearance_s)
  56. else:
  57. _cookies: dict = request_params.get('cookies')
  58. _cookies.update(_jsl_clearance_s)
  59. request_params.update({'cookies': _cookies})
  60. continue
  61. elif response.status_code in [404, 301]:
  62. response.status_code = response.status_code
  63. response.reason = '网站反爬'
  64. response.request = Request(
  65. method='get',
  66. url=url,
  67. headers=request_params.get('headers'),
  68. )
  69. return False, response
  70. elif 500 <= response.status_code < 521:
  71. response.status_code = response.status_code
  72. response.reason = '网站页面无法访问'
  73. response.request = Request(
  74. method='get',
  75. url=url,
  76. headers=request_params.get('headers'),
  77. )
  78. return False, response
  79. else:
  80. retries += 1
  81. except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
  82. response.status_code = 10000
  83. response.reason = e.__class__.__name__
  84. response.request = Request(
  85. method='get',
  86. url=url,
  87. headers=request_params.get('headers'),
  88. )
  89. return False, response
  90. except requests.RequestException as e:
  91. response.status_code = 10001
  92. response.reason = e.__class__.__name__
  93. response.request = Request(
  94. method='get',
  95. url=url,
  96. headers=request_params.get('headers'),
  97. )
  98. retries += 1
  99. return False, response