defaults.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import requests
  2. import urllib3
  3. import os
  4. import json
  5. from requests.models import Response, Request
  6. from crawler.sessions_521 import http_session_521
  7. urllib3.disable_warnings()
  8. def prepare_request(
  9. headers: dict = None,
  10. proxies: dict = None,
  11. timeout: int = None,
  12. verify: bool = None,
  13. cookies=None,
  14. ):
  15. request_params = {}
  16. request_params.setdefault('headers', headers)
  17. request_params.setdefault('timeout', timeout or 60)
  18. request_params.setdefault('proxies', proxies)
  19. if cookies is not None:
  20. request_params.setdefault('cookies', cookies)
  21. if verify is not None:
  22. request_params.setdefault('verify', verify)
  23. return request_params
  24. def get_cookies(url, headers, proxies=None):
  25. if not os.path.isfile(f'./zbytb_ck.json'):
  26. http_session_521(url, headers, proxies)
  27. with open(f'./zbytb_ck.json', 'r', encoding='utf-8') as fr:
  28. cks = fr.read()
  29. ck = json.loads(cks.replace("'", '"'))
  30. return ck
  31. def http_request_get(url, **kwargs):
  32. request_params = prepare_request(**kwargs)
  33. headers = request_params.get('headers')
  34. proxies = request_params.get('proxies')
  35. retries = 0
  36. response = Response()
  37. while retries < 3:
  38. try:
  39. cks = get_cookies(url,headers, proxies)
  40. response = requests.get(url, cookies=cks, **request_params)
  41. if response.status_code == 200:
  42. response.encoding = response.apparent_encoding
  43. return True, response
  44. elif response.status_code == 521:
  45. print("****** 521 ******")
  46. response.status_code = 10521
  47. _jsl_clearance_s = http_session_521(url, headers, proxies)
  48. # TODO 验证是否需要将这个临时变量写入登录cookies,临时写入内存,如有必要再添加
  49. # if account is not None:
  50. # update_login_cookies(account, _jsl_clearance_s)
  51. if 'cookies' not in request_params:
  52. request_params.setdefault('cookies', _jsl_clearance_s)
  53. else:
  54. _cookies: dict = request_params.get('cookies')
  55. _cookies.update(_jsl_clearance_s)
  56. request_params.update({'cookies': _cookies})
  57. continue
  58. elif response.status_code in [404, 301]:
  59. response.status_code = response.status_code
  60. response.reason = '网站反爬'
  61. response.request = Request(
  62. method='get',
  63. url=url,
  64. headers=request_params.get('headers'),
  65. )
  66. return False, response
  67. elif 500 <= response.status_code < 521:
  68. response.status_code = response.status_code
  69. response.reason = '网站页面无法访问'
  70. response.request = Request(
  71. method='get',
  72. url=url,
  73. headers=request_params.get('headers'),
  74. )
  75. return False, response
  76. else:
  77. retries += 1
  78. except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
  79. response.status_code = 10000
  80. response.reason = e.__class__.__name__
  81. response.request = Request(
  82. method='get',
  83. url=url,
  84. headers=request_params.get('headers'),
  85. )
  86. return False, response
  87. except requests.RequestException as e:
  88. response.status_code = 10001
  89. response.reason = e.__class__.__name__
  90. response.request = Request(
  91. method='get',
  92. url=url,
  93. headers=request_params.get('headers'),
  94. )
  95. retries += 1
  96. return False, response