123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- import requests
- import urllib3
- import os
- import json
- from requests.models import Response, Request
- from crawler.sessions_521 import http_session_521
- urllib3.disable_warnings()
- def prepare_request(
- headers: dict = None,
- proxies: dict = None,
- timeout: int = None,
- verify: bool = None,
- cookies=None,
- ):
- request_params = {}
- request_params.setdefault('headers', headers)
- request_params.setdefault('timeout', timeout or 60)
- request_params.setdefault('proxies', proxies)
- if cookies is not None:
- request_params.setdefault('cookies', cookies)
- if verify is not None:
- request_params.setdefault('verify', verify)
- return request_params
- def get_cookies(url, headers, proxies=None):
- if not os.path.isfile(f'./zbytb_ck.json'):
- http_session_521(url, headers, proxies)
- with open(f'./zbytb_ck.json', 'r', encoding='utf-8') as fr:
- cks = fr.read()
- ck = json.loads(cks.replace("'", '"'))
- return ck
- def http_request_get(url, **kwargs):
- request_params = prepare_request(**kwargs)
- headers = request_params.get('headers')
- proxies = request_params.get('proxies')
- retries = 0
- response = Response()
- while retries < 3:
- try:
- cks = get_cookies(url,headers, proxies)
- response = requests.get(url, cookies=cks, **request_params)
- if response.status_code == 200:
- response.encoding = response.apparent_encoding
- return True, response
- elif response.status_code == 521:
- print("****** 521 ******")
- response.status_code = 10521
- _jsl_clearance_s = http_session_521(url, headers, proxies)
- # TODO 验证是否需要将这个临时变量写入登录cookies,临时写入内存,如有必要再添加
- # if account is not None:
- # update_login_cookies(account, _jsl_clearance_s)
- if 'cookies' not in request_params:
- request_params.setdefault('cookies', _jsl_clearance_s)
- else:
- _cookies: dict = request_params.get('cookies')
- _cookies.update(_jsl_clearance_s)
- request_params.update({'cookies': _cookies})
- continue
- elif response.status_code in [404, 301]:
- response.status_code = response.status_code
- response.reason = '网站反爬'
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- return False, response
- elif 500 <= response.status_code < 521:
- response.status_code = response.status_code
- response.reason = '网站页面无法访问'
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- return False, response
- else:
- retries += 1
- except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
- response.status_code = 10000
- response.reason = e.__class__.__name__
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- return False, response
- except requests.RequestException as e:
- response.status_code = 10001
- response.reason = e.__class__.__name__
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- retries += 1
- return False, response
|