123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- import json
- from pathlib import Path
- import requests
- import urllib3
- from requests.models import Response, Request
- from crawler.sessions_521 import http_session_521
- urllib3.disable_warnings()
- def prepare_request(
- headers: dict = None,
- proxies: dict = None,
- timeout: int = None,
- verify: bool = None,
- cookies=None,
- ):
- request_params = {}
- request_params.setdefault('headers', headers)
- request_params.setdefault('timeout', timeout or 60)
- request_params.setdefault('proxies', proxies)
- if cookies is not None:
- request_params.setdefault('cookies', cookies)
- if verify is not None:
- request_params.setdefault('verify', verify)
- return request_params
- def get_jsl_cookies(url, headers, proxies=None):
- root = Path(__file__).parent.parent
- file = root.joinpath("config/jsl_ck.json").resolve()
- if not file.exists():
- http_session_521(url, headers, proxies)
- with file.open('r', encoding='utf-8') as fr:
- cookies = fr.read()
- return json.loads(cookies.replace("'", '"'))
- def http_request_get(url, login=False, **kwargs):
- request_params = prepare_request(**kwargs)
- headers = request_params.get('headers')
- proxies = request_params.get('proxies')
- retries = 0
- response = Response()
- while retries < 3:
- try:
- if not login:
- jsl_cookies = get_jsl_cookies(url, headers, proxies)
- request_params['cookies'] = jsl_cookies
- response = requests.get(url, **request_params)
- if response.status_code == 200:
- response.encoding = response.apparent_encoding
- return True, response
- elif response.status_code == 521:
- print("****** 521 ******")
- response.status_code = 10521
- _jsl_clearance_s = http_session_521(url, headers, proxies)
- # TODO 验证是否需要将这个临时变量写入登录cookies,临时写入内存,如有必要再添加
- # if account is not None:
- # update_login_cookies(account, _jsl_clearance_s)
- if 'cookies' not in request_params:
- request_params.setdefault('cookies', _jsl_clearance_s)
- else:
- _cookies: dict = request_params.get('cookies')
- _cookies.update(_jsl_clearance_s)
- request_params.update({'cookies': _cookies})
- continue
- elif response.status_code in [404, 301]:
- response.status_code = response.status_code
- response.reason = '网站反爬'
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- return False, response
- elif 500 <= response.status_code < 521:
- response.status_code = response.status_code
- response.reason = '网站页面无法访问'
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- return False, response
- else:
- retries += 1
- except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
- response.status_code = 10000
- response.reason = e.__class__.__name__
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- return False, response
- except requests.RequestException as e:
- response.status_code = 10001
- response.reason = e.__class__.__name__
- response.request = Request(
- method='get',
- url=url,
- headers=request_params.get('headers'),
- )
- retries += 1
- return False, response
|