request.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2018-07-25 11:49:08
  4. ---------
  5. @summary: 请求结构体
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import copy
  11. import re
  12. import requests
  13. from requests.adapters import HTTPAdapter
  14. from requests.cookies import RequestsCookieJar
  15. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  16. from requests.packages.urllib3.util.ssl_ import create_urllib3_context
  17. import feapder.setting as setting
  18. import feapder.utils.tools as tools
  19. from feapder.db.redisdb import RedisDB
  20. from feapder.network import user_agent
  21. from feapder.network.response import Response
  22. from feapder.utils.log import log
  23. from feapder.utils.webdriver import WebDriverPool
  24. # 屏蔽warning信息
  25. requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  26. class DESAdapter(HTTPAdapter):
  27. def __init__(self, *args, **kwargs):
  28. """
  29. A TransportAdapter that re-enables 3DES support in Requests.
  30. """
  31. ciphers = ":".join(setting.JA3_REQUEST_CIPHERS).split(':')
  32. tools.random.shuffle(ciphers)
  33. ciphers = ':'.join(ciphers)
  34. self.ciphers = ciphers + ':!aNULL:!eNULL:!MD5'
  35. super().__init__(*args, **kwargs)
  36. def init_poolmanager(self, *args, **kwargs):
  37. context = create_urllib3_context(ciphers=self.ciphers)
  38. kwargs['ssl_context'] = context
  39. return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
  40. def proxy_manager_for(self, *args, **kwargs):
  41. context = create_urllib3_context(ciphers=self.ciphers)
  42. kwargs['ssl_context'] = context
  43. return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
  44. class Request(object):
  45. session = None
  46. webdriver_pool: WebDriverPool = None
  47. user_agent_pool = user_agent
  48. cache_db = None # redis / pika
  49. cached_redis_key = None # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
  50. cached_expire_time = 1200 # 缓存过期时间
  51. local_filepath = None
  52. oss_handler = None
  53. __REQUEST_ATTRS__ = {
  54. # "method", "url", 必须传递 不加入**kwargs中
  55. "params",
  56. "data",
  57. "headers",
  58. "cookies",
  59. "files",
  60. "auth",
  61. "timeout",
  62. "allow_redirects",
  63. "proxies",
  64. "hooks",
  65. "stream",
  66. "verify",
  67. "cert",
  68. "json",
  69. }
  70. DEFAULT_KEY_VALUE = dict(
  71. url="",
  72. retry_times=0,
  73. priority=300,
  74. parser_name=None,
  75. callback=None,
  76. filter_repeat=True,
  77. auto_request=True,
  78. request_sync=False,
  79. use_session=None,
  80. use_ja3_session=None,
  81. random_user_agent=True,
  82. download_midware=None,
  83. is_abandoned=False,
  84. render=False,
  85. render_time=0,
  86. )
  87. def __init__(
  88. self,
  89. url="",
  90. retry_times=0,
  91. priority=300,
  92. parser_name=None,
  93. callback=None,
  94. filter_repeat=True,
  95. auto_request=True,
  96. request_sync=False,
  97. use_session=None,
  98. use_ja3_session=None,
  99. random_user_agent=True,
  100. download_midware=None,
  101. is_abandoned=False,
  102. render=False,
  103. render_time=0,
  104. splash=False,
  105. iframes=0,
  106. page=None,
  107. **kwargs,
  108. ):
  109. """
  110. @summary: Request参数
  111. ---------
  112. 框架参数
  113. @param url: 待抓取url
  114. @param retry_times: 当前重试次数
  115. @param priority: 优先级 越小越优先 默认300
  116. @param parser_name: 回调函数所在的类名 默认为当前类
  117. @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
  118. @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
  119. @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
  120. @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
  121. @param use_session: 是否使用session方式
  122. @param use_ja3_session: 是否使用ja3_session方式
  123. @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
  124. @param download_midware: 下载中间件。默认为parser中的download_midware
  125. @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
  126. @param render: 是否用浏览器渲染
  127. @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
  128. --
  129. 以下参数与requests参数使用方式一致
  130. @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
  131. @param params: 请求参数
  132. @param data: 请求body
  133. @param json: 请求json字符串,同 json.dumps(data)
  134. @param headers:
  135. @param cookies: 字典 或 CookieJar 对象
  136. @param files:
  137. @param auth:
  138. @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
  139. @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
  140. @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
  141. @param verify: 为 True 时将会验证 SSL 证书
  142. @param stream: 如果为 False,将会立即下载响应内容
  143. @param cert:
  144. --
  145. 自定义新增参数
  146. @param splash: 是否使用 splash 渲染服务
  147. @param iframes: splash 获取页面嵌入的 iframe 内容, 0=不获取,1=获取
  148. @param page: 请求列表页页码数
  149. --
  150. @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
  151. ---------
  152. @result:
  153. """
  154. self.url = url
  155. self.retry_times = retry_times
  156. self.priority = priority
  157. self.parser_name = parser_name
  158. self.callback = callback
  159. self.filter_repeat = filter_repeat
  160. self.auto_request = auto_request
  161. self.request_sync = request_sync
  162. self.use_session = use_session
  163. self.use_ja3_session = use_ja3_session
  164. self.random_user_agent = random_user_agent
  165. self.download_midware = download_midware
  166. self.is_abandoned = is_abandoned
  167. self.render = render
  168. self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
  169. self.splash = splash
  170. self.iframes = iframes
  171. self.page = page
  172. self.requests_kwargs = {}
  173. for key, value in kwargs.items():
  174. if key in self.__class__.__REQUEST_ATTRS__: # 取requests参数
  175. self.requests_kwargs[key] = value
  176. self.__dict__[key] = value
  177. def __repr__(self):
  178. try:
  179. return "<Request {}>".format(self.url)
  180. except:
  181. return "<Request {}>".format(str(self.to_dict)[:40])
  182. def __setattr__(self, key, value):
  183. """
  184. 针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
  185. @param key:
  186. @param value:
  187. @return:
  188. """
  189. self.__dict__[key] = value
  190. if key in self.__class__.__REQUEST_ATTRS__:
  191. self.requests_kwargs[key] = value
  192. def __lt__(self, other):
  193. return self.priority < other.priority
  194. @property
  195. def _session(self):
  196. use_session = (
  197. setting.USE_SESSION if self.use_session is None else self.use_session
  198. ) # self.use_session 优先级高
  199. use_ja3_session = (
  200. setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
  201. ) # self.use_ja3_session 优先级高
  202. use_session = use_session or use_ja3_session
  203. if use_session and not self.__class__.session:
  204. self.__class__.session = requests.Session()
  205. if use_ja3_session:
  206. # pool_connections – 缓存的 urllib3 连接池个数 pool_maxsize – 连接池中保存的最大连接数
  207. des_adapter = DESAdapter(pool_connections=1000, pool_maxsize=1000)
  208. # 任何使用该session会话的 HTTP/HTTPS 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
  209. self.__class__.session.mount("https://", des_adapter)
  210. self.__class__.session.mount("http://", des_adapter)
  211. else:
  212. # pool_connections – 缓存的 urllib3 连接池个数 pool_maxsize – 连接池中保存的最大连接数
  213. http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
  214. # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
  215. self.__class__.session.mount("http", http_adapter)
  216. return self.__class__.session
  217. @property
  218. def _webdriver_pool(self):
  219. if not self.__class__.webdriver_pool:
  220. self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
  221. return self.__class__.webdriver_pool
  222. @property
  223. def to_dict(self):
  224. request_dict = {}
  225. self.callback = (
  226. getattr(self.callback, "__name__")
  227. if callable(self.callback)
  228. else self.callback
  229. )
  230. self.download_midware = (
  231. getattr(self.download_midware, "__name__")
  232. if callable(self.download_midware)
  233. else self.download_midware
  234. )
  235. for key, value in self.__dict__.items():
  236. if (
  237. key in self.__class__.DEFAULT_KEY_VALUE
  238. and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
  239. or key == "requests_kwargs"
  240. ):
  241. continue
  242. if key in self.__class__.__REQUEST_ATTRS__:
  243. if not isinstance(
  244. value, (bytes, bool, float, int, str, tuple, list, dict)
  245. ):
  246. value = tools.dumps_obj(value)
  247. else:
  248. if not isinstance(value, (bytes, bool, float, int, str)):
  249. value = tools.dumps_obj(value)
  250. request_dict[key] = value
  251. return request_dict
  252. @property
  253. def callback_name(self):
  254. return (
  255. getattr(self.callback, "__name__")
  256. if callable(self.callback)
  257. else self.callback
  258. )
  259. def get_response(self, save_cached=False):
  260. """
  261. 获取带有selector功能的response
  262. @param save_cached: 保存缓存 方便调试时不用每次都重新下载
  263. @return:
  264. """
  265. # 设置超时默认时间
  266. self.requests_kwargs.setdefault(
  267. "timeout", setting.REQUEST_TIMEOUT
  268. ) # connect=22 read=22
  269. # 设置stream
  270. # 默认情况下,当你进行网络请求后,响应体会立即被下载。
  271. # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
  272. # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
  273. self.requests_kwargs.setdefault("stream", True)
  274. # 关闭证书验证
  275. self.requests_kwargs.setdefault("verify", False)
  276. # 设置请求方法
  277. method = self.__dict__.get("method")
  278. if not method:
  279. if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
  280. method = "POST"
  281. else:
  282. method = "GET"
  283. # 随机user—agent
  284. headers = self.requests_kwargs.get("headers", {})
  285. if "user-agent" not in headers and "User-Agent" not in headers:
  286. if self.render: # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
  287. ua = setting.WEBDRIVER.get(
  288. "user_agent"
  289. ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
  290. else:
  291. ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
  292. if self.random_user_agent and setting.RANDOM_HEADERS:
  293. headers.update({"User-Agent": ua})
  294. self.requests_kwargs.update(headers=headers)
  295. else:
  296. self.requests_kwargs.setdefault(
  297. "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
  298. )
  299. # 代理
  300. proxies = self.requests_kwargs.get("proxies", -1)
  301. if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
  302. while True:
  303. proxies = self.get_proxy()
  304. if proxies:
  305. self.requests_kwargs.update(proxies=proxies)
  306. break
  307. else:
  308. log.debug("暂无可用代理 ...")
  309. log.debug(
  310. """
  311. -------------- %srequest for ----------------
  312. url = %s
  313. method = %s
  314. body = %s
  315. """
  316. % (
  317. ""
  318. if not self.parser_name
  319. else "%s.%s "
  320. % (
  321. self.parser_name,
  322. (
  323. self.callback
  324. and callable(self.callback)
  325. and getattr(self.callback, "__name__")
  326. or self.callback
  327. )
  328. or "parse",
  329. ),
  330. self.url,
  331. method,
  332. self.requests_kwargs,
  333. )
  334. )
  335. use_session = (
  336. setting.USE_SESSION if self.use_session is None else self.use_session
  337. ) # self.use_session 优先级高
  338. use_ja3_session = (
  339. setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
  340. ) # self.use_ja3_session 优先级高
  341. use_session = use_session or use_ja3_session
  342. if self.render:
  343. # 使用request的user_agent、cookies、proxy
  344. user_agent = headers.get("User-Agent") or headers.get("user-agent")
  345. cookies = self.requests_kwargs.get("cookies")
  346. if cookies and isinstance(cookies, RequestsCookieJar):
  347. cookies = cookies.get_dict()
  348. if not cookies:
  349. cookie_str = headers.get("Cookie") or headers.get("cookie")
  350. if cookie_str:
  351. cookies = tools.get_cookies_from_str(cookie_str)
  352. proxy = None
  353. if proxies and proxies != -1:
  354. proxy = proxies.get("http", "").strip("http://") or proxies.get(
  355. "https", ""
  356. ).strip("https://")
  357. browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
  358. try:
  359. browser.get(self.url)
  360. if cookies:
  361. browser.cookies = cookies
  362. if self.render_time:
  363. tools.delay_time(self.render_time)
  364. html = browser.page_source
  365. response = Response.from_dict({
  366. "url": browser.current_url,
  367. "cookies": browser.cookies,
  368. "_content": html.encode(),
  369. "status_code": 200,
  370. "elapsed": 666,
  371. "headers": {
  372. "User-Agent": browser.execute_script(
  373. "return navigator.userAgent"
  374. ),
  375. "Cookie": tools.cookies2str(browser.cookies),
  376. },
  377. })
  378. response.browser = browser
  379. except Exception as e:
  380. self._webdriver_pool.remove(browser)
  381. raise e
  382. elif use_session:
  383. response = self._session.request(method, self.url, **self.requests_kwargs)
  384. response = Response(response)
  385. elif self.splash:
  386. headers = self.requests_kwargs.get('headers')
  387. if not headers:
  388. headers = {'User-Agent': self.user_agent()}
  389. headers = [(key, val) for key, val in headers.items()]
  390. proxy = None
  391. if proxies and proxies != -1:
  392. proxy = proxies.get("http", "").strip("http://") or proxies.get(
  393. "https", ""
  394. ).strip("https://")
  395. params = {
  396. 'iframes': self.iframes,
  397. 'wait': self.render_time,
  398. 'html': 1,
  399. 'proxy': proxy,
  400. 'url': self.url,
  401. }
  402. data = {'headers': headers}
  403. splash_url = setting.SWORDFISH_RENDER_URL
  404. resp = requests.get(splash_url, params=params, json=data)
  405. response = Response(resp)
  406. else:
  407. response = requests.request(method, self.url, **self.requests_kwargs)
  408. response = Response(response)
  409. if save_cached:
  410. self.save_cached(response, expire_time=self.__class__.cached_expire_time)
  411. return response
  412. def proxies(self):
  413. """
  414. Returns: {"https": "https://ip:port", "http": "http://ip:port"}
  415. """
  416. return self.requests_kwargs.get("proxies")
  417. def proxy(self):
  418. """
  419. Returns: ip:port
  420. """
  421. proxies = self.proxies()
  422. if proxies:
  423. return re.sub(
  424. "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
  425. )
  426. def get_proxy(self):
  427. headers = {"Authorization": setting.SWORDFISH_PROXY_AUTHOR}
  428. proxy = requests.get(setting.SWORDFISH_PROXY_URL, headers=headers).json()
  429. return proxy.get("data")
  430. def user_agent(self):
  431. headers = self.requests_kwargs.get("headers")
  432. if headers:
  433. return headers.get("user_agent") or headers.get("User-Agent")
  434. @property
  435. def fingerprint(self):
  436. """
  437. request唯一表识
  438. @return:
  439. """
  440. url = self.__dict__.get("url", "")
  441. # url 归一化
  442. url = tools.canonicalize_url(url)
  443. args = [url]
  444. for arg in ["params", "data", "files", "auth", "cert", "json"]:
  445. if self.requests_kwargs.get(arg):
  446. args.append(self.requests_kwargs.get(arg))
  447. return tools.get_md5(*args)
  448. @property
  449. def _cache_db(self):
  450. if not self.__class__.cache_db:
  451. self.__class__.cache_db = RedisDB()
  452. return self.__class__.cache_db
  453. @property
  454. def _cached_redis_key(self):
  455. if self.__class__.cached_redis_key:
  456. return (
  457. f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
  458. )
  459. else:
  460. return f"response_cached:test:{self.fingerprint}"
  461. def save_cached(self, response, expire_time=1200):
  462. """
  463. 使用redis保存response 用于调试 不用每回都下载
  464. @param response:
  465. @param expire_time: 过期时间
  466. @return:
  467. """
  468. self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
  469. def get_response_from_cached(self, save_cached=True):
  470. """
  471. 从缓存中获取response
  472. 注意:
  473. 属性值为空:
  474. -raw : urllib3.response.HTTPResponse
  475. -connection:requests.adapters.HTTPAdapter
  476. -history
  477. 属性含义改变:
  478. - request 由requests 改为Request
  479. @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
  480. @return:
  481. """
  482. response_dict = self._cache_db.strget(self._cached_redis_key)
  483. if not response_dict:
  484. log.info("无response缓存 重新下载")
  485. response_obj = self.get_response(save_cached=save_cached)
  486. else:
  487. response_dict = eval(response_dict)
  488. response_obj = Response.from_dict(response_dict)
  489. return response_obj
  490. def del_response_cached(self):
  491. self._cache_db.clear(self._cached_redis_key)
  492. @classmethod
  493. def from_dict(cls, request_dict):
  494. for key, value in request_dict.items():
  495. if isinstance(value, bytes): # 反序列化 如item
  496. request_dict[key] = tools.loads_obj(value)
  497. return cls(**request_dict)
  498. def copy(self):
  499. return self.__class__.from_dict(copy.deepcopy(self.to_dict))