request.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2018-07-25 11:49:08
  4. ---------
  5. @summary: 请求结构体
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import copy
  11. import re
  12. import requests
  13. from requests.adapters import HTTPAdapter
  14. from requests.cookies import RequestsCookieJar
  15. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  16. from requests.packages.urllib3.util.ssl_ import create_urllib3_context
  17. import feapder.setting as setting
  18. import feapder.utils.tools as tools
  19. from feapder.db.redisdb import RedisDB
  20. from feapder.network import user_agent, proxy_pool
  21. from feapder.network.response import Response
  22. from feapder.utils.log import log
  23. from feapder.utils.webdriver import WebDriverPool
  24. # 屏蔽warning信息
  25. requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  26. class DESAdapter(HTTPAdapter):
  27. def __init__(self, *args, **kwargs):
  28. """
  29. A TransportAdapter that re-enables 3DES support in Requests.
  30. """
  31. ciphers = ":".join(setting.JA3_REQUEST_CIPHERS).split(":")
  32. tools.random.shuffle(ciphers)
  33. ciphers = ":".join(ciphers)
  34. self.ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
  35. super().__init__(*args, **kwargs)
  36. def init_poolmanager(self, *args, **kwargs):
  37. context = create_urllib3_context(ciphers=self.ciphers)
  38. kwargs["ssl_context"] = context
  39. return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
  40. def proxy_manager_for(self, *args, **kwargs):
  41. context = create_urllib3_context(ciphers=self.ciphers)
  42. kwargs["ssl_context"] = context
  43. return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
  44. class Request(object):
  45. session = None
  46. webdriver_pool: WebDriverPool = None
  47. user_agent_pool = user_agent
  48. cache_db = None # redis / pika
  49. cached_redis_key = None # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
  50. cached_expire_time = 1200 # 缓存过期时间
  51. local_filepath = None
  52. oss_handler = None
  53. __REQUEST_ATTRS__ = {
  54. # "method", "url", 必须传递 不加入**kwargs中
  55. "params",
  56. "data",
  57. "headers",
  58. "cookies",
  59. "files",
  60. "auth",
  61. "timeout",
  62. "allow_redirects",
  63. "proxies",
  64. "hooks",
  65. "stream",
  66. "verify",
  67. "cert",
  68. "json",
  69. }
  70. DEFAULT_KEY_VALUE = dict(
  71. url="",
  72. retry_times=0,
  73. priority=300,
  74. parser_name=None,
  75. callback=None,
  76. filter_repeat=True,
  77. auto_request=True,
  78. request_sync=False,
  79. use_session=None,
  80. use_ja3_session=None,
  81. random_user_agent=True,
  82. download_midware=None,
  83. is_abandoned=False,
  84. render=False,
  85. render_time=0,
  86. )
  87. def __init__(
  88. self,
  89. url="",
  90. retry_times=0,
  91. priority=300,
  92. parser_name=None,
  93. callback=None,
  94. filter_repeat=True,
  95. auto_request=True,
  96. request_sync=False,
  97. use_session=None,
  98. use_ja3_session=None,
  99. random_user_agent=True,
  100. download_midware=None,
  101. is_abandoned=False,
  102. render=False,
  103. render_time=0,
  104. splash=False,
  105. iframes=0,
  106. page=None,
  107. **kwargs,
  108. ):
  109. """
  110. @summary: Request参数
  111. ---------
  112. 框架参数
  113. @param url: 待抓取url
  114. @param retry_times: 当前重试次数
  115. @param priority: 优先级 越小越优先 默认300
  116. @param parser_name: 回调函数所在的类名 默认为当前类
  117. @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
  118. @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
  119. @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
  120. @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
  121. @param use_session: 是否使用session方式
  122. @param use_ja3_session: 是否使用ja3_session方式
  123. @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
  124. @param download_midware: 下载中间件。默认为parser中的download_midware
  125. @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
  126. @param render: 是否用浏览器渲染
  127. @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
  128. --
  129. 以下参数与requests参数使用方式一致
  130. @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
  131. @param params: 请求参数
  132. @param data: 请求body
  133. @param json: 请求json字符串,同 json.dumps(data)
  134. @param headers:
  135. @param cookies: 字典 或 CookieJar 对象
  136. @param files:
  137. @param auth:
  138. @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
  139. @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
  140. @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
  141. @param verify: 为 True 时将会验证 SSL 证书
  142. @param stream: 如果为 False,将会立即下载响应内容
  143. @param cert:
  144. --
  145. 自定义新增参数
  146. @param splash: 是否使用 splash 渲染服务
  147. @param iframes: splash 获取页面嵌入的 iframe 内容, 0=不获取,1=获取
  148. @param page: 请求列表页页码数
  149. --
  150. @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
  151. ---------
  152. @result:
  153. """
  154. self.url = url
  155. self.retry_times = retry_times
  156. self.priority = priority
  157. self.parser_name = parser_name
  158. self.callback = callback
  159. self.filter_repeat = filter_repeat
  160. self.auto_request = auto_request
  161. self.request_sync = request_sync
  162. self.use_session = use_session
  163. self.use_ja3_session = use_ja3_session
  164. self.random_user_agent = random_user_agent
  165. self.download_midware = download_midware
  166. self.is_abandoned = is_abandoned
  167. self.render = render
  168. self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
  169. self.splash = splash
  170. self.iframes = iframes
  171. self.page = page
  172. self.requests_kwargs = {}
  173. for key, value in kwargs.items():
  174. if key in self.__class__.__REQUEST_ATTRS__: # 取requests参数
  175. self.requests_kwargs[key] = value
  176. self.__dict__[key] = value
  177. def __repr__(self):
  178. try:
  179. return "<Request {}>".format(self.url)
  180. except:
  181. return "<Request {}>".format(str(self.to_dict)[:40])
  182. def __setattr__(self, key, value):
  183. """
  184. 针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
  185. @param key:
  186. @param value:
  187. @return:
  188. """
  189. self.__dict__[key] = value
  190. if key in self.__class__.__REQUEST_ATTRS__:
  191. self.requests_kwargs[key] = value
  192. def __lt__(self, other):
  193. return self.priority < other.priority
  194. @property
  195. def _session(self):
  196. use_session = (
  197. setting.USE_SESSION if self.use_session is None else self.use_session
  198. ) # self.use_session 优先级高
  199. use_ja3_session = (
  200. setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
  201. ) # self.use_ja3_session 优先级高
  202. use_session = use_session or use_ja3_session
  203. if use_session and not self.__class__.session:
  204. self.__class__.session = requests.Session()
  205. if use_ja3_session:
  206. # pool_connections – 缓存的 urllib3 连接池个数 pool_maxsize – 连接池中保存的最大连接数
  207. des_adapter = DESAdapter(pool_connections=1000, pool_maxsize=1000)
  208. # 任何使用该session会话的 HTTP/HTTPS 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
  209. self.__class__.session.mount("https://", des_adapter)
  210. self.__class__.session.mount("http://", des_adapter)
  211. else:
  212. # pool_connections – 缓存的 urllib3 连接池个数 pool_maxsize – 连接池中保存的最大连接数
  213. http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
  214. # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
  215. self.__class__.session.mount("http", http_adapter)
  216. return self.__class__.session
  217. @property
  218. def _webdriver_pool(self):
  219. if not self.__class__.webdriver_pool:
  220. self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
  221. return self.__class__.webdriver_pool
  222. @property
  223. def to_dict(self):
  224. request_dict = {}
  225. self.callback = (
  226. getattr(self.callback, "__name__")
  227. if callable(self.callback)
  228. else self.callback
  229. )
  230. self.download_midware = (
  231. getattr(self.download_midware, "__name__")
  232. if callable(self.download_midware)
  233. else self.download_midware
  234. )
  235. for key, value in self.__dict__.items():
  236. if (
  237. key in self.__class__.DEFAULT_KEY_VALUE
  238. and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
  239. or key == "requests_kwargs"
  240. ):
  241. continue
  242. if key in self.__class__.__REQUEST_ATTRS__:
  243. if not isinstance(
  244. value, (bytes, bool, float, int, str, tuple, list, dict)
  245. ):
  246. value = tools.dumps_obj(value)
  247. else:
  248. if not isinstance(value, (bytes, bool, float, int, str)):
  249. value = tools.dumps_obj(value)
  250. request_dict[key] = value
  251. return request_dict
  252. @property
  253. def callback_name(self):
  254. return (
  255. getattr(self.callback, "__name__")
  256. if callable(self.callback)
  257. else self.callback
  258. )
  259. def get_response(self, save_cached=False, show_log=True):
  260. """
  261. 获取带有selector功能的response
  262. @param save_cached: 保存缓存 方便调试时不用每次都重新下载
  263. @param show_log: 展示日志
  264. @return:
  265. """
  266. # 设置超时默认时间
  267. self.requests_kwargs.setdefault(
  268. "timeout", setting.REQUEST_TIMEOUT
  269. ) # connect=22 read=22
  270. # 设置stream
  271. # 默认情况下,当你进行网络请求后,响应体会立即被下载。
  272. # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
  273. # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
  274. self.requests_kwargs.setdefault("stream", True)
  275. # 关闭证书验证
  276. self.requests_kwargs.setdefault("verify", False)
  277. # 设置请求方法
  278. method = self.__dict__.get("method")
  279. if not method:
  280. if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
  281. method = "POST"
  282. else:
  283. method = "GET"
  284. # 随机user—agent
  285. headers = self.requests_kwargs.get("headers", {})
  286. if "user-agent" not in headers and "User-Agent" not in headers:
  287. if self.render: # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
  288. ua = setting.WEBDRIVER.get(
  289. "user_agent"
  290. ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
  291. else:
  292. ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
  293. if self.random_user_agent and setting.RANDOM_HEADERS:
  294. headers.update({"User-Agent": ua})
  295. self.requests_kwargs.update(headers=headers)
  296. else:
  297. self.requests_kwargs.setdefault(
  298. "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
  299. )
  300. # 代理
  301. proxies = self.requests_kwargs.get("proxies", -1)
  302. if proxies == -1 and setting.PROXY_ENABLE and setting.JY_PROXY_URL:
  303. while True:
  304. proxies = proxy_pool.get_proxy_from_jyapi()
  305. if proxies:
  306. self.requests_kwargs.update(proxies=proxies)
  307. break
  308. else:
  309. log.debug("暂无可用代理 ...")
  310. if show_log:
  311. log.debug(
  312. """
  313. -------------- %srequest for ----------------
  314. url = %s
  315. method = %s
  316. body = %s
  317. """
  318. % (
  319. ""
  320. if not self.parser_name
  321. else "%s.%s "
  322. % (
  323. self.parser_name,
  324. (
  325. self.callback
  326. and callable(self.callback)
  327. and getattr(self.callback, "__name__")
  328. or self.callback
  329. )
  330. or "parse",
  331. ),
  332. self.url,
  333. method,
  334. self.requests_kwargs,
  335. )
  336. )
  337. use_session = (
  338. setting.USE_SESSION if self.use_session is None else self.use_session
  339. ) # self.use_session 优先级高
  340. use_ja3_session = (
  341. setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
  342. ) # self.use_ja3_session 优先级高
  343. use_session = use_session or use_ja3_session
  344. if self.render:
  345. # 使用request的user_agent、cookies、proxy
  346. user_agent = headers.get("User-Agent") or headers.get("user-agent")
  347. cookies = self.requests_kwargs.get("cookies")
  348. if cookies and isinstance(cookies, RequestsCookieJar):
  349. cookies = cookies.get_dict()
  350. if not cookies:
  351. cookie_str = headers.get("Cookie") or headers.get("cookie")
  352. if cookie_str:
  353. cookies = tools.get_cookies_from_str(cookie_str)
  354. browser_kwargs = dict(user_agent=user_agent, proxy=self.proxy())
  355. browser = self._webdriver_pool.get(**browser_kwargs)
  356. try:
  357. browser.get(self.url)
  358. if cookies:
  359. browser.cookies = cookies
  360. if self.render_time:
  361. tools.delay_time(self.render_time)
  362. html = browser.page_source
  363. response = Response.from_dict({
  364. "url": browser.current_url,
  365. "cookies": browser.cookies,
  366. "_content": html.encode(),
  367. "status_code": 200,
  368. "elapsed": 666,
  369. "headers": {
  370. "User-Agent": browser.execute_script("return navigator.userAgent"),
  371. "Cookie": tools.cookies2str(browser.cookies),
  372. },
  373. })
  374. response.browser = browser
  375. except Exception as e:
  376. self._webdriver_pool.remove(browser)
  377. raise e
  378. elif use_session:
  379. response = self._session.request(method, self.url, **self.requests_kwargs)
  380. response = Response(response)
  381. elif self.splash:
  382. headers = self.requests_kwargs.get("headers")
  383. if not headers:
  384. headers = {"User-Agent": self.user_agent()}
  385. params = {
  386. "iframes": self.iframes,
  387. "wait": self.render_time,
  388. "html": 1,
  389. "proxy": self.proxy(),
  390. "url": self.url,
  391. }
  392. data = {"headers": [(key, val) for key, val in headers.items()]}
  393. resp = requests.get(setting.SPLASH_API, params=params, json=data)
  394. response = Response(resp)
  395. else:
  396. response = requests.request(method, self.url, **self.requests_kwargs)
  397. response = Response(response)
  398. if save_cached:
  399. self.save_cached(response, expire_time=self.__class__.cached_expire_time)
  400. return response
  401. def proxies(self):
  402. """
  403. Returns: {"https": "https://ip:port", "http": "http://ip:port"}
  404. """
  405. return self.requests_kwargs.get("proxies")
  406. def proxy(self):
  407. """
  408. Returns: ip:port
  409. """
  410. proxies = self.requests_kwargs.get("proxies")
  411. if proxies:
  412. return re.sub(
  413. "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
  414. )
  415. def user_agent(self):
  416. headers = self.requests_kwargs.get("headers")
  417. if headers:
  418. return headers.get("user_agent") or headers.get("User-Agent")
  419. @property
  420. def fingerprint(self):
  421. """
  422. request唯一表识
  423. @return:
  424. """
  425. url = self.__dict__.get("url", "")
  426. # url 归一化
  427. url = tools.canonicalize_url(url)
  428. args = [url]
  429. for arg in ["params", "data", "files", "auth", "cert", "json"]:
  430. if self.requests_kwargs.get(arg):
  431. args.append(self.requests_kwargs.get(arg))
  432. return tools.get_md5(*args)
  433. @property
  434. def _cache_db(self):
  435. if not self.__class__.cache_db:
  436. self.__class__.cache_db = RedisDB()
  437. return self.__class__.cache_db
  438. @property
  439. def _cached_redis_key(self):
  440. if self.__class__.cached_redis_key:
  441. return (
  442. f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
  443. )
  444. else:
  445. return f"response_cached:test:{self.fingerprint}"
  446. def save_cached(self, response, expire_time=1200):
  447. """
  448. 使用redis保存response 用于调试 不用每回都下载
  449. @param response:
  450. @param expire_time: 过期时间
  451. @return:
  452. """
  453. self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
  454. def get_response_from_cached(self, save_cached=True):
  455. """
  456. 从缓存中获取response
  457. 注意:
  458. 属性值为空:
  459. -raw : urllib3.response.HTTPResponse
  460. -connection:requests.adapters.HTTPAdapter
  461. -history
  462. 属性含义改变:
  463. - request 由requests 改为Request
  464. @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
  465. @return:
  466. """
  467. response_dict = self._cache_db.strget(self._cached_redis_key)
  468. if not response_dict:
  469. log.info("无response缓存 重新下载")
  470. response_obj = self.get_response(save_cached=save_cached)
  471. else:
  472. response_dict = eval(response_dict)
  473. response_obj = Response.from_dict(response_dict)
  474. return response_obj
  475. def del_response_cached(self):
  476. self._cache_db.clear(self._cached_redis_key)
  477. @classmethod
  478. def from_dict(cls, request_dict):
  479. for key, value in request_dict.items():
  480. if isinstance(value, bytes): # 反序列化 如item
  481. request_dict[key] = tools.loads_obj(value)
  482. return cls(**request_dict)
  483. def copy(self):
  484. return self.__class__.from_dict(copy.deepcopy(self.to_dict))