request.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2018-07-25 11:49:08
  4. ---------
  5. @summary: 请求结构体
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import requests
  11. from func_timeout import func_set_timeout, FunctionTimedOut
  12. from requests.adapters import HTTPAdapter
  13. from requests.cookies import RequestsCookieJar
  14. from requests.packages.urllib3.exceptions import InsecureRequestWarning
  15. import feapder.setting as setting
  16. import feapder.utils.tools as tools
  17. from feapder.db.redisdb import RedisDB
  18. from feapder.network import user_agent
  19. from feapder.network.proxy_pool import ProxyPool
  20. from feapder.network.response import Response
  21. from feapder.utils.log import Log
  22. from feapder.utils.webdriver import WebDriverPool
  23. log = Log()
  24. # 屏蔽warning信息
  25. requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
  26. class Request(object):
  27. session = None
  28. webdriver_pool: WebDriverPool = None
  29. user_agent_pool = user_agent
  30. cache_db = None # redis / pika
  31. cached_redis_key = None # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
  32. cached_expire_time = 1200 # 缓存过期时间
  33. local_filepath = None
  34. oss_handler = None
  35. __REQUEST_ATTRS__ = {
  36. # 'method', 'url', 必须传递 不加入**kwargs中
  37. "params",
  38. "data",
  39. "headers",
  40. "cookies",
  41. "files",
  42. "auth",
  43. "timeout",
  44. "allow_redirects",
  45. "proxies",
  46. "hooks",
  47. "stream",
  48. "verify",
  49. "cert",
  50. "json",
  51. }
  52. DEFAULT_KEY_VALUE = dict(
  53. url="",
  54. retry_times=0,
  55. priority=300,
  56. parser_name=None,
  57. callback=None,
  58. filter_repeat=True,
  59. auto_request=True,
  60. request_sync=False,
  61. use_session=None,
  62. random_user_agent=True,
  63. download_midware=None,
  64. is_abandoned=False,
  65. render=False,
  66. render_time=0,
  67. )
  68. def __init__(
  69. self,
  70. url="",
  71. retry_times=0,
  72. priority=300,
  73. parser_name=None,
  74. callback=None,
  75. filter_repeat=True,
  76. auto_request=True,
  77. request_sync=False,
  78. use_session=None,
  79. random_user_agent=True,
  80. download_midware=None,
  81. is_abandoned=False,
  82. render=False,
  83. render_time=0,
  84. splash=False,
  85. iframes=0,
  86. **kwargs,
  87. ):
  88. """
  89. @summary: Request参数
  90. ---------
  91. 框架参数
  92. @param url: 待抓取url
  93. @param retry_times: 当前重试次数
  94. @param priority: 优先级 越小越优先 默认300
  95. @param parser_name: 回调函数所在的类名 默认为当前类
  96. @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
  97. @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
  98. @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
  99. @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
  100. @param use_session: 是否使用session方式
  101. @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
  102. @param download_midware: 下载中间件。默认为parser中的download_midware
  103. @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
  104. @param render: 是否用浏览器渲染
  105. @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
  106. --
  107. 以下参数与requests参数使用方式一致
  108. @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
  109. @param params: 请求参数
  110. @param data: 请求body
  111. @param json: 请求json字符串,同 json.dumps(data)
  112. @param headers:
  113. @param cookies: 字典 或 CookieJar 对象
  114. @param files:
  115. @param auth:
  116. @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
  117. @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
  118. @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
  119. @param verify: 为 True 时将会验证 SSL 证书
  120. @param stream: 如果为 False,将会立即下载响应内容
  121. @param cert:
  122. --
  123. @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
  124. ---------
  125. @result:
  126. """
  127. self.url = url
  128. self.retry_times = retry_times
  129. self.priority = priority
  130. self.parser_name = parser_name
  131. self.callback = callback
  132. self.filter_repeat = filter_repeat
  133. self.auto_request = auto_request
  134. self.request_sync = request_sync
  135. self.use_session = use_session
  136. self.random_user_agent = random_user_agent
  137. self.download_midware = download_midware
  138. self.is_abandoned = is_abandoned
  139. self.render = render
  140. self.splash = splash
  141. self.iframes = iframes
  142. self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
  143. self.requests_kwargs = {}
  144. for key, value in kwargs.items():
  145. if key in self.__class__.__REQUEST_ATTRS__: # 取requests参数
  146. self.requests_kwargs[key] = value
  147. self.__dict__[key] = value
  148. def __repr__(self):
  149. try:
  150. return "<Request {}>".format(self.url)
  151. except:
  152. return "<Request {}>".format(str(self.to_dict)[:40])
  153. def __setattr__(self, key, value):
  154. """
  155. 针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
  156. @param key:
  157. @param value:
  158. @return:
  159. """
  160. self.__dict__[key] = value
  161. if key in self.__class__.__REQUEST_ATTRS__:
  162. self.requests_kwargs[key] = value
  163. def __lt__(self, other):
  164. return self.priority < other.priority
  165. @property
  166. def _session(self):
  167. use_session = (
  168. setting.USE_SESSION if self.use_session is None else self.use_session
  169. ) # self.use_session 优先级高
  170. if use_session and not self.__class__.session:
  171. self.__class__.session = requests.Session()
  172. # pool_connections – 缓存的 urllib3 连接池个数 pool_maxsize – 连接池中保存的最大连接数
  173. http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
  174. # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
  175. self.__class__.session.mount("http", http_adapter)
  176. return self.__class__.session
  177. @property
  178. def _webdriver_pool(self):
  179. if not self.__class__.webdriver_pool:
  180. self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
  181. return self.__class__.webdriver_pool
  182. @property
  183. def to_dict(self):
  184. request_dict = {}
  185. self.callback = (
  186. getattr(self.callback, "__name__")
  187. if callable(self.callback)
  188. else self.callback
  189. )
  190. self.download_midware = (
  191. getattr(self.download_midware, "__name__")
  192. if callable(self.download_midware)
  193. else self.download_midware
  194. )
  195. for key, value in self.__dict__.items():
  196. if (
  197. key in self.__class__.DEFAULT_KEY_VALUE
  198. and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
  199. or key == "requests_kwargs"
  200. ):
  201. continue
  202. if key in self.__class__.__REQUEST_ATTRS__:
  203. if not isinstance(
  204. value, (bytes, bool, float, int, str, tuple, list, dict)
  205. ):
  206. value = tools.dumps_obj(value)
  207. else:
  208. if not isinstance(value, (bytes, bool, float, int, str)):
  209. value = tools.dumps_obj(value)
  210. request_dict[key] = value
  211. return request_dict
  212. @property
  213. def callback_name(self):
  214. return (
  215. getattr(self.callback, "__name__")
  216. if callable(self.callback)
  217. else self.callback
  218. )
  219. @func_set_timeout(30)
  220. def get_response(self, save_cached=False):
  221. """
  222. 获取带有selector功能的response
  223. @param save_cached: 保存缓存 方便调试时不用每次都重新下载
  224. @return:
  225. """
  226. # 设置超时默认时间
  227. self.requests_kwargs.setdefault(
  228. "timeout", setting.REQUEST_TIMEOUT
  229. ) # connect=22 read=22
  230. # 设置stream
  231. # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
  232. self.requests_kwargs.setdefault("stream", True)
  233. # 关闭证书验证
  234. self.requests_kwargs.setdefault("verify", False)
  235. # 设置请求方法
  236. method = self.__dict__.get("method")
  237. if not method:
  238. if "data" in self.requests_kwargs:
  239. method = "POST"
  240. else:
  241. method = "GET"
  242. # 随机user—agent
  243. headers = self.requests_kwargs.get("headers", {})
  244. if "user-agent" not in headers and "User-Agent" not in headers:
  245. if self.render: # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
  246. ua = setting.WEBDRIVER.get(
  247. "user_agent"
  248. ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
  249. else:
  250. ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
  251. if self.random_user_agent and setting.RANDOM_HEADERS:
  252. headers.update({"User-Agent": ua})
  253. self.requests_kwargs.update(headers=headers)
  254. else:
  255. self.requests_kwargs.setdefault(
  256. "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
  257. )
  258. # 代理
  259. proxies = self.requests_kwargs.get("proxies", -1)
  260. if not self.render:
  261. if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
  262. while True:
  263. proxies = self.get_proxy()
  264. if proxies:
  265. self.requests_kwargs.update(proxies=proxies)
  266. break
  267. else:
  268. log.debug("暂无可用代理 ...")
  269. log.debug(
  270. """
  271. -------------- %srequest for ----------------
  272. url = %s
  273. method = %s
  274. body = %s
  275. """
  276. % (
  277. ""
  278. if not self.parser_name
  279. else "%s.%s "
  280. % (
  281. self.parser_name,
  282. (
  283. self.callback
  284. and callable(self.callback)
  285. and getattr(self.callback, "__name__")
  286. or self.callback
  287. )
  288. or "parse",
  289. ),
  290. self.url,
  291. method,
  292. self.requests_kwargs,
  293. )
  294. )
  295. use_session = (
  296. setting.USE_SESSION if self.use_session is None else self.use_session
  297. ) # self.use_session 优先级高
  298. if self.render:
  299. # 使用request的user_agent、cookies、proxy
  300. user_agent = headers.get("User-Agent") or headers.get("user-agent")
  301. cookies = self.requests_kwargs.get("cookies")
  302. print(cookies)
  303. if cookies and isinstance(cookies, RequestsCookieJar):
  304. cookies = cookies.get_dict()
  305. if not cookies:
  306. cookie_str = headers.get("Cookie") or headers.get("cookie")
  307. if cookie_str:
  308. cookies = tools.get_cookies_from_str(cookie_str)
  309. browser = self._webdriver_pool.get(user_agent=user_agent, proxy=False)
  310. try:
  311. if proxies:
  312. self.chage_ip(browser)
  313. browser.get(self.url)
  314. if cookies:
  315. browser.cookies = cookies
  316. if self.render_time:
  317. tools.delay_time(self.render_time)
  318. html = browser.page_source
  319. response = Response.from_dict(
  320. {
  321. "url": browser.current_url,
  322. "cookies": browser.cookies,
  323. "_content": html.encode(),
  324. "status_code": 200,
  325. "elapsed": 666,
  326. "headers": {
  327. "User-Agent": browser.execute_script(
  328. "return navigator.userAgent"
  329. ),
  330. "Cookie": tools.cookies2str(browser.cookies),
  331. },
  332. }
  333. )
  334. response.browser = browser
  335. except Exception as e:
  336. self._webdriver_pool.remove(browser)
  337. raise e
  338. elif use_session:
  339. response = self._session.request(method, self.url, **self.requests_kwargs)
  340. response = Response(response)
  341. elif self.splash:
  342. resp = requests.get(setting.JIANYU_SPLASH_URL, params={
  343. 'iframes': self.iframes,
  344. 'wait': self.render_time,
  345. 'html': 1,
  346. 'proxy': self.get_proxy().get("http"),
  347. 'url': self.url
  348. })
  349. response = Response(resp)
  350. # if self.iframes:
  351. # # response = Response(resp)
  352. # res = resp.json()
  353. # response = Response.from_dict(
  354. # {
  355. # "url": self.url,
  356. # "cookies": resp.cookies,
  357. # "_content": res.get("html"),
  358. # "status_code": 200,
  359. # "resp": resp,
  360. # "elapsed": 666,
  361. # "headers":resp.headers
  362. # }
  363. # )
  364. # else:
  365. # res = resp.json()
  366. # html = res.get("html")
  367. # for item in res.get("childFrames"):
  368. # html += item.get("html")
  369. #
  370. # response = Response.from_dict(
  371. # {
  372. # "url": self.url,
  373. # "cookies": resp.cookies,
  374. # "_content": html,
  375. # "status_code": 200,
  376. # "resp": res,
  377. # "elapsed": 666,
  378. # "headers": resp.headers
  379. #
  380. # }
  381. # )
  382. else:
  383. response = requests.request(method, self.url, **self.requests_kwargs)
  384. response = Response(response)
  385. if save_cached:
  386. self.save_cached(response, expire_time=self.__class__.cached_expire_time)
  387. log.info("requests",extra={"url":response.url,"code":response.status_code})
  388. return response
  389. def proxies(self):
  390. """
  391. Returns: {"https": "https://ip:port", "http": "http://ip:port"}
  392. """
  393. return self.requests_kwargs.get("proxies")
  394. def proxy(self):
  395. """
  396. Returns: ip:port
  397. """
  398. proxies = self.proxies()
  399. if proxies:
  400. return proxies.get("http", "").strip("http://") or proxies.get(
  401. "https", ""
  402. ).strip("https://")
  403. def get_proxy(self):
  404. headers = {
  405. "Authorization": setting.JIANYU_PROXY_AUTHOR
  406. }
  407. proxy = requests.get(setting.JIANYU_PROXY_URL, headers=headers).json()
  408. print(f"切换代理:{proxy.get('data')}")
  409. return proxy.get("data")
  410. def chage_ip(self,browser):
  411. ip = self.get_proxy().get("http") # ip格式"127.0.0.1:80"
  412. ip = ip.split("//")[-1]
  413. browser.get("about:config")
  414. browser.find_element_by_id("warningButton").click()
  415. # js代码
  416. setupScript = '''var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
  417. prefs.setIntPref("network.proxy.type", 1);
  418. prefs.setCharPref("network.proxy.socks", "%s");
  419. prefs.setIntPref("network.proxy.socks_port", "%s");
  420. ''' % (
  421. ip.split(':')[0], ip.split(':')[1])
  422. # 执行js
  423. browser.execute_script(setupScript)
  424. def user_agent(self):
  425. headers = self.requests_kwargs.get("headers")
  426. if headers:
  427. return headers.get("user_agent") or headers.get("User-Agent")
  428. @property
  429. def fingerprint(self):
  430. """
  431. request唯一表识
  432. @return:
  433. """
  434. url = self.__dict__.get("url", "")
  435. # url 归一化
  436. url = tools.canonicalize_url(url)
  437. args = [url]
  438. for arg in ["params", "data", "files", "auth", "cert", "json"]:
  439. if self.requests_kwargs.get(arg):
  440. args.append(self.requests_kwargs.get(arg))
  441. return tools.get_md5(*args)
  442. @property
  443. def _cache_db(self):
  444. if not self.__class__.cache_db:
  445. self.__class__.cache_db = RedisDB() # .from_url(setting.pika_spider_1_uri)
  446. return self.__class__.cache_db
  447. @property
  448. def _cached_redis_key(self):
  449. if self.__class__.cached_redis_key:
  450. return (
  451. f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
  452. )
  453. else:
  454. return f"response_cached:test:{self.fingerprint}"
  455. def save_cached(self, response, expire_time=1200):
  456. """
  457. 使用redis保存response 用于调试 不用每回都下载
  458. @param response:
  459. @param expire_time: 过期时间
  460. @return:
  461. """
  462. self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
  463. def get_response_from_cached(self, save_cached=True):
  464. """
  465. 从缓存中获取response
  466. 注意:
  467. 属性值为空:
  468. -raw : urllib3.response.HTTPResponse
  469. -connection:requests.adapters.HTTPAdapter
  470. -history
  471. 属性含义改变:
  472. - request 由requests 改为Request
  473. @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
  474. @return:
  475. """
  476. response_dict = self._cache_db.strget(self._cached_redis_key)
  477. if not response_dict:
  478. log.info("无response缓存 重新下载")
  479. try:
  480. response_obj = self.get_response(save_cached=save_cached)
  481. except FunctionTimedOut:
  482. response_obj = None
  483. log.info("请求超时")
  484. log.info("requests", extra={"url": self.url, "code": 0})
  485. else:
  486. response_dict = eval(response_dict)
  487. response_obj = Response.from_dict(response_dict)
  488. return response_obj
  489. def del_response_cached(self):
  490. self._cache_db.clear(self._cached_redis_key)
  491. @classmethod
  492. def from_dict(cls, request_dict):
  493. for key, value in request_dict.items():
  494. if isinstance(value, bytes): # 反序列化 如item
  495. request_dict[key] = tools.loads_obj(value)
  496. return cls(**request_dict)
  497. def copy(self):
  498. return self.__class__.from_dict(self.to_dict)