cookie_pool.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2018/12/27 11:32 AM
  4. ---------
  5. @summary: cookie池
  6. ---------
  7. @author: Boris
  8. @email: boris_liu@foxmail.com
  9. """
  10. import abc
  11. import datetime
  12. import random
  13. import time
  14. import warnings
  15. from collections import Iterable
  16. from enum import Enum, unique
  17. import feapder.utils.tools as tools
  18. from feapder import setting
  19. from feapder.db.mysqldb import MysqlDB
  20. from feapder.db.redisdb import RedisDB
  21. from feapder.utils import metrics
  22. from feapder.utils.log import log
  23. from feapder.utils.redis_lock import RedisLock
  24. from feapder.utils.tools import send_msg
  25. from feapder.utils.webdriver import WebDriver
  26. class CookiePoolInterface(metaclass=abc.ABCMeta):
  27. """
  28. cookie pool interface
  29. """
  30. @abc.abstractmethod
  31. def create_cookie(self, *args, **kwargs):
  32. raise NotImplementedError
  33. @abc.abstractmethod
  34. def get_cookie(self, *args, **kwargs):
  35. raise NotImplementedError
  36. @abc.abstractmethod
  37. def del_cookie(self, *args, **kwargs):
  38. raise NotImplementedError
  39. @abc.abstractmethod
  40. def run(self):
  41. raise NotImplementedError
  42. class PageCookiePool(CookiePoolInterface):
  43. """
  44. 由页面产生的cookie 不需要用户登陆
  45. """
  46. def __init__(
  47. self,
  48. redis_key,
  49. page_url=None,
  50. min_cookies=10000,
  51. must_contained_keys=(),
  52. keep_alive=False,
  53. **kwargs,
  54. ):
  55. """
  56. @param redis_key: 项目名
  57. @param page_url: 生产cookie的url
  58. @param min_cookies: 最小cookie数
  59. @param must_contained_keys: cookie 必须包含的key
  60. @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
  61. ---
  62. @param kwargs: WebDriver的一些参数
  63. load_images: 是否加载图片
  64. user_agent_pool: user-agent池 为None时不使用
  65. proxies_pool: ;代理池 为None时不使用
  66. headless: 是否启用无头模式
  67. driver_type: web driver 类型
  68. timeout: 请求超时时间 默认16s
  69. window_size: 屏幕分辨率 (width, height)
  70. """
  71. self._redisdb = RedisDB()
  72. self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
  73. self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
  74. redis_key
  75. ) # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
  76. self._page_url = page_url
  77. self._min_cookies = min_cookies
  78. self._must_contained_keys = must_contained_keys
  79. self._keep_alive = keep_alive
  80. self._kwargs = kwargs
  81. self._kwargs.setdefault("load_images", False)
  82. self._kwargs.setdefault("headless", True)
  83. def create_cookie(self):
  84. """
  85. 可能会重写
  86. @return:
  87. """
  88. with WebDriver(**self._kwargs) as driver:
  89. driver.get(self._page_url)
  90. cookies = driver.get_cookies()
  91. cookies_json = {}
  92. for cookie in cookies:
  93. cookies_json[cookie["name"]] = cookie["value"]
  94. for key in self._must_contained_keys:
  95. if key not in cookies_json:
  96. break
  97. else:
  98. return cookies_json
  99. log.error("获取cookie失败 cookies = {}".format(cookies_json))
  100. return None
  101. def add_cookies(self, cookies):
  102. log.info("添加cookie {}".format(cookies))
  103. self._redisdb.lpush(self._tab_cookie_pool, cookies)
  104. def run(self):
  105. while True:
  106. try:
  107. now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
  108. need_cookie_count = self._min_cookies - now_cookie_count
  109. if need_cookie_count > 0:
  110. log.info(
  111. "当前cookie数为 {} 小于 {}, 生产cookie".format(
  112. now_cookie_count, self._min_cookies
  113. )
  114. )
  115. try:
  116. cookies = self.create_cookie()
  117. if cookies:
  118. self.add_cookies(cookies)
  119. except Exception as e:
  120. log.exception(e)
  121. else:
  122. log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
  123. # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
  124. last_count_info = self._redisdb.strget(
  125. self._tab_cookie_pool_last_count
  126. )
  127. if not last_count_info:
  128. self._redisdb.strset(
  129. self._tab_cookie_pool_last_count,
  130. "{}:{}".format(time.time(), now_cookie_count),
  131. )
  132. else:
  133. last_time, last_count = last_count_info.split(":")
  134. last_time = float(last_time)
  135. last_count = int(last_count)
  136. if time.time() - last_time > 60:
  137. if now_cookie_count == last_count:
  138. log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
  139. break
  140. else:
  141. self._redisdb.strset(
  142. self._tab_cookie_pool_last_count,
  143. "{}:{}".format(time.time(), now_cookie_count),
  144. )
  145. if self._keep_alive:
  146. log.info("sleep 10")
  147. tools.delay_time(10)
  148. else:
  149. break
  150. except Exception as e:
  151. log.exception(e)
  152. tools.delay_time(1)
  153. def get_cookie(self, wait_when_null=True):
  154. while True:
  155. try:
  156. cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
  157. if not cookie_info and wait_when_null:
  158. log.info("暂无cookie 生产中...")
  159. self._keep_alive = False
  160. self._min_cookies = 1
  161. with RedisLock(
  162. key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
  163. ) as _lock:
  164. if _lock.locked:
  165. self.run()
  166. continue
  167. return eval(cookie_info) if cookie_info else {}
  168. except Exception as e:
  169. log.exception(e)
  170. tools.delay_time(1)
  171. def del_cookie(self, cookies):
  172. self._redisdb.lrem(self._tab_cookie_pool, cookies)
  173. class User:
  174. def __init__(self, username, cookie):
  175. self.username = username
  176. self.cookie = cookie
  177. class LoginCookiePool(CookiePoolInterface):
  178. """
  179. 需要登陆的cookie池, 用户账号密码等信息用mysql保存
  180. """
  181. def __init__(
  182. self,
  183. redis_key,
  184. *,
  185. table_userbase,
  186. login_state_key="login_state",
  187. lock_state_key="lock_state",
  188. username_key="username",
  189. password_key="password",
  190. login_retry_times=10,
  191. ):
  192. """
  193. @param redis_key: 项目名
  194. @param table_userbase: 用户表名
  195. @param login_state_key: 登录状态列名
  196. @param lock_state_key: 封锁状态列名
  197. @param username_key: 登陆名列名
  198. @param password_key: 密码列名
  199. @param login_retry_times: 登陆失败重试次数
  200. """
  201. self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
  202. self._login_retry_times = login_retry_times
  203. self._table_userbase = table_userbase
  204. self._login_state_key = login_state_key
  205. self._lock_state_key = lock_state_key
  206. self._username_key = username_key
  207. self._password_key = password_key
  208. self._redisdb = RedisDB()
  209. self._mysqldb = ()
  210. self.create_userbase()
  211. def create_userbase(self):
  212. sql = f"""
  213. CREATE TABLE IF NOT EXISTS `{self._table_userbase}` (
  214. `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  215. `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名',
  216. `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码',
  217. `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)',
  218. `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)',
  219. PRIMARY KEY (`id`),
  220. UNIQUE KEY `username` (`username`) USING BTREE
  221. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
  222. """
  223. self._mysqldb.execute(sql)
  224. def create_cookie(self, username, password):
  225. """
  226. 创建cookie
  227. @param username: 用户名
  228. @param password: 密码
  229. @return: return cookie / None
  230. """
  231. raise NotImplementedError
  232. def get_user_info(self):
  233. """
  234. 返回用户信息
  235. @return: yield username, password
  236. """
  237. sql = "select {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format(
  238. username_key=self._username_key,
  239. password_key=self._password_key,
  240. table_userbase=self._table_userbase,
  241. lock_state_key=self._lock_state_key,
  242. login_state_key=self._login_state_key,
  243. )
  244. return self._mysqldb.find(sql)
  245. def handle_login_failed_user(self, username, password):
  246. """
  247. 处理登录失败的user
  248. @param username:
  249. @param password:
  250. @return:
  251. """
  252. pass
  253. def handel_exception(self, e):
  254. """
  255. 处理异常
  256. @param e:
  257. @return:
  258. """
  259. log.exception(e)
  260. def save_cookie(self, username, cookie):
  261. user_cookie = {"username": username, "cookie": cookie}
  262. self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
  263. sql = "update {table_userbase} set {login_state_key} = 1 where {username_key} = '{username}'".format(
  264. table_userbase=self._table_userbase,
  265. login_state_key=self._login_state_key,
  266. username_key=self._username_key,
  267. username=username,
  268. )
  269. self._mysqldb.update(sql)
  270. def get_cookie(self, wait_when_null=True) -> User:
  271. while True:
  272. try:
  273. user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
  274. if not user_cookie and wait_when_null:
  275. log.info("暂无cookie 生产中...")
  276. self.login()
  277. continue
  278. if user_cookie:
  279. user_cookie = eval(user_cookie)
  280. return User(**user_cookie)
  281. return None
  282. except Exception as e:
  283. log.exception(e)
  284. tools.delay_time(1)
  285. def del_cookie(self, user: User):
  286. """
  287. 删除失效的cookie
  288. @param user:
  289. @return:
  290. """
  291. user_info = {"username": user.username, "cookie": user.cookie}
  292. self._redisdb.lrem(self._tab_cookie_pool, user_info)
  293. sql = "update {table_userbase} set {login_state_key} = 0 where {username_key} = '{username}'".format(
  294. table_userbase=self._table_userbase,
  295. login_state_key=self._login_state_key,
  296. username_key=self._username_key,
  297. username=user.username,
  298. )
  299. self._mysqldb.update(sql)
  300. def user_is_locked(self, user: User):
  301. sql = "update {table_userbase} set {lock_state_key} = 1 where {username_key} = '{username}'".format(
  302. table_userbase=self._table_userbase,
  303. lock_state_key=self._lock_state_key,
  304. username_key=self._username_key,
  305. username=user.username,
  306. )
  307. self._mysqldb.update(sql)
  308. def run(self):
  309. with RedisLock(
  310. key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
  311. ) as _lock:
  312. if _lock.locked:
  313. user_infos = self.get_user_info()
  314. if not isinstance(user_infos, Iterable):
  315. raise ValueError("get_user_info 返回值必须可迭代")
  316. if not user_infos:
  317. log.info("无可用用户")
  318. for username, password in user_infos:
  319. for i in range(self._login_retry_times):
  320. try:
  321. cookie = self.create_cookie(username, password)
  322. if cookie:
  323. self.save_cookie(username, cookie)
  324. else:
  325. self.handle_login_failed_user(username, password)
  326. break
  327. except Exception as e:
  328. self.handel_exception(e)
  329. else:
  330. self.handle_login_failed_user(username, password)
  331. login = run
  332. @unique
  333. class LimitTimesUserStatus(Enum):
  334. # 使用状态
  335. USED = "used"
  336. SUCCESS = "success"
  337. OVERDUE = "overdue" # cookie 过期
  338. SLEEP = "sleep"
  339. EXCEPTION = "exception"
  340. # 登陆状态
  341. LOGIN_SUCCESS = "login_success"
  342. LOGIN_FALIED = "login_failed"
  343. class LimitTimesUser:
  344. """
  345. 有次数限制的账户
  346. 基于本地做的缓存,不支持多进程调用
  347. """
  348. ACCOUNT_INFO_KEY = "accounts:h_account_info" # 存储cookie的redis key
  349. SITE_NAME = "" # 网站名
  350. redisdb = None
  351. def __init__(
  352. self,
  353. username,
  354. password,
  355. max_search_times,
  356. proxies=None,
  357. search_interval=0,
  358. **kwargs,
  359. ):
  360. """
  361. @param username:
  362. @param password:
  363. @param max_search_times:
  364. @param proxies:
  365. @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
  366. """
  367. self.__dict__.update(kwargs)
  368. self.username = username
  369. self.password = password
  370. self.max_search_times = max_search_times
  371. self.proxies = proxies
  372. self.search_interval = search_interval
  373. self.delay_use = 0 # 延时使用,用于等待解封的用户
  374. if isinstance(search_interval, (tuple, list)):
  375. if len(search_interval) != 2:
  376. raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
  377. self.used_for_time_length = (
  378. search_interval[1] * 5
  379. ) # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
  380. else:
  381. self.used_for_time_length = (
  382. search_interval * 5
  383. ) # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
  384. self.account_info = {
  385. "login_time": 0,
  386. "cookies": {},
  387. "search_times": 0,
  388. "last_search_time": 0,
  389. "used_for_spider_name": None, # 只被某个爬虫使用 其他爬虫不可使用
  390. "init_search_times_time": 0, # 初始化搜索次数的时间
  391. }
  392. if not self.__class__.redisdb:
  393. self.__class__.redisdb = RedisDB()
  394. self.sync_account_info_from_redis()
  395. self.__init_metrics()
  396. def __init_metrics(self):
  397. """
  398. 初始化打点系统
  399. @return:
  400. """
  401. metrics.init(**setting.METRICS_OTHER_ARGS)
  402. def record_user_status(self, status: LimitTimesUserStatus):
  403. metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
  404. def __repr__(self):
  405. return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
  406. def __eq__(self, other):
  407. return self.username == other.username
  408. def sync_account_info_from_redis(self):
  409. account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
  410. if account_info:
  411. account_info = eval(account_info)
  412. self.account_info.update(account_info)
  413. @property
  414. def cookies(self):
  415. cookies = self.account_info.get("cookies")
  416. return cookies
  417. def set_cookies(self, cookies):
  418. self.account_info["cookies"] = cookies
  419. return self.redisdb.hset(
  420. self.ACCOUNT_INFO_KEY, self.username, self.account_info
  421. )
  422. def set_login_time(self, login_time=None):
  423. self.account_info["login_time"] = login_time or time.time()
  424. return self.redisdb.hset(
  425. self.ACCOUNT_INFO_KEY, self.username, self.account_info
  426. )
  427. def get_login_time(self):
  428. return self.account_info.get("login_time")
  429. def is_time_to_login(self):
  430. return time.time() - self.get_login_time() > 40 * 60
  431. def get_last_search_time(self):
  432. return self.account_info.get("last_search_time", 0)
  433. def is_time_to_search(self):
  434. if self.delay_use:
  435. is_time = time.time() - self.get_last_search_time() > self.delay_use
  436. if is_time:
  437. self.delay_use = 0
  438. else:
  439. is_time = time.time() - self.get_last_search_time() > (
  440. random.randint(*self.search_interval)
  441. if isinstance(self.search_interval, (tuple, list))
  442. else self.search_interval
  443. )
  444. return is_time
  445. @property
  446. def used_for_spider_name(self):
  447. return self.account_info.get("used_for_spider_name")
  448. @used_for_spider_name.setter
  449. def used_for_spider_name(self, spider_name):
  450. self.account_info["used_for_spider_name"] = spider_name
  451. def update_status(self):
  452. """
  453. 更新search的一些状态
  454. @return:
  455. """
  456. self.account_info["search_times"] += 1
  457. self.account_info["last_search_time"] = time.time()
  458. return self.redisdb.hset(
  459. self.ACCOUNT_INFO_KEY, self.username, self.account_info
  460. )
  461. @property
  462. def search_times(self):
  463. init_search_times_time = self.account_info.get("init_search_times_time")
  464. current_time = time.time()
  465. if (
  466. current_time - init_search_times_time >= 86400
  467. ): # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
  468. self.account_info["search_times"] = 0
  469. self.account_info["init_search_times_time"] = current_time
  470. self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
  471. return self.account_info["search_times"]
  472. def is_overwork(self):
  473. if self.search_times > self.max_search_times:
  474. log.warning("账号 {} 请求次数超限制".format(self.username))
  475. return True
  476. return False
  477. def is_at_work_time(self):
  478. if datetime.datetime.now().hour in list(range(7, 23)):
  479. return True
  480. log.warning("账号 {} 不再工作时间内".format(self.username))
  481. return False
  482. def del_cookie(self):
  483. self.account_info["cookies"] = {}
  484. return self.redisdb.hset(
  485. self.ACCOUNT_INFO_KEY, self.username, self.account_info
  486. )
  487. def create_cookie(self):
  488. """
  489. 生产cookie 有异常需要抛出
  490. @return: cookie_dict
  491. """
  492. raise NotImplementedError
  493. def login(self):
  494. """
  495. @return: 1 成功 0 失败
  496. """
  497. try:
  498. # 预检查
  499. if not self.is_time_to_login():
  500. log.info("此账号尚未到登陆时间: {}".format(self.username))
  501. time.sleep(5)
  502. return 0
  503. cookies = self.create_cookie()
  504. if not cookies:
  505. raise Exception("登陆失败 未获取到合法cookie")
  506. if not isinstance(cookies, dict):
  507. raise Exception("cookie 必须为字典格式")
  508. # 保存cookie
  509. self.set_login_time()
  510. self.set_cookies(cookies)
  511. log.info("登录成功 {}".format(self.username))
  512. self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
  513. return 1
  514. except Exception as e:
  515. log.exception(e)
  516. send_msg(
  517. msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
  518. level="error",
  519. message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
  520. )
  521. log.info("登录失败 {}".format(self.username))
  522. self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
  523. return 0
  524. class LimitTimesUserPool:
  525. """
  526. 限制查询次数的用户的User pool
  527. 基于本地做的缓存,不支持多进程调用
  528. """
  529. LOAD_USER_INTERVAL = 60
  530. def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
  531. """
  532. @param accounts_dic: 账户信息字典
  533. {
  534. "15011300228": {
  535. "password": "300228",
  536. "proxies": {},
  537. "max_search_times": 500,
  538. "search_interval": 1, # 使用时间间隔
  539. # 其他携带信息
  540. }
  541. }
  542. @param limit_user_class: 用户重写的 limit_user_class
  543. @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
  544. """
  545. self.accounts_dict = accounts_dict
  546. self.limit_user_class = limit_user_class
  547. self.limit_times_users = []
  548. self.current_user_index = -1
  549. self.support_more_client = support_more_client
  550. self.last_load_user_time = 0
  551. def __load_users(self, username=None):
  552. # 装载user
  553. log.info("更新可用用户")
  554. for _username, detail in self.accounts_dict.items():
  555. if username and username != _username:
  556. continue
  557. limit_times_users = self.limit_user_class(username=_username, **detail)
  558. if limit_times_users in self.limit_times_users:
  559. continue
  560. if limit_times_users.is_overwork():
  561. continue
  562. else:
  563. if (
  564. limit_times_users.cookies or limit_times_users.login()
  565. ): # 如果有cookie 或者登陆成功 则添加到可用的user队列
  566. self.limit_times_users.append(limit_times_users)
  567. self.last_load_user_time = time.time()
  568. def get_user(
  569. self,
  570. username=None,
  571. used_for_spider_name=None,
  572. wait_when_null=True,
  573. not_limit_frequence=False,
  574. ) -> LimitTimesUser:
  575. """
  576. @params username: 获取指定的用户
  577. @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
  578. @params wait_when_null: 无用户时是否等待
  579. @params not_limit_frequence: 不限制使用频率
  580. @return: LimitTimesUser
  581. """
  582. if not self.support_more_client:
  583. warnings.warn(
  584. "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
  585. category=Warning,
  586. )
  587. self._is_show_warning = True
  588. while True:
  589. if (
  590. not self.limit_times_users
  591. or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
  592. ):
  593. self.__load_users(username)
  594. if not self.limit_times_users:
  595. log.warning("无可用的用户")
  596. if wait_when_null:
  597. time.sleep(1)
  598. continue
  599. else:
  600. return None
  601. self.current_user_index += 1
  602. self.current_user_index = self.current_user_index % len(
  603. self.limit_times_users
  604. )
  605. limit_times_user = self.limit_times_users[self.current_user_index]
  606. if self.support_more_client: # 需要先同步下最新数据
  607. limit_times_user.sync_account_info_from_redis()
  608. if username and limit_times_user.username != username:
  609. log.info(
  610. "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
  611. )
  612. time.sleep(1)
  613. continue
  614. # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
  615. if (
  616. limit_times_user.used_for_spider_name
  617. and limit_times_user.used_for_spider_name != used_for_spider_name
  618. ):
  619. wait_time = time.time() - limit_times_user.get_last_search_time()
  620. if wait_time < limit_times_user.used_for_time_length:
  621. log.info(
  622. "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
  623. limit_times_user.username,
  624. limit_times_user.used_for_spider_name,
  625. limit_times_user.used_for_time_length - wait_time,
  626. )
  627. )
  628. time.sleep(1)
  629. continue
  630. if (
  631. not limit_times_user.is_overwork()
  632. and limit_times_user.is_at_work_time()
  633. ):
  634. if not limit_times_user.cookies:
  635. self.limit_times_users.remove(limit_times_user)
  636. continue
  637. if not_limit_frequence or limit_times_user.is_time_to_search():
  638. limit_times_user.used_for_spider_name = used_for_spider_name
  639. limit_times_user.update_status()
  640. log.info("使用用户 {}".format(limit_times_user.username))
  641. limit_times_user.record_user_status(LimitTimesUserStatus.USED)
  642. return limit_times_user
  643. else:
  644. log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
  645. time.sleep(1)
  646. continue
  647. else:
  648. self.limit_times_users.remove(limit_times_user)
  649. self.current_user_index -= 1
  650. if not limit_times_user.is_at_work_time():
  651. log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
  652. if wait_when_null:
  653. time.sleep(30)
  654. continue
  655. else:
  656. return None
  657. def del_user(self, username):
  658. for limit_times_user in self.limit_times_users:
  659. if limit_times_user.username == username:
  660. limit_times_user.del_cookie()
  661. self.limit_times_users.remove(limit_times_user)
  662. limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
  663. self.__load_users(username)
  664. break
  665. def update_cookies(self, username, cookies):
  666. for limit_times_user in self.limit_times_users:
  667. if limit_times_user.username == username:
  668. limit_times_user.set_cookies(cookies)
  669. break
  670. def delay_use(self, username, delay_seconds):
  671. for limit_times_user in self.limit_times_users:
  672. if limit_times_user.username == username:
  673. limit_times_user.delay_use = delay_seconds
  674. limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
  675. break
  676. def record_success_user(self, username):
  677. for limit_times_user in self.limit_times_users:
  678. if limit_times_user.username == username:
  679. limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
  680. def record_exception_user(self, username):
  681. for limit_times_user in self.limit_times_users:
  682. if limit_times_user.username == username:
  683. limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)