|
@@ -15,39 +15,7 @@ import requests
|
|
|
|
|
|
from feapder import setting
|
|
|
from feapder.utils import tools
|
|
|
-from feapder.utils.log import log
|
|
|
-
|
|
|
-
|
|
|
-def decrypt(input_str: str) -> str:
|
|
|
- """
|
|
|
- 改写:新增
|
|
|
- 定义base64解密函数
|
|
|
-
|
|
|
- :param input_str:
|
|
|
- :return:
|
|
|
- """
|
|
|
- key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
|
|
|
- ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
|
|
|
- output_str = ''
|
|
|
- # 对前面不是“=”的字节取索引,然后转换为2进制
|
|
|
- # 补齐“=”的个数
|
|
|
- equal_num = input_str.count('=')
|
|
|
- while ascii_list:
|
|
|
- temp_list = ascii_list[:4]
|
|
|
- # 转换成2进制字符串
|
|
|
- temp_str = ''.join(temp_list)
|
|
|
- # 对没有8位2进制的字符串补够8位2进制
|
|
|
- if len(temp_str) % 8 != 0:
|
|
|
- temp_str = temp_str[0:-1 * equal_num * 2]
|
|
|
- # 4个6字节的二进制 转换 为三个8字节的二进制
|
|
|
- temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
|
|
|
- # 二进制转为10进制
|
|
|
- temp_str_list = [int(x, 2) for x in temp_str_list if x]
|
|
|
- # 连接成字符串
|
|
|
- output_str += ''.join([chr(x) for x in temp_str_list])
|
|
|
- ascii_list = ascii_list[4:]
|
|
|
- return output_str
|
|
|
-
|
|
|
+from feapder.utils.log import log as logger
|
|
|
|
|
|
# 建立本地缓存代理文件夹
|
|
|
proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
|
|
@@ -55,17 +23,32 @@ if not os.path.exists(proxy_path):
|
|
|
os.mkdir(proxy_path)
|
|
|
|
|
|
|
|
|
-# def get_proxies_by_host(host, port):
|
|
|
-# proxy_id = "{}:{}".format(host, port)
|
|
|
-# return get_proxies_by_id(proxy_id)
|
|
|
+def get_proxy_from_jyapi(timeout=5, default=None, show_error_log=False):
|
|
|
+ """
|
|
|
+ 剑鱼代理
|
|
|
|
|
|
+ @param timeout: 访问超时时间
|
|
|
+ @param default: 默认返回值
|
|
|
+ @param show_error_log: 展示错误堆栈信息日志
|
|
|
+ """
|
|
|
+ request_params = dict(
|
|
|
+ headers=dict(Authorization=setting.JY_PROXY_AUTHOR),
|
|
|
+ timeout=timeout
|
|
|
+ )
|
|
|
+ try:
|
|
|
+ response = requests.get(setting.JY_PROXY_URL, **request_params)
|
|
|
+ except requests.exceptions.RequestException as why:
|
|
|
+ if show_error_log:
|
|
|
+ logger.exception(why)
|
|
|
+ return default
|
|
|
+
|
|
|
+ try:
|
|
|
+ proxies = response.json()["data"]
|
|
|
+ return proxies
|
|
|
+ except KeyError:
|
|
|
+ pass
|
|
|
|
|
|
-# def get_proxies_by_id(proxy_id):
|
|
|
-# proxies = {
|
|
|
-# "http": "http://{}".format(proxy_id),
|
|
|
-# "https": "https://{}".format(proxy_id),
|
|
|
-# }
|
|
|
-# return proxies
|
|
|
+ return default
|
|
|
|
|
|
|
|
|
def get_proxy_from_url(**kwargs):
|
|
@@ -122,17 +105,17 @@ def get_proxy_from_http(proxy_source_url, **kwargs):
|
|
|
response = requests.get(proxy_source_url, timeout=20)
|
|
|
# 改写:获取scocks代理的response处理
|
|
|
for proxy in response.json():
|
|
|
- host = decrypt(proxy['ip'])
|
|
|
- port = proxy['ports'][0]
|
|
|
- endTime = proxy['lifetime']
|
|
|
+ host = tools.decrypt(proxy["ip"])
|
|
|
+ port = proxy["ports"][0]
|
|
|
+ endTime = proxy["lifetime"]
|
|
|
pool.append(f"{host}:{port}&&{endTime}")
|
|
|
|
|
|
with open(os.path.join(proxy_path, filename), "w") as f:
|
|
|
- f.write('\n'.join(pool))
|
|
|
+ f.write("\n".join(pool))
|
|
|
return get_proxy_from_file(filename)
|
|
|
|
|
|
|
|
|
-def get_proxy_from_file(filename, **kwargs):
|
|
|
+def get_proxy_from_file(filename):
|
|
|
"""
|
|
|
从指定本地文件获取代理
|
|
|
文件格式
|
|
@@ -140,7 +123,6 @@ def get_proxy_from_file(filename, **kwargs):
|
|
|
ip:port:http
|
|
|
ip:port
|
|
|
:param filename:
|
|
|
- :param kwargs:
|
|
|
:return:
|
|
|
"""
|
|
|
proxies_list = []
|
|
@@ -208,27 +190,18 @@ def get_proxy_from_redis(proxy_source_url, **kwargs):
|
|
|
return proxies_list
|
|
|
|
|
|
|
|
|
-def check_proxy(
|
|
|
- ip="",
|
|
|
- port="",
|
|
|
- proxies=None,
|
|
|
- type=0,
|
|
|
- timeout=5,
|
|
|
- logger=None,
|
|
|
- show_error_log=True,
|
|
|
- **kwargs,
|
|
|
-):
|
|
|
+def check_proxy(ip="", port="", proxies=None, type=0, timeout=5, show_error_log=True):
|
|
|
"""
|
|
|
代理有效性检查
|
|
|
+
|
|
|
:param ip:
|
|
|
:param port:
|
|
|
+ :param proxies:
|
|
|
:param type: 0:socket 1:requests
|
|
|
:param timeout:
|
|
|
- :param logger:
|
|
|
+ :param show_error_log:
|
|
|
:return:
|
|
|
"""
|
|
|
- if not logger:
|
|
|
- logger = log
|
|
|
ok = 0
|
|
|
if type == 0 and ip and port:
|
|
|
# socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
|
|
@@ -249,17 +222,16 @@ def check_proxy(
|
|
|
"https": "socks5//{}:{}".format(ip, port),
|
|
|
}
|
|
|
try:
|
|
|
- # 改写:代理检测的url
|
|
|
- r = requests.get(
|
|
|
- "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
|
|
|
- )
|
|
|
+ r = requests.get("https://myip.ipip.net",
|
|
|
+ proxies=proxies,
|
|
|
+ timeout=timeout,
|
|
|
+ stream=True)
|
|
|
ok = 1
|
|
|
r.close()
|
|
|
except Exception as e:
|
|
|
if show_error_log:
|
|
|
- logger.debug(
|
|
|
- "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
|
|
|
- )
|
|
|
+ args = (e, ip, port, proxies)
|
|
|
+ logger.debug("check proxy failed: {} {}:{} {}".format(*args))
|
|
|
return ok
|
|
|
|
|
|
|
|
@@ -286,7 +258,6 @@ class ProxyItem(object):
|
|
|
:param max_proxy_use_num:
|
|
|
:param delay:
|
|
|
:param use_interval: 使用间隔 单位秒 默认不限制
|
|
|
- :param logger: 日志处理器 默认 log.get_logger()
|
|
|
:param kwargs:
|
|
|
"""
|
|
|
# {"http": ..., "https": ...}
|
|
@@ -322,9 +293,6 @@ class ProxyItem(object):
|
|
|
else:
|
|
|
self.proxy_id = self.proxy_ip_port
|
|
|
|
|
|
- # 日志处理器
|
|
|
- self.logger = log
|
|
|
-
|
|
|
def get_proxies(self):
|
|
|
self.use_num += 1
|
|
|
return self.proxies
|
|
@@ -343,18 +311,18 @@ class ProxyItem(object):
|
|
|
:return:
|
|
|
"""
|
|
|
if self.use_num > self.max_proxy_use_num > 0:
|
|
|
- self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
|
|
|
+ logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
|
|
|
return 0
|
|
|
if self.flag == -1:
|
|
|
- self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
|
|
|
+ logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
|
|
|
return 0
|
|
|
if self.delay > 0 and self.flag == 1:
|
|
|
if time.time() - self.flag_ts < self.delay:
|
|
|
- self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
|
|
|
+ logger.debug("代理被标记 1 延迟 %s" % self.proxies)
|
|
|
return 2
|
|
|
else:
|
|
|
self.flag = 0
|
|
|
- self.logger.debug("延迟代理释放: {}".format(self.proxies))
|
|
|
+ logger.debug("延迟代理释放: {}".format(self.proxies))
|
|
|
if self.use_interval:
|
|
|
if time.time() - self.use_ts < self.use_interval:
|
|
|
return 2
|
|
@@ -366,15 +334,14 @@ class ProxyItem(object):
|
|
|
proxies=self.proxies,
|
|
|
type=type,
|
|
|
timeout=self.valid_timeout,
|
|
|
- logger=self.logger,
|
|
|
)
|
|
|
else:
|
|
|
ok = 1
|
|
|
self.update_ts = time.time()
|
|
|
return ok
|
|
|
|
|
|
- @classmethod
|
|
|
- def parse_proxies(self, proxies):
|
|
|
+ @staticmethod
|
|
|
+ def parse_proxies(proxies):
|
|
|
"""
|
|
|
分解代理组成部分
|
|
|
:param proxies:
|
|
@@ -431,7 +398,6 @@ class ProxyPool(ProxyPoolBase):
|
|
|
:param reset_interval_max: 代理池重置间隔 最大间隔 默认2分钟
|
|
|
:param check_valid: 是否在获取代理时进行检测有效性
|
|
|
:param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
|
|
|
- :param logger: 日志处理器 默认 log.get_logger()
|
|
|
:param kwargs: 其他的参数
|
|
|
"""
|
|
|
kwargs.setdefault("size", -1)
|
|
@@ -450,11 +416,9 @@ class ProxyPool(ProxyPoolBase):
|
|
|
self.proxy_source_url = [x for x in self.proxy_source_url if x]
|
|
|
self.proxy_source_url = list(set(self.proxy_source_url))
|
|
|
kwargs.update({"proxy_source_url": self.proxy_source_url})
|
|
|
- # 处理日志
|
|
|
- self.logger = kwargs.get("logger") or log
|
|
|
- kwargs["logger"] = self.logger
|
|
|
+
|
|
|
if not self.proxy_source_url:
|
|
|
- self.logger.warn("need set proxy_source_url or proxy_instance")
|
|
|
+ logger.warn("need set proxy_source_url or proxy_instance")
|
|
|
|
|
|
# 代理池重置间隔
|
|
|
self.reset_interval = kwargs.get("reset_interval", 5)
|
|
@@ -541,7 +505,7 @@ class ProxyPool(ProxyPoolBase):
|
|
|
# try:
|
|
|
# self.reset_proxy_pool()
|
|
|
# except Exception as e:
|
|
|
- # self.logger.exception(e)
|
|
|
+ # logger.exception(e)
|
|
|
# 记录获取时间
|
|
|
self.last_get_ts = time.time()
|
|
|
#
|
|
@@ -579,7 +543,7 @@ class ProxyPool(ProxyPoolBase):
|
|
|
time.sleep(3)
|
|
|
self.reset_proxy_pool()
|
|
|
except Exception as e:
|
|
|
- self.logger.exception(e)
|
|
|
+ logger.exception(e)
|
|
|
if self.no_valid_proxy_times >= 5:
|
|
|
# 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
|
|
|
# 导致爬虫烂尾
|
|
@@ -587,7 +551,7 @@ class ProxyPool(ProxyPoolBase):
|
|
|
time.sleep(3)
|
|
|
self.reset_proxy_pool()
|
|
|
except Exception as e:
|
|
|
- self.logger.exception(e)
|
|
|
+ logger.exception(e)
|
|
|
return self.get(retry)
|
|
|
|
|
|
get_proxy = get
|
|
@@ -680,7 +644,7 @@ class ProxyPool(ProxyPoolBase):
|
|
|
if time.time() - self.last_reset_time < self.reset_interval:
|
|
|
self.reset_fast_count += 1
|
|
|
if self.reset_fast_count % 10 == 0:
|
|
|
- self.logger.debug(
|
|
|
+ logger.debug(
|
|
|
"代理池重置的太快了:) {}".format(self.reset_fast_count)
|
|
|
)
|
|
|
time.sleep(1)
|
|
@@ -698,7 +662,7 @@ class ProxyPool(ProxyPoolBase):
|
|
|
_valid_count = self.append_proxies(proxies_list)
|
|
|
self.last_reset_time = time.time()
|
|
|
self.no_valid_proxy_times = 0
|
|
|
- self.logger.debug(
|
|
|
+ logger.debug(
|
|
|
"重置代理池成功: 获取{}, 成功添加{}, 失效{}, 当前代理数{},".format(
|
|
|
len(proxies_list),
|
|
|
_valid_count,
|