dongzhaorui 1 gadu atpakaļ
vecāks
revīzija
677614d618

+ 53 - 89
FworkSpider/feapder/network/proxy_pool.py

@@ -15,39 +15,7 @@ import requests
 
 from feapder import setting
 from feapder.utils import tools
-from feapder.utils.log import log
-
-
-def decrypt(input_str: str) -> str:
-    """
-    改写:新增
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
+from feapder.utils.log import log as logger
 
 # 建立本地缓存代理文件夹
 proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
@@ -55,17 +23,32 @@ if not os.path.exists(proxy_path):
     os.mkdir(proxy_path)
 
 
-# def get_proxies_by_host(host, port):
-#     proxy_id = "{}:{}".format(host, port)
-#     return get_proxies_by_id(proxy_id)
+def get_proxy_from_jyapi(timeout=5, default=None, show_error_log=False):
+    """
+    剑鱼代理
 
+    @param timeout: 访问超时时间
+    @param default: 默认返回值
+    @param show_error_log: 展示错误堆栈信息日志
+    """
+    request_params = dict(
+        headers=dict(Authorization=setting.JY_PROXY_AUTHOR),
+        timeout=timeout
+    )
+    try:
+        response = requests.get(setting.JY_PROXY_URL, **request_params)
+    except requests.exceptions.RequestException as why:
+        if show_error_log:
+            logger.exception(why)
+        return default
+
+    try:
+        proxies = response.json()["data"]
+        return proxies
+    except KeyError:
+        pass
 
-# def get_proxies_by_id(proxy_id):
-#     proxies = {
-#         "http": "http://{}".format(proxy_id),
-#         "https": "https://{}".format(proxy_id),
-#     }
-#     return proxies
+    return default
 
 
 def get_proxy_from_url(**kwargs):
@@ -122,17 +105,17 @@ def get_proxy_from_http(proxy_source_url, **kwargs):
         response = requests.get(proxy_source_url, timeout=20)
         # 改写:获取scocks代理的response处理
         for proxy in response.json():
-            host = decrypt(proxy['ip'])
-            port = proxy['ports'][0]
-            endTime = proxy['lifetime']
+            host = tools.decrypt(proxy["ip"])
+            port = proxy["ports"][0]
+            endTime = proxy["lifetime"]
             pool.append(f"{host}:{port}&&{endTime}")
 
         with open(os.path.join(proxy_path, filename), "w") as f:
-            f.write('\n'.join(pool))
+            f.write("\n".join(pool))
     return get_proxy_from_file(filename)
 
 
-def get_proxy_from_file(filename, **kwargs):
+def get_proxy_from_file(filename):
     """
     从指定本地文件获取代理
         文件格式
@@ -140,7 +123,6 @@ def get_proxy_from_file(filename, **kwargs):
         ip:port:http
         ip:port
     :param filename:
-    :param kwargs:
     :return:
     """
     proxies_list = []
@@ -208,27 +190,18 @@ def get_proxy_from_redis(proxy_source_url, **kwargs):
     return proxies_list
 
 
-def check_proxy(
-        ip="",
-        port="",
-        proxies=None,
-        type=0,
-        timeout=5,
-        logger=None,
-        show_error_log=True,
-        **kwargs,
-):
+def check_proxy(ip="", port="", proxies=None, type=0, timeout=5, show_error_log=True):
     """
     代理有效性检查
+
     :param ip:
     :param port:
+    :param proxies:
     :param type: 0:socket  1:requests
     :param timeout:
-    :param logger:
+    :param show_error_log:
     :return:
     """
-    if not logger:
-        logger = log
     ok = 0
     if type == 0 and ip and port:
         # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
@@ -249,17 +222,16 @@ def check_proxy(
                 "https": "socks5//{}:{}".format(ip, port),
             }
         try:
-            # 改写:代理检测的url
-            r = requests.get(
-                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
-            )
+            r = requests.get("https://myip.ipip.net",
+                             proxies=proxies,
+                             timeout=timeout,
+                             stream=True)
             ok = 1
             r.close()
         except Exception as e:
             if show_error_log:
-                logger.debug(
-                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
-                )
+                args = (e, ip, port, proxies)
+                logger.debug("check proxy failed: {} {}:{} {}".format(*args))
     return ok
 
 
@@ -286,7 +258,6 @@ class ProxyItem(object):
         :param max_proxy_use_num:
         :param delay:
         :param use_interval: 使用间隔 单位秒 默认不限制
-        :param logger: 日志处理器 默认 log.get_logger()
         :param kwargs:
         """
         # {"http": ..., "https": ...}
@@ -322,9 +293,6 @@ class ProxyItem(object):
         else:
             self.proxy_id = self.proxy_ip_port
 
-        # 日志处理器
-        self.logger = log
-
     def get_proxies(self):
         self.use_num += 1
         return self.proxies
@@ -343,18 +311,18 @@ class ProxyItem(object):
         :return:
         """
         if self.use_num > self.max_proxy_use_num > 0:
-            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
+            logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
             return 0
         if self.flag == -1:
-            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
+            logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
             return 0
         if self.delay > 0 and self.flag == 1:
             if time.time() - self.flag_ts < self.delay:
-                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
+                logger.debug("代理被标记 1 延迟 %s" % self.proxies)
                 return 2
             else:
                 self.flag = 0
-                self.logger.debug("延迟代理释放: {}".format(self.proxies))
+                logger.debug("延迟代理释放: {}".format(self.proxies))
         if self.use_interval:
             if time.time() - self.use_ts < self.use_interval:
                 return 2
@@ -366,15 +334,14 @@ class ProxyItem(object):
                 proxies=self.proxies,
                 type=type,
                 timeout=self.valid_timeout,
-                logger=self.logger,
             )
         else:
             ok = 1
         self.update_ts = time.time()
         return ok
 
-    @classmethod
-    def parse_proxies(self, proxies):
+    @staticmethod
+    def parse_proxies(proxies):
         """
         分解代理组成部分
         :param proxies:
@@ -431,7 +398,6 @@ class ProxyPool(ProxyPoolBase):
         :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
         :param check_valid: 是否在获取代理时进行检测有效性
         :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
-        :param logger: 日志处理器 默认 log.get_logger()
         :param kwargs: 其他的参数
         """
         kwargs.setdefault("size", -1)
@@ -450,11 +416,9 @@ class ProxyPool(ProxyPoolBase):
             self.proxy_source_url = [x for x in self.proxy_source_url if x]
             self.proxy_source_url = list(set(self.proxy_source_url))
             kwargs.update({"proxy_source_url": self.proxy_source_url})
-        # 处理日志
-        self.logger = kwargs.get("logger") or log
-        kwargs["logger"] = self.logger
+
         if not self.proxy_source_url:
-            self.logger.warn("need set proxy_source_url or proxy_instance")
+            logger.warn("need set proxy_source_url or proxy_instance")
 
         # 代理池重置间隔
         self.reset_interval = kwargs.get("reset_interval", 5)
@@ -541,7 +505,7 @@ class ProxyPool(ProxyPoolBase):
         #     try:
         #         self.reset_proxy_pool()
         #     except Exception as e:
-        #         self.logger.exception(e)
+        #         logger.exception(e)
         # 记录获取时间
         self.last_get_ts = time.time()
         #
@@ -579,7 +543,7 @@ class ProxyPool(ProxyPoolBase):
                 time.sleep(3)
                 self.reset_proxy_pool()
             except Exception as e:
-                self.logger.exception(e)
+                logger.exception(e)
         if self.no_valid_proxy_times >= 5:
             # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
             # 导致爬虫烂尾
@@ -587,7 +551,7 @@ class ProxyPool(ProxyPoolBase):
                 time.sleep(3)
                 self.reset_proxy_pool()
             except Exception as e:
-                self.logger.exception(e)
+                logger.exception(e)
         return self.get(retry)
 
     get_proxy = get
@@ -680,7 +644,7 @@ class ProxyPool(ProxyPoolBase):
                 if time.time() - self.last_reset_time < self.reset_interval:
                     self.reset_fast_count += 1
                     if self.reset_fast_count % 10 == 0:
-                        self.logger.debug(
+                        logger.debug(
                             "代理池重置的太快了:) {}".format(self.reset_fast_count)
                         )
                         time.sleep(1)
@@ -698,7 +662,7 @@ class ProxyPool(ProxyPoolBase):
                     _valid_count = self.append_proxies(proxies_list)
                     self.last_reset_time = time.time()
                     self.no_valid_proxy_times = 0
-                    self.logger.debug(
+                    logger.debug(
                         "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
                             len(proxies_list),
                             _valid_count,

+ 20 - 40
FworkSpider/feapder/network/request.py

@@ -19,7 +19,7 @@ from requests.packages.urllib3.util.ssl_ import create_urllib3_context
 import feapder.setting as setting
 import feapder.utils.tools as tools
 from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent
+from feapder.network import user_agent, proxy_pool
 from feapder.network.response import Response
 from feapder.utils.log import log
 from feapder.utils.webdriver import WebDriverPool
@@ -34,20 +34,20 @@ class DESAdapter(HTTPAdapter):
         """
         A TransportAdapter that re-enables 3DES support in Requests.
         """
-        ciphers = ":".join(setting.JA3_REQUEST_CIPHERS).split(':')
+        ciphers = ":".join(setting.JA3_REQUEST_CIPHERS).split(":")
         tools.random.shuffle(ciphers)
-        ciphers = ':'.join(ciphers)
-        self.ciphers = ciphers + ':!aNULL:!eNULL:!MD5'
+        ciphers = ":".join(ciphers)
+        self.ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
         super().__init__(*args, **kwargs)
 
     def init_poolmanager(self, *args, **kwargs):
         context = create_urllib3_context(ciphers=self.ciphers)
-        kwargs['ssl_context'] = context
+        kwargs["ssl_context"] = context
         return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
 
     def proxy_manager_for(self, *args, **kwargs):
         context = create_urllib3_context(ciphers=self.ciphers)
-        kwargs['ssl_context'] = context
+        kwargs["ssl_context"] = context
         return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
 
 
@@ -339,9 +339,9 @@ class Request(object):
 
         # 代理
         proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+        if proxies == -1 and setting.PROXY_ENABLE and setting.JY_PROXY_URL:
             while True:
-                proxies = self.get_proxy()
+                proxies = proxy_pool.get_proxy_from_jyapi()
                 if proxies:
                     self.requests_kwargs.update(proxies=proxies)
                     break
@@ -395,13 +395,8 @@ class Request(object):
                 if cookie_str:
                     cookies = tools.get_cookies_from_str(cookie_str)
 
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
-
-            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
+            browser_kwargs = dict(user_agent=user_agent, proxy=self.proxy())
+            browser = self._webdriver_pool.get(**browser_kwargs)
 
             try:
                 browser.get(self.url)
@@ -418,9 +413,7 @@ class Request(object):
                     "status_code": 200,
                     "elapsed": 666,
                     "headers": {
-                        "User-Agent": browser.execute_script(
-                            "return navigator.userAgent"
-                        ),
+                        "User-Agent": browser.execute_script("return navigator.userAgent"),
                         "Cookie": tools.cookies2str(browser.cookies),
                     },
                 })
@@ -434,27 +427,19 @@ class Request(object):
             response = Response(response)
 
         elif self.splash:
-            headers = self.requests_kwargs.get('headers')
+            headers = self.requests_kwargs.get("headers")
             if not headers:
-                headers = {'User-Agent': self.user_agent()}
-            headers = [(key, val) for key, val in headers.items()]
-
-            proxy = None
-            if proxies and proxies != -1:
-                proxy = proxies.get("http", "").strip("http://") or proxies.get(
-                    "https", ""
-                ).strip("https://")
+                headers = {"User-Agent": self.user_agent()}
 
             params = {
-                'iframes': self.iframes,
-                'wait': self.render_time,
-                'html': 1,
-                'proxy': proxy,
-                'url': self.url,
+                "iframes": self.iframes,
+                "wait": self.render_time,
+                "html": 1,
+                "proxy": self.proxy(),
+                "url": self.url,
             }
-            data = {'headers': headers}
-            splash_url = setting.SWORDFISH_RENDER_URL
-            resp = requests.get(splash_url, params=params, json=data)
+            data = {"headers": [(key, val) for key, val in headers.items()]}
+            resp = requests.get(setting.SPLASH_API, params=params, json=data)
             response = Response(resp)
 
         else:
@@ -484,11 +469,6 @@ class Request(object):
                 "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
             )
 
-    def get_proxy(self):
-        headers = {"Authorization": setting.SWORDFISH_PROXY_AUTHOR}
-        proxy = requests.get(setting.SWORDFISH_PROXY_URL, headers=headers).json()
-        return proxy.get("data")
-
     def user_agent(self):
         headers = self.requests_kwargs.get("headers")
         if headers:

+ 6 - 0
FworkSpider/feapder/setting.py

@@ -87,6 +87,9 @@ WEBDRIVER = dict(
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
 )
 
+# splash 渲染
+SPLASH_API = os.getenv("SPLASH_API")
+
 # 爬虫启动时,重新抓取失败的requests
 RETRY_FAILED_REQUESTS = False
 # 爬虫启动时,重新入库失败的item
@@ -111,6 +114,9 @@ DELETE_KEYS = []
 # 设置代理
 PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
 PROXY_ENABLE = True
+# 剑鱼代理
+JY_PROXY_URL = None
+JY_PROXY_AUTHOR = os.getenv("JY_PROXY_AUTHOR")
 
 # 随机headers
 RANDOM_HEADERS = True

+ 30 - 0
FworkSpider/feapder/utils/tools.py

@@ -1803,6 +1803,36 @@ def get_hash(text):
     return hash(text)
 
 
+def decrypt(input_str: str) -> str:
+    """
+    改写:新增
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
 ##################################################
 
 

+ 12 - 15
FworkSpider/untils/tools.py

@@ -4,10 +4,11 @@ import hashlib
 import re
 from collections import namedtuple
 from string import whitespace
-from bs4 import BeautifulSoup
+
 import bson
-import requests
+from bs4 import BeautifulSoup
 
+from feapder.network.proxy_pool import get_proxy_from_jyapi
 from untils.clean_html import cleaner
 
 SearchText = namedtuple('SearchText', ['total'])
@@ -290,24 +291,20 @@ def njpc_fields_extract_special(html, data_item):
     return data_item
 
 
-def get_proxy(scheme=None,default=None,socks5h=False):
-    headers = {
-        "Authorization": "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
-    }
-    proxy_res = requests.get("http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch", headers=headers).json()
-
-    proxies = proxy_res.get('data')
-    if proxy_res and proxies:
+def get_proxy(scheme=None, default=None, socks5h=False):
+    proxies = get_proxy_from_jyapi()
+    print(f"切换代理:{proxies}")
+    if proxies is not None:
         if socks5h:
-            proxyh = {}
-            proxyh["http"] = proxies.get("http").replace("socks5", "socks5h")
-            proxyh["https"] = proxies.get("http").replace("socks5", "socks5h")
+            proxyh = {
+                "http": proxies.get("http").replace("socks5", "socks5h"),
+                "https": proxies.get("http").replace("socks5", "socks5h")
+            }
             proxies = proxyh
-        print(f"切换代理:{proxies}")
         if not scheme:
             return proxies
         else:
-            return proxies.get(scheme,default)
+            return proxies.get(scheme, default)
 
 
 def search(pattern, string):