dongzhaorui 2 月之前
父节点
当前提交
9a71fab9e2
共有 3 个文件被更改,包括 87 次插入60 次删除
  1. 10 9
      FworkSpider/requirements.txt
  2. 52 39
      FworkSpider/setting.py
  3. 25 12
      FworkSpider/setup.py

+ 10 - 9
FworkSpider/requirements.txt

@@ -1,25 +1,26 @@
 better-exceptions==0.3.3
-AMQPStorm==2.10.6
-beautifulsoup4==4.9.3
+AMQPStorm>=2.10.6
 bs4==0.0.1
+beautifulsoup4==4.9.3
 DBUtils==3.0.0
 fire==0.4.0
 influxdb==5.3.1
-ipython==7.30.0
+ipython>=7.14.0
 loguru==0.5.3
 lxml==4.6.2
 oss2==2.15.0
 parsel==1.6.0
+cssselect==1.2.0
 pymongo==3.10.1
 PyMySQL==0.9.3
-python-logstash==0.4.8
 redis==3.3.6
-requests==2.24.0
+requests>=2.24.0
 PySocks==1.7.1
-selenium==3.141.0
-six==1.15.0
+selenium>=3.141.0
 tqdm==4.64.0
 urllib3==1.25.11
-w3lib==1.22.0
 PyExecJS>=1.5.1
-func-timeout==4.3.5
+DrissionPage==4.1.0.18
+redis-py-cluster>=2.1.0
+webdriver-manager>=4.0.0
+playwright

+ 52 - 39
FworkSpider/setting.py

@@ -52,58 +52,68 @@ SPIDER_THREAD_COUNT = 1  # 爬虫并发数,追求速度推荐32
 SPIDER_MAX_RETRY_TIMES = 3  # 每个请求最大重试次数
 
 # 浏览器渲染
-WEBDRIVER = dict(
-    pool_size=1,  # 浏览器的数量
+DRISSIONPAGE = dict(
+    pool_size=1,  # 浏览器标签页的数量
+    browser_path=None,  # 浏览器可执行文件路径
+    scope=None,  # 自动浏览器端口范围
+    port=None,  # 浏览器端口
+    user_data_path=None,  # 用户数据目录
+    headless=True,  # 是否为无头浏览器
     load_images=False,  # 是否加载图片
-    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=False,  # 是否为无头浏览器
-    driver_type="CHROME",  # CHROME、FIREFOX
-    timeout=30,  # 请求超时时间
-    window_size=(1280, 800),  # 窗口大小
-    executable_path=None,  # 浏览器路径,默认为默认路径
-    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
-    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
-    usages_local_driver=False,  # 是否加载本地驱动
-    server_addr="http://172.17.162.28:6666/wd/hub",  # selenium 远程服务地址
-    version="",  # 远程浏览器版本
-    service_log_path=os.devnull  # 日志路径
+    user_agent=None,  # 字符串
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx
+    window_size=(1024, 800),  # 窗口大小
+    driver_type="chromium",
+    load_mode="normal",  # 网页加载策略, 可选值:"normal", "eager", "none"
+    timeout=10,  # 请求超时时间
+    retry=1,  # 连接失败浏览器重试次数
+    interval=0.5,  # 连接失败重试间隔(秒)
+    page_load=15,  # 页面加载超时时间(秒)
+    render_time=20,  # 渲染时长,即打开网页等待加载超时时间
+    download_path=None,  # 下载文件的路径
+    custom_argument=[
+        "--no-sandbox",
+        "--ignore-certificate-errors"
+    ]
 )
 
-# splash渲染
-SPLASH_API = "http://splash.spdata.jianyu360.com/render.json"
-
 # request网络请求超时时间
 REQUEST_TIMEOUT = 60
 
-# 设置代理,代理提取API ,返回的代理分割符为\r\n
+# 设置代理
+PROXY_EXTRACT_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+# PROXY_EXTRACT_API = "http://172.31.31.204:16001/sam"
 PROXY_ENABLE = True
-JY_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
-PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
-JY_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+PROXY_AUTH = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+PROXY_POOL = "feapder.network.proxy_pool.DirectProxyPool"
+# PROXY_POOL = "feapder.network.proxy_pool.SpringBoardProxyPool"
 
-# 任务中心
-JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
+# 下载器
+DOWNLOADER = "feapder.network.downloader.RequestsDownloader"  # 请求下载器
+SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
+# SESSION_DOWNLOADER = "feapder.network.downloader.RequestsJa3SessionDownloader"
+RENDER_DOWNLOADER = "feapder.network.downloader.DrissionPageDownloader"  # 渲染下载器
+MAKE_ABSOLUTE_LINKS = True  # 自动转成绝对连接
 
 # item去重
 ITEM_FILTER_ENABLE = True
 ITEM_FILTER_SETTING = dict(
     filter_type=6,
-    redisdb_conf=[
-        dict(
-            fingerprint_pref="pylist_",
-            ip_port="172.17.162.34:8361",
-            user_pass="k5ZJR5KV4q7DRZ92DQ",
-            db=0
-        ),
-        dict(
-            fingerprint_pref="list_",
-            ip_port="172.17.4.84:4679",
-            user_pass="jytopnet123",
-            db=0
-        )
-    ],
     expire_time=1 * 365 * 24 * 3600,  # 过期时间
+    config={
+        "py": {
+            "fingerprint_pref": "pylist_",
+            "ip_port": "172.17.162.34:8361",
+            "user_pass": "k5ZJR5KV4q7DRZ92DQ",
+            "db": 0
+        },
+        "lua": {
+            "fingerprint_pref": "list_",
+            "ip_port": "172.17.4.84:4679",
+            "user_pass": "jytopnet123",
+            "db": 0
+        }
+    }
 )
 
 # 日志设置
@@ -120,7 +130,10 @@ LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_ENCODING = "utf8"  # 日志文件编码
 OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
 
-# 远程bucket配置
+# 详情采集任务领取接口
+JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
+
+# bucket配置
 ALI_BUCKET_CONFIG = {
     "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
     "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",

+ 25 - 12
FworkSpider/setup.py

@@ -36,33 +36,42 @@ packages.extend(
 requires = [
     "better-exceptions>=0.2.2",
     "DBUtils>=2.0",
-    "parsel>=1.5.2",
+    "parsel>=1.5.2,<=1.6.0",
     "PyMySQL>=0.9.3",
+    "pymongo==3.10.1",
     "redis>=2.10.6,<4.0.0",
     "requests>=2.24.0",
-    "PySocks==1.7.1",
     "bs4>=0.0.1",
     "ipython>=7.14.0,<=8.12",
-    "redis-py-cluster>=2.1.0",
     "cryptography>=3.3.2",
-    "pymongo>=3.10.1",
     "urllib3>=1.25.8,<=1.25.11",
     "loguru>=0.5.3",
     "influxdb>=5.3.1",
     "pyperclip>=1.8.2",
     "terminal-layout>=2.1.3",
-    "python-logstash==0.4.8",
-    "AMQPStorm",
+    "redis-py-cluster>=2.1.0",
+    "tqdm>=4.64.0",
+    "jmespath<1.0.0,>=0.9.3"
+]
+
+render_requires = [
+    "webdriver-manager>=4.0.0",
+    "DrissionPage==4.1.0.18",
+    "playwright",
+    "selenium>=3.141.0"
+]
+
+other = [
+    "func-timeout==4.3.5"
 ]
 
-render_requires = ["webdriver-manager>=3.5.3", "selenium==3.141.0"]
-jy_requires = ["tqdm>=4.64.0", "func-timeout==4.3.5", "oss2"]
 all_requires = [
     "bitarray>=1.5.3",
     "PyExecJS>=1.5.1",
-    *jy_requires,
-    *render_requires
-]
+    "PySocks==1.7.1",
+    "oss2",
+    "AMQPStorm"
+] + render_requires
 
 setuptools.setup(
     name="feapder",
@@ -75,7 +84,11 @@ setuptools.setup(
     long_description=long_description,
     long_description_content_type="text/markdown",
     install_requires=requires,
-    extras_require={"all": all_requires, "jy": jy_requires, "render": render_requires},
+    extras_require={
+        "all": all_requires,
+        "render": render_requires,
+        "other": other
+    },
     entry_points={"console_scripts": ["feapder = feapder.commands.cmdline:execute"]},
     url="https://github.com/Boris-code/feapder.git",
     packages=packages,