|
@@ -52,58 +52,68 @@ SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
|
|
|
SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
|
|
|
|
|
|
# 浏览器渲染
|
|
|
-WEBDRIVER = dict(
|
|
|
- pool_size=1, # 浏览器的数量
|
|
|
+DRISSIONPAGE = dict(
|
|
|
+ pool_size=1, # 浏览器标签页的数量
|
|
|
+ browser_path=None, # 浏览器可执行文件路径
|
|
|
+ scope=None, # 自动浏览器端口范围
|
|
|
+ port=None, # 浏览器端口
|
|
|
+ user_data_path=None, # 用户数据目录
|
|
|
+ headless=True, # 是否为无头浏览器
|
|
|
load_images=False, # 是否加载图片
|
|
|
- user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
|
- proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
|
- headless=False, # 是否为无头浏览器
|
|
|
- driver_type="CHROME", # CHROME、FIREFOX
|
|
|
- timeout=30, # 请求超时时间
|
|
|
- window_size=(1280, 800), # 窗口大小
|
|
|
- executable_path=None, # 浏览器路径,默认为默认路径
|
|
|
- render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
|
- custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
|
|
|
- usages_local_driver=False, # 是否加载本地驱动
|
|
|
- server_addr="http://172.17.162.28:6666/wd/hub", # selenium 远程服务地址
|
|
|
- version="", # 远程浏览器版本
|
|
|
- service_log_path=os.devnull # 日志路径
|
|
|
+ user_agent=None, # 字符串
|
|
|
+ proxy=None, # xxx.xxx.xxx.xxx:xxxx
|
|
|
+ window_size=(1024, 800), # 窗口大小
|
|
|
+ driver_type="chromium",
|
|
|
+ load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none"
|
|
|
+ timeout=10, # 请求超时时间
|
|
|
+ retry=1, # 连接失败浏览器重试次数
|
|
|
+ interval=0.5, # 连接失败重试间隔(秒)
|
|
|
+ page_load=15, # 页面加载超时时间(秒)
|
|
|
+ render_time=20, # 渲染时长,即打开网页等待加载超时时间
|
|
|
+ download_path=None, # 下载文件的路径
|
|
|
+ custom_argument=[
|
|
|
+ "--no-sandbox",
|
|
|
+ "--ignore-certificate-errors"
|
|
|
+ ]
|
|
|
)
|
|
|
|
|
|
-# splash渲染
|
|
|
-SPLASH_API = "http://splash.spdata.jianyu360.com/render.json"
|
|
|
-
|
|
|
# request网络请求超时时间
|
|
|
REQUEST_TIMEOUT = 60
|
|
|
|
|
|
-# 设置代理,代理提取API ,返回的代理分割符为\r\n
|
|
|
+# 设置代理
|
|
|
+PROXY_EXTRACT_API = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
|
|
|
+# PROXY_EXTRACT_API = "http://172.31.31.204:16001/sam"
|
|
|
PROXY_ENABLE = True
|
|
|
-JY_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
|
-PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
|
|
|
-JY_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
|
|
|
+PROXY_AUTH = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
|
+PROXY_POOL = "feapder.network.proxy_pool.DirectProxyPool"
|
|
|
+# PROXY_POOL = "feapder.network.proxy_pool.SpringBoardProxyPool"
|
|
|
|
|
|
-# 任务中心
|
|
|
-JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
|
|
|
+# 下载器
|
|
|
+DOWNLOADER = "feapder.network.downloader.RequestsDownloader" # 请求下载器
|
|
|
+SESSION_DOWNLOADER = "feapder.network.downloader.RequestsSessionDownloader"
|
|
|
+# SESSION_DOWNLOADER = "feapder.network.downloader.RequestsJa3SessionDownloader"
|
|
|
+RENDER_DOWNLOADER = "feapder.network.downloader.DrissionPageDownloader" # 渲染下载器
|
|
|
+MAKE_ABSOLUTE_LINKS = True # 自动转成绝对连接
|
|
|
|
|
|
# item去重
|
|
|
ITEM_FILTER_ENABLE = True
|
|
|
ITEM_FILTER_SETTING = dict(
|
|
|
filter_type=6,
|
|
|
- redisdb_conf=[
|
|
|
- dict(
|
|
|
- fingerprint_pref="pylist_",
|
|
|
- ip_port="172.17.162.34:8361",
|
|
|
- user_pass="k5ZJR5KV4q7DRZ92DQ",
|
|
|
- db=0
|
|
|
- ),
|
|
|
- dict(
|
|
|
- fingerprint_pref="list_",
|
|
|
- ip_port="172.17.4.84:4679",
|
|
|
- user_pass="jytopnet123",
|
|
|
- db=0
|
|
|
- )
|
|
|
- ],
|
|
|
expire_time=1 * 365 * 24 * 3600, # 过期时间
|
|
|
+ config={
|
|
|
+ "py": {
|
|
|
+ "fingerprint_pref": "pylist_",
|
|
|
+ "ip_port": "172.17.162.34:8361",
|
|
|
+ "user_pass": "k5ZJR5KV4q7DRZ92DQ",
|
|
|
+ "db": 0
|
|
|
+ },
|
|
|
+ "lua": {
|
|
|
+ "fingerprint_pref": "list_",
|
|
|
+ "ip_port": "172.17.4.84:4679",
|
|
|
+ "user_pass": "jytopnet123",
|
|
|
+ "db": 0
|
|
|
+ }
|
|
|
+ }
|
|
|
)
|
|
|
|
|
|
# 日志设置
|
|
@@ -120,7 +130,10 @@ LOG_BACKUP_COUNT = 20 # 日志文件保留数量
|
|
|
LOG_ENCODING = "utf8" # 日志文件编码
|
|
|
OTHERS_LOG_LEVAL = "ERROR" # 第三方库的log等级 一般用不到
|
|
|
|
|
|
-# 远程bucket配置
|
|
|
+# 详情采集任务领取接口
|
|
|
+JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
|
|
|
+
|
|
|
+# bucket配置
|
|
|
ALI_BUCKET_CONFIG = {
|
|
|
"key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
|
|
|
"key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
|