|
@@ -4,22 +4,33 @@ import datetime
|
|
|
import os
|
|
|
import sys
|
|
|
|
|
|
-# MONGODB
|
|
|
-MONGO_IP = "172.17.4.87"
|
|
|
-MONGO_PORT = 27080
|
|
|
+# 数据保存失败表
|
|
|
+TAB_FAILED_ITEMS = 'pyspider:s_failed_items'
|
|
|
+# 任务失败表
|
|
|
+TAB_FAILED_REQUESTS = 'pyspider:z_failed_requests'
|
|
|
+# 待处理任务状态记录表
|
|
|
+TASK_CRAWL_STATE = "pyspider:t_crawl_state"
|
|
|
+# 失败任务记录表
|
|
|
+TASK_REQUEST_FAILED = "pyspider_listdata_err"
|
|
|
+# 爬虫采集汇总表
|
|
|
+SPIDER_HEARTBEAT_RECORD = "spider_heartbeat" # 爬虫采集心跳记录表名
|
|
|
+
|
|
|
+#mongo
|
|
|
+MONGO_IP = "192.168.3.182"
|
|
|
+MONGO_PORT = 27017
|
|
|
MONGO_DB = "py_spider"
|
|
|
|
|
|
# REDIS
|
|
|
-REDISDB_IP_PORTS = "172.17.4.232:7361"
|
|
|
-REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
|
|
|
-REDISDB_DB = 12
|
|
|
+REDISDB_IP_PORTS = "192.168.3.182:6379"
|
|
|
+REDISDB_USER_PASS = "jianyu@python"
|
|
|
+REDISDB_DB = 2
|
|
|
# 爬虫采集信息存放目录
|
|
|
REDIS_KEY = "py_spider"
|
|
|
|
|
|
# rabbitMq
|
|
|
-RABBITMQ_IP_PORT = '172.17.4.232:5672'
|
|
|
+RABBITMQ_IP_PORT = '192.168.3.182:5672'
|
|
|
RABBITMQ_USER = 'root'
|
|
|
-RABBITMQ_USER_PASS = 'V0O0049qBI2rV1554jLZPiBZ8H3Bo4'
|
|
|
+RABBITMQ_USER_PASS = '123123'
|
|
|
RABBITMQ_EXCHANGE = 'py_spider'
|
|
|
RABBITMQ_EXCHANGE_TYPE = 'direct'
|
|
|
RABBITMQ_VIRTUAL_HOST = '/'
|
|
@@ -28,9 +39,9 @@ RABBITMQ_HEARTBEAT = 600
|
|
|
|
|
|
# 数据入库的pipeline
|
|
|
ITEM_PIPELINES = [
|
|
|
- # "feapder.pipelines.mongo_pipeline.MongoPipeline",
|
|
|
+ "feapder.pipelines.mongo_pipeline.MongoPipeline",
|
|
|
# "feapder.pipelines.redis_pipeline.RedisPipeline",
|
|
|
- "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
|
|
|
+ # "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
|
|
|
]
|
|
|
# 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
|
|
|
EXPORT_DATA_MAX_FAILED_TIMES = 5
|
|
@@ -40,7 +51,6 @@ EXPORT_DATA_MAX_RETRY_TIMES = 5
|
|
|
COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
|
|
|
|
|
|
# 爬虫
|
|
|
-SPIDER_HEARTBEAT = "spider_heartbeat" # 爬虫心跳
|
|
|
SPIDER_THREAD_COUNT = 1 # 爬虫并发数,追求速度推荐32
|
|
|
SPIDER_MAX_RETRY_TIMES = 3 # 每个请求最大重试次数
|
|
|
|
|
@@ -50,23 +60,18 @@ WEBDRIVER = dict(
|
|
|
load_images=False, # 是否加载图片
|
|
|
user_agent=None, # 字符串 或 无参函数,返回值为user_agent
|
|
|
proxy=None, # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
|
|
|
- headless=False, # 是否为无头浏览器
|
|
|
- driver_type="CHROME", # CHROME、FIREFOX
|
|
|
- timeout=30, # 请求超时时间
|
|
|
+ headless=True, # 是否为无头浏览器
|
|
|
+ driver_type="FIREFOX", # CHROME、FIREFOX
|
|
|
+ timeout=3, # 请求超时时间
|
|
|
window_size=(1280, 800), # 窗口大小
|
|
|
- executable_path=None, # 浏览器路径,默认为默认路径
|
|
|
+ executable_path='/Users/dongzhaorui/Documents/dzr/pyscripts/Spiders/settings/geckodriver', # 浏览器路径,默认为默认路径
|
|
|
render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
|
|
|
custom_argument=["--ignore-certificate-errors"], # 自定义浏览器渲染参数
|
|
|
usages_local_driver=False, # 是否加载本地驱动
|
|
|
- server_addr="http://172.17.4.232:6666/wd/hub", # selenium 远程服务地址
|
|
|
+ server_addr="http://192.168.3.182:8899/wd/hub", # selenium 远程服务地址
|
|
|
version="", # 远程浏览器版本
|
|
|
service_log_path=os.devnull # 日志路径
|
|
|
)
|
|
|
-# 爬虫启动时,重新入库失败的item
|
|
|
-RETRY_FAILED_ITEMS = True
|
|
|
-
|
|
|
-# 保存失败的request
|
|
|
-SAVE_FAILED_REQUEST = False
|
|
|
|
|
|
# request网络请求超时时间
|
|
|
REQUEST_TIMEOUT = 60
|
|
@@ -76,29 +81,18 @@ PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
|
|
|
PROXY_ENABLE = True
|
|
|
|
|
|
# item去重
|
|
|
-ITEM_FILTER_ENABLE = True
|
|
|
+ITEM_FILTER_ENABLE = False
|
|
|
ITEM_FILTER_SETTING = dict(
|
|
|
- filter_type=6,
|
|
|
- redis_conf=dict(
|
|
|
- pylist_=dict(
|
|
|
- redisdb_ip_port="172.17.4.240:8361",
|
|
|
- redisdb_user_pass="k5ZJR5KV4q7DRZ92DQ",
|
|
|
- redisdb_db=0
|
|
|
- ),
|
|
|
- list_=dict(
|
|
|
- redisdb_ip_port="172.17.4.84:4679",
|
|
|
- redisdb_user_pass="jytopnet123",
|
|
|
- redisdb_db=0
|
|
|
- )
|
|
|
- ),
|
|
|
- expire_time=63072000, # 过期时间2年
|
|
|
+ filter_type=5, # redis去重
|
|
|
+ expire_time=86400, # 过期时间1天
|
|
|
+ redis_url='redis://default:top@123@192.168.3.165:8165/2'
|
|
|
)
|
|
|
|
|
|
# 日志设置
|
|
|
DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
|
LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
|
|
|
LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME) # log存储路径
|
|
|
-LOG_LEVEL = "ERROR"
|
|
|
+LOG_LEVEL = "DEBUG"
|
|
|
LOG_COLOR = True # 是否带有颜色
|
|
|
LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
|
|
|
LOG_IS_WRITE_TO_FILE = True # 是否写文件
|
|
@@ -115,9 +109,6 @@ SWORDFISH_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
|
# splash 渲染服务
|
|
|
SWORDFISH_RENDER_URL = "http://splash.spdata.jianyu360.com/render.json"
|
|
|
|
|
|
-# 爬虫心跳
|
|
|
-RECORD_SPIDER_HEARTBEAT = "spider_heartbeat" # 爬虫采集心跳记录表名
|
|
|
-
|
|
|
# 远程bucket配置
|
|
|
ALI_BUCKET_CONFIG = {
|
|
|
"key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
|