dongzhaorui 1 anno fa
parent
commit
4919812161
1 ha cambiato i file con 33 aggiunte e 23 eliminazioni
  1. 33 23
      FworkSpider/setting.py

+ 33 - 23
FworkSpider/setting.py

@@ -12,25 +12,25 @@ TAB_FAILED_REQUESTS = 'pyspider:z_failed_requests'
 TASK_CRAWL_STATE = "pyspider:t_crawl_state"
 # 失败任务记录表
 TASK_REQUEST_FAILED = "pyspider_listdata_err"
-# 爬虫采集汇总表
-SPIDER_HEARTBEAT_RECORD = "spider_heartbeat"  # 爬虫采集心跳记录表名
+# 采集信息汇总表
+SPIDER_HEARTBEAT_RECORD = "spider_heartbeat"
 
-#mongo
-MONGO_IP = "192.168.3.182"
-MONGO_PORT = 27017
+# MONGODB
+MONGO_IP = "172.17.4.87"
+MONGO_PORT = 27080
 MONGO_DB = "py_spider"
 
 # REDIS
-REDISDB_IP_PORTS = "192.168.3.182:6379"
-REDISDB_USER_PASS = "jianyu@python"
-REDISDB_DB = 2
+REDISDB_IP_PORTS = "172.17.4.232:7361"
+REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
+REDISDB_DB = 12
 # 爬虫采集信息存放目录
 REDIS_KEY = "py_spider"
 
 # rabbitMq
-RABBITMQ_IP_PORT = '192.168.3.182:5672'
+RABBITMQ_IP_PORT = '172.17.4.232:5672'
 RABBITMQ_USER = 'root'
-RABBITMQ_USER_PASS = '123123'
+RABBITMQ_USER_PASS = 'V0O0049qBI2rV1554jLZPiBZ8H3Bo4'
 RABBITMQ_EXCHANGE = 'py_spider'
 RABBITMQ_EXCHANGE_TYPE = 'direct'
 RABBITMQ_VIRTUAL_HOST = '/'
@@ -39,9 +39,8 @@ RABBITMQ_HEARTBEAT = 600
 
 # 数据入库的pipeline
 ITEM_PIPELINES = [
-    "feapder.pipelines.mongo_pipeline.MongoPipeline",
-    # "feapder.pipelines.redis_pipeline.RedisPipeline",
-    # "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
+    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
+    "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
 ]
 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
 EXPORT_DATA_MAX_FAILED_TIMES = 5
@@ -60,15 +59,15 @@ WEBDRIVER = dict(
     load_images=False,  # 是否加载图片
     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
     proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=True,  # 是否为无头浏览器
-    driver_type="FIREFOX",  # CHROME、FIREFOX
-    timeout=3,  # 请求超时时间
+    headless=False,  # 是否为无头浏览器
+    driver_type="CHROME",  # CHROME、FIREFOX
+    timeout=30,  # 请求超时时间
     window_size=(1280, 800),  # 窗口大小
-    executable_path='/Users/dongzhaorui/Documents/dzr/pyscripts/Spiders/settings/geckodriver',  # 浏览器路径,默认为默认路径
+    executable_path=None,  # 浏览器路径,默认为默认路径
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
     usages_local_driver=False,  # 是否加载本地驱动
-    server_addr="http://192.168.3.182:8899/wd/hub",  # selenium 远程服务地址
+    server_addr="http://172.17.4.232:6666/wd/hub",  # selenium 远程服务地址
     version="",  # 远程浏览器版本
     service_log_path=os.devnull  # 日志路径
 )
@@ -81,18 +80,29 @@ PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
 PROXY_ENABLE = True
 
 # item去重
-ITEM_FILTER_ENABLE = False
+ITEM_FILTER_ENABLE = True
 ITEM_FILTER_SETTING = dict(
-    filter_type=5,  # redis去重
-    expire_time=86400,  # 过期时间1天
-    redis_url='redis://default:top@123@192.168.3.165:8165/2'
+    filter_type=6,
+    redis_conf=dict(
+        pylist_=dict(
+            redisdb_ip_port="172.17.4.240:8361",
+            redisdb_user_pass="k5ZJR5KV4q7DRZ92DQ",
+            redisdb_db=0
+        ),
+        list_=dict(
+            redisdb_ip_port="172.17.4.84:4679",
+            redisdb_user_pass="jytopnet123",
+            redisdb_db=0
+        )
+    ),
+    expire_time=63072000,  # 过期时间2年
 )
 
 # 日志设置
 DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
 LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
 LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME)  # log存储路径
-LOG_LEVEL = "DEBUG"
+LOG_LEVEL = "ERROR"
 LOG_COLOR = True  # 是否带有颜色
 LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
 LOG_IS_WRITE_TO_FILE = True  # 是否写文件