浏览代码

删除组合式过滤查询

dongzhaorui 1 年之前
父节点
当前提交
a1315ee6e6

+ 2 - 20
FworkSpider/feapder/dedup/README.md

@@ -95,7 +95,7 @@ def test_filter():
 from feapder.dedup import Dedup
 
 def test_filter():
-    dedup = Dedup(Dedup.RedisFilter, ip_ports=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=60)
+    dedup = Dedup(Dedup.RedisFilter, to_md5=False, ip_ports=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=60)
 
     # 制造已存在数据
     datas = ["xxx", "bbb"]
@@ -109,25 +109,7 @@ def test_filter():
 ```
 
 ```python
-# redis 去重
-from feapder.dedup import Dedup
-
-def test_filter():
-    dedup = Dedup(Dedup.RedisFilter, expire_time=60)
-
-    # 制造已存在数据
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    ss = dedup.filter_exist_data(datas)
-    print(ss)
-    assert datas == ["ccc"]
-```
-
-```python
-# redis 多实例去重
+# redis 多redis实例去重
 from feapder.dedup import Dedup
 
 def test_filter():

+ 10 - 38
FworkSpider/feapder/dedup/redisfilter.py

@@ -27,26 +27,13 @@ class RedisFilter(BaseFilter):
             )  # 集群/单机
 
         self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
-        self._prefix1 = 'list_'
-        self._prefix2 = 'pylist_'
 
     def __repr__(self):
         return "<RedisFilter: {}>".format(self.redis_db)
 
     def exists(self, key):
-        """全量检索/lua增量检索/python增量检索"""
-        if '&&' in key:
-            md5, sha256 = key.split("&&")
-            mixture = tools.get_sha256(md5)
-        else:
-            mixture = sha256 = key
-
-        if (
-                self.redis_db.exists(sha256) > 0
-                or self.redis_db.exists(self._prefix1 + sha256) > 0
-                or self.redis_db.exists(self._prefix2 + sha256) > 0
-                or self.redis_db.exists(self._prefix2 + mixture) > 0
-        ):
+        """全量检索"""
+        if self.redis_db.exists(key) > 0:
             return True
         return False
 
@@ -62,14 +49,7 @@ class RedisFilter(BaseFilter):
         is_added = []
         for key in keys:
             if not self.exists(key):
-                if '&&' in key:
-                    md5, sha256 = key.split("&&")
-                else:
-                    sha256 = key
-
-                is_added.append(
-                    self.redis_db.set(self._prefix2 + sha256, 1, ex=self._ex)
-                )
+                is_added.append(self.redis_db.set(key, 1, ex=self._ex))
             else:
                 is_added.append(False)
 
@@ -104,6 +84,10 @@ class MRedisFilter(RedisFilter):
 
     def __init__(self, redis_conf=None, **kwargs):
         super(MRedisFilter, self).__init__(**kwargs)
+
+        self._prefix1 = 'list_'  # lua前缀
+        self._prefix2 = 'pylist_'  # python前缀
+
         if not redis_conf:
             self.__class__.redis_dbs[self._prefix2] = RedisDB()
         else:
@@ -122,17 +106,8 @@ class MRedisFilter(RedisFilter):
 
     def exists(self, key):
         """lua增量检索/python增量检索"""
-        if '&&' in key:
-            md5, sha256 = key.split("&&")
-            mixture = tools.get_sha256(md5)
-        else:
-            mixture = sha256 = key
-
         for prefix, redis_db in self.redis_dbs.items():
-            if any([
-                redis_db.exists(prefix + sha256) > 0,
-                redis_db.exists(prefix + mixture) > 0
-            ]):
+            if redis_db.exists(prefix + key) > 0:
                 return True
         return False
 
@@ -150,11 +125,8 @@ class MRedisFilter(RedisFilter):
         is_added = []
         for key in keys:
             if not self.exists(key):
-                if '&&' in key:
-                    md5, sha256 = key.split("&&")
-                else:
-                    sha256 = key
-                is_added.append(redis_db.set(self._prefix2 + sha256, 1, ex=self._ex))
+                key = self._prefix2 + key
+                is_added.append(redis_db.set(key, 1, ex=self._ex))
             else:
                 is_added.append(False)
 

+ 1 - 1
FworkSpider/feapder/network/item.py

@@ -212,7 +212,7 @@ class BaseItem(Item):
 
         if args:
             args = sorted(args)
-            return tools.get_md5(*args) + "&&" + tools.get_sha256(*args)
+            return tools.get_sha256(*args)
         else:
             return None
 

+ 31 - 40
FworkSpider/setting.py

@@ -4,22 +4,33 @@ import datetime
 import os
 import sys
 
-# MONGODB
-MONGO_IP = "172.17.4.87"
-MONGO_PORT = 27080
+# 数据保存失败表
+TAB_FAILED_ITEMS = 'pyspider:s_failed_items'
+# 任务失败表
+TAB_FAILED_REQUESTS = 'pyspider:z_failed_requests'
+# 待处理任务状态记录表
+TASK_CRAWL_STATE = "pyspider:t_crawl_state"
+# 失败任务记录表
+TASK_REQUEST_FAILED = "pyspider_listdata_err"
+# 爬虫采集汇总表
+SPIDER_HEARTBEAT_RECORD = "spider_heartbeat"  # 爬虫采集心跳记录表名
+
+#mongo
+MONGO_IP = "192.168.3.182"
+MONGO_PORT = 27017
 MONGO_DB = "py_spider"
 
 # REDIS
-REDISDB_IP_PORTS = "172.17.4.232:7361"
-REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
-REDISDB_DB = 12
+REDISDB_IP_PORTS = "192.168.3.182:6379"
+REDISDB_USER_PASS = "jianyu@python"
+REDISDB_DB = 2
 # 爬虫采集信息存放目录
 REDIS_KEY = "py_spider"
 
 # rabbitMq
-RABBITMQ_IP_PORT = '172.17.4.232:5672'
+RABBITMQ_IP_PORT = '192.168.3.182:5672'
 RABBITMQ_USER = 'root'
-RABBITMQ_USER_PASS = 'V0O0049qBI2rV1554jLZPiBZ8H3Bo4'
+RABBITMQ_USER_PASS = '123123'
 RABBITMQ_EXCHANGE = 'py_spider'
 RABBITMQ_EXCHANGE_TYPE = 'direct'
 RABBITMQ_VIRTUAL_HOST = '/'
@@ -28,9 +39,9 @@ RABBITMQ_HEARTBEAT = 600
 
 # 数据入库的pipeline
 ITEM_PIPELINES = [
-    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
+    "feapder.pipelines.mongo_pipeline.MongoPipeline",
     # "feapder.pipelines.redis_pipeline.RedisPipeline",
-    "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
+    # "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
 ]
 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
 EXPORT_DATA_MAX_FAILED_TIMES = 5
@@ -40,7 +51,6 @@ EXPORT_DATA_MAX_RETRY_TIMES = 5
 COLLECTOR_TASK_COUNT = 100  # 每次获取任务数量
 
 # 爬虫
-SPIDER_HEARTBEAT = "spider_heartbeat"  # 爬虫心跳
 SPIDER_THREAD_COUNT = 1  # 爬虫并发数,追求速度推荐32
 SPIDER_MAX_RETRY_TIMES = 3  # 每个请求最大重试次数
 
@@ -50,23 +60,18 @@ WEBDRIVER = dict(
     load_images=False,  # 是否加载图片
     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
     proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=False,  # 是否为无头浏览器
-    driver_type="CHROME",  # CHROME、FIREFOX
-    timeout=30,  # 请求超时时间
+    headless=True,  # 是否为无头浏览器
+    driver_type="FIREFOX",  # CHROME、FIREFOX
+    timeout=3,  # 请求超时时间
     window_size=(1280, 800),  # 窗口大小
-    executable_path=None,  # 浏览器路径,默认为默认路径
+    executable_path='/Users/dongzhaorui/Documents/dzr/pyscripts/Spiders/settings/geckodriver',  # 浏览器路径,默认为默认路径
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
     usages_local_driver=False,  # 是否加载本地驱动
-    server_addr="http://172.17.4.232:6666/wd/hub",  # selenium 远程服务地址
+    server_addr="http://192.168.3.182:8899/wd/hub",  # selenium 远程服务地址
     version="",  # 远程浏览器版本
     service_log_path=os.devnull  # 日志路径
 )
-# 爬虫启动时,重新入库失败的item
-RETRY_FAILED_ITEMS = True
-
-# 保存失败的request
-SAVE_FAILED_REQUEST = False
 
 # request网络请求超时时间
 REQUEST_TIMEOUT = 60
@@ -76,29 +81,18 @@ PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
 PROXY_ENABLE = True
 
 # item去重
-ITEM_FILTER_ENABLE = True
+ITEM_FILTER_ENABLE = False
 ITEM_FILTER_SETTING = dict(
-    filter_type=6,
-    redis_conf=dict(
-        pylist_=dict(
-            redisdb_ip_port="172.17.4.240:8361",
-            redisdb_user_pass="k5ZJR5KV4q7DRZ92DQ",
-            redisdb_db=0
-        ),
-        list_=dict(
-            redisdb_ip_port="172.17.4.84:4679",
-            redisdb_user_pass="jytopnet123",
-            redisdb_db=0
-        )
-    ),
-    expire_time=63072000,  # 过期时间2年
+    filter_type=5,  # redis去重
+    expire_time=86400,  # 过期时间1天
+    redis_url='redis://default:top@123@192.168.3.165:8165/2'
 )
 
 # 日志设置
 DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
 LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
 LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME)  # log存储路径
-LOG_LEVEL = "ERROR"
+LOG_LEVEL = "DEBUG"
 LOG_COLOR = True  # 是否带有颜色
 LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
 LOG_IS_WRITE_TO_FILE = True  # 是否写文件
@@ -115,9 +109,6 @@ SWORDFISH_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
 # splash 渲染服务
 SWORDFISH_RENDER_URL = "http://splash.spdata.jianyu360.com/render.json"
 
-# 爬虫心跳
-RECORD_SPIDER_HEARTBEAT = "spider_heartbeat"  # 爬虫采集心跳记录表名
-
 # 远程bucket配置
 ALI_BUCKET_CONFIG = {
     "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",