Browse Source

添加sha1文本url过滤

dongzhaorui@topnet.net.cn 3 years ago
parent
commit
5df3cac914
1 changed files with 10 additions and 4 deletions
  1. 10 4
      zbytb/crawler/spiders/ListPageSpider.py

+ 10 - 4
zbytb/crawler/spiders/ListPageSpider.py

@@ -6,9 +6,10 @@ from lxml.html import fromstring
 
 from config.load import headers
 from crawler.defaults import http_request_get
-from utils.databases import mongo_table, int2long, es_query
+from utils.databases import mongo_table, int2long, es_query, redis_client
 from utils.log import logger
 from utils.socks5 import Proxy
+from utils.tools import sha1
 
 
 class CrawlListPageSpider:
@@ -23,6 +24,8 @@ class CrawlListPageSpider:
     ):
         self.crawl_tab = mongo_table(db, crawl_tab)
         self.crawl_error_tab = mongo_table(db, error_tab)
+        self.redis_key = 'zbytb_2021'
+        self.r = redis_client()
 
         self.host = 'https://www.zbytb.com/search'
         self.headers = kwargs.get('headers') or headers
@@ -69,9 +72,12 @@ class CrawlListPageSpider:
                 **label_info,
                 "crawl": False,
             }
-            info["count"] = es_query(info["title"], info["l_np_publishtime"])
-            # print('>>> ', info['competehref'])
-            results.append(info)
+            sign = sha1(info['competehref'])
+            if not self.r.hexists(self.redis_key, sign):
+                info["count"] = es_query(info["title"], info["l_np_publishtime"])
+                # print('>>> ', info['competehref'])
+                results.append(info)
+                self.r.hset(self.redis_key, sign, '')
         self.crawl_tab.insert_many(results)
         logger.info(f'[采集成功]{len(results)}条')