|
@@ -6,9 +6,10 @@ from lxml.html import fromstring
|
|
|
|
|
|
from config.load import headers
|
|
|
from crawler.defaults import http_request_get
|
|
|
-from utils.databases import mongo_table, int2long, es_query
|
|
|
+from utils.databases import mongo_table, int2long, es_query, redis_client
|
|
|
from utils.log import logger
|
|
|
from utils.socks5 import Proxy
|
|
|
+from utils.tools import sha1
|
|
|
|
|
|
|
|
|
class CrawlListPageSpider:
|
|
@@ -23,6 +24,8 @@ class CrawlListPageSpider:
|
|
|
):
|
|
|
self.crawl_tab = mongo_table(db, crawl_tab)
|
|
|
self.crawl_error_tab = mongo_table(db, error_tab)
|
|
|
+ self.redis_key = 'zbytb_2021'
|
|
|
+ self.r = redis_client()
|
|
|
|
|
|
self.host = 'https://www.zbytb.com/search'
|
|
|
self.headers = kwargs.get('headers') or headers
|
|
@@ -69,9 +72,12 @@ class CrawlListPageSpider:
|
|
|
**label_info,
|
|
|
"crawl": False,
|
|
|
}
|
|
|
- info["count"] = es_query(info["title"], info["l_np_publishtime"])
|
|
|
- # print('>>> ', info['competehref'])
|
|
|
- results.append(info)
|
|
|
+ sign = sha1(info['competehref'])
|
|
|
+ if not self.r.hexists(self.redis_key, sign):
|
|
|
+ info["count"] = es_query(info["title"], info["l_np_publishtime"])
|
|
|
+ # print('>>> ', info['competehref'])
|
|
|
+ results.append(info)
|
|
|
+ self.r.hset(self.redis_key, sign, '')
|
|
|
self.crawl_tab.insert_many(results)
|
|
|
logger.info(f'[采集成功]{len(results)}条')
|
|
|
|