dzr 5 месяцев назад
Родитель
Сommit
b22b36e3bf
6 измененных файлов с 100 добавлено и 32 удалено
  1. 0 10
      qlm/config/conf.yaml
  2. 0 2
      qlm/config/load.py
  3. 85 0
      qlm/db/RedisDB.py
  4. 8 0
      qlm/db/__init__.py
  5. 6 5
      qlm/source_qianlima.py
  6. 1 15
      qlm/utils/databases.py

+ 0 - 10
qlm/config/conf.yaml

@@ -3,13 +3,3 @@ mongo:
   port: !!int 27080
 #  host: 127.0.0.1
 #  port: !!int 27017
-
-
-redis:
-  host: 172.17.4.232
-  port: !!int 7361
-  pwd: "k5ZJR5KV4q7DRZ92DQ"
-#  host: 127.0.0.1
-#  port: !!int 6379
-#  pwd: ""
-  db: !!int 3

+ 0 - 2
qlm/config/load.py

@@ -4,7 +4,6 @@ import yaml
 
 __all__ = [
     'mongo_conf',
-    'redis_conf',
     'headers',
     'node_module_path'
 ]
@@ -17,7 +16,6 @@ _node_modules = (_base_path.parent / 'node_modules').resolve()
 with open(_yaml_conf, encoding="utf-8") as f:
     conf = yaml.safe_load(f)
     mongo_conf = conf['mongo']
-    redis_conf = conf['redis']
 
 with open(_yaml_constants, encoding="utf-8") as fp:
     constants = yaml.safe_load(fp)

+ 85 - 0
qlm/db/RedisDB.py

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-27
+---------
+@summary: redis 去重
+---------
+@author: Lzz
+"""
+import hashlib
+
+import redis
+
+
+class RedisFilter:
+
+    def __init__(self, url, expire_time=None):
+        self.redis_db = redis.StrictRedis.from_url(url)
+        self._ex = expire_time or 86400 * 365 * 1  # 1年 = 86400 * 365 * 1
+
+    def __repr__(self):
+        return "<RedisFilter: {}>".format(self.redis_db)
+
+    def exists(self, key):
+        """全量检索"""
+        if self.redis_db.exists(key) > 0:
+            return True
+        return False
+
+    def add(self, keys):
+        """
+        添加数据
+
+        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
+        @return: list / 单个值(添加失败返回False, 添加成功返回True)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_added = []
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            if not self.exists(pkey):
+                is_added.append(self.redis_db.set(pkey, 1, ex=self._ex))
+            else:
+                is_added.append(False)
+
+        return is_added if is_list else is_added[0]
+
+    def get(self, keys):
+        """
+        检查数据是否存在
+        @param keys: list / 单个值
+        @return: list / 单个值 (存在返回True 不存在返回False)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_exist = []
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            is_exist.append(self.exists(pkey))
+
+        # 判断数据本身是否重复
+        temp_set = set()
+        for i, key in enumerate(keys):
+            if key in temp_set:
+                is_exist[i] = True
+            else:
+                temp_set.add(key)
+
+        return is_exist if is_list else is_exist[0]
+
+    def fingerprint(self, *args):
+        """
+        @summary: 获取唯一的64位值,获取唯一数据指纹
+        ---------
+        @param args: 去重数据集合
+        ---------
+        @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
+        """
+        args = sorted(args)
+        sha256 = hashlib.sha256()
+        for arg in args:
+            sha256.update(str(arg).encode())
+        return sha256.hexdigest()

+ 8 - 0
qlm/db/__init__.py

@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2025-03-04 
+---------
+@summary:  
+---------
+@author: Dzr
+"""

+ 6 - 5
qlm/source_qianlima.py

@@ -7,6 +7,7 @@ import time
 
 import requests
 
+from db.RedisDB import RedisFilter
 from utils.config_parms import (
     account_pool,
     area_dict,
@@ -15,7 +16,7 @@ from utils.config_parms import (
     channel_dict,
     REQUEST_DATA_MAP
 )
-from utils.databases import mongo_table, redis_client
+from utils.databases import mongo_table
 from utils.log import logger
 from utils.sessions_521 import http_session_521
 from utils.tools import sha1, get_today_of_day
@@ -30,8 +31,7 @@ https://search.vip.qianlima.com/index.html#?sortType=6&isSearchWord=1&tab_index=
 '''
 
 qlm = mongo_table('qlm', 'data_merge')
-r = redis_client()
-redis_key = 'qianlima_2024'
+dedup = RedisFilter('redis://:k5ZJR5KV4q7DRZ92DQ@172.17.189.142:7361/2')
 
 session = requests.session()
 
@@ -176,8 +176,9 @@ def downloader(begin_date, end_date, category, address, page, page_size, account
             items = resp_json['data']['data']
             for item in items:
                 cid = sha1(str(item['contentid']))
-                if not r.hexists(redis_key, cid):
-                    r.hset(redis_key, cid, '')
+
+                if not dedup.get(item['contentid']):
+                    dedup.add(item['contentid'])
                     if 'popTitle' in item:
                         item['title'] = item['popTitle']
                     else:

+ 1 - 15
qlm/utils/databases.py

@@ -1,8 +1,7 @@
 import bson
 import pymongo
-import redis
 
-from config.load import mongo_conf, redis_conf
+from config.load import mongo_conf
 
 # ---------------------------------- mongo ----------------------------------
 MONGO_URI_CLIENTS = {}    # a dictionary hold all client with uri as key
@@ -44,16 +43,3 @@ def int2long(param: int):
 
 def object_id(_id: str):
     return bson.objectid.ObjectId(_id)
-
-
-# ---------------------------------- redis ----------------------------------
-def redis_client(cfg=None):
-    if cfg is None:
-        cfg = redis_conf
-    pool = redis.ConnectionPool(
-        host=cfg['host'],
-        port=cfg['port'],
-        password=cfg['pwd'],
-        db=cfg['db']
-    )
-    return redis.Redis(connection_pool=pool, decode_responses=True)