萤火也是火 3 years ago
parent
commit
83efbbe9ea
2 changed files with 25 additions and 2 deletions
  1. 12 2
      jzsc/spider.py
  2. 13 0
      jzsc/utils/tools.py

+ 12 - 2
jzsc/spider.py

@@ -12,12 +12,17 @@ from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
 
 from chaojiying import Chaojiying_Client
-from utils.databases import mongo_table
+from utils.databases import mongo_table, redis_client
 from utils.log import logger
+from utils.tools import sha1
 
 '''MongoDB'''
 company_tab = mongo_table('national', 'company')
 
+'''redis服务'''
+r = redis_client()
+redis_key = 'jzsc_2022'
+
 '''验证码服务'''
 chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')
 
@@ -261,9 +266,13 @@ def crawl_spider(driver: Chrome, handler):
         if exception_count > 3:
             '''数据异常,停止采集'''
             return False
+        title = td_element.text
+        '''使用公司名称进行去重'''
+        if r.hexists(redis_key, sha1(title)):
+            logger.info(f"[重复数据]{title} - 丢弃")
+            continue
         button = td_element.find_element_by_class_name("link")
         click(driver, button, wait_time=2)
-        title = td_element.text
         for current_handler in driver.window_handles:
             if current_handler == handler:
                 continue
@@ -368,6 +377,7 @@ def crawl_spider(driver: Chrome, handler):
             driver.close()
             '''返回列表页'''
             driver.switch_to.window(handler)
+        r.hset(redis_key, sha1(title), title)
     else:
         return True
 

+ 13 - 0
jzsc/utils/tools.py

@@ -1,4 +1,5 @@
 import socket
+import hashlib
 
 
 def get_host_ip():
@@ -9,3 +10,15 @@ def get_host_ip():
     finally:
         s.close()
     return ip
+
+
+def sha1(text: str):
+    """
+    十六进制数字字符串形式摘要值
+
+    @param text: 字符串文本
+    @return: 摘要值
+    """
+    _sha1 = hashlib.sha1()
+    _sha1.update(text.encode("utf-8"))
+    return _sha1.hexdigest()