|
@@ -12,12 +12,17 @@ from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
from selenium.webdriver.support.wait import WebDriverWait
|
|
|
|
|
|
from chaojiying import Chaojiying_Client
|
|
from chaojiying import Chaojiying_Client
|
|
-from utils.databases import mongo_table
|
|
|
|
|
|
+from utils.databases import mongo_table, redis_client
|
|
from utils.log import logger
|
|
from utils.log import logger
|
|
|
|
+from utils.tools import sha1
|
|
|
|
|
|
'''MongoDB'''
|
|
'''MongoDB'''
|
|
company_tab = mongo_table('national', 'company')
|
|
company_tab = mongo_table('national', 'company')
|
|
|
|
|
|
|
|
+'''redis服务'''
|
|
|
|
+r = redis_client()
|
|
|
|
+redis_key = 'jzsc_2022'
|
|
|
|
+
|
|
'''验证码服务'''
|
|
'''验证码服务'''
|
|
chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')
|
|
chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '929622')
|
|
|
|
|
|
@@ -261,9 +266,13 @@ def crawl_spider(driver: Chrome, handler):
|
|
if exception_count > 3:
|
|
if exception_count > 3:
|
|
'''数据异常,停止采集'''
|
|
'''数据异常,停止采集'''
|
|
return False
|
|
return False
|
|
|
|
+ title = td_element.text
|
|
|
|
+ '''使用公司名称进行去重'''
|
|
|
|
+ if r.hexists(redis_key, sha1(title)):
|
|
|
|
+ logger.info(f"[重复数据]{title} - 丢弃")
|
|
|
|
+ continue
|
|
button = td_element.find_element_by_class_name("link")
|
|
button = td_element.find_element_by_class_name("link")
|
|
click(driver, button, wait_time=2)
|
|
click(driver, button, wait_time=2)
|
|
- title = td_element.text
|
|
|
|
for current_handler in driver.window_handles:
|
|
for current_handler in driver.window_handles:
|
|
if current_handler == handler:
|
|
if current_handler == handler:
|
|
continue
|
|
continue
|
|
@@ -368,6 +377,7 @@ def crawl_spider(driver: Chrome, handler):
|
|
driver.close()
|
|
driver.close()
|
|
'''返回列表页'''
|
|
'''返回列表页'''
|
|
driver.switch_to.window(handler)
|
|
driver.switch_to.window(handler)
|
|
|
|
+ r.hset(redis_key, sha1(title), title)
|
|
else:
|
|
else:
|
|
return True
|
|
return True
|
|
|
|
|