3 gadi atpakaļ · 5fc262e098
--- a/zgzb/crawler/crawl_spider.py
+++ b/zgzb/crawler/crawl_spider.py
@@ -78,7 +78,6 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
 
				         allow_next_page = False
			
 
				         while True:
			
 
				             if exit_crawl:
			
 
				-                browser.quit()
			
 
				                 proxy.switch()
			
 
				                 break
			
 
				 
			
@@ -161,21 +160,21 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
 
				                 '''访问详情页'''
			
 
				                 goto(browser, node1, wait_time=2)
			
 
				                 '''详情页'''
			
 
				+                item['href'] = '#'
			
 
				                 detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
			
 
				                 if detail_js.startswith('showDetails') is False:
			
 
				+                    item['competehref'] = detail_url
			
 
				                     try:
			
 
				                         item = crawl_psp_frame(browser, main_handler, item)
			
 
				-                        item['href'] = '#'
			
 
				-                        item['competehref'] = detail_url
			
 
				                     except NoSuchElementException:
			
 
				-                        logger.error('[加载超时]frame框架加载失败')
			
 
				-                        continue
			
 
				+                        exit_crawl = True
			
 
				+                        break
			
 
				                 else:
			
 
				+                    item['competehref'] = '{}/{}'.format(detail_url, sign)
			
 
				                     try:
			
 
				                         item = crawl_show_details(browser, main_handler, item)
			
 
				-                        item['href'] = '#'
			
 
				-                        item['competehref'] = '{}/{}'.format(detail_url, sign)
			
 
				                     except (ValueError, WebDriverException) as e:
			
 
				+                        browser.quit()
			
 
				                         exit_crawl = True
			
 
				                         if e.__class__.__name__ == 'ValueError':
			
 
				                             logger.error("[机器人验证]验证失败")
			
--- a/zgzb/crawler/defaults.py
+++ b/zgzb/crawler/defaults.py
@@ -3,7 +3,8 @@ import time
 
				 from selenium.common.exceptions import (
			
 
				     WebDriverException,
			
 
				     TimeoutException,
			
 
				-    InvalidSessionIdException
			
 
				+    InvalidSessionIdException,
			
 
				+    NoSuchElementException
			
 
				 )
			
 
				 from selenium.webdriver import ActionChains
			
 
				 from selenium.webdriver.common.by import By
			
@@ -179,13 +180,6 @@ def goto(driver, web_element, wait_time=None, allow_check_page=False):
 
				 def next_page(driver, category):
			
 
				     """翻页"""
			
 
				     _finished_pages = CRAWL_RECORDS[category]['pages']
			
 
				-    # web_elements = driver.find_elements(by=By.XPATH, value='//div[@id="Pagination"]/div[1]/child::*')
			
 
				-    # for element in web_elements[1:-1]:
			
 
				-    #     val = element.text
			
 
				-    #     if val not in _finished_pages:
			
 
				-    #         goto(driver, element, wait_time=1.2)
			
 
				-    #         return int(val)
			
 
				-    # else:
			
 
				     while True:
			
 
				         next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
			
 
				         if next_element.text == '下一页':
			
@@ -332,7 +326,12 @@ def crawl_psp_frame(driver, handler, item):
 
				             check_timeout=15
			
 
				         )
			
 
				         '''切换到frame'''
			
 
				-        driver.switch_to.frame('mini-iframe-6')
			
 
				+        try:
			
 
				+            driver.switch_to.frame('mini-iframe-6')
			
 
				+        except NoSuchElementException:
			
 
				+            driver.quit()
			
 
				+            logger.error(f'[未检测到iframe-{item["channel"]}]{item["title"]} - {item["competehref"]}')
			
 
				+            raise NoSuchElementException()
			
 
				         '''等待加载数据'''
			
 
				         wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
			
 
				         content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')