dongzhaorui 3 gadi atpakaļ
vecāks
revīzija
5fc262e098
2 mainītis faili ar 14 papildinājumiem un 16 dzēšanām
  1. 6 7
      zgzb/crawler/crawl_spider.py
  2. 8 9
      zgzb/crawler/defaults.py

+ 6 - 7
zgzb/crawler/crawl_spider.py

@@ -78,7 +78,6 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
         allow_next_page = False
         while True:
             if exit_crawl:
-                browser.quit()
                 proxy.switch()
                 break
 
@@ -161,21 +160,21 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
                 '''访问详情页'''
                 goto(browser, node1, wait_time=2)
                 '''详情页'''
+                item['href'] = '#'
                 detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
                 if detail_js.startswith('showDetails') is False:
+                    item['competehref'] = detail_url
                     try:
                         item = crawl_psp_frame(browser, main_handler, item)
-                        item['href'] = '#'
-                        item['competehref'] = detail_url
                     except NoSuchElementException:
-                        logger.error('[加载超时]frame框架加载失败')
-                        continue
+                        exit_crawl = True
+                        break
                 else:
+                    item['competehref'] = '{}/{}'.format(detail_url, sign)
                     try:
                         item = crawl_show_details(browser, main_handler, item)
-                        item['href'] = '#'
-                        item['competehref'] = '{}/{}'.format(detail_url, sign)
                     except (ValueError, WebDriverException) as e:
+                        browser.quit()
                         exit_crawl = True
                         if e.__class__.__name__ == 'ValueError':
                             logger.error("[机器人验证]验证失败")

+ 8 - 9
zgzb/crawler/defaults.py

@@ -3,7 +3,8 @@ import time
 from selenium.common.exceptions import (
     WebDriverException,
     TimeoutException,
-    InvalidSessionIdException
+    InvalidSessionIdException,
+    NoSuchElementException
 )
 from selenium.webdriver import ActionChains
 from selenium.webdriver.common.by import By
@@ -179,13 +180,6 @@ def goto(driver, web_element, wait_time=None, allow_check_page=False):
 def next_page(driver, category):
     """翻页"""
     _finished_pages = CRAWL_RECORDS[category]['pages']
-    # web_elements = driver.find_elements(by=By.XPATH, value='//div[@id="Pagination"]/div[1]/child::*')
-    # for element in web_elements[1:-1]:
-    #     val = element.text
-    #     if val not in _finished_pages:
-    #         goto(driver, element, wait_time=1.2)
-    #         return int(val)
-    # else:
     while True:
         next_element = driver.find_element_by_xpath('//div[@id="Pagination"]/div[1]/child::a[last()]')
         if next_element.text == '下一页':
@@ -332,7 +326,12 @@ def crawl_psp_frame(driver, handler, item):
             check_timeout=15
         )
         '''切换到frame'''
-        driver.switch_to.frame('mini-iframe-6')
+        try:
+            driver.switch_to.frame('mini-iframe-6')
+        except NoSuchElementException:
+            driver.quit()
+            logger.error(f'[未检测到iframe-{item["channel"]}]{item["title"]} - {item["competehref"]}')
+            raise NoSuchElementException()
         '''等待加载数据'''
         wait_load_detail(driver, check_feature='//div[contains(@role, "accordion")]')
         content_html = extract_page_html(driver.page_source, feature='//div[@class="fui-accordions"]')