dongzhaorui 3 жил өмнө
parent
commit
8462d1084d

+ 14 - 5
zgzb/crawler/crawl_spider.py

@@ -2,7 +2,8 @@ import time
 
 from selenium.common.exceptions import (
     WebDriverException,
-    TimeoutException
+    TimeoutException,
+    NoSuchElementException
 )
 
 from common.databases import mongo_table, int2long, redis_client
@@ -118,7 +119,6 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
                     "spidercode": crawl_menu.spidercode,
                     "T": "bidding",
                     "sendflag": "false",
-                    "iscompete": "true",
                     "_d": "comeintime",
                     "comeintime": '',
                     "area": '',
@@ -138,8 +138,6 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
                 print(f'>>> {sign}')
                 if r.hexists(redis_key, sign):
                     continue
-
-                item['href'] = detail_js
                 '''发布标题'''
                 node1 = element.find_element_by_xpath('./td[1]/a')
                 title = node1.text
@@ -163,11 +161,20 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
                 '''访问详情页'''
                 goto(browser, node1, wait_time=2)
                 '''详情页'''
+                detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
                 if detail_js.startswith('showDetails') is False:
-                    item = crawl_psp_frame(browser, main_handler, item)
+                    try:
+                        item = crawl_psp_frame(browser, main_handler, item)
+                        item['href'] = '#'
+                        item['competehref'] = detail_url
+                    except NoSuchElementException:
+                        logger.error('[加载超时]frame框架加载失败')
+                        continue
                 else:
                     try:
                         item = crawl_show_details(browser, main_handler, item)
+                        item['href'] = '#'
+                        item['competehref'] = '{}/{}'.format(detail_url, sign)
                     except (ValueError, WebDriverException) as e:
                         exit_crawl = True
                         if e.__class__.__name__ == 'ValueError':
@@ -184,6 +191,8 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
                     if '_id' in item:
                         del item['_id']
                     logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
+                '''备注:详情页访问参数'''
+                item['remark'] = detail_js
                 '''添加数据指纹'''
                 r.hset(redis_key, sign, '')
                 '''保存列表'''

+ 13 - 6
zgzb/crawler/defaults.py

@@ -245,20 +245,23 @@ def select_date(driver, category, setup_time):
         return False
 
 
-def wait_load_detail(driver, check_feature=None):
+def wait_load_detail(driver, check_feature=None, check_timeout=None):
     """等待二次加载页面结果并检测元素变化"""
+    _check_timeout = (check_timeout or 10)
+    sleep_interval = 0.5
+    max_check_count = int(_check_timeout / sleep_interval)
     if check_feature is not None:
         check_count = 0
-        while check_count < 20:
+        while check_count < max_check_count:
             element = html2element(driver.page_source)
             check_node = element.xpath(check_feature)
             if len(check_node) > 0:
                 break
-            time.sleep(0.5)
+            time.sleep(sleep_interval)
             check_count += 1
     else:
         check_count = 0
-        while check_count < 20:
+        while check_count < max_check_count:
             element = html2element(driver.page_source)
             root = element.xpath('//div[@id="xxnrList"]')
             if len(root) > 0:
@@ -268,7 +271,7 @@ def wait_load_detail(driver, check_feature=None):
                     children = element.xpath('//div[@id="xxnrList"]/div[1]/div[2]/child::*')
                     if "".join(text) != '暂无详细数据' and len(children) > 0:
                         break
-            time.sleep(0.5)
+            time.sleep(sleep_interval)
             check_count += 1
     time.sleep(1)
 
@@ -323,7 +326,11 @@ def crawl_psp_frame(driver, handler, item):
         if current_handler == handler:
             continue
         driver.switch_to.window(current_handler)
-        wait_load_detail(driver, check_feature='//div[contains(@id, "mini-1")]')
+        wait_load_detail(
+            driver,
+            check_feature='//div[contains(@id, "mini-1")]',
+            check_timeout=15
+        )
         '''切换到frame'''
         driver.switch_to.frame('mini-iframe-6')
         '''等待加载数据'''