dongzhaorui %!s(int64=3) %!d(string=hai) anos
pai
achega
916ebc5dd6
Modificáronse 2 ficheiros con 143 adicións e 143 borrados
  1. 1 1
      zgzb/common/webdriver.py
  2. 142 142
      zgzb/crawler/crawl_spider.py

+ 1 - 1
zgzb/common/webdriver.py

@@ -323,7 +323,7 @@ class WebDriver(RemoteWebDriver):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if exc_val:
-            logger.error(f'{self.__class__.__name__} >>> {exc_type} <> {exc_val}')
+            logger.error(f'{self.__class__.__name__} <> {exc_type.__name__}: {exc_val}')
 
         self.quit()
         return True

+ 142 - 142
zgzb/crawler/crawl_spider.py

@@ -44,158 +44,158 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
         proxies = proxy.proxies
         logger.info(f"[采集代理]{proxies}")
         list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
-        browser = WebDriver(load_images=False, proxy=proxies, headless=headless)
-        ua = browser.execute_script('return navigator.userAgent')
-        print('>>> ', ua)
-        success_reqeust = crawl_request(browser, list_page_url)
-        if not success_reqeust:
-            proxy.switch()
-            logger.error('[访问超时]请求列表页')
-            continue
-
-        '''等待加载主页'''
-        wait_load_list(browser)
-        '''获取主页句柄'''
-        main_handler = browser.current_window_handle
-        '''选择分类'''
-        category = select_category(browser, crawl_category)
-        '''分类栏目列表'''
-        crawl_menu = get_crawl_menu(category)
-        if crawl_menu is None:
-            browser.quit()
-            logger.info("任务结束")
-            break
+        with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
+            ua = browser.execute_script('return navigator.userAgent')
+            print('>>> ', ua)
+            success_reqeust = crawl_request(browser, list_page_url)
+            if not success_reqeust:
+                proxy.switch()
+                logger.error('[访问超时]请求列表页')
+                continue
 
-        logger.info(f"[分类栏目]{category}")
+            '''等待加载主页'''
+            wait_load_list(browser)
+            '''获取主页句柄'''
+            main_handler = browser.current_window_handle
+            '''选择分类'''
+            category = select_category(browser, crawl_category)
+            '''分类栏目列表'''
+            crawl_menu = get_crawl_menu(category)
+            if crawl_menu is None:
+                browser.quit()
+                logger.info("任务结束")
+                break
 
-        '''选择建立时间'''
-        success_select_date = select_date(browser, category, crawl_date)
-        if not success_select_date:
-            proxy.switch()
-            continue
+            logger.info(f"[分类栏目]{category}")
 
-        exit_crawl = False
-        allow_next_page = False
-        while True:
-            if exit_crawl:
+            '''选择建立时间'''
+            success_select_date = select_date(browser, category, crawl_date)
+            if not success_select_date:
                 proxy.switch()
-                break
+                continue
 
-            if allow_next_page:
-                allow_next_page = True
-                try:
-                    page_num = next_page(browser, category)
-                    if page_num is None or (page_num > crawl_max_page):
-                        browser.quit()
-                        proxy.switch()
-                        update_crawl_records(category, True)
-                        break
-                    elif page_num != prev_num and page_num % 2 == 0:
-                        '''每个代理IP仅采集2页,轮询使用代理'''
-                        browser.quit()
-                        proxy.switch()
-                        prev_num = page_num
-                        break
-                except TimeoutException:
-                    browser.quit()
+            exit_crawl = False
+            allow_next_page = False
+            while True:
+                if exit_crawl:
                     proxy.switch()
-                    logger.error('[访问超时]请求翻页')
                     break
-            else:
-                allow_next_page = True
-
-            '''详情页'''
-            web_elements = parser_list_elements(browser, category)
-            if web_elements is None:
-                proxy.switch()
-                break
 
-            for index, element in enumerate(web_elements):
-                index += 1
-                item = {
-                    "site": "中国招标投标公共服务平台",
-                    "channel": crawl_menu.channel,
-                    "spidercode": crawl_menu.spidercode,
-                    "T": "bidding",
-                    "sendflag": "false",
-                    "_d": "comeintime",
-                    "comeintime": '',
-                    "area": '',
-                    "city": '',
-                    "publishdept": "",
-                    "title": "",
-                    "href": "",
-                    "publishtime": "",
-                    "l_np_publishtime": "",
-                }
-                html = browser.page_source
-                category_id = get_category_id(category)
-                click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
-                href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
-                detail_js = (click_detail_js or href_)
-                sign = sha1(detail_js)
-                print(f'>>> {sign}')
-                if r.hexists(redis_key, sign):
-                    continue
-                '''发布标题'''
-                node1 = element.find_element_by_xpath('./td[1]/a')
-                title = node1.text
-                item['title'] = title
-                '''省市'''
-                node2 = element.find_element_by_xpath('./td[3]/span')
-                region = str(node2.text).replace('【', '').replace('】', '')
-                if region.find(" ") > 0:
-                    province, city = region.split(' ')
-                else:
-                    province = region
-                    city = ''
-                item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
-                item['city'] = city
-                '''发布时间'''
-                node3 = element.find_element_by_xpath('./td[5]')
-                publish_time = node3.text
-                item['publishtime'] = publish_time
-                item['l_np_publishtime'] = int2long(date2ts(publish_time))
-                item['comeintime'] = int2long(int(time.time()))
-                '''访问详情页'''
-                goto(browser, node1, wait_time=2)
-                '''详情页'''
-                item['href'] = '#'
-                detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
-                if detail_js.startswith('showDetails') is False:
-                    item['competehref'] = detail_url
-                    try:
-                        item = crawl_psp_frame(browser, main_handler, item)
-                    except NoSuchElementException:
-                        exit_crawl = True
-                        break
-                else:
-                    item['competehref'] = '{}/{}'.format(detail_url, sign)
+                if allow_next_page:
+                    allow_next_page = True
                     try:
-                        item = crawl_show_details(browser, main_handler, item)
-                    except (ValueError, WebDriverException) as e:
+                        page_num = next_page(browser, category)
+                        if page_num is None or (page_num > crawl_max_page):
+                            browser.quit()
+                            proxy.switch()
+                            update_crawl_records(category, True)
+                            break
+                        elif page_num != prev_num and page_num % 2 == 0:
+                            '''每个代理IP仅采集2页,轮询使用代理'''
+                            browser.quit()
+                            proxy.switch()
+                            prev_num = page_num
+                            break
+                    except TimeoutException:
                         browser.quit()
-                        exit_crawl = True
-                        if e.__class__.__name__ == 'ValueError':
-                            logger.error("[机器人验证]验证失败")
+                        proxy.switch()
+                        logger.error('[访问超时]请求翻页')
                         break
-                '''入库处理'''
-                if 'contenthtml' not in item:
-                    item['crawl_status'] = 'detail_err'
                 else:
-                    item['crawl_status'] = 'success'
-                    '''保存详情'''
-                    save_tab.insert_one(item)
-                    del item['contenthtml'], item['detail']
-                    if '_id' in item:
-                        del item['_id']
-                    logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
-                '''备注:详情页访问参数'''
-                item['remark'] = detail_js
-                '''添加数据指纹'''
-                r.hset(redis_key, sign, '')
-                '''保存列表'''
-                crawl_tab.insert_one(item)
+                    allow_next_page = True
+
+                '''详情页'''
+                web_elements = parser_list_elements(browser, category)
+                if web_elements is None:
+                    proxy.switch()
+                    break
+
+                for index, element in enumerate(web_elements):
+                    index += 1
+                    item = {
+                        "site": "中国招标投标公共服务平台",
+                        "channel": crawl_menu.channel,
+                        "spidercode": crawl_menu.spidercode,
+                        "T": "bidding",
+                        "sendflag": "false",
+                        "_d": "comeintime",
+                        "comeintime": '',
+                        "area": '',
+                        "city": '',
+                        "publishdept": "",
+                        "title": "",
+                        "href": "",
+                        "publishtime": "",
+                        "l_np_publishtime": "",
+                    }
+                    html = browser.page_source
+                    category_id = get_category_id(category)
+                    click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
+                    href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
+                    detail_js = (click_detail_js or href_)
+                    sign = sha1(detail_js)
+                    print(f'>>> {sign}')
+                    if r.hexists(redis_key, sign):
+                        continue
+                    '''发布标题'''
+                    node1 = element.find_element_by_xpath('./td[1]/a')
+                    title = node1.text
+                    item['title'] = title
+                    '''省市'''
+                    node2 = element.find_element_by_xpath('./td[3]/span')
+                    region = str(node2.text).replace('【', '').replace('】', '')
+                    if region.find(" ") > 0:
+                        province, city = region.split(' ')
+                    else:
+                        province = region
+                        city = ''
+                    item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
+                    item['city'] = city
+                    '''发布时间'''
+                    node3 = element.find_element_by_xpath('./td[5]')
+                    publish_time = node3.text
+                    item['publishtime'] = publish_time
+                    item['l_np_publishtime'] = int2long(date2ts(publish_time))
+                    item['comeintime'] = int2long(int(time.time()))
+                    '''访问详情页'''
+                    goto(browser, node1, wait_time=2)
+                    '''详情页'''
+                    item['href'] = '#'
+                    detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
+                    if detail_js.startswith('showDetails') is False:
+                        item['competehref'] = detail_url
+                        try:
+                            item = crawl_psp_frame(browser, main_handler, item)
+                        except NoSuchElementException:
+                            exit_crawl = True
+                            break
+                    else:
+                        item['competehref'] = '{}/{}'.format(detail_url, sign)
+                        try:
+                            item = crawl_show_details(browser, main_handler, item)
+                        except (ValueError, WebDriverException) as e:
+                            browser.quit()
+                            exit_crawl = True
+                            if e.__class__.__name__ == 'ValueError':
+                                logger.error("[机器人验证]验证失败")
+                            break
+                    '''入库处理'''
+                    if 'contenthtml' not in item:
+                        item['crawl_status'] = 'detail_err'
+                    else:
+                        item['crawl_status'] = 'success'
+                        '''保存详情'''
+                        save_tab.insert_one(item)
+                        del item['contenthtml'], item['detail']
+                        if '_id' in item:
+                            del item['_id']
+                        logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
+                    '''备注:详情页访问参数'''
+                    item['remark'] = detail_js
+                    '''添加数据指纹'''
+                    r.hset(redis_key, sign, '')
+                    '''保存列表'''
+                    crawl_tab.insert_one(item)
 
-            logger.info(f"[{category}-第{page_num}页]采集完成")
-            write_crawl_records(category, page_num)
+                logger.info(f"[{category}-第{page_num}页]采集完成")
+                write_crawl_records(category, page_num)