%!s(int64=3) %!d(string=hai) anos · 916ebc5dd6
--- a/zgzb/common/webdriver.py
+++ b/zgzb/common/webdriver.py
@@ -323,7 +323,7 @@ class WebDriver(RemoteWebDriver):
 
				 
			
 
				     def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				         if exc_val:
			
 
				-            logger.error(f'{self.__class__.__name__} >>> {exc_type} <> {exc_val}')
			
 
				+            logger.error(f'{self.__class__.__name__} <> {exc_type.__name__}: {exc_val}')
			
 
				 
			
 
				         self.quit()
			
 
				         return True
			
--- a/zgzb/crawler/crawl_spider.py
+++ b/zgzb/crawler/crawl_spider.py
@@ -44,158 +44,158 @@ def crawl_spider(crawl_max_page=1, enable_proxy=False, **kw):
 
				         proxies = proxy.proxies
			
 
				         logger.info(f"[采集代理]{proxies}")
			
 
				         list_page_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do?tabledivIds=searchTabLi2'
			
 
				-        browser = WebDriver(load_images=False, proxy=proxies, headless=headless)
			
 
				-        ua = browser.execute_script('return navigator.userAgent')
			
 
				-        print('>>> ', ua)
			
 
				-        success_reqeust = crawl_request(browser, list_page_url)
			
 
				-        if not success_reqeust:
			
 
				-            proxy.switch()
			
 
				-            logger.error('[访问超时]请求列表页')
			
 
				-            continue
			
 
				-
			
 
				-        '''等待加载主页'''
			
 
				-        wait_load_list(browser)
			
 
				-        '''获取主页句柄'''
			
 
				-        main_handler = browser.current_window_handle
			
 
				-        '''选择分类'''
			
 
				-        category = select_category(browser, crawl_category)
			
 
				-        '''分类栏目列表'''
			
 
				-        crawl_menu = get_crawl_menu(category)
			
 
				-        if crawl_menu is None:
			
 
				-            browser.quit()
			
 
				-            logger.info("任务结束")
			
 
				-            break
			
 
				+        with WebDriver(load_images=False, proxy=proxies, headless=headless) as browser:
			
 
				+            ua = browser.execute_script('return navigator.userAgent')
			
 
				+            print('>>> ', ua)
			
 
				+            success_reqeust = crawl_request(browser, list_page_url)
			
 
				+            if not success_reqeust:
			
 
				+                proxy.switch()
			
 
				+                logger.error('[访问超时]请求列表页')
			
 
				+                continue
			
 
				 
			
 
				-        logger.info(f"[分类栏目]{category}")
			
 
				+            '''等待加载主页'''
			
 
				+            wait_load_list(browser)
			
 
				+            '''获取主页句柄'''
			
 
				+            main_handler = browser.current_window_handle
			
 
				+            '''选择分类'''
			
 
				+            category = select_category(browser, crawl_category)
			
 
				+            '''分类栏目列表'''
			
 
				+            crawl_menu = get_crawl_menu(category)
			
 
				+            if crawl_menu is None:
			
 
				+                browser.quit()
			
 
				+                logger.info("任务结束")
			
 
				+                break
			
 
				 
			
 
				-        '''选择建立时间'''
			
 
				-        success_select_date = select_date(browser, category, crawl_date)
			
 
				-        if not success_select_date:
			
 
				-            proxy.switch()
			
 
				-            continue
			
 
				+            logger.info(f"[分类栏目]{category}")
			
 
				 
			
 
				-        exit_crawl = False
			
 
				-        allow_next_page = False
			
 
				-        while True:
			
 
				-            if exit_crawl:
			
 
				+            '''选择建立时间'''
			
 
				+            success_select_date = select_date(browser, category, crawl_date)
			
 
				+            if not success_select_date:
			
 
				                 proxy.switch()
			
 
				-                break
			
 
				+                continue
			
 
				 
			
 
				-            if allow_next_page:
			
 
				-                allow_next_page = True
			
 
				-                try:
			
 
				-                    page_num = next_page(browser, category)
			
 
				-                    if page_num is None or (page_num > crawl_max_page):
			
 
				-                        browser.quit()
			
 
				-                        proxy.switch()
			
 
				-                        update_crawl_records(category, True)
			
 
				-                        break
			
 
				-                    elif page_num != prev_num and page_num % 2 == 0:
			
 
				-                        '''每个代理IP仅采集2页，轮询使用代理'''
			
 
				-                        browser.quit()
			
 
				-                        proxy.switch()
			
 
				-                        prev_num = page_num
			
 
				-                        break
			
 
				-                except TimeoutException:
			
 
				-                    browser.quit()
			
 
				+            exit_crawl = False
			
 
				+            allow_next_page = False
			
 
				+            while True:
			
 
				+                if exit_crawl:
			
 
				                     proxy.switch()
			
 
				-                    logger.error('[访问超时]请求翻页')
			
 
				                     break
			
 
				-            else:
			
 
				-                allow_next_page = True
			
 
				-
			
 
				-            '''详情页'''
			
 
				-            web_elements = parser_list_elements(browser, category)
			
 
				-            if web_elements is None:
			
 
				-                proxy.switch()
			
 
				-                break
			
 
				 
			
 
				-            for index, element in enumerate(web_elements):
			
 
				-                index += 1
			
 
				-                item = {
			
 
				-                    "site": "中国招标投标公共服务平台",
			
 
				-                    "channel": crawl_menu.channel,
			
 
				-                    "spidercode": crawl_menu.spidercode,
			
 
				-                    "T": "bidding",
			
 
				-                    "sendflag": "false",
			
 
				-                    "_d": "comeintime",
			
 
				-                    "comeintime": '',
			
 
				-                    "area": '',
			
 
				-                    "city": '',
			
 
				-                    "publishdept": "",
			
 
				-                    "title": "",
			
 
				-                    "href": "",
			
 
				-                    "publishtime": "",
			
 
				-                    "l_np_publishtime": "",
			
 
				-                }
			
 
				-                html = browser.page_source
			
 
				-                category_id = get_category_id(category)
			
 
				-                click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
			
 
				-                href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
			
 
				-                detail_js = (click_detail_js or href_)
			
 
				-                sign = sha1(detail_js)
			
 
				-                print(f'>>> {sign}')
			
 
				-                if r.hexists(redis_key, sign):
			
 
				-                    continue
			
 
				-                '''发布标题'''
			
 
				-                node1 = element.find_element_by_xpath('./td[1]/a')
			
 
				-                title = node1.text
			
 
				-                item['title'] = title
			
 
				-                '''省市'''
			
 
				-                node2 = element.find_element_by_xpath('./td[3]/span')
			
 
				-                region = str(node2.text).replace('【', '').replace('】', '')
			
 
				-                if region.find(" ") > 0:
			
 
				-                    province, city = region.split(' ')
			
 
				-                else:
			
 
				-                    province = region
			
 
				-                    city = ''
			
 
				-                item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
			
 
				-                item['city'] = city
			
 
				-                '''发布时间'''
			
 
				-                node3 = element.find_element_by_xpath('./td[5]')
			
 
				-                publish_time = node3.text
			
 
				-                item['publishtime'] = publish_time
			
 
				-                item['l_np_publishtime'] = int2long(date2ts(publish_time))
			
 
				-                item['comeintime'] = int2long(int(time.time()))
			
 
				-                '''访问详情页'''
			
 
				-                goto(browser, node1, wait_time=2)
			
 
				-                '''详情页'''
			
 
				-                item['href'] = '#'
			
 
				-                detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
			
 
				-                if detail_js.startswith('showDetails') is False:
			
 
				-                    item['competehref'] = detail_url
			
 
				-                    try:
			
 
				-                        item = crawl_psp_frame(browser, main_handler, item)
			
 
				-                    except NoSuchElementException:
			
 
				-                        exit_crawl = True
			
 
				-                        break
			
 
				-                else:
			
 
				-                    item['competehref'] = '{}/{}'.format(detail_url, sign)
			
 
				+                if allow_next_page:
			
 
				+                    allow_next_page = True
			
 
				                     try:
			
 
				-                        item = crawl_show_details(browser, main_handler, item)
			
 
				-                    except (ValueError, WebDriverException) as e:
			
 
				+                        page_num = next_page(browser, category)
			
 
				+                        if page_num is None or (page_num > crawl_max_page):
			
 
				+                            browser.quit()
			
 
				+                            proxy.switch()
			
 
				+                            update_crawl_records(category, True)
			
 
				+                            break
			
 
				+                        elif page_num != prev_num and page_num % 2 == 0:
			
 
				+                            '''每个代理IP仅采集2页，轮询使用代理'''
			
 
				+                            browser.quit()
			
 
				+                            proxy.switch()
			
 
				+                            prev_num = page_num
			
 
				+                            break
			
 
				+                    except TimeoutException:
			
 
				                         browser.quit()
			
 
				-                        exit_crawl = True
			
 
				-                        if e.__class__.__name__ == 'ValueError':
			
 
				-                            logger.error("[机器人验证]验证失败")
			
 
				+                        proxy.switch()
			
 
				+                        logger.error('[访问超时]请求翻页')
			
 
				                         break
			
 
				-                '''入库处理'''
			
 
				-                if 'contenthtml' not in item:
			
 
				-                    item['crawl_status'] = 'detail_err'
			
 
				                 else:
			
 
				-                    item['crawl_status'] = 'success'
			
 
				-                    '''保存详情'''
			
 
				-                    save_tab.insert_one(item)
			
 
				-                    del item['contenthtml'], item['detail']
			
 
				-                    if '_id' in item:
			
 
				-                        del item['_id']
			
 
				-                    logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
			
 
				-                '''备注:详情页访问参数'''
			
 
				-                item['remark'] = detail_js
			
 
				-                '''添加数据指纹'''
			
 
				-                r.hset(redis_key, sign, '')
			
 
				-                '''保存列表'''
			
 
				-                crawl_tab.insert_one(item)
			
 
				+                    allow_next_page = True
			
 
				+
			
 
				+                '''详情页'''
			
 
				+                web_elements = parser_list_elements(browser, category)
			
 
				+                if web_elements is None:
			
 
				+                    proxy.switch()
			
 
				+                    break
			
 
				+
			
 
				+                for index, element in enumerate(web_elements):
			
 
				+                    index += 1
			
 
				+                    item = {
			
 
				+                        "site": "中国招标投标公共服务平台",
			
 
				+                        "channel": crawl_menu.channel,
			
 
				+                        "spidercode": crawl_menu.spidercode,
			
 
				+                        "T": "bidding",
			
 
				+                        "sendflag": "false",
			
 
				+                        "_d": "comeintime",
			
 
				+                        "comeintime": '',
			
 
				+                        "area": '',
			
 
				+                        "city": '',
			
 
				+                        "publishdept": "",
			
 
				+                        "title": "",
			
 
				+                        "href": "",
			
 
				+                        "publishtime": "",
			
 
				+                        "l_np_publishtime": "",
			
 
				+                    }
			
 
				+                    html = browser.page_source
			
 
				+                    category_id = get_category_id(category)
			
 
				+                    click_detail_js = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@onclick')).strip()
			
 
				+                    href_ = "".join(extract_text(html, feature=f'//*[@id="{category_id}"]//tr[{index}]/td[1]/a/@href')).strip()
			
 
				+                    detail_js = (click_detail_js or href_)
			
 
				+                    sign = sha1(detail_js)
			
 
				+                    print(f'>>> {sign}')
			
 
				+                    if r.hexists(redis_key, sign):
			
 
				+                        continue
			
 
				+                    '''发布标题'''
			
 
				+                    node1 = element.find_element_by_xpath('./td[1]/a')
			
 
				+                    title = node1.text
			
 
				+                    item['title'] = title
			
 
				+                    '''省市'''
			
 
				+                    node2 = element.find_element_by_xpath('./td[3]/span')
			
 
				+                    region = str(node2.text).replace('【', '').replace('】', '')
			
 
				+                    if region.find(" ") > 0:
			
 
				+                        province, city = region.split(' ')
			
 
				+                    else:
			
 
				+                        province = region
			
 
				+                        city = ''
			
 
				+                    item['area'] = str(province).replace('省', '').replace('市', '').replace('自治区', '')
			
 
				+                    item['city'] = city
			
 
				+                    '''发布时间'''
			
 
				+                    node3 = element.find_element_by_xpath('./td[5]')
			
 
				+                    publish_time = node3.text
			
 
				+                    item['publishtime'] = publish_time
			
 
				+                    item['l_np_publishtime'] = int2long(date2ts(publish_time))
			
 
				+                    item['comeintime'] = int2long(int(time.time()))
			
 
				+                    '''访问详情页'''
			
 
				+                    goto(browser, node1, wait_time=2)
			
 
				+                    '''详情页'''
			
 
				+                    item['href'] = '#'
			
 
				+                    detail_url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/showDetails.do'
			
 
				+                    if detail_js.startswith('showDetails') is False:
			
 
				+                        item['competehref'] = detail_url
			
 
				+                        try:
			
 
				+                            item = crawl_psp_frame(browser, main_handler, item)
			
 
				+                        except NoSuchElementException:
			
 
				+                            exit_crawl = True
			
 
				+                            break
			
 
				+                    else:
			
 
				+                        item['competehref'] = '{}/{}'.format(detail_url, sign)
			
 
				+                        try:
			
 
				+                            item = crawl_show_details(browser, main_handler, item)
			
 
				+                        except (ValueError, WebDriverException) as e:
			
 
				+                            browser.quit()
			
 
				+                            exit_crawl = True
			
 
				+                            if e.__class__.__name__ == 'ValueError':
			
 
				+                                logger.error("[机器人验证]验证失败")
			
 
				+                            break
			
 
				+                    '''入库处理'''
			
 
				+                    if 'contenthtml' not in item:
			
 
				+                        item['crawl_status'] = 'detail_err'
			
 
				+                    else:
			
 
				+                        item['crawl_status'] = 'success'
			
 
				+                        '''保存详情'''
			
 
				+                        save_tab.insert_one(item)
			
 
				+                        del item['contenthtml'], item['detail']
			
 
				+                        if '_id' in item:
			
 
				+                            del item['_id']
			
 
				+                        logger.info(f'[采集成功-{item["channel"]}]{title} - {publish_time}')
			
 
				+                    '''备注:详情页访问参数'''
			
 
				+                    item['remark'] = detail_js
			
 
				+                    '''添加数据指纹'''
			
 
				+                    r.hset(redis_key, sign, '')
			
 
				+                    '''保存列表'''
			
 
				+                    crawl_tab.insert_one(item)
			
 
				 
			
 
				-            logger.info(f"[{category}-第{page_num}页]采集完成")
			
 
				-            write_crawl_records(category, page_num)
			
 
				+                logger.info(f"[{category}-第{page_num}页]采集完成")
			
 
				+                write_crawl_records(category, page_num)