瀏覽代碼

列表页新增阿里滑块验证

dongzhaorui 1 年之前
父節點
當前提交
120e9a6cbc
共有 4 個文件被更改,包括 296 次插入140 次删除
  1. 37 29
      zgztb_cookie/cookie_pool.py
  2. 107 60
      zgztb_cookie/zgzbtb_spider.py
  3. 77 31
      zgztb_cookie/zgzbtb_spider_d.py
  4. 75 20
      zgztb_cookie/zgzbtb_spider_m.py

+ 37 - 29
zgztb_cookie/cookie_pool.py

@@ -8,13 +8,20 @@ Created on 2022-07-10
 """
 import time
 
-from selenium.common.exceptions import NoSuchElementException
-from selenium.webdriver import ActionChains
-
 from feapder.network.cookie_pool import PageCookiePool
 from feapder.setting import WEBDRIVER
 from feapder.utils.log import log
 from feapder.utils.webdriver import WebDriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver import ActionChains
+
+
+def get_page_text(driver, split=False):
+    try:
+        feature = driver.find_element_by_xpath('//body').text
+    except NoSuchElementException:
+        feature = '验证通过'
+    return ''.join(feature.split()) if split else feature
 
 
 class WebCookiePool(PageCookiePool):
@@ -33,48 +40,45 @@ class WebCookiePool(PageCookiePool):
                 result = driver.execute_script(js)
                 log.debug(f'window.navigator.webdriver >>> {result}')
                 driver.get(self.page_url)
-                time.sleep(3)
+
+                time.sleep(3)  # 等待页面加载
+
+                xpath_lst = [
+                    '//div[@id="aliyunCaptcha-sliding-slider"]',
+                    '//span[contains(@class, "nc_iconfont btn_slide")]'
+                ]
                 for _ in range(4):
                     log.info(f"【{driver.title}】,处理中...")
                     try:
-                        slider = driver.find_element_by_xpath("//span[contains(@class, 'nc_iconfont btn_slide')]")
+                        slider = driver.find_element_by_xpath('|'.join(xpath_lst))
                         if slider.is_displayed():
-                            # 点击并且不松开鼠标
-                            ActionChains(driver).click_and_hold(on_element=slider).perform()
-                            # 往右边移动258个位置
-                            ActionChains(driver).move_by_offset(xoffset=252, yoffset=0).perform()
-                            # 松开鼠标
-                            ActionChains(driver).pause(1).release().perform()
+                            ActionChains(driver).click_and_hold(on_element=slider).perform()  # 点击并且不松开鼠标
+                            ActionChains(driver).move_by_offset(xoffset=322, yoffset=0).perform()  # 往右边移动x个位置
+                            ActionChains(driver).pause(1).release().perform()  # 松开鼠标
                     except NoSuchElementException:
                         if '找不到 输入的jsp页面或者服务器' == driver.title:
-                            # 切换新ip,不触发阿里人机验证,浏览器信息缺少acw3,需要再次发起请求
-                            break
+                            break  # 切换新ip,不触发阿里人机验证,浏览器信息缺少acw3,需要再次发起请求
+
                     except Exception as e:
                         reason = 'nsIDOMWindowUtils.sendMouseEvent'
                         if len(e.args) > 0 and e.args[0].count(reason) == 0:
                             if driver.title == '出错了':
-                                # 代理ip频繁访问,导致被封,直接切换代理
-                                break
+                                break  # 代理ip频繁访问,ip被封
 
                             log.error(f"异常原因:{e.args}")
 
                     time.sleep(10)  # ip数量少,间隔时间放大一些
 
-                    # 滑动页面特征
-                    try:
-                        feature = driver.find_element_by_xpath('//body').text
-                    except NoSuchElementException:
-                        feature = "验证通过"
-
-                    if 'IP地址在短时间内频繁访问该页面' in feature:
-                        return
-                    elif '访问被阻断' in feature:
+                    feature = get_page_text(driver)  # 机器人页面特征检测
+                    if 'IP地址在短时间内频繁访问该页面' in feature or '访问被阻断' in feature:
                         return
-                    elif 'nc_iconfont btn_slide' in driver.page_source:
-                        # 检查页面是否存在滑块元素
+                    elif 'nc_iconfont btn_slide' in driver.page_source or 'aliyunCaptcha-sliding-slider' in driver.page_source:
+                        driver.refresh()  # 刷新页面
+                        time.sleep(1.5)  # 等待页面加载
                         continue
                     else:
                         break
+
                 log.info(f"【{driver.title}】,处理结束")
                 return driver.cookies
             except Exception as e:
@@ -85,7 +89,11 @@ class WebCookiePool(PageCookiePool):
 #     for i in range(1):
 #         cookies = WebCookiePool(redis_key='gdcookie',page_url="http://www.user-agent.cn/").create_cookies("socks5://36.7.252.15:8860")
 #         print(cookies)
-#     cookie_pool = WebCookiePool(redis_key='zgztbcookie',
-#                                 page_url="http://www.cebpubservice.com/ctpsp_iiss/SecondaryAction/findDetails.do")
-#     cks = cookie_pool.create_cookies()
+#
+#     cookie_pool = WebCookiePool(
+#         redis_key='zgztb.list.cookies',
+#         page_url='http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do'
+#     )
+#     proxy = swordfish_proxy().get('http')
+#     cks = cookie_pool.create_cookies(proxy=proxy)
 #     print(cks)

+ 107 - 60
zgztb_cookie/zgzbtb_spider.py

@@ -16,38 +16,35 @@ from feapder.network.request import requests
 from feapder.utils.log import log as logger
 from feapder.utils.tools import json
 
+from cookie_pool import WebCookiePool
+
 Menu = namedtuple('Menu', ['channel', 'code', 'type', 'businessKeyWord'])
 
 
+class AliRobotsCaptchaError(Exception):
+    pass
+
+
 class ListPageSpider(feapder.AirSpider):
 
+    cookie_pool = WebCookiePool(
+        redis_key='zgztb.list.cookies',
+        page_url='http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do'
+    )
+
     @property
     def proxy(self):
         return swordfish_proxy()
 
-    @staticmethod
-    def extract_address(region):
-        if region:
-            args = region.split(' ')
-            if len(args) == 2:
-                area, city = args
-            elif len(args) == 1:
-                area, city = args[0], ''
-            else:
-                area, city, *argi = args
-        else:
-            area, city = '全国', ''
-
-        area = area.strip().replace('省', '').replace('市', '')
-        city = city.strip()
-        return area, city
-
     def start_callback(self):
         self._task_coll_name = 'zgzb_list'
         self._proxies = None
+        self._cookies = None  # 全局 cookies
 
     def start_requests(self):
-        self._proxies = self.proxy
+        # self._proxies = self.proxy
+        self.ali_robots_cookies(limit=40)  # 获取全局 cookies 与 全局代理
+
         task_menus = [
             Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', '招标公告', 'tenderBulletin'),
             Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', '开标记录', 'openBidRecord'),
@@ -89,6 +86,7 @@ class ListPageSpider(feapder.AirSpider):
     def download_midware(self, request):
         request.url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getStringMethod.do'
         request.proxies = self._proxies
+        request.cookies = self._cookies
         request.method = 'POST'
         request.timeout = 5
         request.headers = {
@@ -105,49 +103,37 @@ class ListPageSpider(feapder.AirSpider):
         }
 
     def validate(self, request, response):
-        msg = request.meta['msg']
-        menu = request.menu
-        resp_json = response.json  # 检测数据接口是否请求成功
-        if not resp_json:
-            logger.info(f'{msg}-无列表数据')
-            return False
-
-        # 汇总消息
-        data = resp_json.get("object")
-        return_list = data.get("returnlist")
-
-        page = data.get("page")
-        total_page = page.get("totalPage", 0)
-        total_count = page.get("totalCount", 0)
-        page_no = page.get("pageNo", 0)
-        row = page.get("row", 0)
-
-        msg = f"{menu.channel},超出最大采集页码"
-        if page_no <= total_page:
-            tips = [
-                menu.channel,
-                f'共{total_page}页{total_count}/{len(return_list)}条',
-                f'第{page_no}页{row}条',
-            ]
-            msg = "-".join(tips)
-
-        logger.info(msg)
-        return True
-
-    def exception_request(self, request, response, e):
-        msg = request.meta['msg']
-        proxy_errors = (
-            json.decoder.JSONDecodeError,
-            requests.exceptions.ConnectionError
-        )
-        if isinstance(e, proxy_errors):
-            interval = request.meta['interval']
-            logger.warning(f"{msg}--代理失效,{interval}s后重试...")
-            tools.delay_time(interval)
-            self._proxies = self.proxy
+        if response.is_html:
+            raise AliRobotsCaptchaError
         else:
-            logger.error(f"{msg}--请求失败")
-            logger.exception(f'异常原因:{e}')
+            msg = request.meta['msg']
+            menu = request.menu
+            resp_json = response.json  # 检测数据接口是否请求成功
+            if not resp_json:
+                logger.info(f'{msg}-无列表数据')
+                return False
+
+            # 汇总消息
+            data = resp_json.get("object")
+            return_list = data.get("returnlist")
+
+            page = data.get("page")
+            total_page = page.get("totalPage", 0)
+            total_count = page.get("totalCount", 0)
+            page_no = page.get("pageNo", 0)
+            row = page.get("row", 0)
+
+            msg = f"{menu.channel},超出最大采集页码"
+            if page_no <= total_page:
+                tips = [
+                    menu.channel,
+                    f'共{total_page}页{total_count}/{len(return_list)}条',
+                    f'第{page_no}页{row}条',
+                ]
+                msg = "-".join(tips)
+
+            logger.info(msg)
+            return True
 
     def parse(self, request, response):
         menu = request.menu
@@ -198,9 +184,70 @@ class ListPageSpider(feapder.AirSpider):
             yield item
             logger.info(f"采集成功--{menu.channel}-{item['title']}-{publish_time}")
 
+    def exception_request(self, request, response, e):
+        msg = request.meta['msg']
+        errors = (
+            json.decoder.JSONDecodeError,
+            requests.exceptions.ConnectionError
+        )
+        if isinstance(e, errors):
+            interval = request.meta['interval']
+            logger.warning(f"{msg}--代理失效,{interval}s后重试...")
+            tools.delay_time(interval)
+            self._proxies = self.proxy
+        elif isinstance(e, AliRobotsCaptchaError):
+            logger.info("cookies失效,重新获取...")
+            self.ali_robots_cookies(request)
+        else:
+            logger.error(f"{msg}--请求失败")
+            logger.exception(f'异常原因:{e}')
+
     def end_callback(self):
         logger.info("列表页采集结束")
 
+    def ali_robots_cookies(self, request=None, limit=3):
+        """
+        阿里无感机器人滑块验证
+
+        @param request
+        @param limit 最大重试次数
+        """
+        retries = 0
+        cookies = None
+        proxy = self.proxy
+        while retries < limit:
+            cookies = self.cookie_pool.create_cookies(proxy=proxy.get('http'))
+            if not cookies or len(cookies) <= 4:
+                proxy = self.proxy
+                retries += 1
+                continue
+
+            if request is not None:
+                request.cookies = cookies
+                request.proxies = proxy
+
+            break
+
+        self._cookies = cookies
+        self._proxies = proxy
+
+    @staticmethod
+    def extract_address(region):
+        if region:
+            args = region.split(' ')
+            if len(args) == 2:
+                area, city = args
+            elif len(args) == 1:
+                area, city = args[0], ''
+            else:
+                area, city, *argi = args
+        else:
+            area, city = '全国', ''
+
+        area = area.strip().replace('省', '').replace('市', '')
+        city = city.strip()
+        return area, city
+
 
 if __name__ == '__main__':
     ListPageSpider(thread_count=1).start()

+ 77 - 31
zgztb_cookie/zgzbtb_spider_d.py

@@ -17,11 +17,22 @@ from feapder.network.request import requests
 from feapder.utils.log import log as logger
 from feapder.utils.tools import json
 
+from cookie_pool import WebCookiePool
+
 Menu = namedtuple('Menu', ['channel', 'code', 'type', 'businessKeyWord'])
 
 
+class AliRobotsCaptchaError(Exception):
+    pass
+
+
 class ListPageSpider(feapder.AirSpider):
 
+    cookie_pool = WebCookiePool(
+        redis_key='zgztb.list.cookies',
+        page_url='http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do'
+    )
+
     @property
     def proxy(self):
         return swordfish_proxy()
@@ -46,6 +57,7 @@ class ListPageSpider(feapder.AirSpider):
     def start_callback(self):
         self._task_coll_name = 'zgzb_list'
         self._proxies = None
+        self._cookies = None  # 全局 cookies
 
     def visit_day_lst(self, start_ts, days):
         day_lst = []
@@ -60,7 +72,9 @@ class ListPageSpider(feapder.AirSpider):
         return day_lst
 
     def start_requests(self):
-        self._proxies = self.proxy
+        # self._proxies = self.proxy
+        self.ali_robots_cookies(limit=40)  # 获取全局 cookies 与 全局代理
+
         task_menus = [
             Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', '招标公告', 'tenderBulletin'),
             Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', '开标记录', 'openBidRecord'),
@@ -123,36 +137,39 @@ class ListPageSpider(feapder.AirSpider):
         }
 
     def validate(self, request, response):
-        day = request.meta['day']
-        msg = request.meta['msg']
-        menu = request.menu
-        resp_json = response.json  # 检测数据接口是否请求成功
-        if not resp_json:
-            logger.info(f'{day}-{msg}-无列表数据')
-            return False
-
-        # 汇总消息
-        data = resp_json.get("object")
-        return_list = data.get("returnlist")
-
-        page = data.get("page")
-        total_page = page.get("totalPage", 0)
-        total_count = page.get("totalCount", 0)
-        page_no = page.get("pageNo", 0)
-        row = page.get("row", 0)
-
-        msg = f"{day}-{menu.channel},超出最大采集页码"
-        if page_no <= total_page:
-            tips = [
-                day,
-                menu.channel,
-                f'共{total_page}页{total_count}/{len(return_list)}条',
-                f'第{page_no}页{row}条',
-            ]
-            msg = "-".join(tips)
-
-        logger.info(msg)
-        return True
+        if response.is_html:
+            raise AliRobotsCaptchaError
+        else:
+            day = request.meta['day']
+            msg = request.meta['msg']
+            menu = request.menu
+            resp_json = response.json  # 检测数据接口是否请求成功
+            if not resp_json:
+                logger.info(f'{day}-{msg}-无列表数据')
+                return False
+
+            # 汇总消息
+            data = resp_json.get("object")
+            return_list = data.get("returnlist")
+
+            page = data.get("page")
+            total_page = page.get("totalPage", 0)
+            total_count = page.get("totalCount", 0)
+            page_no = page.get("pageNo", 0)
+            row = page.get("row", 0)
+
+            msg = f"{day}-{menu.channel},超出最大采集页码"
+            if page_no <= total_page:
+                tips = [
+                    day,
+                    menu.channel,
+                    f'共{total_page}页{total_count}/{len(return_list)}条',
+                    f'第{page_no}页{row}条',
+                ]
+                msg = "-".join(tips)
+
+            logger.info(msg)
+            return True
 
     def exception_request(self, request, response, e):
         msg = request.meta['msg']
@@ -165,6 +182,9 @@ class ListPageSpider(feapder.AirSpider):
             logger.warning(f"{msg}--代理失效,{interval}s后重试...")
             tools.delay_time(interval)
             self._proxies = self.proxy
+        elif isinstance(e, AliRobotsCaptchaError):
+            logger.info("cookies失效,重新获取...")
+            self.ali_robots_cookies(request)
         else:
             logger.error(f"{msg}--请求失败")
             logger.exception(f'异常原因:{e}')
@@ -221,6 +241,32 @@ class ListPageSpider(feapder.AirSpider):
     def end_callback(self):
         logger.info("列表页采集结束")
 
+    def ali_robots_cookies(self, request=None, limit=3):
+        """
+        阿里无感机器人滑块验证
+
+        @param request
+        @param limit 最大重试次数
+        """
+        retries = 0
+        cookies = None
+        proxy = self.proxy
+        while retries < limit:
+            cookies = self.cookie_pool.create_cookies(proxy=proxy.get('http'))
+            if not cookies or len(cookies) <= 4:
+                proxy = self.proxy
+                retries += 1
+                continue
+
+            if request is not None:
+                request.cookies = cookies
+                request.proxies = proxy
+
+            break
+
+        self._cookies = cookies
+        self._proxies = proxy
+
 
 if __name__ == '__main__':
     ListPageSpider(thread_count=5).start()

+ 75 - 20
zgztb_cookie/zgzbtb_spider_m.py

@@ -16,9 +16,15 @@ from feapder.network.request import requests
 from feapder.utils.log import log as logger
 from feapder.utils.tools import json
 
+from cookie_pool import WebCookiePool
+
 Menu = namedtuple('Menu', ['channel', 'code', 'type', 'businessKeyWord'])
 
 
+class AliRobotsCaptchaError(Exception):
+    pass
+
+
 def pay_proxy():
     proxies = {
         'http': 'http://pyspider:J2c4CY62tB8R53pW@140.249.73.234:15039',
@@ -29,6 +35,11 @@ def pay_proxy():
 
 class ListPageSpider(feapder.AirSpider):
 
+    cookie_pool = WebCookiePool(
+        redis_key='zgztb.list.cookies',
+        page_url='http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do'
+    )
+
     @property
     def proxy(self):
         return swordfish_proxy()
@@ -53,9 +64,12 @@ class ListPageSpider(feapder.AirSpider):
     def start_callback(self):
         self._task_coll_name = 'zgzb_list'
         self._proxies = None
+        self._cookies = None  # 全局 cookies
 
     def start_requests(self):
-        self._proxies = self.proxy
+        # self._proxies = self.proxy
+        self.ali_robots_cookies(limit=40)  # 获取全局 cookies 与 全局代理
+
         task_menus = [
             Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', '招标公告', 'tenderBulletin'),
             Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', '开标记录', 'openBidRecord'),
@@ -97,6 +111,7 @@ class ListPageSpider(feapder.AirSpider):
     def download_midware(self, request):
         request.url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getStringMethod.do'
         request.proxies = self._proxies
+        request.cookies = self._cookies
         request.method = 'POST'
         request.timeout = 5
         request.headers = {
@@ -113,26 +128,37 @@ class ListPageSpider(feapder.AirSpider):
         }
 
     def validate(self, request, response):
-        msg = request.meta['msg']
-        menu = request.menu
-        resp_json = response.json  # 检测数据接口是否请求成功
-        if not resp_json:
-            logger.info(f'{msg}-无列表数据')
-            return False
-
-        # 汇总消息
-        data = resp_json.get("object")
-        total_page = data.get("page").get("totalPage", 0)
-        total_count = data.get("page").get("totalCount", 0)
-        page_no = data.get("page").get("pageNo", 0)
-        row = data.get("page").get("row", 0)
-        items = data.get("returnlist")
-        if page_no < total_page:
-            info = f'{menu.channel}-共{total_page}页-{total_count}条-第{page_no}页-返回{row}条-实际{len(items)}条'
+        if response.is_html:
+            raise AliRobotsCaptchaError
         else:
-            info = f'{menu.channel},超出最大采集页码'
-        logger.info(info)
-        return True
+            msg = request.meta['msg']
+            menu = request.menu
+            resp_json = response.json  # 检测数据接口是否请求成功
+            if not resp_json:
+                logger.info(f'{msg}-无列表数据')
+                return False
+
+            # 汇总消息
+            data = resp_json.get("object")
+            return_list = data.get("returnlist")
+
+            page = data.get("page")
+            total_page = page.get("totalPage", 0)
+            total_count = page.get("totalCount", 0)
+            page_no = page.get("pageNo", 0)
+            row = page.get("row", 0)
+
+            msg = f"{menu.channel},超出最大采集页码"
+            if page_no <= total_page:
+                tips = [
+                    menu.channel,
+                    f'共{total_page}页{total_count}/{len(return_list)}条',
+                    f'第{page_no}页{row}条',
+                ]
+                msg = "-".join(tips)
+
+            logger.info(msg)
+            return True
 
     def exception_request(self, request, response, e):
         msg = request.meta['msg']
@@ -145,6 +171,9 @@ class ListPageSpider(feapder.AirSpider):
             logger.warning(f"{msg}--代理失效,{interval}s后重试...")
             tools.delay_time(interval)
             self._proxies = self.proxy
+        elif isinstance(e, AliRobotsCaptchaError):
+            logger.info("cookies失效,重新获取...")
+            self.ali_robots_cookies(request)
         else:
             logger.error(f"{msg}--请求失败")
             logger.exception(f'异常原因:{e}')
@@ -201,6 +230,32 @@ class ListPageSpider(feapder.AirSpider):
     def end_callback(self):
         logger.info("列表页采集结束")
 
+    def ali_robots_cookies(self, request=None, limit=3):
+        """
+        阿里无感机器人滑块验证
+
+        @param request
+        @param limit 最大重试次数
+        """
+        retries = 0
+        cookies = None
+        proxy = self.proxy
+        while retries < limit:
+            cookies = self.cookie_pool.create_cookies(proxy=proxy.get('http'))
+            if not cookies or len(cookies) <= 4:
+                proxy = self.proxy
+                retries += 1
+                continue
+
+            if request is not None:
+                request.cookies = cookies
+                request.proxies = proxy
+
+            break
+
+        self._cookies = cookies
+        self._proxies = proxy
+
 
 if __name__ == '__main__':
     ListPageSpider(thread_count=1).start()