Browse Source

update:更新webdriver驱动配置

dongzhaorui 2 years ago
parent
commit
1df2daa01f

+ 29 - 26
zgztb_cookie/Dockerfile

@@ -1,5 +1,5 @@
 # 拉取镜像
-FROM centos:latest
+FROM centos:centos7.9.2009
 
 # 配置容器时间
 RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
@@ -7,50 +7,53 @@ RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shangh
 # 添加快捷命令
 RUN echo "alias ll='ls -hall'" >> ~/.bashrc && source ~/.bashrc
 
-# 更新yum源
-RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*
-RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
-RUN yum update -y
+# 更新yum源, 并生成缓存
+RUN curl -o /etc/yum.repos.d/CentOS7-Aliyun.repo http://mirrors.aliyun.com/repo/Centos-7.repo && curl -o /etc/yum.repos.d/epel-7-Aliyun.repo http://mirrors.aliyun.com/repo/epel-7.repo
+RUN yum clean all && yum makecache && yum -y update
 
-# 解决bash: service: command not found 错误
-RUN yum list | grep initscripts && yum install initscripts -y
 
 # crontab服务安装和启动
-RUN yum install crontabs -y && service crond start
+RUN yum -y install vixie-cron crontabs && systemctl start crond.service && systemctl  enable  crond.service
+
 
 WORKDIR /opt
 # 安装node
-RUN curl -fsSL https://rpm.nodesource.com/setup_14.x | bash -
-RUN yum -y install nodejs
-# 更换npm源
-RUN npm config set registry https://registry.npm.taobao.org
+# 安装node, 更换npm源
+RUN curl -fsSL https://rpm.nodesource.com/setup_14.x | bash && yum -y install nodejs && npm config set registry https://registry.npm.taobao.org
 
 # 安装 python3.8.10 gcc相关配置
-RUN yum --exclude=kernel* update -y && yum groupinstall -y 'Development Tools' && yum install -y gcc openssl-devel bzip2-devel libffi-devel
+RUN yum --exclude=kernel* update -y && yum groupinstall -y 'Development Tools' && yum install -y gcc openssl-devel bzip2-devel libffi-devel wget gtk3 libXt kde-l10n-Chinese glibc-common unzip
 
 # python3.8.10下载与解压缩
 RUN curl -o python3.8.10.tgz https://mirrors.huaweicloud.com/python/3.8.10/Python-3.8.10.tgz && tar -zxvf python3.8.10.tgz
 
-# 编译python3.8.10
-WORKDIR /opt/Python-3.8.10
-# 创建编译安装目录
+# 创建编译安装目录, 配置安装位置
 RUN mkdir /usr/local/python38
-# 配置安装位置
-RUN ./configure --prefix=/usr/local/python38
-RUN make && make install
+WORKDIR /opt/Python-3.8.10
+RUN ./configure --prefix=/usr/local/python38 && make && make install
 
 # 添加python3的软连接
-RUN rm -rf /usr/bin/python3 /usr/bin/pip3
-RUN ln -s /usr/local/python38/bin/python3 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
-
+RUN rm -rf /usr/bin/python3 /usr/bin/pip3 && ln -s /usr/local/python38/bin/python3 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
 # 更换pip源&更新pip
-RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && pip3 install --upgrade pip
+RUN pip3 config set global.index-url https://mirrors.bfsu.edu.cn/pypi/web/simple && pip3 install --upgrade pip
+
+# node 项目依赖
+WORKDIR /usr/lib/node_modules/npm
+RUN npm i crypto@1.0.1 crypto-js@4.1.1 js-md5@0.7.3 jsdom@19.0.0 jsdom@19.0.0 jsencrypt@3.2.1 node-bignumber@1.2.2 xhr2@0.2.1 -g
+RUN echo 'export NODE_PATH="/usr/lib/node_modules' >> /etc/profile
+
+# selenium filefox环境安装
+# 下载火狐浏览器
+RUN wget https://download-installer.cdn.mozilla.net/pub/firefox/releases/78.0/linux-x86_64/zh-CN/firefox-78.0.tar.bz2 && tar -jxvf firefox-78.0.tar.bz2 && ln -s /opt/firefox/firefox /usr/bin/firefox
+# 下载驱动
+RUN wget https://baibai.ink:88/selenium/firefox/geckodriver/0.31.0/geckodriver && chmod +x geckodriver && ln -s /opt/geckodriver /usr/bin/geckodriver
 
 # 指定工作目录
-WORKDIR /app
+WORKDIR /mnt
 
 # 当前目录下 源文件 复制到容器路径 /app 下
 COPY . .
 
-# 安装python项目依赖 和  node 项目依赖
-RUN pip3 install -r requirements.txt && npm install
+# 安装python项目依赖 和 node 项目依赖
+RUN pip3 install -r requirements.txt
+RUN npm install

+ 14 - 10
zgztb_cookie/FworkSpider/feapder/utils/webdriver.py

@@ -36,12 +36,12 @@ class WebDriver(RemoteWebDriver):
         proxy=None,
         driver_type=CHROME,
         timeout=10,
+        headless=False,
+        usages_local_driver=False,
         window_size=(1024, 800),
         server_addr=None,
-        custom_argument=None,
         version=None,
-        usages_local_driver=False,
-        headless=False,
+        custom_argument=None,
         executable_path=None,
         service_log_path=None,
         **kwargs
@@ -58,7 +58,7 @@ class WebDriver(RemoteWebDriver):
             window_size: # 窗口大小
             executable_path: 浏览器路径,默认为默认路径
             server_addr: 远程服务地址
-            usages_local_driver: 使用本地驱动
+            usages_local_driver: 是否使用本地驱动
             service_log_path: selenium service 日志路径
             version: 浏览器版本
             **kwargs:
@@ -67,13 +67,13 @@ class WebDriver(RemoteWebDriver):
         self._user_agent = user_agent or DEFAULT_USERAGENT
         self._proxy = proxy
         self._headless = headless
+        self._usages_local_driver = usages_local_driver
         self._timeout = timeout
         self._window_size = window_size
-        self._server_addr = server_addr or WEBDRIVER["server_addr"]
+        self._executable_path = executable_path
         self._custom_argument = custom_argument
+        self._server_addr = server_addr or WEBDRIVER["server_addr"]
         self._version = version or WEBDRIVER["version"]
-        self._executable_path = executable_path
-        self._usages_local_driver = usages_local_driver
         self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
 
         if driver_type == WebDriver.CHROME:
@@ -84,7 +84,7 @@ class WebDriver(RemoteWebDriver):
 
         else:
             raise TypeError(
-                "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
+                "dirver_type must be one of CHROME or FIREFOX, but received {}".format(
                     type(driver_type)
                 )
             )
@@ -224,6 +224,7 @@ class WebDriver(RemoteWebDriver):
         # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
         chrome_options.add_experimental_option("useAutomationExtension", False)
+        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-gpu")
@@ -277,7 +278,6 @@ class WebDriver(RemoteWebDriver):
                 'params': {'source': js}
             }
             res = browser.execute("executeCdpCommand", params)['value']
-
         return browser
 
     def local_chrome_driver(self):
@@ -285,6 +285,7 @@ class WebDriver(RemoteWebDriver):
         # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
         chrome_options.add_experimental_option("useAutomationExtension", False)
+        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
         # docker 里运行需要
         chrome_options.add_argument("--no-sandbox")
         chrome_options.add_argument("--disable-gpu")
@@ -341,7 +342,7 @@ class WebDriver(RemoteWebDriver):
         # 隐藏浏览器特征
         with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
             js = f.read()
-        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
+            driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
 
         return driver
 
@@ -376,6 +377,9 @@ class WebDriver(RemoteWebDriver):
         else:
             raise AttributeError
 
+    def quit(self):
+        self.get_driver().quit()
+
     # def __del__(self):
     #     if self.driver:
     #         self.driver.quit()

+ 2 - 4
zgztb_cookie/FworkSpider/setting.py

@@ -39,11 +39,11 @@ WEBDRIVER = dict(
     load_images=False,  # 是否加载图片
     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
     headless=False,  # 是否为无头浏览器
+    usages_local_driver=True,  # 是否使用本地驱动,默认启动本地驱动
     proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    driver_type="CHROME",  # CHROME、FIREFOX
+    driver_type="FIREFOX",  # CHROME、FIREFOX
     timeout=30,  # 请求超时时间
     executable_path=None,  # 浏览器路径,默认为默认路径
-    usages_local_driver=False,  # 是否使用本地驱动,默认启动本地驱动
     window_size=(1280, 800),  # 窗口大小
     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
@@ -77,8 +77,6 @@ LOG_BACKUP_COUNT = 20  # 日志文件保留数量
 LOG_ENCODING = "utf8"  # 日志文件编码
 OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
 
-author = {}
-
 JIANYU_PROXY_URL = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
 JIANYU_PROXY_AUTHOR = 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'
 JIANYU_SPLASH_URL = "http://59.110.6.43:8998/render.json"

+ 1 - 0
zgztb_cookie/README.md

@@ -1,6 +1,7 @@
 #### 中国招标投标公共服务平台
 
 #### docker 构建和启动容器
+    $ docker build -t centos7_zgzb:v1.0 .
     $ docker-compose --compatibility up -d
 
 ##### 快照页执行脚本

+ 5 - 7
zgztb_cookie/cookie_pool.py

@@ -1,11 +1,12 @@
 import sys
-sys.path.append('/app/FworkSpider')
+sys.path.append('/mnt/FworkSpider')
 
 from selenium.webdriver import ActionChains
 
 from feapder.utils.webdriver import WebDriver
 from feapder.utils.log import log
 from feapder.network.cookie_pool import PageCookiePool
+from feapder.setting import WEBDRIVER
 
 
 class WebCookiePool(PageCookiePool):
@@ -14,13 +15,10 @@ class WebCookiePool(PageCookiePool):
 
         self.page_url = page_url
         self.cookie_key = cookie_key
-        self._kwargs = kwargs
-        self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", False)
-        self._kwargs.setdefault("driver_type", "CHROME")
+        self._kwargs = WEBDRIVER
 
     def create_cookies(self, proxy=None):
-        self._kwargs.setdefault("proxy", proxy)
+        self._kwargs["proxy"] = proxy
         with WebDriver(**self._kwargs) as driver_pool:
             import time
             try:
@@ -36,7 +34,7 @@ class WebCookiePool(PageCookiePool):
                             # 点击并且不松开鼠标
                             ActionChains(driver_pool).click_and_hold(on_element=slider).perform()
                             # 往右边移动258个位置
-                            ActionChains(driver_pool).move_by_offset(xoffset=258, yoffset=0).perform()
+                            ActionChains(driver_pool).move_by_offset(xoffset=252, yoffset=0).perform()
                             # 松开鼠标
                             ActionChains(driver_pool).pause(1).release().perform()
                     except Exception as e:

+ 1 - 1
zgztb_cookie/detail_firefox.py

@@ -7,7 +7,7 @@ Created on 2021-12-13 13:25:15
 @author: 马国鹏
 """
 import sys
-sys.path.append('/app/FworkSpider')
+sys.path.append('/mnt/FworkSpider')
 
 import time
 

+ 1 - 1
zgztb_cookie/detail_normol.py

@@ -7,7 +7,7 @@ Created on 2021-12-13 13:25:15
 @author: 马国鹏
 """
 import sys
-sys.path.append('/app/FworkSpider')
+sys.path.append('/mnt/FworkSpider')
 
 import re
 import time

+ 4 - 4
zgztb_cookie/docker-compose.yml

@@ -1,12 +1,12 @@
 version: '3.3'
 services:
-  py:
+  worker:
     container_name: cebpubservice
-    build: .
+    image: centos7_zgzb:v1.0
     restart: always
     shm_size: '2gb'
     volumes:
-      - /mnt/zgztb_cookie:/app
+      - /mnt/zgztb_cookie:/mnt
     command:
       - /sbin/init
     privileged: true
@@ -19,6 +19,6 @@ services:
     deploy:
      resources:
         limits:
-           memory: 3G
+           memory: 4G
         reservations:
            memory: 200M

+ 20 - 20
zgztb_cookie/start.sh

@@ -12,7 +12,7 @@ function check_time() {
     fi
 }
 
-cd /app
+cd /mnt
 
 for pid in $(ps -ef | grep -v grep | grep detail | awk '{print $2}'); do
     echo $pid
@@ -20,24 +20,24 @@ for pid in $(ps -ef | grep -v grep | grep detail | awk '{print $2}'); do
     echo "关闭中国招标投标公共服务平台快照页爬虫"
 done
 
+#
+#echo "开始关闭自动化服务进程"
+#for pid in $(ps -ef | grep -v grep | grep geckodriver | awk '{print $2}'); do
+#    echo $pid
+#    check_time $pid
+#    echo "关闭驱动"
+#done
+#
+#for pid in $(ps -ef | grep -v grep | grep firefox | awk '{print $2}'); do
+#    echo $pid
+#    check_time $pid
+#    echo "关闭浏览器"
+#done
+#
+#echo "rest启动服务"
+#rm -rf /tmp/*
+
 sleep 10
-flock -xn /app/detail_normol.py -c 'nohup python3 detail_normol.py >/dev/null &'
+flock -xn /mnt/detail_normol.py -c 'nohup python3 detail_normol.py >/dev/null &'
 sleep 5
-flock -xn /app/detail_firefox.py -c 'nohup python3 detail_firefox.py >/dev/null &'
-
-echo "开始关闭自动化服务进程"
-for pid in $(ps -ef | grep -v grep | grep geckodriver | awk '{print $2}'); do
-    echo $pid
-    check_time $pid
-    echo "关闭驱动"
-done
-
-for pid in $(ps -ef | grep -v grep | grep firefox | awk '{print $2}'); do
-    echo $pid
-    check_time $pid
-    echo "关闭浏览器"
-done
-
-echo "rest启动服务"
-cd /tmp
-rm -rf *
+flock -xn /mnt/detail_firefox.py -c 'nohup python3 detail_firefox.py >/dev/null &'

+ 1 - 1
zgztb_cookie/zgzbtb_spider.py

@@ -4,7 +4,7 @@
 #  @Author  : 马国鹏
 #  @File    : qgzb_spider.py
 import sys
-sys.path.append('/app/FworkSpider')
+sys.path.append('/mnt/FworkSpider')
 
 import datetime
 import time