浏览代码

Merge branch 'master' of http://192.168.3.207:8080/data_processing/crawlab_feader

lizongze 2 年之前
父节点
当前提交
7d8f721e24
共有 30 个文件被更改,包括 7274 次插入0 次删除
  1. 141 0
      A数据处理/site_monitor/.gitignore
  2. 19 0
      A数据处理/site_monitor/README.md
  3. 9 0
      A数据处理/site_monitor/db/__init__.py
  4. 422 0
      A数据处理/site_monitor/db/mongodb.py
  5. 924 0
      A数据处理/site_monitor/db/redisdb.py
  6. 35 0
      A数据处理/site_monitor/docker/Dockerfile
  7. 17 0
      A数据处理/site_monitor/docker/docker-compose.yml
  8. 218 0
      A数据处理/site_monitor/monitor.py
  9. 8 0
      A数据处理/site_monitor/network/__init__.py
  10. 3 0
      A数据处理/site_monitor/network/downloader/__init__.py
  11. 104 0
      A数据处理/site_monitor/network/downloader/_playwright.py
  12. 46 0
      A数据处理/site_monitor/network/downloader/_requests.py
  13. 41 0
      A数据处理/site_monitor/network/downloader/base.py
  14. 32 0
      A数据处理/site_monitor/network/proxy_file/de9f83d546a39eca6979d2a6dca3407a.txt
  15. 746 0
      A数据处理/site_monitor/network/proxy_pool.py
  16. 524 0
      A数据处理/site_monitor/network/request.py
  17. 396 0
      A数据处理/site_monitor/network/response.py
  18. 389 0
      A数据处理/site_monitor/network/user_agent.py
  19. 14 0
      A数据处理/site_monitor/requirements.txt
  20. 65 0
      A数据处理/site_monitor/setting.py
  21. 8 0
      A数据处理/site_monitor/utils/__init__.py
  22. 147 0
      A数据处理/site_monitor/utils/clean_html.py
  23. 0 0
      A数据处理/site_monitor/utils/js/intercept.js
  24. 6 0
      A数据处理/site_monitor/utils/js/stealth.min.js
  25. 14 0
      A数据处理/site_monitor/utils/log.py
  26. 2438 0
      A数据处理/site_monitor/utils/tools.py
  27. 12 0
      A数据处理/site_monitor/utils/webdriver/__init__.py
  28. 300 0
      A数据处理/site_monitor/utils/webdriver/playwright_driver.py
  29. 81 0
      A数据处理/site_monitor/utils/webdriver/webdirver.py
  30. 115 0
      A数据处理/site_monitor/utils/webdriver/webdriver_pool.py

+ 141 - 0
A数据处理/site_monitor/.gitignore

@@ -0,0 +1,141 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+.idea

+ 19 - 0
A数据处理/site_monitor/README.md

@@ -0,0 +1,19 @@
+# 原网站监控
+
+#### 构建镜像
+```shell
+$ cd site_monitor
+$ docker build -t site_monitor:v1.0 -f docker/Dockerfile .
+```
+
+#### 创建容器
+```shell
+$ cd site_monitor
+$ docker-compose -f docker/docker-compose.yml up -d
+```
+
+#### 关闭容器
+```shell
+$ cd site_monitor
+$ docker-compose -f docker/docker-compose.yml down
+```

+ 9 - 0
A数据处理/site_monitor/db/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/23 12:09 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""

+ 422 - 0
A数据处理/site_monitor/db/mongodb.py

@@ -0,0 +1,422 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-04-18 14:12:21
+---------
+@summary: 操作mongo数据库
+---------
+@author: Mkdir700
+@email:  mkdir700@gmail.com
+"""
+import re
+from typing import List, Dict, Optional
+from urllib import parse
+
+import pymongo
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo.database import Database
+from pymongo.errors import DuplicateKeyError, BulkWriteError
+
+import setting as setting
+from utils.log import logger as log
+
+
+class MongoDB:
+    def __init__(
+        self,
+        ip=None,
+        port=None,
+        db=None,
+        user_name=None,
+        user_pass=None,
+        url=None,
+        **kwargs,
+    ):
+        if url:
+            self.client = MongoClient(url, **kwargs)
+        else:
+            if not ip:
+                ip = setting.MONGO_IP
+            if not port:
+                port = setting.MONGO_PORT
+            if not db:
+                db = setting.MONGO_DB
+            if not user_name:
+                user_name = setting.MONGO_USER_NAME
+            if not user_pass:
+                user_pass = setting.MONGO_USER_PASS
+            self.client = MongoClient(
+                host=ip, port=port, username=user_name, password=user_pass
+            )
+
+        self.db = self.get_database(db)
+
+        # 缓存索引信息
+        self.__index__cached = {}
+
+    @classmethod
+    def from_url(cls, url, **kwargs):
+        """
+        Args:
+            url: mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
+                 参考:http://mongodb.github.io/mongo-java-driver/3.4/javadoc/com/mongodb/MongoClientURI.html
+            **kwargs:
+
+        Returns:
+
+        """
+        url_parsed = parse.urlparse(url)
+
+        db_type = url_parsed.scheme.strip()
+        if db_type != "mongodb":
+            raise Exception(
+                "url error, expect mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]], but get {}".format(
+                    url
+                )
+            )
+
+        return cls(url=url, **kwargs)
+
+    def get_database(self, database, **kwargs) -> Database:
+        """
+        获取数据库对象
+        @param database: 数据库名
+        @return:
+        """
+        return self.client.get_database(database, **kwargs)
+
+    def get_collection(self, coll_name, **kwargs) -> Collection:
+        """
+        根据集合名获取集合对象
+        @param coll_name: 集合名
+        @return:
+        """
+        return self.db.get_collection(coll_name, **kwargs)
+
+    def find(
+        self, coll_name: str, condition: Optional[Dict] = None, limit: int = 0, **kwargs
+    ) -> List[Dict]:
+        """
+        @summary:
+        无数据: 返回[]
+        有数据: [{'_id': 'xx', ...}, ...]
+        ---------
+        @param coll_name: 集合名(表名)
+        @param condition: 查询条件
+        @param limit: 结果数量
+        @param kwargs:
+            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
+
+        ---------
+        @result:
+        """
+        condition = {} if condition is None else condition
+        command = {"find": coll_name, "filter": condition, "limit": limit}
+        command.update(kwargs)
+        result = self.run_command(command)
+        cursor = result["cursor"]
+        cursor_id = cursor["id"]
+        dataset = cursor["firstBatch"]
+        while True:
+            if cursor_id == 0:
+                break
+            result = self.run_command(
+                {
+                    "getMore": cursor_id,
+                    "collection": coll_name,
+                    "batchSize": kwargs.get("batchSize", 100),
+                }
+            )
+            cursor = result["cursor"]
+            cursor_id = cursor["id"]
+            dataset.extend(cursor["nextBatch"])
+        return dataset
+
+    def add(
+        self,
+        coll_name,
+        data: Dict,
+        replace=False,
+        update_columns=(),
+        update_columns_value=(),
+        insert_ignore=False,
+    ):
+        """
+        添加单条数据
+        Args:
+            coll_name: 集合名
+            data: 单条数据
+            replace: 唯一索引冲突时直接覆盖旧数据,默认为False
+            update_columns: 更新指定的列(如果数据唯一索引冲突,则更新指定字段,如 update_columns = ["name", "title"]
+            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
+            insert_ignore: 索引冲突是否忽略 默认False
+
+        Returns: 插入成功的行数
+
+        """
+        affect_count = 1
+        collection = self.get_collection(coll_name)
+        try:
+            collection.insert_one(data)
+        except DuplicateKeyError as e:
+            # 存在则更新
+            if update_columns:
+                if not isinstance(update_columns, (tuple, list)):
+                    update_columns = [update_columns]
+
+                condition = self.__get_update_condition(
+                    coll_name, data, e.details.get("errmsg")
+                )
+
+                # 更新指定的列
+                if update_columns_value:
+                    # 使用指定的值更新
+                    doc = {
+                        key: value
+                        for key, value in zip(update_columns, update_columns_value)
+                    }
+                else:
+                    # 使用数据本身的值更新
+                    doc = {key: data[key] for key in update_columns}
+
+                collection.update_one(condition, {"$set": doc})
+
+            # 覆盖更新
+            elif replace:
+                condition = self.__get_update_condition(
+                    coll_name, data, e.details.get("errmsg")
+                )
+                # 替换已存在的数据
+                collection.replace_one(condition, data)
+
+            elif not insert_ignore:
+                raise e
+
+        return affect_count
+
+    def add_batch(
+        self,
+        coll_name: str,
+        datas: List[Dict],
+        replace=False,
+        update_columns=(),
+        update_columns_value=(),
+        condition_fields: dict = None,
+    ):
+        """
+        批量添加数据
+        Args:
+            coll_name: 集合名
+            datas: 数据 [{'_id': 'xx'}, ... ]
+            replace:  唯一索引冲突时直接覆盖旧数据,默认为False
+            update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
+            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
+            condition_fields: 用于条件查找的字段,不指定则用索引冲突中的字段查找
+
+        Returns: 添加行数,不包含更新
+
+        """
+        add_count = 0
+
+        if not datas:
+            return add_count
+
+        collection = self.get_collection(coll_name)
+        if not isinstance(update_columns, (tuple, list)):
+            update_columns = [update_columns]
+
+        try:
+            add_count = len(datas)
+            collection.insert_many(datas, ordered=False)
+        except BulkWriteError as e:
+            write_errors = e.details.get("writeErrors")
+            for error in write_errors:
+                if error.get("code") == 11000:
+                    # 数据重复
+                    # 获取重复的数据
+                    data = error.get("op")
+
+                    def get_condition():
+                        # 获取更新条件
+                        if condition_fields:
+                            condition = {
+                                condition_field: data[condition_field]
+                                for condition_field in condition_fields
+                            }
+                        else:
+                            # 根据重复的值获取更新条件
+                            condition = self.__get_update_condition(
+                                coll_name, data, error.get("errmsg")
+                            )
+
+                        return condition
+
+                    if update_columns:
+                        # 更新指定的列
+                        if update_columns_value:
+                            # 使用指定的值更新
+                            doc = {
+                                key: value
+                                for key, value in zip(
+                                    update_columns, update_columns_value
+                                )
+                            }
+                        else:
+                            # 使用数据本身的值更新
+                            doc = {key: data.get(key) for key in update_columns}
+
+                        collection.update_one(get_condition(), {"$set": doc})
+                        add_count -= 1
+
+                    elif replace:
+                        # 覆盖更新
+                        collection.replace_one(get_condition(), data)
+                        add_count -= 1
+
+                    else:
+                        # log.error(error)
+                        add_count -= 1
+
+        return add_count
+
+    def count(self, coll_name, condition: Optional[Dict], limit=0, **kwargs):
+        """
+        计数
+        @param coll_name: 集合名
+        @param condition: 查询条件
+        @param limit: 限制数量
+        @param kwargs:
+        ----
+        command = {
+          count: <collection or view>,
+          query: <document>,
+          limit: <integer>,
+          skip: <integer>,
+          hint: <hint>,
+          readConcern: <document>,
+          collation: <document>,
+          comment: <any>
+        }
+        https://docs.mongodb.com/manual/reference/command/count/#mongodb-dbcommand-dbcmd.count
+        @return: 数据数量
+        """
+        command = {"count": coll_name, "query": condition, "limit": limit, **kwargs}
+        result = self.run_command(command)
+        return result["n"]
+
+    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
+        """
+        更新
+        Args:
+            coll_name: 集合名
+            data: 单条数据 {"xxx":"xxx"}
+            condition: 更新条件 {"_id": "xxxx"}
+            upsert: 数据不存在则插入,默认为 False
+
+        Returns: True / False
+        """
+        try:
+            collection = self.get_collection(coll_name)
+            collection.update_one(condition, {"$set": data}, upsert=upsert)
+        except Exception as e:
+            log.error(
+                """
+                error:{}
+                condition: {}
+            """.format(
+                    e, condition
+                )
+            )
+            return False
+        else:
+            return True
+
+    def delete(self, coll_name, condition: Dict) -> bool:
+        """
+        删除
+        Args:
+            coll_name: 集合名
+            condition: 查找条件
+        Returns: True / False
+
+        """
+        try:
+            collection = self.get_collection(coll_name)
+            collection.delete_one(condition)
+        except Exception as e:
+            log.error(
+                """
+                error:{}
+                condition: {}
+            """.format(
+                    e, condition
+                )
+            )
+            return False
+        else:
+            return True
+
+    def run_command(self, command: Dict):
+        """
+        运行指令
+        参考文档 https://www.geek-book.com/src/docs/mongodb/mongodb/docs.mongodb.com/manual/reference/command/index.html
+        @param command:
+        @return:
+        """
+        return self.db.command(command)
+
+    def create_index(self, coll_name, keys, unique=True):
+        collection = self.get_collection(coll_name)
+        _keys = [(key, pymongo.ASCENDING) for key in keys]
+        collection.create_index(_keys, unique=unique)
+
+    def get_index(self, coll_name):
+        return self.get_collection(coll_name).index_information()
+
+    def drop_collection(self, coll_name):
+        return self.db.drop_collection(coll_name)
+
+    def get_index_key(self, coll_name, index_name):
+        """
+        获取参与索引的key
+        Args:
+            index_name: 索引名
+
+        Returns:
+
+        """
+        cache_key = f"{coll_name}:{index_name}"
+
+        if cache_key in self.__index__cached:
+            return self.__index__cached.get(cache_key)
+
+        index = self.get_index(coll_name)
+        index_detail = index.get(index_name)
+        if not index_detail:
+            errmsg = f"not found index {index_name} in collection {coll_name}"
+            raise Exception(errmsg)
+
+        index_keys = [val[0] for val in index_detail.get("key")]
+        self.__index__cached[cache_key] = index_keys
+        return index_keys
+
+    def __get_update_condition(
+        self, coll_name: str, data: dict, duplicate_errmsg: str
+    ) -> dict:
+        """
+        根据索引冲突的报错信息 获取更新条件
+        Args:
+            duplicate_errmsg: E11000 duplicate key error collection: feapder.test index: a_1_b_1 dup key: { : 1, : "你好" }
+            data: {"a": 1, "b": "你好", "c": "嘻嘻"}
+
+        Returns: {"a": 1, "b": "你好"}
+
+        """
+        index_name = re.search(r"index: (\w+)", duplicate_errmsg).group(1)
+        index_keys = self.get_index_key(coll_name, index_name)
+
+        condition = {key: data.get(key) for key in index_keys}
+        return condition
+
+    def __getattr__(self, name):
+        return getattr(self.db, name)

+ 924 - 0
A数据处理/site_monitor/db/redisdb.py

@@ -0,0 +1,924 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2016-11-16 16:25
+---------
+@summary: 操作redis数据库
+---------
+@author: Boris
+"""
+
+import time
+
+import redis
+from redis._compat import unicode, long, basestring
+from redis.connection import Encoder as _Encoder
+from redis.exceptions import ConnectionError, TimeoutError
+from redis.exceptions import DataError
+from redis.sentinel import Sentinel
+from rediscluster import RedisCluster
+
+import setting as setting
+from utils.log import logger as log
+
+
+class Encoder(_Encoder):
+    def encode(self, value):
+        "Return a bytestring or bytes-like representation of the value"
+        if isinstance(value, (bytes, memoryview)):
+            return value
+        # elif isinstance(value, bool):
+        #     # special case bool since it is a subclass of int
+        #     raise DataError(
+        #         "Invalid input of type: 'bool'. Convert to a "
+        #         "bytes, string, int or float first."
+        #     )
+        elif isinstance(value, float):
+            value = repr(value).encode()
+        elif isinstance(value, (int, long)):
+            # python 2 repr() on longs is '123L', so use str() instead
+            value = str(value).encode()
+        elif isinstance(value, (list, dict, tuple)):
+            value = unicode(value)
+        elif not isinstance(value, basestring):
+            # a value we don't know how to deal with. throw an error
+            typename = type(value).__name__
+            raise DataError(
+                "Invalid input of type: '%s'. Convert to a "
+                "bytes, string, int or float first." % typename
+            )
+        if isinstance(value, unicode):
+            value = value.encode(self.encoding, self.encoding_errors)
+        return value
+
+
+redis.connection.Encoder = Encoder
+
+
+class RedisDB:
+    def __init__(
+        self,
+        ip_ports=None,
+        db=None,
+        user_pass=None,
+        url=None,
+        decode_responses=True,
+        service_name=None,
+        max_connections=1000,
+        **kwargs,
+    ):
+        """
+        redis的封装
+        Args:
+            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+            db:
+            user_pass:
+            url:
+            decode_responses:
+            service_name: 适用于redis哨兵模式
+            max_connections: 同一个redis对象使用的并发数(连接池的最大连接数),超过这个数量会抛出redis.ConnectionError
+        """
+
+        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
+        if ip_ports is None:
+            ip_ports = setting.REDISDB_IP_PORTS
+        if db is None:
+            db = setting.REDISDB_DB
+        if user_pass is None:
+            user_pass = setting.REDISDB_USER_PASS
+        if service_name is None:
+            service_name = setting.REDISDB_SERVICE_NAME
+
+        self._is_redis_cluster = False
+
+        self.__redis = None
+        self._url = url
+        self._ip_ports = ip_ports
+        self._db = db
+        self._user_pass = user_pass
+        self._decode_responses = decode_responses
+        self._service_name = service_name
+        self._max_connections = max_connections
+        self._kwargs = kwargs
+        self.get_connect()
+
+    def __repr__(self):
+        if self._url:
+            return "<Redisdb url:{}>".format(self._url)
+
+        return "<Redisdb ip_ports: {} db:{} user_pass:{}>".format(
+            self._ip_ports, self._db, self._user_pass
+        )
+
+    @property
+    def _redis(self):
+        try:
+            if not self.__redis.ping():
+                raise ConnectionError("unable to connect to redis")
+        except:
+            self._reconnect()
+
+        return self.__redis
+
+    @_redis.setter
+    def _redis(self, val):
+        self.__redis = val
+
+    def get_connect(self):
+        # 获取数据库连接
+        try:
+            if not self._url:
+                if not self._ip_ports:
+                    raise ConnectionError("未设置 redis 连接信息")
+
+                ip_ports = (
+                    self._ip_ports
+                    if isinstance(self._ip_ports, list)
+                    else self._ip_ports.split(",")
+                )
+                if len(ip_ports) > 1:
+                    startup_nodes = []
+                    for ip_port in ip_ports:
+                        ip, port = ip_port.split(":")
+                        startup_nodes.append({"host": ip, "port": port})
+
+                    if self._service_name:
+                        # log.debug("使用redis哨兵模式")
+                        hosts = [(node["host"], node["port"]) for node in startup_nodes]
+                        sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs)
+                        self._redis = sentinel.master_for(
+                            self._service_name,
+                            password=self._user_pass,
+                            db=self._db,
+                            redis_class=redis.StrictRedis,
+                            decode_responses=self._decode_responses,
+                            max_connections=self._max_connections,
+                            **self._kwargs,
+                        )
+
+                    else:
+                        # log.debug("使用redis集群模式")
+                        self._redis = RedisCluster(
+                            startup_nodes=startup_nodes,
+                            decode_responses=self._decode_responses,
+                            password=self._user_pass,
+                            max_connections=self._max_connections,
+                            **self._kwargs,
+                        )
+
+                    self._is_redis_cluster = True
+                else:
+                    ip, port = ip_ports[0].split(":")
+                    self._redis = redis.StrictRedis(
+                        host=ip,
+                        port=port,
+                        db=self._db,
+                        password=self._user_pass,
+                        decode_responses=self._decode_responses,
+                        max_connections=self._max_connections,
+                        **self._kwargs,
+                    )
+                    self._is_redis_cluster = False
+            else:
+                self._redis = redis.StrictRedis.from_url(
+                    self._url, decode_responses=self._decode_responses
+                )
+                self._is_redis_cluster = False
+
+        except Exception as e:
+            raise e
+
+        # 不要写成self._redis.ping() 否则循环调用了
+        return self.__redis.ping()
+
+    @classmethod
+    def from_url(cls, url):
+        """
+
+        Args:
+            url: redis://[[username]:[password]]@[host]:[port]/[db]
+
+        Returns:
+
+        """
+        return cls(url=url)
+
+    def sadd(self, table, values):
+        """
+        @summary: 使用无序set集合存储数据, 去重
+        ---------
+        @param table:
+        @param values: 值; 支持list 或 单个值
+        ---------
+        @result: 若库中存在 返回0,否则入库,返回1。 批量添加返回None
+        """
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value in values:
+                pipe.sadd(table, value)
+            pipe.execute()
+
+        else:
+            return self._redis.sadd(table, values)
+
+    def sget(self, table, count=1, is_pop=True):
+        """
+        返回 list 如 ['1'] 或 []
+        @param table:
+        @param count:
+        @param is_pop:
+        @return:
+        """
+
+        datas = []
+        if is_pop:
+            count = count if count <= self.sget_count(table) else self.sget_count(table)
+            if count:
+                if count > 1:
+                    pipe = self._redis.pipeline()
+
+                    if not self._is_redis_cluster:
+                        pipe.multi()
+                    while count:
+                        pipe.spop(table)
+                        count -= 1
+                    datas = pipe.execute()
+
+                else:
+                    datas.append(self._redis.spop(table))
+
+        else:
+            datas = self._redis.srandmember(table, count)
+
+        return datas
+
+    def srem(self, table, values):
+        """
+        @summary: 移除集合中的指定元素
+        ---------
+        @param table:
+        @param values: 一个或者列表
+        ---------
+        @result:
+        """
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value in values:
+                pipe.srem(table, value)
+            pipe.execute()
+        else:
+            self._redis.srem(table, values)
+
+    def sget_count(self, table):
+        return self._redis.scard(table)
+
+    def sdelete(self, table):
+        """
+        @summary: 删除set集合的大键(数据量大的表)
+        删除大set键,使用sscan命令,每次扫描集合中500个元素,再用srem命令每次删除一个键
+        若直接用delete命令,会导致Redis阻塞,出现故障切换和应用程序崩溃的故障。
+        ---------
+        @param table:
+        ---------
+        @result:
+        """
+
+        # 当 SCAN 命令的游标参数被设置为 0 时, 服务器将开始一次新的迭代, 而当服务器向用户返回值为 0 的游标时, 表示迭代已结束
+        cursor = "0"
+        while cursor != 0:
+            cursor, data = self._redis.sscan(table, cursor=cursor, count=500)
+            for item in data:
+                # pipe.srem(table, item)
+                self._redis.srem(table, item)
+
+            # pipe.execute()
+
+    def sismember(self, table, key):
+        "Return a boolean indicating if ``value`` is a member of set ``name``"
+        return self._redis.sismember(table, key)
+
+    def zadd(self, table, values, prioritys=0):
+        """
+        @summary: 使用有序set集合存储数据, 去重(值存在更新)
+        ---------
+        @param table:
+        @param values: 值; 支持list 或 单个值
+        @param prioritys: 优先级; double类型,支持list 或 单个值。 根据此字段的值来排序, 值越小越优先。 可不传值,默认value的优先级为0
+        ---------
+        @result:若库中存在 返回0,否则入库,返回1。 批量添加返回 [0, 1 ...]
+        """
+        if isinstance(values, list):
+            if not isinstance(prioritys, list):
+                prioritys = [prioritys] * len(values)
+            else:
+                assert len(values) == len(prioritys), "values值要与prioritys值一一对应"
+
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value, priority in zip(values, prioritys):
+                pipe.execute_command(
+                    "ZADD", table, priority, value
+                )  # 为了兼容2.x与3.x版本的redis
+            return pipe.execute()
+
+        else:
+            return self._redis.execute_command(
+                "ZADD", table, prioritys, values
+            )  # 为了兼容2.x与3.x版本的redis
+
+    def zget(self, table, count=1, is_pop=True):
+        """
+        @summary: 从有序set集合中获取数据 优先返回分数小的(优先级高的)
+        ---------
+        @param table:
+        @param count: 数量 -1 返回全部数据
+        @param is_pop:获取数据后,是否在原set集合中删除,默认是
+        ---------
+        @result: 列表
+        """
+
+        start_pos = 0  # 包含
+        end_pos = count - 1 if count > 0 else count
+
+        pipe = self._redis.pipeline()
+
+        if not self._is_redis_cluster:
+            pipe.multi()  # 标记事务的开始 参考 http://www.runoob.com/redis/redis-transactions.html
+        pipe.zrange(table, start_pos, end_pos)  # 取值
+        if is_pop:
+            pipe.zremrangebyrank(table, start_pos, end_pos)  # 删除
+        results, *count = pipe.execute()
+        return results
+
+    def zremrangebyscore(self, table, priority_min, priority_max):
+        """
+        根据分数移除成员 闭区间
+        @param table:
+        @param priority_min:
+        @param priority_max:
+        @return: 被移除的成员个数
+        """
+        return self._redis.zremrangebyscore(table, priority_min, priority_max)
+
+    def zrangebyscore(self, table, priority_min, priority_max, count=None, is_pop=True):
+        """
+        @summary: 返回指定分数区间的数据 闭区间
+        ---------
+        @param table:
+        @param priority_min: 优先级越小越优先
+        @param priority_max:
+        @param count: 获取的数量,为空则表示分数区间内的全部数据
+        @param is_pop: 是否删除
+        ---------
+        @result:
+        """
+
+        # 使用lua脚本, 保证操作的原子性
+        lua = """
+            -- local key = KEYS[1]
+            local min_score = ARGV[2]
+            local max_score = ARGV[3]
+            local is_pop = ARGV[4]
+            local count = ARGV[5]
+
+            -- 取值
+            local datas = nil
+            if count then
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
+            else
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
+            end
+
+            -- 删除redis中刚取到的值
+            if (is_pop=='True' or is_pop=='1') then
+                for i=1, #datas do
+                    redis.call('zrem', KEYS[1], datas[i])
+                end
+            end
+
+
+            return datas
+
+        """
+        cmd = self._redis.register_script(lua)
+        if count:
+            res = cmd(
+                keys=[table], args=[table, priority_min, priority_max, is_pop, count]
+            )
+        else:
+            res = cmd(keys=[table], args=[table, priority_min, priority_max, is_pop])
+
+        return res
+
+    def zrangebyscore_increase_score(
+        self, table, priority_min, priority_max, increase_score, count=None
+    ):
+        """
+        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
+        ---------
+        @param table:
+        @param priority_min: 最小分数
+        @param priority_max: 最大分数
+        @param increase_score: 分数值增量 正数则在原有的分数上叠加,负数则相减
+        @param count: 获取的数量,为空则表示分数区间内的全部数据
+        ---------
+        @result:
+        """
+
+        # 使用lua脚本, 保证操作的原子性
+        lua = """
+            -- local key = KEYS[1]
+            local min_score = ARGV[1]
+            local max_score = ARGV[2]
+            local increase_score = ARGV[3]
+            local count = ARGV[4]
+
+            -- 取值
+            local datas = nil
+            if count then
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
+            else
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
+            end
+
+            --修改优先级
+            for i=1, #datas do
+                redis.call('zincrby', KEYS[1], increase_score, datas[i])
+            end
+
+            return datas
+
+        """
+        cmd = self._redis.register_script(lua)
+        if count:
+            res = cmd(
+                keys=[table], args=[priority_min, priority_max, increase_score, count]
+            )
+        else:
+            res = cmd(keys=[table], args=[priority_min, priority_max, increase_score])
+
+        return res
+
+    def zrangebyscore_set_score(
+        self, table, priority_min, priority_max, score, count=None
+    ):
+        """
+        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
+        ---------
+        @param table:
+        @param priority_min: 最小分数
+        @param priority_max: 最大分数
+        @param score: 分数值
+        @param count: 获取的数量,为空则表示分数区间内的全部数据
+        ---------
+        @result:
+        """
+
+        # 使用lua脚本, 保证操作的原子性
+        lua = """
+            -- local key = KEYS[1]
+            local min_score = ARGV[1]
+            local max_score = ARGV[2]
+            local set_score = ARGV[3]
+            local count = ARGV[4]
+
+            -- 取值
+            local datas = nil
+            if count then
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores','limit', 0, count)
+            else
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores')
+            end
+
+            local real_datas = {} -- 数据
+            --修改优先级
+            for i=1, #datas, 2 do
+               local data = datas[i]
+               local score = datas[i+1]
+
+               table.insert(real_datas, data) -- 添加数据
+
+               redis.call('zincrby', KEYS[1], set_score - score, datas[i])
+            end
+
+            return real_datas
+
+        """
+        cmd = self._redis.register_script(lua)
+        if count:
+            res = cmd(keys=[table], args=[priority_min, priority_max, score, count])
+        else:
+            res = cmd(keys=[table], args=[priority_min, priority_max, score])
+
+        return res
+
+    def zincrby(self, table, amount, value):
+        return self._redis.zincrby(table, amount, value)
+
+    def zget_count(self, table, priority_min=None, priority_max=None):
+        """
+        @summary: 获取表数据的数量
+        ---------
+        @param table:
+        @param priority_min:优先级范围 最小值(包含)
+        @param priority_max:优先级范围 最大值(包含)
+        ---------
+        @result:
+        """
+
+        if priority_min != None and priority_max != None:
+            return self._redis.zcount(table, priority_min, priority_max)
+        else:
+            return self._redis.zcard(table)
+
+    def zrem(self, table, values):
+        """
+        @summary: 移除集合中的指定元素
+        ---------
+        @param table:
+        @param values: 一个或者列表
+        ---------
+        @result:
+        """
+
+        if isinstance(values, list):
+            self._redis.zrem(table, *values)
+        else:
+            self._redis.zrem(table, values)
+
+    def zexists(self, table, values):
+        """
+        利用zscore判断某元素是否存在
+        @param values:
+        @return:
+        """
+
+        is_exists = []
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+            pipe.multi()
+            for value in values:
+                pipe.zscore(table, value)
+            is_exists_temp = pipe.execute()
+            for is_exist in is_exists_temp:
+                if is_exist != None:
+                    is_exists.append(1)
+                else:
+                    is_exists.append(0)
+
+        else:
+            is_exists = self._redis.zscore(table, values)
+            is_exists = 1 if is_exists != None else 0
+
+        return is_exists
+
+    def lpush(self, table, values):
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value in values:
+                pipe.rpush(table, value)
+            pipe.execute()
+
+        else:
+            return self._redis.rpush(table, values)
+
+    def lpop(self, table, count=1):
+        """
+        @summary:
+        ---------
+        @param table:
+        @param count:
+        ---------
+        @result: count>1时返回列表
+        """
+
+        datas = None
+        lcount = self.lget_count(table)
+        count = count if count <= lcount else lcount
+
+        if count:
+            if count > 1:
+                pipe = self._redis.pipeline()
+
+                if not self._is_redis_cluster:
+                    pipe.multi()
+                while count:
+                    pipe.lpop(table)
+                    count -= 1
+                datas = pipe.execute()
+
+            else:
+                datas = self._redis.lpop(table)
+
+        return datas
+
+    def rpoplpush(self, from_table, to_table=None):
+        """
+        将列表 from_table 中的最后一个元素(尾元素)弹出,并返回给客户端。
+        将 from_table 弹出的元素插入到列表 to_table ,作为 to_table 列表的的头元素。
+        如果 from_table 和 to_table 相同,则列表中的表尾元素被移动到表头,并返回该元素,可以把这种特殊情况视作列表的旋转(rotation)操作
+        @param from_table:
+        @param to_table:
+        @return:
+        """
+
+        if not to_table:
+            to_table = from_table
+
+        return self._redis.rpoplpush(from_table, to_table)
+
+    def lget_count(self, table):
+        return self._redis.llen(table)
+
+    def lrem(self, table, value, num=0):
+        """
+        @summary:
+        删除value
+        ---------
+        @param table:
+        @param value:
+        @param num:
+        ---------
+        @result: 删除的条数
+        """
+        return self._redis.lrem(table, num, value)
+
+    def lrange(self, table, start=0, end=-1):
+        return self._redis.lrange(table, start, end)
+
+    def hset(self, table, key, value):
+        """
+        @summary:
+        如果 key 不存在,一个新的哈希表被创建并进行 HSET 操作。
+        如果域 field 已经存在于哈希表中,旧值将被覆盖
+        ---------
+        @param table:
+        @param key:
+        @param value:
+        ---------
+        @result: 1 新插入; 0 覆盖
+        """
+        return self._redis.hset(table, key, value)
+
+    def hset_batch(self, table, datas):
+        """
+        批量插入
+        Args:
+            datas:
+                [[key, value]]
+        Returns:
+
+        """
+        pipe = self._redis.pipeline()
+
+        if not self._is_redis_cluster:
+            pipe.multi()
+        for key, value in datas:
+            pipe.hset(table, key, value)
+        return pipe.execute()
+
+    def hincrby(self, table, key, increment):
+        return self._redis.hincrby(table, key, increment)
+
+    def hget(self, table, key, is_pop=False):
+        if not is_pop:
+            return self._redis.hget(table, key)
+        else:
+            lua = """
+                -- local key = KEYS[1]
+                local field = ARGV[1]
+
+                -- 取值
+                local datas = redis.call('hget', KEYS[1], field)
+                -- 删除值
+                redis.call('hdel', KEYS[1], field)
+
+                return datas
+
+                    """
+            cmd = self._redis.register_script(lua)
+            res = cmd(keys=[table], args=[key])
+
+            return res
+
+    def hgetall(self, table):
+        return self._redis.hgetall(table)
+
+    def hexists(self, table, key):
+        return self._redis.hexists(table, key)
+
+    def hdel(self, table, *keys):
+        """
+        @summary: 删除对应的key 可传多个
+        ---------
+        @param table:
+        @param *keys:
+        ---------
+        @result:
+        """
+        self._redis.hdel(table, *keys)
+
+    def hget_count(self, table):
+        return self._redis.hlen(table)
+
+    def hkeys(self, table):
+        return self._redis.hkeys(table)
+
+    def setbit(self, table, offsets, values):
+        """
+        设置字符串数组某一位的值, 返回之前的值
+        @param table:
+        @param offsets: 支持列表或单个值
+        @param values: 支持列表或单个值
+        @return: list / 单个值
+        """
+        if isinstance(offsets, list):
+            if not isinstance(values, list):
+                values = [values] * len(offsets)
+            else:
+                assert len(offsets) == len(values), "offsets值要与values值一一对应"
+
+            pipe = self._redis.pipeline()
+            pipe.multi()
+
+            for offset, value in zip(offsets, values):
+                pipe.setbit(table, offset, value)
+
+            return pipe.execute()
+
+        else:
+            return self._redis.setbit(table, offsets, values)
+
+    def getbit(self, table, offsets):
+        """
+        取字符串数组某一位的值
+        @param table:
+        @param offsets: 支持列表
+        @return: list / 单个值
+        """
+        if isinstance(offsets, list):
+            pipe = self._redis.pipeline()
+            pipe.multi()
+            for offset in offsets:
+                pipe.getbit(table, offset)
+
+            return pipe.execute()
+
+        else:
+            return self._redis.getbit(table, offsets)
+
+    def bitcount(self, table):
+        return self._redis.bitcount(table)
+
+    def strset(self, table, value, **kwargs):
+        return self._redis.set(table, value, **kwargs)
+
+    def str_incrby(self, table, value):
+        return self._redis.incrby(table, value)
+
+    def strget(self, table):
+        return self._redis.get(table)
+
+    def strlen(self, table):
+        return self._redis.strlen(table)
+
+    def getkeys(self, regex):
+        return self._redis.keys(regex)
+
+    def exists_key(self, key):
+        return self._redis.exists(key)
+
+    def set_expire(self, key, seconds):
+        """
+        @summary: 设置过期时间
+        ---------
+        @param key:
+        @param seconds: 秒
+        ---------
+        @result:
+        """
+        self._redis.expire(key, seconds)
+
+    def get_expire(self, key):
+        """
+        @summary: 查询过期时间
+        ---------
+        @param key:
+        @param seconds: 秒
+        ---------
+        @result:
+        """
+        return self._redis.ttl(key)
+
+    def clear(self, table):
+        try:
+            self._redis.delete(table)
+        except Exception as e:
+            log.error(e)
+
+    def get_redis_obj(self):
+        return self._redis
+
+    def _reconnect(self):
+        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
+        retry_count = 0
+        while True:
+            try:
+                retry_count += 1
+                log.error(f"redis 连接断开, 重新连接 {retry_count}")
+                if self.get_connect():
+                    log.info(f"redis 连接成功")
+                    return True
+            except (ConnectionError, TimeoutError) as e:
+                log.error(f"连接失败 e: {e}")
+
+            time.sleep(2)
+
+    def __getattr__(self, name):
+        return getattr(self._redis, name)
+
+    def current_status(self, show_key=True, filter_key_by_used_memory=10 * 1024 * 1024):
+        """
+        统计redis当前使用情况
+        Args:
+            show_key: 是否统计每个key的内存
+            filter_key_by_used_memory: 根据内存的使用量过滤key 只显示使用量大于指定内存的key
+
+        Returns:
+
+        """
+        from prettytable import PrettyTable
+        from tqdm import tqdm
+
+        status_msg = ""
+
+        print("正在查询最大连接数...")
+        clients_count = self._redis.execute_command("info clients")
+        max_clients_count = self._redis.execute_command("config get maxclients")
+        status_msg += ": ".join(max_clients_count) + "\n"
+        status_msg += clients_count + "\n"
+
+        print("正在查询整体内存使用情况...")
+        total_status = self._redis.execute_command("info memory")
+        status_msg += total_status + "\n"
+
+        if show_key:
+            print("正在查询每个key占用内存情况等信息...")
+            table = PrettyTable(
+                field_names=[
+                    "type",
+                    "key",
+                    "value_count",
+                    "used_memory_human",
+                    "used_memory",
+                ],
+                sortby="used_memory",
+                reversesort=True,
+                header_style="title",
+            )
+
+            keys = self._redis.execute_command("keys *")
+            for key in tqdm(keys):
+                key_type = self._redis.execute_command("type {}".format(key))
+                if key_type == "set":
+                    value_count = self._redis.scard(key)
+                elif key_type == "zset":
+                    value_count = self._redis.zcard(key)
+                elif key_type == "list":
+                    value_count = self._redis.llen(key)
+                elif key_type == "hash":
+                    value_count = self._redis.hlen(key)
+                elif key_type == "string":
+                    value_count = self._redis.strlen(key)
+                elif key_type == "none":
+                    continue
+                else:
+                    raise TypeError("尚不支持 {} 类型的key".format(key_type))
+
+                used_memory = self._redis.execute_command("memory usage {}".format(key))
+                if used_memory >= filter_key_by_used_memory:
+                    used_memory_human = (
+                        "%0.2fMB" % (used_memory / 1024 / 1024) if used_memory else 0
+                    )
+
+                    table.add_row(
+                        [key_type, key, value_count, used_memory_human, used_memory]
+                    )
+
+            status_msg += str(table)
+
+        return status_msg

+ 35 - 0
A数据处理/site_monitor/docker/Dockerfile

@@ -0,0 +1,35 @@
+# 拉取镜像
+FROM ubuntu:22.04
+
+# 配置容器时间
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
+
+# 更新源 - 阿里源
+RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN sed -i s@/security.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
+RUN apt-get clean && apt-get update
+RUN apt-get install -y wget unzip curl vim
+
+# 安装 python3.8.10 gcc相关配置
+WORKDIR /opt
+RUN apt-get install -y gcc build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libbz2-dev liblzma-dev sqlite3 libsqlite3-dev tk-dev uuid-dev libgdbm-compat-dev libncurses-dev libnspr4-dev
+
+# python3.8.10下载与解压缩
+RUN curl -o python3.8.10.tgz https://mirrors.huaweicloud.com/python/3.8.10/Python-3.8.10.tgz && tar -zxvf python3.8.10.tgz
+# 创建编译安装目录, 配置安装位置
+RUN mkdir /usr/local/python38
+WORKDIR /opt/Python-3.8.10
+RUN ./configure --prefix=/usr/local/python38 && make && make install
+# 添加python3的软连接
+RUN rm -rf /usr/bin/python3 /usr/bin/pip3 && ln -s /usr/local/python38/bin/python3 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
+# 更换pip源&更新pip
+RUN pip3 config set global.index-url https://mirrors.bfsu.edu.cn/pypi/web/simple && pip3 install --upgrade pip
+
+# 安装项目依赖
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+# 安装playwright - webkit 驱动和依赖
+RUN python3 -m playwright install --with-deps webkit
+
+# 设置工作目录
+WORKDIR /mnt

+ 17 - 0
A数据处理/site_monitor/docker/docker-compose.yml

@@ -0,0 +1,17 @@
+version: "3"
+services: # 一组容器
+  worker01:
+    container_name: site_monitor
+    image: site_monitor:v1.0
+    volumes: # 映射文件夹
+      - /mnt/site_monitor:/mnt
+    network_mode: "host" # 指定网络名称
+    restart: always
+    privileged: true
+    shm_size: 2GB
+    logging:
+      driver: "json-file"
+      options:
+        max-size: "200k"
+        max-file: "10"
+    command: 'python3 /mnt/monitor.py'

+ 218 - 0
A数据处理/site_monitor/monitor.py

@@ -0,0 +1,218 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-05-10 
+---------
+@summary:  网站监控
+---------
+@author: Dzr
+"""
+import threading
+
+import bson
+import numpy as np
+import requests
+from playwright._impl._api_types import Error
+from requests.exceptions import SSLError
+
+import utils.tools as tools
+from db.mongodb import MongoDB
+from network.request import Request
+from network.response import Response
+from utils.log import logger
+
+
+class MonitorParser(threading.Thread):
+
+    def __init__(self, mongo_db, coll_name):
+        threading.Thread.__init__(self)
+        self.mgo_db = mongo_db
+        self.coll_name = coll_name
+
+        self.monitor_api = 'http://cc.spdata.jianyu360.com/crawl/site_monitor/task/fetch'
+
+    def get_task(self):
+        items = {}
+        try:
+            response = requests.get(self.monitor_api, timeout=5)
+            items = response.json()['data']
+            if '_id' in items:
+                items['_id'] = bson.ObjectId(items['_id'])
+        finally:
+            return items
+
+    @staticmethod
+    def get_response(url, render=False, **kwargs):
+        request = Request(url=url, render=render, **kwargs)
+        response = None
+        for i in range(3):
+            try:
+                response = request.get_response()
+            except Error as e:
+                if 'The certificate for this server is invalid.' in e.message:
+                    url = url.replace('https', 'http')
+                    request = Request(url=url, render=render, **kwargs)
+            except SSLError as e:
+                url = url.replace('https', 'http')
+                request = Request(url=url, render=True, **kwargs)
+            except Exception as e:
+                response.reason = e.args
+            else:
+                if response.status_code != 200:
+                    continue
+
+                if response.text is None:
+                    continue
+
+                if len(response.plain_text) == 0:
+                    continue
+
+                if response.tags()['tags_count'] == 0:
+                    continue
+
+                break
+        else:
+            response = Response.from_dict({
+                "url": url,
+                "_content": b"",
+                "cookies": {},
+                "status_code": getattr(response, 'status_code', -1),
+                "elapsed": 666,
+                "headers": {}
+            })
+
+        return response
+
+    def __add_items_to_db(self, task, items):
+        result = self.mgo_db.update(
+            coll_name=self.coll_name,
+            condition={'_id': task['_id']},
+            data=items
+        )
+        print({'_id': task['_id']})
+        return result
+
+    def deal_task(self, task):
+        is_first_monitor = False
+
+        # 网站主页
+        host = task['host']
+        response = self.get_response(host, render=False, proxies=False)
+        host_status_code = response.status_code
+
+        # 栏目
+        url = task['url']
+        response = self.get_response(url, render=True, proxies=False)
+        channel_status_code = response.status_code
+
+        # 栏目页面标签
+        tags_count = response.tags()['tags_count']
+        tags_count_diff = abs(tags_count - task['tags_count'])
+        tags_count_diff_lst = list(task['tags_count_diff_lst'])
+
+        # 栏目是否改版
+        channel_ischange = task['channel_ischange']
+        if len(tags_count_diff_lst) >= 3 and not channel_ischange:
+            mean = np.mean(tags_count_diff_lst)  # 均值
+            std = np.std(tags_count_diff_lst, ddof=1)  # 标准差
+            std_range = [mean - (2 * std), mean + (2 * std)]
+            if tags_count_diff not in std_range:
+                channel_ischange = True
+
+        # 访问频次
+        update_dt = tools.timestamp_to_date(task['update_at'], '%Y-%m-%d')
+        if tools.get_current_date('%Y-%m-%d') != update_dt:
+            is_first_monitor = True
+
+            channel_visit_count, channel_failure_count = 1, 0
+            if channel_status_code != 200:
+                channel_failure_count = 1
+
+            host_visit_count, host_failure_count = 1, 0
+            if host_status_code != 200:
+                host_failure_count = 1
+
+            tags_count_diff_lst = []
+            tags_count_diff_lst.insert(0, tags_count_diff)
+        else:
+            channel_visit_count = task['channel_visit_count'] + 1
+            channel_failure_count = task['channel_failure_count']
+            if channel_status_code != 200:
+                channel_failure_count += 1
+
+            host_visit_count = task['host_visit_count'] + 1
+            host_failure_count = task['host_failure_count']
+            if host_status_code != 200:
+                host_failure_count += 1
+
+            tags_count_diff_lst.insert(0, tags_count_diff)
+
+        if is_first_monitor:
+            pass
+
+        items = {
+            'tags_count': tags_count,
+            'tags_count_diff': tags_count_diff,
+            'tags_count_diff_lst': tags_count_diff_lst,
+            'channel_ischange': channel_ischange,
+            'channel_status_code': channel_status_code,
+            'channel_visit_count': channel_visit_count,
+            'channel_failure_count': channel_failure_count,
+            'host_status_code': host_status_code,
+            'host_visit_count': host_visit_count,
+            'host_failure_count': host_failure_count,
+            'update_at': tools.ensure_int64(tools.get_current_timestamp())
+        }
+        self.__add_items_to_db(task, items)
+
+        logger.debug(
+            """
+                -------------- 处理完成 ----------------
+                id  = Object('%s')
+                thread = %s
+                response = %s
+                """
+            % (
+                str(task['_id']),
+                self.getName(),
+                response
+            )
+        )
+
+    def run(self):
+        while True:
+            task = self.get_task()
+            if not task:
+                logger.debug(f"[{self.getName()}]暂无监控任务")
+                tools.delay_time(2)
+                continue
+
+            try:
+                self.deal_task(task)
+            except Exception as e:
+                logger.exception(e)
+
+
+class MonitorServer(threading.Thread):
+
+    def __init__(self, thread_nums=1):
+        threading.Thread.__init__(self)
+        self.mongo_db = MongoDB()
+        self.coll_name = 'site_monitor'
+
+        self.thread_nums = thread_nums
+
+        self.parser_control_obj = MonitorParser
+        self.parser_controls = []
+
+    def run(self):
+        for _ in range(self.thread_nums):
+            parser_control = self.parser_control_obj(
+                mongo_db=self.mongo_db,
+                coll_name=self.coll_name
+            )
+            parser_control.start()
+            self.parser_controls.append(parser_control)
+
+
+if __name__ == '__main__':
+    MonitorServer(thread_nums=2).start()

+ 8 - 0
A数据处理/site_monitor/network/__init__.py

@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-05-10 
+---------
+@summary:  
+---------
+@author: Dzr
+"""

+ 3 - 0
A数据处理/site_monitor/network/downloader/__init__.py

@@ -0,0 +1,3 @@
+from ._requests import RequestsDownloader
+from ._requests import RequestsSessionDownloader
+from ._playwright import PlaywrightDownloader

+ 104 - 0
A数据处理/site_monitor/network/downloader/_playwright.py

@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:05 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import setting as setting
+import utils.tools as tools
+from network.downloader.base import RenderDownloader
+from network.response import Response
+from utils.webdriver import WebDriverPool, PlaywrightDriver
+
+
+class PlaywrightDownloader(RenderDownloader):
+    webdriver_pool: WebDriverPool = None
+
+    @property
+    def _webdriver_pool(self):
+        if not self.__class__.webdriver_pool:
+            self.__class__.webdriver_pool = WebDriverPool(
+                **setting.PLAYWRIGHT, driver_cls=PlaywrightDriver, thread_safe=True
+            )
+
+        return self.__class__.webdriver_pool
+
+    def download(self, request) -> Response:
+        # 代理优先级 自定义 > 配置文件 > 随机
+        if request.custom_proxies:
+            proxy = request.get_proxy()
+        elif setting.PLAYWRIGHT.get("proxy"):
+            proxy = setting.PLAYWRIGHT.get("proxy")
+        else:
+            proxy = request.get_proxy()
+
+        # user_agent优先级 自定义 > 配置文件 > 随机
+        if request.custom_ua:
+            user_agent = request.get_user_agent()
+        elif setting.PLAYWRIGHT.get("user_agent"):
+            user_agent = setting.PLAYWRIGHT.get("user_agent")
+        else:
+            user_agent = request.get_user_agent()
+
+        cookies = request.get_cookies()
+        url = request.url
+        render_time = request.render_time or setting.PLAYWRIGHT.get("render_time")
+        wait_until = setting.PLAYWRIGHT.get("wait_until") or "domcontentloaded"
+        if request.get_params():
+            url = tools.joint_url(url, request.get_params())
+
+        driver: PlaywrightDriver = self._webdriver_pool.get(
+            user_agent=user_agent, proxy=proxy
+        )
+        try:
+            if cookies:
+                driver.url = url
+                driver.cookies = cookies
+            driver.page.goto(url, wait_until=wait_until)
+
+            if render_time:
+                tools.delay_time(render_time)
+
+            html = driver.page.content()
+            response = Response.from_dict(
+                {
+                    "url": driver.page.url,
+                    "cookies": driver.cookies,
+                    "_content": html.encode(),
+                    "status_code": 200,
+                    "elapsed": 666,
+                    "headers": {
+                        "User-Agent": driver.user_agent,
+                        "Cookie": tools.cookies2str(driver.cookies),
+                    },
+                }
+            )
+
+            response.driver = driver
+            response.browser = driver
+            return response
+        except Exception as e:
+            self._webdriver_pool.remove(driver)
+            raise e
+
+    def close(self, driver):
+        if driver:
+            self._webdriver_pool.remove(driver)
+
+    def put_back(self, driver):
+        """
+        释放浏览器对象
+        """
+        self._webdriver_pool.put(driver)
+
+    def close_all(self):
+        """
+        关闭所有浏览器
+        """
+        # 不支持
+        # self._webdriver_pool.close()
+        pass

+ 46 - 0
A数据处理/site_monitor/network/downloader/_requests.py

@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/4/10 5:57 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import requests
+from requests.adapters import HTTPAdapter
+
+from network.downloader.base import Downloader
+from network.response import Response
+
+
+class RequestsDownloader(Downloader):
+    def download(self, request) -> Response:
+        response = requests.request(
+            request.method, request.url, **request.requests_kwargs
+        )
+        response = Response(response)
+        return response
+
+
+class RequestsSessionDownloader(Downloader):
+    session = None
+
+    @property
+    def _session(self):
+        if not self.__class__.session:
+            self.__class__.session = requests.Session()
+            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
+            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
+            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+            self.__class__.session.mount("http", http_adapter)
+
+        return self.__class__.session
+
+    def download(self, request) -> Response:
+        response = self._session.request(
+            request.method, request.url, **request.requests_kwargs
+        )
+        response = Response(response)
+        return response

+ 41 - 0
A数据处理/site_monitor/network/downloader/base.py

@@ -0,0 +1,41 @@
+import abc
+from abc import ABC
+
+from network.response import Response
+
+
+class Downloader:
+    @abc.abstractmethod
+    def download(self, request) -> Response:
+        """
+
+        Args:
+            request: Request
+
+        Returns: Response
+
+        """
+        raise NotImplementedError
+
+    def close(self, response: Response):
+        pass
+
+
+class RenderDownloader(Downloader, ABC):
+    def put_back(self, driver):
+        """
+        释放浏览器对象
+        """
+        pass
+
+    def close(self, driver):
+        """
+        关闭浏览器
+        """
+        pass
+
+    def close_all(self):
+        """
+        关闭所有浏览器
+        """
+        pass

+ 32 - 0
A数据处理/site_monitor/network/proxy_file/de9f83d546a39eca6979d2a6dca3407a.txt

@@ -0,0 +1,32 @@
+180.105.104.247:8860&&1684743244
+115.208.199.134:8860&&1684742848
+42.84.93.124:8861&&1684742999
+180.127.72.88:8860&&1684743979
+144.255.48.89:8860&&1684744166
+180.106.242.48:8860&&1684743307
+121.207.84.107:8860&&1684742787
+180.127.72.79:8860&&1684743262
+182.107.181.130:8860&&1684742689
+218.67.90.253:8860&&1684743824
+59.61.165.88:8860&&1684742786
+114.233.0.176:8860&&1684742924
+113.93.224.26:8860&&1684743064
+123.169.34.24:8860&&1684743176
+182.34.27.242:8860&&1684744210
+125.69.91.209:8860&&1684743202
+36.27.184.4:8860&&1684743545
+49.69.209.246:8860&&1684742763
+123.146.150.68:8860&&1684742715
+114.235.254.245:8860&&1684742840
+106.32.10.20:8860&&1684743120
+140.250.148.156:8860&&1684742873
+180.111.177.16:8860&&1684743024
+180.108.151.90:8860&&1684743675
+121.238.107.47:8860&&1684742780
+123.160.96.180:8860&&1684742820
+223.215.119.152:8860&&1684742729
+182.34.102.138:8860&&1684743505
+59.58.211.240:8860&&1684744113
+180.140.47.156:8860&&1684743073
+125.123.136.247:8861&&1684743189
+49.86.182.103:8860&&1684742719

+ 746 - 0
A数据处理/site_monitor/network/proxy_pool.py

@@ -0,0 +1,746 @@
+# -*- coding: utf-8 -*-
+"""
+代理池
+"""
+import datetime
+import json
+import os
+import random
+import socket
+import time
+from urllib import parse
+
+import redis
+import requests
+
+import setting
+from utils import tools
+from utils.log import logger as log
+
+
+def decrypt(input_str: str) -> str:
+    """
+    改写:新增
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+# 建立本地缓存代理文件夹
+proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
+if not os.path.exists(proxy_path):
+    os.mkdir(proxy_path)
+
+
+def get_proxy_from_url(**kwargs):
+    """
+    获取指定url的代理
+    :param kwargs:
+    :return:
+    """
+    proxy_source_url = kwargs.get("proxy_source_url", [])
+    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
+
+    if not isinstance(proxy_source_url, list):
+        proxy_source_url = [proxy_source_url]
+        proxy_source_url = [x for x in proxy_source_url if x]
+    if not proxy_source_url:
+        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
+    kwargs = kwargs.copy()
+    kwargs.pop("proxy_source_url")
+    proxies_list = []
+    for url in proxy_source_url:
+        if url.startswith("http"):
+            proxies_list.extend(get_proxy_from_http(url, **kwargs))
+        elif url.startswith("redis"):
+            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
+
+    if proxies_list:
+        # 顺序打乱
+        random.shuffle(proxies_list)
+    return proxies_list
+
+
+def get_proxy_from_http(proxy_source_url, **kwargs):
+    """
+    从指定 http 地址获取代理
+    :param proxy_source_url:
+    :param kwargs:
+    :return:
+    """
+    filename = tools.get_md5(proxy_source_url) + ".txt"
+    abs_filename = os.path.join(proxy_path, filename)
+    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
+    update_flag = 0
+    if not update_interval:
+        # 强制更新
+        update_flag = 1
+    elif not os.path.exists(abs_filename):
+        # 文件不存在则更新
+        update_flag = 1
+    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
+        # 超过更新间隔
+        update_flag = 1
+    if update_flag:
+        pool = []
+        response = requests.get(proxy_source_url, timeout=20)
+        # 改写:获取scocks代理的response处理
+        for proxy in response.json():
+            host = decrypt(proxy['ip'])
+            port = proxy['ports'][0]
+            endTime = proxy['lifetime']
+            pool.append(f"{host}:{port}&&{endTime}")
+
+        with open(os.path.join(proxy_path, filename), "w") as f:
+            f.write('\n'.join(pool))
+    return get_proxy_from_file(filename)
+
+
+def get_proxy_from_file(filename, **kwargs):
+    """
+    从指定本地文件获取代理
+        文件格式
+        ip:port:https
+        ip:port:http
+        ip:port
+    :param filename:
+    :param kwargs:
+    :return:
+    """
+    proxies_list = []
+    with open(os.path.join(proxy_path, filename), "r") as f:
+        lines = f.readlines()
+
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # 解析
+        auth = ""
+        if "@" in line:
+            auth, line = line.split("@")
+        # 改写,解析代理有效期结束时间
+        line, end = line.split("&&")
+
+        items = line.split(":")
+        if len(items) < 2:
+            continue
+
+        ip, port, *protocol = items
+        if not all([port, ip]):
+            continue
+        if auth:
+            ip = "{}@{}".format(auth, ip)
+        if not protocol:
+            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
+            if time.time() < int(end):
+                proxies = {
+                    "https": "socks5://%s:%s" % (ip, port),
+                    "http": "socks5://%s:%s" % (ip, port),
+                    # "end":end
+                }
+            else:
+                continue
+        else:
+            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
+        proxies_list.append(proxies)
+
+    return proxies_list
+
+
+def get_proxy_from_redis(proxy_source_url, **kwargs):
+    """
+    从指定 redis 地址获取代理
+    @param proxy_source_url: redis://:passwd@host:ip/db
+        redis 存储结构 zset
+        ip:port ts
+    @param kwargs:
+        {"redis_proxies_key": "xxx"}
+    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
+    """
+
+    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
+    key = kwargs.get("redis_proxies_key")
+    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
+    proxies = redis_conn.zrange(key, 0, -1)
+    proxies_list = []
+    for proxy in proxies:
+        proxy = proxy.decode()
+        proxies_list.append(
+            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
+        )
+    return proxies_list
+
+
+def check_proxy(
+        ip="",
+        port="",
+        proxies=None,
+        type=0,
+        timeout=5,
+        logger=None,
+        show_error_log=True,
+        **kwargs,
+):
+    """
+    代理有效性检查
+    :param ip:
+    :param port:
+    :param type: 0:socket  1:requests
+    :param timeout:
+    :param logger:
+    :return:
+    """
+    if not logger:
+        logger = log
+    ok = 0
+    if type == 0 and ip and port:
+        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
+            sk.settimeout(timeout)
+            try:
+                # 必须检测 否则代理永远不刷新
+                sk.connect((ip, int(port)))
+                ok = 1
+            except Exception as e:
+                if show_error_log:
+                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
+            sk.close()
+    else:
+        if not proxies:
+            proxies = {
+                "http": "socks5://{}:{}".format(ip, port),
+                "https": "socks5//{}:{}".format(ip, port),
+            }
+        try:
+            # 改写:代理检测的url
+            r = requests.get(
+                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
+            )
+            ok = 1
+            r.close()
+        except Exception as e:
+            if show_error_log:
+                logger.debug(
+                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
+                )
+    return ok
+
+
+class ProxyItem(object):
+    """单个代理对象"""
+
+    # 代理标记
+    proxy_tag_list = (-1, 0, 1)
+
+    def __init__(
+            self,
+            proxies=None,
+            valid_timeout=20,
+            check_interval=180,
+            max_proxy_use_num=10000,
+            delay=30,
+            use_interval=None,
+            **kwargs,
+    ):
+        """
+        :param proxies:
+        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
+        :param check_interval:
+        :param max_proxy_use_num:
+        :param delay:
+        :param use_interval: 使用间隔 单位秒 默认不限制
+        :param logger: 日志处理器 默认 log.get_logger()
+        :param kwargs:
+        """
+        # {"http": ..., "https": ...}
+        self.proxies = proxies
+        # 检测超时时间 秒
+        self.valid_timeout = valid_timeout
+        # 检测间隔 秒
+        self.check_interval = check_interval
+
+        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
+        self.flag = 0
+        # 上次状态变化时间
+        self.flag_ts = 0
+        # 上次更新时间 有效时间
+        self.update_ts = 0
+        # 最大被使用次数
+        self.max_proxy_use_num = max_proxy_use_num
+        # 被使用次数记录
+        self.use_num = 0
+        # 延迟使用时间
+        self.delay = delay
+        # 使用间隔 单位秒
+        self.use_interval = use_interval
+        # 使用时间
+        self.use_ts = 0
+
+        self.proxy_args = self.parse_proxies(self.proxies)
+        self.proxy_ip = self.proxy_args["ip"]
+        self.proxy_port = self.proxy_args["port"]
+        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
+        if self.proxy_args["user"]:
+            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
+        else:
+            self.proxy_id = self.proxy_ip_port
+
+        # 日志处理器
+        self.logger = log
+
+    def get_proxies(self):
+        self.use_num += 1
+        return self.proxies
+
+    def is_delay(self):
+        return self.flag == 1
+
+    def is_valid(self, force=0, type=0):
+        """
+        检测代理是否有效
+            1 有效
+            2 延时使用
+            0 无效 直接在代理池删除
+        :param force:
+        :param type:
+        :return:
+        """
+        if self.use_num > self.max_proxy_use_num > 0:
+            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
+            return 0
+        if self.flag == -1:
+            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
+            return 0
+        if self.delay > 0 and self.flag == 1:
+            if time.time() - self.flag_ts < self.delay:
+                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
+                return 2
+            else:
+                self.flag = 0
+                self.logger.debug("延迟代理释放: {}".format(self.proxies))
+        if self.use_interval:
+            if time.time() - self.use_ts < self.use_interval:
+                return 2
+        if not force:
+            if time.time() - self.update_ts < self.check_interval:
+                return 1
+        if self.valid_timeout > 0:
+            ok = check_proxy(
+                proxies=self.proxies,
+                type=type,
+                timeout=self.valid_timeout,
+                logger=self.logger,
+            )
+        else:
+            ok = 1
+        self.update_ts = time.time()
+        return ok
+
+    @classmethod
+    def parse_proxies(self, proxies):
+        """
+        分解代理组成部分
+        :param proxies:
+        :return:
+        """
+        if not proxies:
+            return {}
+        if isinstance(proxies, (str, bytes)):
+            proxies = json.loads(proxies)
+        protocol = list(proxies.keys())
+        if not protocol:
+            return {}
+        _url = proxies.get(protocol[0])
+        # 改写:注释http代理url的拼接,以正常生成代理池
+        # if not _url.startswith("http"):
+        #     _url = "http://" + _url
+        _url_parse = parse.urlparse(_url)
+        netloc = _url_parse.netloc
+        if "@" in netloc:
+            netloc_auth, netloc_host = netloc.split("@")
+        else:
+            netloc_auth, netloc_host = "", netloc
+        ip, *port = netloc_host.split(":")
+        port = port[0] if port else "80"
+        user, *password = netloc_auth.split(":")
+        password = password[0] if password else ""
+        return {
+            "protocol": protocol,
+            "ip": ip,
+            "port": port,
+            "user": user,
+            "password": password,
+            "ip_port": "{}:{}".format(ip, port),
+        }
+
+
+class ProxyPoolBase(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def get(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class ProxyPool(ProxyPoolBase):
+    """代理池"""
+
+    def __init__(self, **kwargs):
+        """
+        :param size: 代理池大小  -1 为不限制
+        :param proxy_source_url: 代理文件地址 支持列表
+        :param proxy_instance:  提供代理的实例
+        :param reset_interval:  代理池重置间隔 最小间隔
+        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
+        :param check_valid: 是否在获取代理时进行检测有效性
+        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
+        :param logger: 日志处理器 默认 log.get_logger()
+        :param kwargs: 其他的参数
+        """
+        kwargs.setdefault("size", -1)
+        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
+
+        super(ProxyPool, self).__init__(**kwargs)
+        # 队列最大长度
+        self.max_queue_size = kwargs.get("size", -1)
+        # 实际代理数量
+        self.real_max_proxy_count = 1000
+        # 代理可用最大次数
+        # 代理获取地址 http://localhost/proxy.txt
+        self.proxy_source_url = kwargs.get("proxy_source_url", [])
+        if not isinstance(self.proxy_source_url, list):
+            self.proxy_source_url = [self.proxy_source_url]
+            self.proxy_source_url = [x for x in self.proxy_source_url if x]
+            self.proxy_source_url = list(set(self.proxy_source_url))
+            kwargs.update({"proxy_source_url": self.proxy_source_url})
+        # 处理日志
+        self.logger = kwargs.get("logger") or log
+        kwargs["logger"] = self.logger
+        if not self.proxy_source_url:
+            self.logger.warn("need set proxy_source_url or proxy_instance")
+
+        # 代理池重置间隔
+        self.reset_interval = kwargs.get("reset_interval", 5)
+        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
+        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
+        # 是否监测代理有效性
+        self.check_valid = kwargs.get("check_valid", True)
+
+        # 代理队列
+        self.proxy_queue = None
+        # {代理id: ProxyItem, ...}
+        self.proxy_dict = {}
+        # 失效代理队列
+        self.invalid_proxy_dict = {}
+
+        self.kwargs = kwargs
+
+        # 重置代理池锁
+        self.reset_lock = None
+        # 重置时间
+        self.last_reset_time = 0
+        # 重置的太快了  计数
+        self.reset_fast_count = 0
+        # 计数 获取代理重试3次仍然失败 次数
+        self.no_valid_proxy_times = 0
+
+        # 上次获取代理时间
+        self.last_get_ts = time.time()
+
+        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
+        self.proxy_item_update_ts_dict = {}
+
+        # 警告
+        self.warn_flag = False
+
+    def warn(self):
+        if not self.warn_flag:
+            for url in self.proxy_source_url:
+                if "zhima" in url:
+                    continue
+            self.warn_flag = True
+        return
+
+    @property
+    def queue_size(self):
+        """
+        当前代理池中代理数量
+        :return:
+        """
+        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
+
+    def clear(self):
+        """
+        清空自己
+        :return:
+        """
+        self.proxy_queue = None
+        # {代理ip: ProxyItem, ...}
+        self.proxy_dict = {}
+        # 清理失效代理集合
+        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
+        self.invalid_proxy_dict = {
+            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
+        }
+        # 清理超时的update_ts记录
+        _limit = time.time() - 600
+        self.proxy_item_update_ts_dict = {
+            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
+        }
+        return
+
+    def get(self, retry: int = 0) -> dict:
+        """
+        从代理池中获取代理
+        :param retry:
+        :return:
+        """
+        retry += 1
+        if retry > 3:
+            self.no_valid_proxy_times += 1
+            return None
+        # if time.time() - self.last_get_ts > 3 * 60:
+        #     # 3分钟没有获取过 重置一下
+        #     try:
+        #         self.reset_proxy_pool()
+        #     except Exception as e:
+        #         self.logger.exception(e)
+        # 记录获取时间
+        self.last_get_ts = time.time()
+        #
+        self.warn()
+        proxy_item = self.get_random_proxy()
+        if proxy_item:
+            # 不检测
+            if not self.check_valid:  #
+                # 塞回去
+                proxies = proxy_item.get_proxies()
+                self.put_proxy_item(proxy_item)
+                return proxies
+            else:
+                is_valid = proxy_item.is_valid()
+                if is_valid:
+                    # 记录update_ts
+                    self.proxy_item_update_ts_dict[
+                        proxy_item.proxy_id
+                    ] = proxy_item.update_ts
+                    # 塞回去
+                    proxies = proxy_item.get_proxies()
+                    self.put_proxy_item(proxy_item)
+                    if is_valid == 1:
+                        if proxy_item.use_interval:
+                            proxy_item.use_ts = time.time()
+                        return proxies
+                else:
+                    # 处理失效代理
+                    self.proxy_dict.pop(proxy_item.proxy_id, "")
+                    self.invalid_proxy_dict[
+                        proxy_item.proxy_id
+                    ] = datetime.datetime.now()
+        else:
+            try:
+                time.sleep(3)
+                self.reset_proxy_pool()
+            except Exception as e:
+                self.logger.exception(e)
+        if self.no_valid_proxy_times >= 5:
+            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
+            # 导致爬虫烂尾
+            try:
+                time.sleep(3)
+                self.reset_proxy_pool()
+            except Exception as e:
+                self.logger.exception(e)
+        return self.get(retry)
+
+    get_proxy = get
+
+    def get_random_proxy(self) -> ProxyItem:
+        """
+        随机获取代理
+        :return:
+        """
+        if self.proxy_queue is not None:
+            if random.random() < 0.5:
+                # 一半概率检查 这是个高频操作 优化一下
+                if time.time() - self.last_reset_time > self.reset_interval_max:
+                    time.sleep(3)
+                    self.reset_proxy_pool(force=True)
+                else:
+                    min_q_size = (
+                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
+                        if self.max_queue_size > 0
+                        else self.real_max_proxy_count / 2
+                    )
+                    if self.proxy_queue.qsize() < min_q_size:
+                        time.sleep(3)
+                        self.reset_proxy_pool()
+            try:
+                return self.proxy_queue.get_nowait()
+            except Exception:
+                pass
+        return None
+
+    def append_proxies(self, proxies_list: list) -> int:
+        """
+        添加代理到代理池
+        :param proxies_list:
+        :return:
+        """
+        count = 0
+        if not isinstance(proxies_list, list):
+            proxies_list = [proxies_list]
+        for proxies in proxies_list:
+            if proxies:
+                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
+                # 增加失效判断 2018/12/18
+                if proxy_item.proxy_id in self.invalid_proxy_dict:
+                    continue
+                if proxy_item.proxy_id not in self.proxy_dict:
+                    # 补充update_ts
+                    if not proxy_item.update_ts:
+                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
+                            proxy_item.proxy_id, 0
+                        )
+                    self.put_proxy_item(proxy_item)
+                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
+                    count += 1
+        return count
+
+    def put_proxy_item(self, proxy_item: ProxyItem):
+        """
+        添加 ProxyItem 到代理池
+        :param proxy_item:
+        :return:
+        """
+        return self.proxy_queue.put_nowait(proxy_item)
+
+    def reset_proxy_pool(self, force: bool = False):
+        """
+        重置代理池
+        :param force: 是否强制重置代理池
+        :return:
+        """
+        if not self.reset_lock:
+            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
+            import threading
+
+            self.reset_lock = threading.RLock()
+        with self.reset_lock:
+            if (
+                    force
+                    or self.proxy_queue is None
+                    or (
+                    self.max_queue_size > 0
+                    and self.proxy_queue.qsize() < self.max_queue_size / 2
+            )
+                    or (
+                    self.max_queue_size < 0
+                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
+            )
+                    or self.no_valid_proxy_times >= 5
+            ):
+                if time.time() - self.last_reset_time < self.reset_interval:
+                    self.reset_fast_count += 1
+                    if self.reset_fast_count % 10 == 0:
+                        self.logger.debug(
+                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
+                        )
+                        time.sleep(1)
+                else:
+                    self.clear()
+                    if self.proxy_queue is None:
+                        import queue
+
+                        self.proxy_queue = queue.Queue()
+                    # TODO 这里获取到的可能重复
+                    proxies_list = get_proxy_from_url(**self.kwargs)
+                    self.real_max_proxy_count = len(proxies_list)
+                    if 0 < self.max_queue_size < self.real_max_proxy_count:
+                        proxies_list = random.sample(proxies_list, self.max_queue_size)
+                    _valid_count = self.append_proxies(proxies_list)
+                    self.last_reset_time = time.time()
+                    self.no_valid_proxy_times = 0
+                    self.logger.debug(
+                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
+                            len(proxies_list),
+                            _valid_count,
+                            len(self.invalid_proxy_dict),
+                            len(self.proxy_dict),
+                        )
+                    )
+        return
+
+    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
+        """
+        对代理进行标记
+        :param proxies_list:
+        :param flag:
+                    -1  废弃
+                    1 延迟使用
+        :param delay: 延迟时间
+        :return:
+        """
+        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
+            return False
+        if not isinstance(proxies_list, list):
+            proxies_list = [proxies_list]
+        for proxies in proxies_list:
+            if not proxies:
+                continue
+            proxy_id = ProxyItem(proxies).proxy_id
+            if proxy_id not in self.proxy_dict:
+                continue
+            self.proxy_dict[proxy_id].flag = flag
+            self.proxy_dict[proxy_id].flag_ts = time.time()
+            self.proxy_dict[proxy_id].delay = delay
+
+        return True
+
+    def get_proxy_item(self, proxy_id="", proxies=None):
+        """
+        获取代理对象
+        :param proxy_id:
+        :param proxies:
+        :return:
+        """
+        if proxy_id:
+            return self.proxy_dict.get(proxy_id)
+        if proxies:
+            proxy_id = ProxyItem(proxies).proxy_id
+            return self.proxy_dict.get(proxy_id)
+        return
+
+    def copy(self):
+        return ProxyPool(**self.kwargs)
+
+    def all(self) -> list:
+        """
+        获取当前代理池中的全部代理
+        :return:
+        """
+        return get_proxy_from_url(**self.kwargs)

+ 524 - 0
A数据处理/site_monitor/network/request.py

@@ -0,0 +1,524 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-25 11:49:08
+---------
+@summary: 请求结构体
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import copy
+import re
+
+import requests
+from requests.cookies import RequestsCookieJar
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+
+import setting as setting
+import utils.tools as tools
+from db.redisdb import RedisDB
+from network import user_agent
+from network.downloader.base import Downloader, RenderDownloader
+from network.proxy_pool import ProxyPool
+from network.response import Response
+from utils.log import logger as log
+
+# 屏蔽warning信息
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+
+
+class Request:
+    user_agent_pool = user_agent
+    proxies_pool: ProxyPool = None
+
+    cache_db = None  # redis / pika
+    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
+    cached_expire_time = 1200  # 缓存过期时间
+
+    # 下载器
+    downloader: Downloader = None
+    session_downloader: Downloader = None
+    render_downloader: RenderDownloader = None
+
+    __REQUEST_ATTRS__ = {
+        # "method",
+        # "url",
+        "params",
+        "data",
+        "headers",
+        "cookies",
+        "files",
+        "auth",
+        "timeout",
+        "allow_redirects",
+        "proxies",
+        "hooks",
+        "stream",
+        "verify",
+        "cert",
+        "json",
+    }
+
+    _DEFAULT_KEY_VALUE_ = dict(
+        url="",
+        method=None,
+        retry_times=0,
+        priority=300,
+        parser_name=None,
+        callback=None,
+        filter_repeat=True,
+        auto_request=True,
+        request_sync=False,
+        use_session=None,
+        random_user_agent=True,
+        download_midware=None,
+        is_abandoned=False,
+        render=False,
+        render_time=0,
+        make_absolute_links=None,
+    )
+
+    _CUSTOM_PROPERTIES_ = {
+        "requests_kwargs",
+        "custom_ua",
+        "custom_proxies",
+    }
+
+    def __init__(
+        self,
+        url="",
+        retry_times=0,
+        priority=300,
+        parser_name=None,
+        callback=None,
+        filter_repeat=True,
+        auto_request=True,
+        request_sync=False,
+        use_session=None,
+        random_user_agent=True,
+        download_midware=None,
+        is_abandoned=False,
+        render=False,
+        render_time=0,
+        make_absolute_links=None,
+        **kwargs,
+    ):
+        """
+        @summary: Request参数
+        ---------
+        框架参数
+        @param url: 待抓取url
+        @param retry_times: 当前重试次数
+        @param priority: 优先级 越小越优先 默认300
+        @param parser_name: 回调函数所在的类名 默认为当前类
+        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
+        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
+        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
+        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
+        @param use_session: 是否使用session方式
+        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
+        @param download_midware: 下载中间件。默认为parser中的download_midware
+        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
+        @param render: 是否用浏览器渲染
+        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
+        @param make_absolute_links: 是否转成绝对连接,默认是
+        --
+        以下参数与requests参数使用方式一致
+        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
+        @param params: 请求参数
+        @param data: 请求body
+        @param json: 请求json字符串,同 json.dumps(data)
+        @param headers:
+        @param cookies: 字典 或 CookieJar 对象
+        @param files:
+        @param auth:
+        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
+        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @param verify: 为 True 时将会验证 SSL 证书
+        @param stream: 如果为 False,将会立即下载响应内容
+        @param cert:
+        --
+        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
+        ---------
+        @result:
+        """
+
+        self.url = url
+        self.method = None
+        self.retry_times = retry_times
+        self.priority = priority
+        self.parser_name = parser_name
+        self.callback = callback
+        self.filter_repeat = filter_repeat
+        self.auto_request = auto_request
+        self.request_sync = request_sync
+        self.use_session = use_session
+        self.random_user_agent = random_user_agent
+        self.download_midware = download_midware
+        self.is_abandoned = is_abandoned
+        self.render = render
+        self.render_time = render_time
+        self.make_absolute_links = (
+            make_absolute_links
+            if make_absolute_links is not None
+            else setting.MAKE_ABSOLUTE_LINKS
+        )
+
+        # 自定义属性,不参与序列化
+        self.requests_kwargs = {}
+        for key, value in kwargs.items():
+            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
+                self.requests_kwargs[key] = value
+
+            self.__dict__[key] = value
+
+        self.custom_ua = False
+        self.custom_proxies = False
+
+    def __repr__(self):
+        try:
+            return "<Request {}>".format(self.url)
+        except:
+            return "<Request {}>".format(str(self.to_dict)[:40])
+
+    def __setattr__(self, key, value):
+        """
+        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
+        @param key:
+        @param value:
+        @return:
+        """
+        self.__dict__[key] = value
+
+        if key in self.__class__.__REQUEST_ATTRS__:
+            self.requests_kwargs[key] = value
+
+    def __lt__(self, other):
+        return self.priority < other.priority
+
+    @property
+    def _proxies_pool(self):
+        if not self.__class__.proxies_pool:
+            self.__class__.proxies_pool = ProxyPool()
+
+        return self.__class__.proxies_pool
+
+    @property
+    def _downloader(self):
+        if not self.__class__.downloader:
+            self.__class__.downloader = tools.import_cls(setting.DOWNLOADER)()
+
+        return self.__class__.downloader
+
+    @property
+    def _session_downloader(self):
+        if not self.__class__.session_downloader:
+            self.__class__.session_downloader = tools.import_cls(
+                setting.SESSION_DOWNLOADER
+            )()
+
+        return self.__class__.session_downloader
+
+    @property
+    def _render_downloader(self):
+        if not self.__class__.render_downloader:
+            self.__class__.render_downloader = tools.import_cls(
+                setting.RENDER_DOWNLOADER
+            )()
+
+        return self.__class__.render_downloader
+
+    @property
+    def to_dict(self):
+        request_dict = {}
+
+        self.callback = (
+            getattr(self.callback, "__name__")
+            if callable(self.callback)
+            else self.callback
+        )
+
+        if isinstance(self.download_midware, (tuple, list)):
+            self.download_midware = [
+                getattr(download_midware, "__name__")
+                if callable(download_midware)
+                else download_midware
+                for download_midware in self.download_midware
+            ]
+        else:
+            self.download_midware = (
+                getattr(self.download_midware, "__name__")
+                if callable(self.download_midware)
+                else self.download_midware
+            )
+
+        for key, value in self.__dict__.items():
+            if (
+                key in self.__class__._DEFAULT_KEY_VALUE_
+                and self.__class__._DEFAULT_KEY_VALUE_.get(key) == value
+                or key in self.__class__._CUSTOM_PROPERTIES_
+            ):
+                continue
+
+            if value is not None:
+                if key in self.__class__.__REQUEST_ATTRS__:
+                    if not isinstance(
+                        value, (bytes, bool, float, int, str, tuple, list, dict)
+                    ):
+                        value = tools.dumps_obj(value)
+                else:
+                    if not isinstance(value, (bytes, bool, float, int, str)):
+                        value = tools.dumps_obj(value)
+
+            request_dict[key] = value
+
+        return request_dict
+
+    @property
+    def callback_name(self):
+        return (
+            getattr(self.callback, "__name__")
+            if callable(self.callback)
+            else self.callback
+        )
+
+    def make_requests_kwargs(self):
+        """
+        处理参数
+        """
+        # 设置超时默认时间
+        self.requests_kwargs.setdefault(
+            "timeout", setting.REQUEST_TIMEOUT
+        )  # connect=22 read=22
+
+        # 设置stream
+        # 默认情况下,当你进行网络请求后,响应体会立即被下载。
+        # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
+        # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
+        self.requests_kwargs.setdefault("stream", True)
+
+        # 关闭证书验证
+        self.requests_kwargs.setdefault("verify", False)
+
+        # 设置请求方法
+        method = self.__dict__.get("method")
+        if not method:
+            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
+                method = "POST"
+            else:
+                method = "GET"
+        self.method = method
+
+        # 设置user—agent
+        headers = self.requests_kwargs.get("headers", {})
+        if "user-agent" not in headers and "User-Agent" not in headers:
+            if self.random_user_agent and setting.RANDOM_HEADERS:
+                # 随机user—agent
+                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
+                headers.update({"User-Agent": ua})
+                self.requests_kwargs.update(headers=headers)
+            else:
+                # 使用默认的user—agent
+                self.requests_kwargs.setdefault(
+                    "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
+                )
+        else:
+            self.custom_ua = True
+
+        # 代理
+        proxies = self.requests_kwargs.get("proxies", -1)
+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+            while True:
+                proxies = self._proxies_pool.get()
+                if proxies:
+                    self.requests_kwargs.update(proxies=proxies)
+                    break
+                else:
+                    log.debug("暂无可用代理 ...")
+        else:
+            self.custom_proxies = True
+
+    def get_response(self, save_cached=False):
+        """
+        获取带有selector功能的response
+        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
+        @return:
+        """
+        self.make_requests_kwargs()
+
+        log.debug(
+            """
+                -------------- %srequest for ----------------
+                url  = %s
+                method = %s
+                args = %s
+                """
+            % (
+                ""
+                if not self.parser_name
+                else "%s.%s "
+                % (
+                    self.parser_name,
+                    (
+                        self.callback
+                        and callable(self.callback)
+                        and getattr(self.callback, "__name__")
+                        or self.callback
+                    )
+                    or "parse",
+                ),
+                self.url,
+                self.method,
+                self.requests_kwargs,
+            )
+        )
+
+        # def hooks(response, *args, **kwargs):
+        #     print(response.url)
+        #
+        # self.requests_kwargs.update(hooks={'response': hooks})
+
+        # self.use_session 优先级高
+        use_session = (
+            setting.USE_SESSION if self.use_session is None else self.use_session
+        )
+
+        if self.render:
+            response = self._render_downloader.download(self)
+        elif use_session:
+            response = self._session_downloader.download(self)
+        else:
+            response = self._downloader.download(self)
+
+        response.make_absolute_links = self.make_absolute_links
+
+        if save_cached:
+            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
+
+        return response
+
+    def get_params(self):
+        return self.requests_kwargs.get("params")
+
+    def get_proxies(self) -> dict:
+        """
+
+        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
+
+        """
+        return self.requests_kwargs.get("proxies")
+
+    def get_proxy(self) -> str:
+        """
+
+        Returns: ip:port
+
+        """
+        proxies = self.get_proxies()
+        if proxies:
+            return re.sub(
+                "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
+            )
+
+    def get_headers(self) -> dict:
+        return self.requests_kwargs.get("headers", {})
+
+    def get_user_agent(self) -> str:
+        return self.get_headers().get("user_agent") or self.get_headers().get(
+            "User-Agent"
+        )
+
+    def get_cookies(self) -> dict:
+        cookies = self.requests_kwargs.get("cookies")
+        if cookies and isinstance(cookies, RequestsCookieJar):
+            cookies = cookies.get_dict()
+
+        if not cookies:
+            cookie_str = self.get_headers().get("Cookie") or self.get_headers().get(
+                "cookie"
+            )
+            if cookie_str:
+                cookies = tools.get_cookies_from_str(cookie_str)
+        return cookies
+
+    @property
+    def fingerprint(self):
+        """
+        request唯一表识
+        @return:
+        """
+        url = self.__dict__.get("url", "")
+        # url 归一化
+        url = tools.canonicalize_url(url)
+        args = [url]
+
+        for arg in ["params", "data", "files", "auth", "cert", "json"]:
+            if self.requests_kwargs.get(arg):
+                args.append(self.requests_kwargs.get(arg))
+
+        return tools.get_md5(*args)
+
+    @property
+    def _cache_db(self):
+        if not self.__class__.cache_db:
+            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
+
+        return self.__class__.cache_db
+
+    @property
+    def _cached_redis_key(self):
+        if self.__class__.cached_redis_key:
+            return (
+                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
+            )
+        else:
+            return f"response_cached:test:{self.fingerprint}"
+
+    def save_cached(self, response, expire_time=1200):
+        """
+        使用redis保存response 用于调试 不用每回都下载
+        @param response:
+        @param expire_time: 过期时间
+        @return:
+        """
+
+        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
+
+    def get_response_from_cached(self, save_cached=True):
+        """
+        从缓存中获取response
+        注意:
+            属性值为空:
+                -raw : urllib3.response.HTTPResponse
+                -connection:requests.adapters.HTTPAdapter
+                -history
+
+            属性含义改变:
+                - request 由requests 改为Request
+        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
+        @return:
+        """
+        response_dict = self._cache_db.strget(self._cached_redis_key)
+        if not response_dict:
+            log.info("无response缓存  重新下载")
+            response_obj = self.get_response(save_cached=save_cached)
+        else:
+            response_dict = eval(response_dict)
+            response_obj = Response.from_dict(response_dict)
+        return response_obj
+
+    def del_response_cached(self):
+        self._cache_db.clear(self._cached_redis_key)
+
+    @classmethod
+    def from_dict(cls, request_dict):
+        for key, value in request_dict.items():
+            if isinstance(value, bytes):  # 反序列化 如item
+                request_dict[key] = tools.loads_obj(value)
+
+        return cls(**request_dict)
+
+    def copy(self):
+        return self.__class__.from_dict(copy.deepcopy(self.to_dict))

+ 396 - 0
A数据处理/site_monitor/network/response.py

@@ -0,0 +1,396 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-05-10 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+import copy
+import datetime
+import os
+import re
+import time
+from urllib.parse import urlparse, urlunparse, urljoin
+
+from bs4 import BeautifulSoup
+from bs4.dammit import UnicodeDammit
+from lxml.html import fromstring, HtmlElement
+from lxml.html.clean import Cleaner
+from parsel import Selector
+from requests.cookies import RequestsCookieJar
+from requests.models import Response as res
+from w3lib.encoding import (
+    http_content_type_encoding,
+    html_body_declared_encoding
+)
+
+from utils.log import logger as log
+
+FAIL_ENCODING = "ISO-8859-1"
+
+# html 源码中的特殊字符,需要删掉,否则会影响etree的构建
+SPECIAL_CHARACTERS = [
+    # 移除控制字符 全部字符列表 https://zh.wikipedia.org/wiki/%E6%8E%A7%E5%88%B6%E5%AD%97%E7%AC%A6
+    "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]"
+]
+
+SPECIAL_CHARACTER_PATTERNS = [
+    re.compile(special_character) for special_character in SPECIAL_CHARACTERS
+]
+
+
+def iter_node(element: HtmlElement):
+    yield element
+    for sub_element in element:
+        if isinstance(sub_element, HtmlElement):
+            yield from iter_node(sub_element)
+
+
+class Response(res):
+    def __init__(self, response):
+        super(Response, self).__init__()
+        self.__dict__.update(response.__dict__)
+
+        self._cached_selector = None
+        self._cached_text = None
+        self._cached_json = None
+
+        self._encoding = None
+
+        self.encoding_errors = "strict"  # strict / replace / ignore
+
+    @classmethod
+    def from_dict(cls, response_dict):
+        """
+        利用字典获取Response对象
+        @param response_dict: 原生的response.__dict__
+        @return:
+        """
+        cookie_jar = RequestsCookieJar()
+        cookie_jar.update(other=response_dict["cookies"])
+        response_dict["cookies"] = cookie_jar
+
+        response_dict["elapsed"] = datetime.timedelta(
+            0, 0, response_dict["elapsed"]
+        )  # 耗时
+        response_dict["connection"] = None
+        response_dict["_content_consumed"] = True
+
+        response = res()
+        response.__dict__.update(response_dict)
+        return cls(response)
+
+    @property
+    def to_dict(self):
+        response_dict = {
+            "_content": self.content,
+            "cookies": self.cookies.get_dict(),
+            "encoding": self.encoding,
+            "headers": self.headers,
+            "status_code": self.status_code,
+            "elapsed": self.elapsed.microseconds,  # 耗时
+            "url": self.url,
+        }
+
+        return response_dict
+
+    def __clear_cache(self):
+        self.__dict__["_cached_selector"] = None
+        self.__dict__["_cached_text"] = None
+        self.__dict__["_cached_json"] = None
+
+    @property
+    def encoding(self):
+        """
+        编码优先级:自定义编码 > header中编码 > 页面编码 > 根据content猜测的编码
+        """
+        self._encoding = (
+            self._encoding
+            or self._headers_encoding()
+            or self._body_declared_encoding()
+            or self.apparent_encoding
+        )
+        return self._encoding
+
+    @encoding.setter
+    def encoding(self, val):
+        self.__clear_cache()
+        self._encoding = val
+
+    code = encoding
+
+    def _headers_encoding(self):
+        """
+        从headers获取头部charset编码
+        """
+        content_type = self.headers.get("Content-Type") or self.headers.get(
+            "content-type"
+        )
+        if content_type:
+            return (
+                http_content_type_encoding(content_type) or "utf-8"
+                if "application/json" in content_type
+                else None
+            )
+
+    def _body_declared_encoding(self):
+        """
+        从html xml等获取<meta charset="编码">
+        """
+
+        return html_body_declared_encoding(self.content)
+
+    def _get_unicode_html(self, html):
+        if not html or not isinstance(html, bytes):
+            return html
+
+        converted = UnicodeDammit(html, is_html=True)
+        if not converted.unicode_markup:
+            raise Exception(
+                "Failed to detect encoding of article HTML, tried: %s"
+                % ", ".join(converted.tried_encodings)
+            )
+
+        html = converted.unicode_markup
+        return html
+
+    def _make_absolute(self, link):
+        """Makes a given link absolute."""
+        try:
+
+            link = link.strip()
+
+            # Parse the link with stdlib.
+            parsed = urlparse(link)._asdict()
+
+            # If link is relative, then join it with base_url.
+            if not parsed["netloc"]:
+                return urljoin(self.url, link)
+
+            # Link is absolute; if it lacks a scheme, add one from base_url.
+            if not parsed["scheme"]:
+                parsed["scheme"] = urlparse(self.url).scheme
+
+                # Reconstruct the URL to incorporate the new scheme.
+                parsed = (v for v in parsed.values())
+                return urlunparse(parsed)
+
+        except Exception as e:
+            log.error(
+                "Invalid URL <{}> can't make absolute_link. exception: {}".format(
+                    link, e
+                )
+            )
+
+        # Link is absolute and complete with scheme; nothing to be done here.
+        return link
+
+    def _absolute_links(self, text):
+        regexs = [
+            r'(<(?i)a.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # a
+            r'(<(?i)img.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # img
+            r'(<(?i)link.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # css
+            r'(<(?i)script.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # js
+        ]
+
+        for regex in regexs:
+
+            def replace_href(text):
+                # html = text.group(0)
+                link = text.group(2)
+                absolute_link = self._make_absolute(link)
+
+                # return re.sub(regex, r'\1{}\3'.format(absolute_link), html) # 使用正则替换,个别字符不支持。如该网址源代码http://permit.mep.gov.cn/permitExt/syssb/xxgk/xxgk!showImage.action?dataid=0b092f8115ff45c5a50947cdea537726
+                return text.group(1) + absolute_link + text.group(3)
+
+            text = re.sub(regex, replace_href, text, flags=re.S)
+
+        return text
+
+    def _del_special_character(self, text):
+        """
+        删除特殊字符
+        """
+        for special_character_pattern in SPECIAL_CHARACTER_PATTERNS:
+            text = special_character_pattern.sub("", text)
+
+        return text
+
+    @property
+    def __text(self):
+        """Content of the response, in unicode.
+
+        If Response.encoding is None, encoding will be guessed using
+        ``chardet``.
+
+        The encoding of the response content is determined based solely on HTTP
+        headers, following RFC 2616 to the letter. If you can take advantage of
+        non-HTTP knowledge to make a better guess at the encoding, you should
+        set ``r.encoding`` appropriately before accessing this property.
+        """
+
+        if not self.content:
+            return ""
+
+        # Decode unicode from given encoding.
+        try:
+            content = str(self.content, self.encoding, errors=self.encoding_errors)
+        except (LookupError, TypeError):
+            # A LookupError is raised if the encoding was not found which could
+            # indicate a misspelling or similar mistake.
+            #
+            # A TypeError can be raised if encoding is None
+            #
+            # So we try blindly encoding.
+            content = str(self.content, errors=self.encoding_errors)
+
+        return content
+
+    @property
+    def text(self):
+        if self._cached_text is None:
+            if self.encoding and self.encoding.upper() != FAIL_ENCODING:
+                try:
+                    self._cached_text = self.__text
+                except UnicodeDecodeError:
+                    self._cached_text = self._get_unicode_html(self.content)
+            else:
+                self._cached_text = self._get_unicode_html(self.content)
+
+            if self._cached_text:
+                self._cached_text = self._absolute_links(self._cached_text)
+                self._cached_text = self._del_special_character(self._cached_text)
+
+        return self._cached_text
+
+    @text.setter
+    def text(self, html):
+        self._cached_text = html
+        self._cached_text = self._absolute_links(self._cached_text)
+        self._cached_text = self._del_special_character(self._cached_text)
+        self._cached_selector = Selector(self.text)
+
+    @property
+    def json(self, **kwargs):
+        if self._cached_json is None:
+            self.encoding = self.encoding or "utf-8"
+            self._cached_json = super(Response, self).json(**kwargs)
+
+        return self._cached_json
+
+    @property
+    def content(self):
+        content = super(Response, self).content
+        return content
+
+    @property
+    def is_html(self):
+        content_type = self.headers.get("Content-Type", "")
+        if "text/html" in content_type:
+            return True
+        else:
+            return False
+
+    @property
+    def selector(self):
+        if self._cached_selector is None:
+            self._cached_selector = Selector(self.text)
+        return self._cached_selector
+
+    def bs4(self, features="html.parser"):
+        soup = BeautifulSoup(self.text, features)
+        return soup
+
+    def extract(self):
+        return self.selector.get()
+
+    def xpath(self, query, **kwargs):
+        return self.selector.xpath(query, **kwargs)
+
+    def css(self, query):
+        return self.selector.css(query)
+
+    def re(self, regex, replace_entities=False):
+        """
+        @summary: 正则匹配
+        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
+        为了使用方便,正则单双引号自动处理为不敏感
+        ---------
+        @param regex: 正则或者re.compile
+        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
+        ---------
+        @result: 列表
+        """
+
+        # 将单双引号设置为不敏感
+        if isinstance(regex, str):
+            regex = re.sub("['\"]", "['\"]", regex)
+
+        return self.selector.re(regex, replace_entities)
+
+    def re_first(self, regex, default=None, replace_entities=False):
+        """
+        @summary: 正则匹配
+        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
+        为了使用方便,正则单双引号自动处理为不敏感
+        ---------
+        @param regex: 正则或者re.compile
+        @param default: 未匹配到, 默认值
+        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
+        ---------
+        @result: 第一个值或默认值
+        """
+
+        # 将单双引号设置为不敏感
+        if isinstance(regex, str):
+            regex = re.sub("['\"]", "['\"]", regex)
+
+        return self.selector.re_first(regex, default, replace_entities)
+
+    def close_browser(self, request):
+        if hasattr(self, "browser"):
+            request._webdriver_pool.remove(self.browser)
+            del self.browser
+
+    def __del__(self):
+        self.close()
+
+    def open(self, delete_temp_file=False):
+        with open("temp.html", "w", encoding=self.encoding, errors="replace") as html:
+            self.encoding_errors = "replace"
+            html.write(self.text)
+
+        os.system("open temp.html")
+
+        if delete_temp_file:
+            time.sleep(1)
+            os.remove("temp.html")
+
+    @property
+    def plain_text(self):
+        return re.findall('[\u4e00-\u9fa5]', self.text, re.S)
+
+    def tags(self):
+        tags_dict = {}
+
+        html = copy.deepcopy(self.text)
+        if len(html) == 0:
+            tags_dict['tags_count'] = 0
+            return tags_dict
+
+        cleaner = Cleaner()
+        html = cleaner.clean_html(html)
+
+        count = 0
+        node = fromstring(html)
+        for elem in iter_node(node.xpath('/html')[0]):
+            count += 1
+            tag = elem.tag
+            if not tags_dict.get(tag):
+                tags_dict[tag] = 1
+            else:
+                tags_dict[tag] += 1
+
+        tags_dict['tags_count'] = count
+        return tags_dict

+ 389 - 0
A数据处理/site_monitor/network/user_agent.py

@@ -0,0 +1,389 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2016-12-28 17:55
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import random
+
+USER_AGENTS = {
+    "chrome": [
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
+        "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
+        "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
+        "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
+        "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
+    ],
+    "opera": [
+        "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
+        "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
+        "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
+        "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
+        "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
+        "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
+        "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
+        "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
+        "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
+        "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
+        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
+        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
+        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
+        "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
+        "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
+        "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
+        "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
+        "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
+        "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
+        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
+        "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
+        "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
+        "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
+        "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
+        "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
+        "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
+    ],
+    "firefox": [
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
+        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
+        "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
+        "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
+        "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
+        "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
+        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
+        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
+        "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
+        "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
+    ],
+    "internetexplorer": [
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
+        "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko",
+        "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
+        "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
+        "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
+        "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
+        "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
+        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
+        "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)",
+    ],
+    "safari": [
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+    ],
+    "mobile": [
+        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
+        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
+        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
+        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
+        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
+        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
+        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
+        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
+        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
+        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
+        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
+        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Safari/605.1.15",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
+    ],
+}
+
+
+def get(ua_type: str = None):
+    if not ua_type:
+        ua_type = random.choice(list(USER_AGENTS.keys()))
+    elif ua_type not in USER_AGENTS:
+        raise ValueError(
+            "ua_type error, expect one of {}".format(list(USER_AGENTS.keys()))
+        )
+
+    return random.choice(USER_AGENTS[ua_type])

+ 14 - 0
A数据处理/site_monitor/requirements.txt

@@ -0,0 +1,14 @@
+beautifulsoup4==4.9.3
+bs4==0.0.1
+loguru==0.5.3
+lxml==4.9.1
+numpy==1.24.1
+parsel==1.7.0
+playwright==1.24.1
+pymongo==3.12.0
+redis==3.5.3
+requests==2.30.0
+six==1.16.0
+w3lib==2.1.1
+PyExecJS>=1.5.1
+redis-py-cluster>=2.1.0

+ 65 - 0
A数据处理/site_monitor/setting.py

@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+"""爬虫配置文件"""
+import os
+
+# MONGODB
+MONGO_IP = "172.17.4.87"
+MONGO_PORT = 27080
+MONGO_DB = "py_spider"
+MONGO_USER_NAME = os.getenv("MONGO_USER_NAME")
+MONGO_USER_PASS = os.getenv("MONGO_USER_PASS")
+
+# REDIS
+# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+REDISDB_IP_PORTS = "172.17.4.232:7361"
+REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
+REDISDB_DB = 4
+# 适用于redis哨兵模式
+REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME")
+
+# 浏览器渲染
+PLAYWRIGHT = dict(
+    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+    headless=True,  # 是否为无头浏览器
+    driver_type="webkit",  # chromium、firefox、webkit
+    timeout=60,  # 请求超时时间
+    window_size=(1024, 800),  # 窗口大小
+    executable_path=None,  # 浏览器路径,默认为默认路径
+    download_path=None,  # 下载文件的路径
+    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
+    wait_until="networkidle",  # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle"
+    use_stealth_js=False,  # 使用stealth.min.js隐藏浏览器特征
+    page_on_event_callback=None,  # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
+    storage_state_path=None,  # 保存浏览器状态的路径
+    url_regexes=None,  # 拦截接口,支持正则,数组类型
+    save_all=False,  # 是否保存所有拦截的接口, 配合url_regexes使用,为False时只保存最后一次拦截的接口
+)
+
+# request网络请求超时时间
+REQUEST_TIMEOUT = 30  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
+
+# 设置代理
+PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"  # 代理提取API ,返回的代理分割符为\r\n
+PROXY_ENABLE = True
+
+# 随机headers
+RANDOM_HEADERS = True
+# UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
+USER_AGENT_TYPE = "chrome"
+# 默认使用的浏览器头
+DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
+# requests 使用session
+USE_SESSION = False
+
+# 下载
+DOWNLOADER = "network.downloader.RequestsDownloader"
+SESSION_DOWNLOADER = "network.downloader.RequestsSessionDownloader"
+RENDER_DOWNLOADER = "network.downloader.PlaywrightDownloader"
+MAKE_ABSOLUTE_LINKS = True  # 自动转成绝对连接
+
+# 企业微信报警
+WECHAT_WARNING_URL = ""  # 企业微信机器人api
+WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
+WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
+WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重

+ 8 - 0
A数据处理/site_monitor/utils/__init__.py

@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2023-05-10 
+---------
+@summary:  
+---------
+@author: Dzr
+"""

+ 147 - 0
A数据处理/site_monitor/utils/clean_html.py

@@ -0,0 +1,147 @@
+import re
+__all__ = ['cleaner']
+
+# 独立元素
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+# 行内元素
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',            # 图片base64
+}
+# 块级元素
+BLOCK_TAGS = {
+    '<div>\s*?</div>':'',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割 division
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+# 其他
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+
+}
+# 样式
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+
+}
+# 空白符
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+# css标签集合
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+# css属性集合
+ATTRS = {'id', 'class', 'style', 'width'}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    # 不显示输入框边框
+    html = html.replace('<input', '<input style="border-color: transparent;"')
+    return html
+
+
+def _lowercase_tag(html):
+    """标签归一化处理(全部小写 + 标签修复)"""
+    tags = re.findall("<[^>]+>", html)
+    tag_sets = set(tags)
+
+    if len(tag_sets) > 10000:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, "lxml")
+        html = str(soup.body.next_element)
+    else:
+        for tag in tag_sets:
+            html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def cleaner(html, special=None, completely=False):
+    """
+    数据清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :return: 清洗后的页面源码
+    """
+    if special is None:
+        special = {}
+
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+    html = _lowercase_tag(html)
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    return html

文件差异内容过多而无法显示
+ 0 - 0
A数据处理/site_monitor/utils/js/intercept.js


文件差异内容过多而无法显示
+ 6 - 0
A数据处理/site_monitor/utils/js/stealth.min.js


+ 14 - 0
A数据处理/site_monitor/utils/log.py

@@ -0,0 +1,14 @@
+from pathlib import Path
+
+from loguru import logger
+
+_absolute = Path(__file__).absolute().parent.parent
+_log_path = (_absolute / 'logs/log_{time:YYYY-MM-DD}.log').resolve()
+logger.add(
+    _log_path,
+    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
+    level='INFO',
+    rotation='00:00',
+    retention='1 week',
+    encoding='utf-8',
+)

+ 2438 - 0
A数据处理/site_monitor/utils/tools.py

@@ -0,0 +1,2438 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-09-06 14:21
+---------
+@summary: 工具
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import asyncio
+import calendar
+import codecs
+import configparser  # 读配置文件的
+import datetime
+import functools
+import hashlib
+import html
+import importlib
+import json
+import os
+import pickle
+import random
+import re
+import socket
+import ssl
+import string
+import sys
+import time
+import traceback
+import urllib
+import urllib.parse
+import uuid
+import weakref
+from functools import partial, wraps
+from hashlib import md5
+from pprint import pformat
+from pprint import pprint
+from urllib import request
+from urllib.parse import urljoin
+
+import bson
+import execjs  # pip install PyExecJS
+import redis
+import requests
+import six
+from requests.cookies import RequestsCookieJar
+from w3lib.url import canonicalize_url as _canonicalize_url
+
+import setting as setting
+from db.redisdb import RedisDB
+from utils.log import logger as log
+
+os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
+
+# 全局取消ssl证书验证
+ssl._create_default_https_context = ssl._create_unverified_context
+
+TIME_OUT = 30
+TIMER_TIME = 5
+
+redisdb = None
+
+
+def get_redisdb():
+    global redisdb
+    if not redisdb:
+        redisdb = RedisDB()
+    return redisdb
+
+
+# 装饰器
+class Singleton(object):
+    def __init__(self, cls):
+        self._cls = cls
+        self._instance = {}
+
+    def __call__(self, *args, **kwargs):
+        if self._cls not in self._instance:
+            self._instance[self._cls] = self._cls(*args, **kwargs)
+        return self._instance[self._cls]
+
+
+def log_function_time(func):
+    try:
+
+        @functools.wraps(func)  # 将函数的原来属性付给新函数
+        def calculate_time(*args, **kw):
+            began_time = time.time()
+            callfunc = func(*args, **kw)
+            end_time = time.time()
+            log.debug(func.__name__ + " run time  = " + str(end_time - began_time))
+            return callfunc
+
+        return calculate_time
+    except:
+        log.debug("求取时间无效 因为函数参数不符")
+        return func
+
+
+def run_safe_model(module_name):
+    def inner_run_safe_model(func):
+        try:
+
+            @functools.wraps(func)  # 将函数的原来属性付给新函数
+            def run_func(*args, **kw):
+                callfunc = None
+                try:
+                    callfunc = func(*args, **kw)
+                except Exception as e:
+                    log.error(module_name + ": " + func.__name__ + " - " + str(e))
+                    traceback.print_exc()
+                return callfunc
+
+            return run_func
+        except Exception as e:
+            log.error(module_name + ": " + func.__name__ + " - " + str(e))
+            traceback.print_exc()
+            return func
+
+    return inner_run_safe_model
+
+
+def memoizemethod_noargs(method):
+    """Decorator to cache the result of a method (without arguments) using a
+    weak reference to its object
+    """
+    cache = weakref.WeakKeyDictionary()
+
+    @functools.wraps(method)
+    def new_method(self, *args, **kwargs):
+        if self not in cache:
+            cache[self] = method(self, *args, **kwargs)
+        return cache[self]
+
+    return new_method
+
+
+########################【网页解析相关】###############################
+
+
+# @log_function_time
+def get_html_by_requests(
+    url, headers=None, code="utf-8", data=None, proxies={}, with_response=False
+):
+    html = ""
+    r = None
+    try:
+        if data:
+            r = requests.post(
+                url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies
+            )
+        else:
+            r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies)
+
+        if code:
+            r.encoding = code
+        html = r.text
+
+    except Exception as e:
+        log.error(e)
+    finally:
+        r and r.close()
+
+    if with_response:
+        return html, r
+    else:
+        return html
+
+
+def get_json_by_requests(
+    url,
+    params=None,
+    headers=None,
+    data=None,
+    proxies={},
+    with_response=False,
+    cookies=None,
+):
+    json = {}
+    response = None
+    try:
+        # response = requests.get(url, params = params)
+        if data:
+            response = requests.post(
+                url,
+                headers=headers,
+                data=data,
+                params=params,
+                timeout=TIME_OUT,
+                proxies=proxies,
+                cookies=cookies,
+            )
+        else:
+            response = requests.get(
+                url,
+                headers=headers,
+                params=params,
+                timeout=TIME_OUT,
+                proxies=proxies,
+                cookies=cookies,
+            )
+        response.encoding = "utf-8"
+        json = response.json()
+    except Exception as e:
+        log.error(e)
+    finally:
+        response and response.close()
+
+    if with_response:
+        return json, response
+    else:
+        return json
+
+
+def get_cookies(response):
+    cookies = requests.utils.dict_from_cookiejar(response.cookies)
+    return cookies
+
+
+def get_cookies_from_str(cookie_str):
+    """
+    >>> get_cookies_from_str("key=value; key2=value2; key3=; key4=; ")
+    {'key': 'value', 'key2': 'value2', 'key3': '', 'key4': ''}
+
+    Args:
+        cookie_str: key=value; key2=value2; key3=; key4=
+
+    Returns:
+
+    """
+    cookies = {}
+    for cookie in cookie_str.split(";"):
+        cookie = cookie.strip()
+        if not cookie:
+            continue
+        key, value = cookie.split("=", 1)
+        key = key.strip()
+        value = value.strip()
+        cookies[key] = value
+
+    return cookies
+
+
+def get_cookies_jar(cookies):
+    """
+    @summary: 适用于selenium生成的cookies转requests的cookies
+    requests.get(xxx, cookies=jar)
+    参考:https://www.cnblogs.com/small-bud/p/9064674.html
+
+    ---------
+    @param cookies: [{},{}]
+    ---------
+    @result: cookie jar
+    """
+
+    cookie_jar = RequestsCookieJar()
+    for cookie in cookies:
+        cookie_jar.set(cookie["name"], cookie["value"])
+
+    return cookie_jar
+
+
+def get_cookies_from_selenium_cookie(cookies):
+    """
+    @summary: 适用于selenium生成的cookies转requests的cookies
+    requests.get(xxx, cookies=jar)
+    参考:https://www.cnblogs.com/small-bud/p/9064674.html
+
+    ---------
+    @param cookies: [{},{}]
+    ---------
+    @result: cookie jar
+    """
+
+    cookie_dict = {}
+    for cookie in cookies:
+        if cookie.get("name"):
+            cookie_dict[cookie["name"]] = cookie["value"]
+
+    return cookie_dict
+
+
+def cookiesjar2str(cookies):
+    str_cookie = ""
+    for k, v in requests.utils.dict_from_cookiejar(cookies).items():
+        str_cookie += k
+        str_cookie += "="
+        str_cookie += v
+        str_cookie += "; "
+    return str_cookie
+
+
+def cookies2str(cookies):
+    str_cookie = ""
+    for k, v in cookies.items():
+        str_cookie += k
+        str_cookie += "="
+        str_cookie += v
+        str_cookie += "; "
+    return str_cookie
+
+
+def get_urls(
+    html,
+    stop_urls=(
+        "javascript",
+        "+",
+        ".css",
+        ".js",
+        ".rar",
+        ".xls",
+        ".exe",
+        ".apk",
+        ".doc",
+        ".jpg",
+        ".png",
+        ".flv",
+        ".mp4",
+    ),
+):
+    # 不匹配javascript、 +、 # 这样的url
+    regex = r'<a.*?href.*?=.*?["|\'](.*?)["|\']'
+
+    urls = get_info(html, regex)
+    urls = sorted(set(urls), key=urls.index)
+    if stop_urls:
+        stop_urls = isinstance(stop_urls, str) and [stop_urls] or stop_urls
+        use_urls = []
+        for url in urls:
+            for stop_url in stop_urls:
+                if stop_url in url:
+                    break
+            else:
+                use_urls.append(url)
+
+        urls = use_urls
+    return urls
+
+
+def get_full_url(root_url, sub_url):
+    """
+    @summary: 得到完整的ur
+    ---------
+    @param root_url: 根url (网页的url)
+    @param sub_url:  子url (带有相对路径的 可以拼接成完整的)
+    ---------
+    @result: 返回完整的url
+    """
+
+    return urljoin(root_url, sub_url)
+
+
+def joint_url(url, params):
+    # param_str = "?"
+    # for key, value in params.items():
+    #     value = isinstance(value, str) and value or str(value)
+    #     param_str += key + "=" + value + "&"
+    #
+    # return url + param_str[:-1]
+
+    if not params:
+        return url
+
+    params = urlencode(params)
+    separator = "?" if "?" not in url else "&"
+    return url + separator + params
+
+
+def canonicalize_url(url):
+    """
+    url 归一化 会参数排序 及去掉锚点
+    """
+    return _canonicalize_url(url)
+
+
+def get_url_md5(url):
+    url = canonicalize_url(url)
+    url = re.sub("^http://", "https://", url)
+    return get_md5(url)
+
+
+def fit_url(urls, identis):
+    identis = isinstance(identis, str) and [identis] or identis
+    fit_urls = []
+    for link in urls:
+        for identi in identis:
+            if identi in link:
+                fit_urls.append(link)
+    return list(set(fit_urls))
+
+
+def get_param(url, key):
+    params = url.split("?")[-1].split("&")
+    for param in params:
+        key_value = param.split("=", 1)
+        if key == key_value[0]:
+            return key_value[1]
+    return None
+
+
+def urlencode(params):
+    """
+    字典类型的参数转为字符串
+    @param params:
+    {
+        'a': 1,
+        'b': 2
+    }
+    @return: a=1&b=2
+    """
+    return urllib.parse.urlencode(params)
+
+
+def urldecode(url):
+    """
+    将字符串类型的参数转为json
+    @param url: xxx?a=1&b=2
+    @return:
+    {
+        'a': 1,
+        'b': 2
+    }
+    """
+    params_json = {}
+    params = url.split("?")[-1].split("&")
+    for param in params:
+        key, value = param.split("=")
+        params_json[key] = unquote_url(value)
+
+    return params_json
+
+
+def unquote_url(url, encoding="utf-8"):
+    """
+    @summary: 将url解码
+    ---------
+    @param url:
+    ---------
+    @result:
+    """
+
+    return urllib.parse.unquote(url, encoding=encoding)
+
+
+def quote_url(url, encoding="utf-8"):
+    """
+    @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html
+    ---------
+    @param url:
+    ---------
+    @result:
+    """
+
+    return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding)
+
+
+def quote_chinese_word(text, encoding="utf-8"):
+    def quote_chinese_word_func(text):
+        chinese_word = text.group(0)
+        return urllib.parse.quote(chinese_word, encoding=encoding)
+
+    return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S)
+
+
+def unescape(str):
+    """
+    反转译
+    """
+    return html.unescape(str)
+
+
+def excape(str):
+    """
+    转译
+    """
+    return html.escape(str)
+
+
+_regexs = {}
+
+
+# @log_function_time
+def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
+    regexs = isinstance(regexs, str) and [regexs] or regexs
+
+    infos = []
+    for regex in regexs:
+        if regex == "":
+            continue
+
+        if regex not in _regexs.keys():
+            _regexs[regex] = re.compile(regex, re.S)
+
+        if fetch_one:
+            infos = _regexs[regex].search(html)
+            if infos:
+                infos = infos.groups()
+            else:
+                continue
+        else:
+            infos = _regexs[regex].findall(str(html))
+
+        if len(infos) > 0:
+            # print(regex)
+            break
+
+    if fetch_one:
+        infos = infos if infos else ("",)
+        return infos if len(infos) > 1 else infos[0]
+    else:
+        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
+        infos = split.join(infos) if split else infos
+        return infos
+
+
+def table_json(table, save_one_blank=True):
+    """
+    将表格转为json 适应于 key:value 在一行类的表格
+    @param table: 使用selector封装后的具有xpath的selector
+    @param save_one_blank: 保留一个空白符
+    @return:
+    """
+    data = {}
+
+    trs = table.xpath(".//tr")
+    for tr in trs:
+        tds = tr.xpath("./td|./th")
+
+        for i in range(0, len(tds), 2):
+            if i + 1 > len(tds) - 1:
+                break
+
+            key = tds[i].xpath("string(.)").extract_first(default="").strip()
+            value = tds[i + 1].xpath("string(.)").extract_first(default="").strip()
+            value = replace_str(value, "[\f\n\r\t\v]", "")
+            value = replace_str(value, " +", " " if save_one_blank else "")
+
+            if key:
+                data[key] = value
+
+    return data
+
+
+def get_table_row_data(table):
+    """
+    获取表格里每一行数据
+    @param table: 使用selector封装后的具有xpath的selector
+    @return: [[],[]..]
+    """
+
+    datas = []
+    rows = table.xpath(".//tr")
+    for row in rows:
+        cols = row.xpath("./td|./th")
+        row_datas = []
+        for col in cols:
+            data = col.xpath("string(.)").extract_first(default="").strip()
+            row_datas.append(data)
+        datas.append(row_datas)
+
+    return datas
+
+
+def rows2json(rows, keys=None):
+    """
+    将行数据转为json
+    @param rows: 每一行的数据
+    @param keys: json的key,空时将rows的第一行作为key
+    @return:
+    """
+    data_start_pos = 0 if keys else 1
+    datas = []
+    keys = keys or rows[0]
+    for values in rows[data_start_pos:]:
+        datas.append(dict(zip(keys, values)))
+
+    return datas
+
+
+def get_form_data(form):
+    """
+    提取form中提交的数据
+    :param form: 使用selector封装后的具有xpath的selector
+    :return:
+    """
+    data = {}
+    inputs = form.xpath(".//input")
+    for input in inputs:
+        name = input.xpath("./@name").extract_first()
+        value = input.xpath("./@value").extract_first()
+        if name:
+            data[name] = value
+
+    return data
+
+
+def get_domain(url):
+    return urllib.parse.urlparse(url).netloc
+
+
+def get_index_url(url):
+    return "/".join(url.split("/")[:3])
+
+
+def get_ip(domain):
+    ip = socket.getaddrinfo(domain, "http")[0][4][0]
+    return ip
+
+
+def get_localhost_ip():
+    """
+    利用 UDP 协议来实现的,生成一个UDP包,把自己的 IP 放如到 UDP 协议头中,然后从UDP包中获取本机的IP。
+    这个方法并不会真实的向外部发包,所以用抓包工具是看不到的
+    :return:
+    """
+    s = None
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(("8.8.8.8", 80))
+        ip = s.getsockname()[0]
+    finally:
+        if s:
+            s.close()
+
+    return ip
+
+
+def ip_to_num(ip):
+    import struct
+
+    ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
+    return ip_num
+
+
+def is_valid_proxy(proxy, check_url=None):
+    """
+    检验代理是否有效
+    @param proxy: xxx.xxx.xxx:xxx
+    @param check_url: 利用目标网站检查,目标网站url。默认为None, 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host
+    @return: True / False
+    """
+    is_valid = False
+
+    if check_url:
+        proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"}
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
+        }
+        response = None
+        try:
+            response = requests.get(
+                check_url, headers=headers, proxies=proxies, stream=True, timeout=20
+            )
+            is_valid = True
+
+        except Exception as e:
+            log.error("check proxy failed: {} {}".format(e, proxy))
+
+        finally:
+            if response:
+                response.close()
+
+    else:
+        ip, port = proxy.split(":")
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
+            sk.settimeout(7)
+            try:
+                sk.connect((ip, int(port)))  # 检查代理服务器是否开着
+                is_valid = True
+
+            except Exception as e:
+                log.error("check proxy failed: {} {}:{}".format(e, ip, port))
+
+    return is_valid
+
+
+def is_valid_url(url):
+    """
+    验证url是否合法
+    :param url:
+    :return:
+    """
+    if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url):
+        return True
+    else:
+        return False
+
+
+def get_text(soup, *args):
+    try:
+        return soup.get_text()
+    except Exception as e:
+        log.error(e)
+        return ""
+
+
+def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""):
+    """
+    删除html标签
+    @param content: html内容
+    @param except_line_break: 保留p标签
+    @param save_img: 保留图片
+    @param white_replaced: 空白符替换
+    @return:
+    """
+    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
+    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
+    content = replace_str(content, "<!--(.|\n)*?-->")
+    content = replace_str(
+        content, "(?!&[a-z]+=)&[a-z]+;?"
+    )  # 干掉&nbsp等无用的字符 但&xxx= 这种表示参数的除外
+    if except_line_break:
+        content = content.replace("</p>", "/p")
+        content = replace_str(content, "<[^p].*?>")
+        content = content.replace("/p", "</p>")
+        content = replace_str(content, "[ \f\r\t\v]")
+
+    elif save_img:
+        content = replace_str(content, "(?!<img.+?>)<.+?>")  # 替换掉除图片外的其他标签
+        content = replace_str(content, "(?! +)\s+", "\n")  # 保留空格
+        content = content.strip()
+
+    else:
+        content = replace_str(content, "<(.|\n)*?>")
+        content = replace_str(content, "\s", white_replaced)
+        content = content.strip()
+
+    return content
+
+
+def del_html_js_css(content):
+    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
+    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
+    content = replace_str(content, "<!--(.|\n)*?-->")
+
+    return content
+
+
+def is_have_chinese(content):
+    regex = "[\u4e00-\u9fa5]+"
+    chinese_word = get_info(content, regex)
+    return chinese_word and True or False
+
+
+def is_have_english(content):
+    regex = "[a-zA-Z]+"
+    english_words = get_info(content, regex)
+    return english_words and True or False
+
+
+def get_chinese_word(content):
+    regex = "[\u4e00-\u9fa5]+"
+    chinese_word = get_info(content, regex)
+    return chinese_word
+
+
+def get_english_words(content):
+    regex = "[a-zA-Z]+"
+    english_words = get_info(content, regex)
+    return english_words or ""
+
+
+##################################################
+def get_json(json_str):
+    """
+    @summary: 取json对象
+    ---------
+    @param json_str: json格式的字符串
+    ---------
+    @result: 返回json对象
+    """
+
+    try:
+        return json.loads(json_str) if json_str else {}
+    except Exception as e1:
+        try:
+            json_str = json_str.strip()
+            json_str = json_str.replace("'", '"')
+            keys = get_info(json_str, "(\w+):")
+            for key in keys:
+                json_str = json_str.replace(key, '"%s"' % key)
+
+            return json.loads(json_str) if json_str else {}
+
+        except Exception as e2:
+            pass
+
+        return {}
+
+
+def jsonp2json(jsonp):
+    """
+    将jsonp转为json
+    @param jsonp: jQuery172013600082560040794_1553230569815({})
+    @return:
+    """
+    try:
+        return json.loads(re.match(".*?({.*}).*", jsonp, re.S).group(1))
+    except:
+        raise ValueError("Invalid Input")
+
+
+def dumps_json(data, indent=4, sort_keys=False):
+    """
+    @summary: 格式化json 用于打印
+    ---------
+    @param data: json格式的字符串或json对象
+    ---------
+    @result: 格式化后的字符串
+    """
+    try:
+        if isinstance(data, str):
+            data = get_json(data)
+
+        data = json.dumps(
+            data,
+            ensure_ascii=False,
+            indent=indent,
+            skipkeys=True,
+            sort_keys=sort_keys,
+            default=str,
+        )
+
+    except Exception as e:
+        data = pformat(data)
+
+    return data
+
+
+def get_json_value(json_object, key):
+    """
+    @summary:
+    ---------
+    @param json_object: json对象或json格式的字符串
+    @param key: 建值 如果在多个层级目录下 可写 key1.key2  如{'key1':{'key2':3}}
+    ---------
+    @result: 返回对应的值,如果没有,返回''
+    """
+    current_key = ""
+    value = ""
+    try:
+        json_object = (
+            isinstance(json_object, str) and get_json(json_object) or json_object
+        )
+
+        current_key = key.split(".")[0]
+        value = json_object[current_key]
+
+        key = key[key.find(".") + 1 :]
+    except Exception as e:
+        return value
+
+    if key == current_key:
+        return value
+    else:
+        return get_json_value(value, key)
+
+
+def get_all_keys(datas, depth=None, current_depth=0):
+    """
+    @summary: 获取json李所有的key
+    ---------
+    @param datas: dict / list
+    @param depth: 字典key的层级 默认不限制层级 层级从1开始
+    @param current_depth: 字典key的当前层级 不用传参
+    ---------
+    @result: 返回json所有的key
+    """
+
+    keys = []
+    if depth and current_depth >= depth:
+        return keys
+
+    if isinstance(datas, list):
+        for data in datas:
+            keys.extend(get_all_keys(data, depth, current_depth=current_depth + 1))
+    elif isinstance(datas, dict):
+        for key, value in datas.items():
+            keys.append(key)
+            if isinstance(value, dict):
+                keys.extend(get_all_keys(value, depth, current_depth=current_depth + 1))
+
+    return keys
+
+
+def to_chinese(unicode_str):
+    format_str = json.loads('{"chinese":"%s"}' % unicode_str)
+    return format_str["chinese"]
+
+
+##################################################
+def replace_str(source_str, regex, replace_str=""):
+    """
+    @summary: 替换字符串
+    ---------
+    @param source_str: 原字符串
+    @param regex: 正则
+    @param replace_str: 用什么来替换 默认为''
+    ---------
+    @result: 返回替换后的字符串
+    """
+    str_info = re.compile(regex)
+    return str_info.sub(replace_str, source_str)
+
+
+def del_redundant_blank_character(text):
+    """
+    删除冗余的空白符, 只保留一个
+    :param text:
+    :return:
+    """
+    return re.sub("\s+", " ", text)
+
+
+##################################################
+def get_conf_value(config_file, section, key):
+    cp = configparser.ConfigParser(allow_no_value=True)
+    with codecs.open(config_file, "r", encoding="utf-8") as f:
+        cp.read_file(f)
+    return cp.get(section, key)
+
+
+def mkdir(path):
+    try:
+        if not os.path.exists(path):
+            os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        pass
+
+
+def write_file(filename, content, mode="w", encoding="utf-8"):
+    """
+    @summary: 写文件
+    ---------
+    @param filename: 文件名(有路径)
+    @param content: 内容
+    @param mode: 模式 w/w+ (覆盖/追加)
+    ---------
+    @result:
+    """
+
+    directory = os.path.dirname(filename)
+    mkdir(directory)
+    with open(filename, mode, encoding=encoding) as file:
+        file.writelines(content)
+
+
+def read_file(filename, readlines=False, encoding="utf-8"):
+    """
+    @summary: 读文件
+    ---------
+    @param filename: 文件名(有路径)
+    @param readlines: 按行读取 (默认False)
+    ---------
+    @result: 按行读取返回List,否则返回字符串
+    """
+
+    content = None
+    try:
+        with open(filename, "r", encoding=encoding) as file:
+            content = file.readlines() if readlines else file.read()
+    except Exception as e:
+        log.error(e)
+
+    return content
+
+
+def get_oss_file_list(oss_handler, prefix, date_range_min, date_range_max=None):
+    """
+    获取文件列表
+    @param prefix: 路径前缀 如 data/car_service_line/yiche/yiche_serial_zongshu_info
+    @param date_range_min: 时间范围 最小值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
+    @param date_range_max: 时间范围 最大值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
+    @return: 每个文件路径 如 html/e_commerce_service_line/alibaba/alibaba_shop_info/2019/03/22/15/53/15/8ca8b9e4-4c77-11e9-9dee-acde48001122.json.snappy
+    """
+
+    # 计算时间范围
+    date_range_max = date_range_max or date_range_min
+    date_format = "/".join(
+        ["%Y", "%m", "%d", "%H", "%M", "%S"][: date_range_min.count("/") + 1]
+    )
+    time_interval = [
+        {"days": 365},
+        {"days": 31},
+        {"days": 1},
+        {"hours": 1},
+        {"minutes": 1},
+        {"seconds": 1},
+    ][date_range_min.count("/")]
+    date_range = get_between_date(
+        date_range_min, date_range_max, date_format=date_format, **time_interval
+    )
+
+    for date in date_range:
+        file_folder_path = os.path.join(prefix, date)
+        objs = oss_handler.list(prefix=file_folder_path)
+        for obj in objs:
+            filename = obj.key
+            yield filename
+
+
+def is_html(url):
+    if not url:
+        return False
+
+    try:
+        content_type = request.urlopen(url).info().get("Content-Type", "")
+
+        if "text/html" in content_type:
+            return True
+        else:
+            return False
+    except Exception as e:
+        log.error(e)
+        return False
+
+
+def is_exist(file_path):
+    """
+    @summary: 文件是否存在
+    ---------
+    @param file_path:
+    ---------
+    @result:
+    """
+
+    return os.path.exists(file_path)
+
+
+def download_file(url, file_path, *, call_func=None, proxies=None, data=None):
+    """
+    下载文件,会自动创建文件存储目录
+    Args:
+        url: 地址
+        file_path: 文件存储地址
+        call_func: 下载成功的回调
+        proxies: 代理
+        data: 请求体
+
+    Returns:
+
+    """
+    directory = os.path.dirname(file_path)
+    mkdir(directory)
+
+    # 进度条
+    def progress_callfunc(blocknum, blocksize, totalsize):
+        """回调函数
+        @blocknum : 已经下载的数据块
+        @blocksize : 数据块的大小
+        @totalsize: 远程文件的大小
+        """
+        percent = 100.0 * blocknum * blocksize / totalsize
+        if percent > 100:
+            percent = 100
+        # print ('进度条 %.2f%%' % percent, end = '\r')
+        sys.stdout.write("进度条 %.2f%%" % percent + "\r")
+        sys.stdout.flush()
+
+    if url:
+        try:
+            if proxies:
+                # create the object, assign it to a variable
+                proxy = request.ProxyHandler(proxies)
+                # construct a new opener using your proxy settings
+                opener = request.build_opener(proxy)
+                # install the openen on the module-level
+                request.install_opener(opener)
+
+            request.urlretrieve(url, file_path, progress_callfunc, data)
+
+            if callable(call_func):
+                call_func()
+            return 1
+        except Exception as e:
+            log.error(e)
+            return 0
+    else:
+        return 0
+
+
+def get_file_list(path, ignore=[]):
+    templist = path.split("*")
+    path = templist[0]
+    file_type = templist[1] if len(templist) >= 2 else ""
+
+    # 递归遍历文件
+    def get_file_list_(path, file_type, ignore, all_file=[]):
+        file_list = os.listdir(path)
+
+        for file_name in file_list:
+            if file_name in ignore:
+                continue
+
+            file_path = os.path.join(path, file_name)
+            if os.path.isdir(file_path):
+                get_file_list_(file_path, file_type, ignore, all_file)
+            else:
+                if not file_type or file_name.endswith(file_type):
+                    all_file.append(file_path)
+
+        return all_file
+
+    return get_file_list_(path, file_type, ignore) if os.path.isdir(path) else [path]
+
+
+def rename_file(old_name, new_name):
+    os.rename(old_name, new_name)
+
+
+def del_file(path, ignore=()):
+    files = get_file_list(path, ignore)
+    for file in files:
+        try:
+            os.remove(file)
+        except Exception as e:
+            log.error(
+                """
+                删除出错: %s
+                Exception : %s
+                """
+                % (file, str(e))
+            )
+        finally:
+            pass
+
+
+def get_file_type(file_name):
+    """
+    @summary: 取文件后缀名
+    ---------
+    @param file_name:
+    ---------
+    @result:
+    """
+    try:
+        return os.path.splitext(file_name)[1]
+    except Exception as e:
+        log.exception(e)
+
+
+def get_file_path(file_path):
+    """
+    @summary: 取文件路径
+    ---------
+    @param file_path: /root/a.py
+    ---------
+    @result: /root
+    """
+    try:
+        return os.path.split(file_path)[0]
+    except Exception as e:
+        log.exception(e)
+
+
+#############################################
+
+
+def exec_js(js_code):
+    """
+    @summary: 执行js代码
+    ---------
+    @param js_code: js代码
+    ---------
+    @result: 返回执行结果
+    """
+
+    return execjs.eval(js_code)
+
+
+def compile_js(js_func):
+    """
+    @summary: 编译js函数
+    ---------
+    @param js_func:js函数
+    ---------
+    @result: 返回函数对象 调用 fun('js_funName', param1,param2)
+    """
+
+    ctx = execjs.compile(js_func)
+    return ctx.call
+
+
+###############################################
+
+#############################################
+
+
+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
+    @param format:时间格式
+    ---------
+    @result: 返回时间戳
+    """
+
+    timestamp = time.mktime(time.strptime(date, time_format))
+    return int(timestamp)
+
+
+def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param timestamp: 将时间戳转化为日期
+    @param format: 日期格式
+    ---------
+    @result: 返回日期
+    """
+    if timestamp is None:
+        raise ValueError("timestamp is null")
+
+    date = time.localtime(timestamp)
+    return time.strftime(time_format, date)
+
+
+def get_current_timestamp():
+    return int(time.time())
+
+
+def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
+    return datetime.datetime.now().strftime(date_format)
+    # return time.strftime(date_format, time.localtime(time.time()))
+
+
+def get_date_number(year=None, month=None, day=None):
+    """
+    @summary: 获取指定日期对应的日期数
+    默认当前周
+    ---------
+    @param year: 2010
+    @param month: 6
+    @param day: 16
+    ---------
+    @result: (年号,第几周,第几天) 如 (2010, 24, 3)
+    """
+    if year and month and day:
+        return datetime.date(year, month, day).isocalendar()
+    elif not any([year, month, day]):
+        return datetime.datetime.now().isocalendar()
+    else:
+        assert year, "year 不能为空"
+        assert month, "month 不能为空"
+        assert day, "day 不能为空"
+
+
+def get_between_date(
+    begin_date, end_date=None, date_format="%Y-%m-%d", **time_interval
+):
+    """
+    @summary: 获取一段时间间隔内的日期,默认为每一天
+    ---------
+    @param begin_date: 开始日期 str 如 2018-10-01
+    @param end_date: 默认为今日
+    @param date_format: 日期格式,应与begin_date的日期格式相对应
+    @param time_interval: 时间间隔 默认一天 支持 days、seconds、microseconds、milliseconds、minutes、hours、weeks
+    ---------
+    @result: list 值为字符串
+    """
+
+    date_list = []
+
+    begin_date = datetime.datetime.strptime(begin_date, date_format)
+    end_date = (
+        datetime.datetime.strptime(end_date, date_format)
+        if end_date
+        else datetime.datetime.strptime(
+            time.strftime(date_format, time.localtime(time.time())), date_format
+        )
+    )
+    time_interval = time_interval or dict(days=1)
+
+    while begin_date <= end_date:
+        date_str = begin_date.strftime(date_format)
+        date_list.append(date_str)
+
+        begin_date += datetime.timedelta(**time_interval)
+
+    if end_date.strftime(date_format) not in date_list:
+        date_list.append(end_date.strftime(date_format))
+
+    return date_list
+
+
+def get_between_months(begin_date, end_date=None):
+    """
+    @summary: 获取一段时间间隔内的月份
+    需要满一整月
+    ---------
+    @param begin_date: 开始时间 如 2018-01-01
+    @param end_date: 默认当前时间
+    ---------
+    @result: 列表 如 ['2018-01', '2018-02']
+    """
+
+    def add_months(dt, months):
+        month = dt.month - 1 + months
+        year = dt.year + month // 12
+        month = month % 12 + 1
+        day = min(dt.day, calendar.monthrange(year, month)[1])
+        return dt.replace(year=year, month=month, day=day)
+
+    date_list = []
+    begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
+    end_date = (
+        datetime.datetime.strptime(end_date, "%Y-%m-%d")
+        if end_date
+        else datetime.datetime.strptime(
+            time.strftime("%Y-%m-%d", time.localtime(time.time())), "%Y-%m-%d"
+        )
+    )
+    while begin_date <= end_date:
+        date_str = begin_date.strftime("%Y-%m")
+        date_list.append(date_str)
+        begin_date = add_months(begin_date, 1)
+    return date_list
+
+
+def get_today_of_day(day_offset=0):
+    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
+
+
+def get_days_of_month(year, month):
+    """
+    返回天数
+    """
+
+    return calendar.monthrange(year, month)[1]
+
+
+def get_firstday_of_month(date):
+    """''
+    date format = "YYYY-MM-DD"
+    """
+
+    year, month, day = date.split("-")
+    year, month, day = int(year), int(month), int(day)
+
+    days = "01"
+    if int(month) < 10:
+        month = "0" + str(int(month))
+    arr = (year, month, days)
+    return "-".join("%s" % i for i in arr)
+
+
+def get_lastday_of_month(date):
+    """''
+    get the last day of month
+    date format = "YYYY-MM-DD"
+    """
+    year, month, day = date.split("-")
+    year, month, day = int(year), int(month), int(day)
+
+    days = calendar.monthrange(year, month)[1]
+    month = add_zero(month)
+    arr = (year, month, days)
+    return "-".join("%s" % i for i in arr)
+
+
+def get_firstday_month(month_offset=0):
+    """''
+    get the first day of month from today
+    month_offset is how many months
+    """
+    (y, m, d) = get_year_month_and_days(month_offset)
+    d = "01"
+    arr = (y, m, d)
+    return "-".join("%s" % i for i in arr)
+
+
+def get_lastday_month(month_offset=0):
+    """''
+    get the last day of month from today
+    month_offset is how many months
+    """
+    return "-".join("%s" % i for i in get_year_month_and_days(month_offset))
+
+
+def get_last_month(month_offset=0):
+    """''
+    get the last day of month from today
+    month_offset is how many months
+    """
+    return "-".join("%s" % i for i in get_year_month_and_days(month_offset)[:2])
+
+
+def get_year_month_and_days(month_offset=0):
+    """
+    @summary:
+    ---------
+    @param month_offset: 月份偏移量
+    ---------
+    @result: ('2019', '04', '30')
+    """
+
+    today = datetime.datetime.now()
+    year, month = today.year, today.month
+
+    this_year = int(year)
+    this_month = int(month)
+    total_month = this_month + month_offset
+    if month_offset >= 0:
+        if total_month <= 12:
+            days = str(get_days_of_month(this_year, total_month))
+            total_month = add_zero(total_month)
+            return (year, total_month, days)
+        else:
+            i = total_month // 12
+            j = total_month % 12
+            if j == 0:
+                i -= 1
+                j = 12
+            this_year += i
+            days = str(get_days_of_month(this_year, j))
+            j = add_zero(j)
+            return (str(this_year), str(j), days)
+    else:
+        if (total_month > 0) and (total_month < 12):
+            days = str(get_days_of_month(this_year, total_month))
+            total_month = add_zero(total_month)
+            return (year, total_month, days)
+        else:
+            i = total_month // 12
+            j = total_month % 12
+            if j == 0:
+                i -= 1
+                j = 12
+            this_year += i
+            days = str(get_days_of_month(this_year, j))
+            j = add_zero(j)
+            return (str(this_year), str(j), days)
+
+
+def add_zero(n):
+    return "%02d" % n
+
+
+def get_month(month_offset=0):
+    """''
+    获取当前日期前后N月的日期
+    if month_offset>0, 获取当前日期前N月的日期
+    if month_offset<0, 获取当前日期后N月的日期
+    date format = "YYYY-MM-DD"
+    """
+    today = datetime.datetime.now()
+    day = add_zero(today.day)
+
+    (y, m, d) = get_year_month_and_days(month_offset)
+    arr = (y, m, d)
+    if int(day) < int(d):
+        arr = (y, m, day)
+    return "-".join("%s" % i for i in arr)
+
+
+@run_safe_model("format_date")
+def format_date(date, old_format="", new_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary: 格式化日期格式
+    ---------
+    @param date: 日期 eg:2017年4月17日 3时27分12秒
+    @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒'
+        %y 两位数的年份表示(00-99)
+        %Y 四位数的年份表示(000-9999)
+        %m 月份(01-12)
+        %d 月内中的一天(0-31)
+        %H 24小时制小时数(0-23)
+        %I 12小时制小时数(01-12)
+        %M 分钟数(00-59)
+        %S 秒(00-59)
+    @param new_format: 输出的日期格式
+    ---------
+    @result: 格式化后的日期,类型为字符串 如2017-4-17 03:27:12
+    """
+    if not date:
+        return ""
+
+    if not old_format:
+        regex = "(\d+)"
+        numbers = get_info(date, regex, allow_repeat=True)
+        formats = ["%Y", "%m", "%d", "%H", "%M", "%S"]
+        old_format = date
+        for i, number in enumerate(numbers[:6]):
+            if i == 0 and len(number) == 2:  # 年份可能是两位 用小%y
+                old_format = old_format.replace(
+                    number, formats[i].lower(), 1
+                )  # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时
+            else:
+                old_format = old_format.replace(number, formats[i], 1)  # 替换一次
+
+    try:
+        date_obj = datetime.datetime.strptime(date, old_format)
+        if "T" in date and "Z" in date:
+            date_obj += datetime.timedelta(hours=8)
+            date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
+        else:
+            date_str = datetime.datetime.strftime(date_obj, new_format)
+
+    except Exception as e:
+        log.error("日期格式化出错,old_format = %s 不符合 %s 格式" % (old_format, date))
+        date_str = date
+
+    return date_str
+
+
+def transform_lower_num(data_str: str):
+    num_map = {
+        "一": "1",
+        "二": "2",
+        "三": "3",
+        "四": "4",
+        "五": "5",
+        "六": "6",
+        "七": "7",
+        "八": "8",
+        "九": "9",
+        "十": "0",
+    }
+    pattern = f'[{"|".join(num_map.keys())}|零]'
+    res = re.search(pattern, data_str)
+    if not res:
+        #  如果字符串中没有包含中文数字 不做处理 直接返回
+        return data_str
+
+    data_str = data_str.replace("0", "零")
+    for n in num_map:
+        data_str = data_str.replace(n, num_map[n])
+
+    re_data_str = re.findall("\d+", data_str)
+    for i in re_data_str:
+        if len(i) == 3:
+            new_i = i.replace("0", "")
+            data_str = data_str.replace(i, new_i, 1)
+        elif len(i) == 4:
+            new_i = i.replace("10", "")
+            data_str = data_str.replace(i, new_i, 1)
+        elif len(i) == 2 and int(i) < 10:
+            new_i = int(i) + 10
+            data_str = data_str.replace(i, str(new_i), 1)
+        elif len(i) == 1 and int(i) == 0:
+            new_i = int(i) + 10
+            data_str = data_str.replace(i, str(new_i), 1)
+
+    return data_str.replace("零", "0")
+
+
+@run_safe_model("format_time")
+def format_time(release_time, date_format="%Y-%m-%d %H:%M:%S"):
+    """
+    >>> format_time("2个月前")
+    '2021-08-15 16:24:21'
+    >>> format_time("2月前")
+    '2021-08-15 16:24:36'
+    """
+    release_time = transform_lower_num(release_time)
+    release_time = release_time.replace("日", "天").replace("/", "-")
+
+    if "年前" in release_time:
+        years = re.compile("(\d+)\s*年前").findall(release_time)
+        years_ago = datetime.datetime.now() - datetime.timedelta(
+            days=int(years[0]) * 365
+        )
+        release_time = years_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "月前" in release_time:
+        months = re.compile("(\d+)[\s个]*月前").findall(release_time)
+        months_ago = datetime.datetime.now() - datetime.timedelta(
+            days=int(months[0]) * 30
+        )
+        release_time = months_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "周前" in release_time:
+        weeks = re.compile("(\d+)\s*周前").findall(release_time)
+        weeks_ago = datetime.datetime.now() - datetime.timedelta(days=int(weeks[0]) * 7)
+        release_time = weeks_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "天前" in release_time:
+        ndays = re.compile("(\d+)\s*天前").findall(release_time)
+        days_ago = datetime.datetime.now() - datetime.timedelta(days=int(ndays[0]))
+        release_time = days_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "小时前" in release_time:
+        nhours = re.compile("(\d+)\s*小时前").findall(release_time)
+        hours_ago = datetime.datetime.now() - datetime.timedelta(hours=int(nhours[0]))
+        release_time = hours_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "分钟前" in release_time:
+        nminutes = re.compile("(\d+)\s*分钟前").findall(release_time)
+        minutes_ago = datetime.datetime.now() - datetime.timedelta(
+            minutes=int(nminutes[0])
+        )
+        release_time = minutes_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "前天" in release_time:
+        today = datetime.date.today()
+        yesterday = today - datetime.timedelta(days=2)
+        release_time = release_time.replace("前天", str(yesterday))
+
+    elif "昨天" in release_time:
+        today = datetime.date.today()
+        yesterday = today - datetime.timedelta(days=1)
+        release_time = release_time.replace("昨天", str(yesterday))
+
+    elif "今天" in release_time:
+        release_time = release_time.replace("今天", get_current_date("%Y-%m-%d"))
+
+    elif "刚刚" in release_time:
+        release_time = get_current_date()
+
+    elif re.search("^\d\d:\d\d", release_time):
+        release_time = get_current_date("%Y-%m-%d") + " " + release_time
+
+    elif not re.compile("\d{4}").findall(release_time):
+        month = re.compile("\d{1,2}").findall(release_time)
+        if month and int(month[0]) <= int(get_current_date("%m")):
+            release_time = get_current_date("%Y") + "-" + release_time
+        else:
+            release_time = str(int(get_current_date("%Y")) - 1) + "-" + release_time
+
+    # 把日和小时粘在一起的拆开
+    template = re.compile("(\d{4}-\d{1,2}-\d{2})(\d{1,2})")
+    release_time = re.sub(template, r"\1 \2", release_time)
+    release_time = format_date(release_time, new_format=date_format)
+
+    return release_time
+
+
+def to_date(date_str, date_format="%Y-%m-%d %H:%M:%S"):
+    return datetime.datetime.strptime(date_str, date_format)
+
+
+def get_before_date(
+    current_date,
+    days,
+    current_date_format="%Y-%m-%d %H:%M:%S",
+    return_date_format="%Y-%m-%d %H:%M:%S",
+):
+    """
+    @summary: 获取之前时间
+    ---------
+    @param current_date: 当前时间 str类型
+    @param days: 时间间隔 -1 表示前一天 1 表示后一天
+    @param days: 返回的时间格式
+    ---------
+    @result: 字符串
+    """
+
+    current_date = to_date(current_date, current_date_format)
+    date_obj = current_date + datetime.timedelta(days=days)
+    return datetime.datetime.strftime(date_obj, return_date_format)
+
+
+def get_utcnow():
+    """utc时间"""
+    return datetime.datetime.utcnow()
+
+
+def delay_time(sleep_time=60):
+    """
+    @summary: 睡眠  默认1分钟
+    ---------
+    @param sleep_time: 以秒为单位
+    ---------
+    @result:
+    """
+
+    time.sleep(sleep_time)
+
+
+def format_seconds(seconds):
+    """
+    @summary: 将秒转为时分秒
+    ---------
+    @param seconds:
+    ---------
+    @result: 2天3小时2分49秒
+    """
+
+    seconds = int(seconds + 0.5)  # 向上取整
+
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    d, h = divmod(h, 24)
+
+    times = ""
+    if d:
+        times += "{}天".format(d)
+    if h:
+        times += "{}小时".format(h)
+    if m:
+        times += "{}分".format(m)
+    if s:
+        times += "{}秒".format(s)
+
+    return times
+
+
+################################################
+def get_md5(*args):
+    """
+    @summary: 获取唯一的32位md5
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: 7c8684bcbdfcea6697650aa53d7b1405
+    """
+
+    m = hashlib.md5()
+    for arg in args:
+        m.update(str(arg).encode())
+
+    return m.hexdigest()
+
+
+def get_sha1(*args):
+    """
+    @summary: 获取唯一的40位值, 用于获取唯一的id
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
+    """
+
+    sha1 = hashlib.sha1()
+    for arg in args:
+        sha1.update(str(arg).encode())
+    return sha1.hexdigest()  # 40位
+
+
+def get_base64(secret, message):
+    """
+    @summary: 数字证书签名算法是:"HMAC-SHA256"
+              参考:https://www.jokecamp.com/blog/examples-of-creating-base64-hashes-using-hmac-sha256-in-different-languages/
+    ---------
+    @param secret: 秘钥
+    @param message: 消息
+    ---------
+    @result: 签名输出类型是:"base64"
+    """
+
+    import hashlib
+    import hmac
+    import base64
+
+    message = bytes(message, "utf-8")
+    secret = bytes(secret, "utf-8")
+
+    signature = base64.b64encode(
+        hmac.new(secret, message, digestmod=hashlib.sha256).digest()
+    ).decode("utf8")
+    return signature
+
+
+def get_uuid(key1="", key2=""):
+    """
+    @summary: 计算uuid值
+    可用于将两个字符串组成唯一的值。如可将域名和新闻标题组成uuid,形成联合索引
+    ---------
+    @param key1:str
+    @param key2:str
+    ---------
+    @result:
+    """
+
+    uuid_object = ""
+
+    if not key1 and not key2:
+        uuid_object = uuid.uuid1()
+    else:
+        hash = md5(bytes(key1, "utf-8") + bytes(key2, "utf-8")).digest()
+        uuid_object = uuid.UUID(bytes=hash[:16], version=3)
+
+    return str(uuid_object)
+
+
+def get_hash(text):
+    return hash(text)
+
+
+##################################################
+
+
+def cut_string(text, length):
+    """
+    @summary: 将文本按指定长度拆分
+    ---------
+    @param text: 文本
+    @param length: 拆分长度
+    ---------
+    @result: 返回按指定长度拆分后形成的list
+    """
+
+    text_list = re.findall(".{%d}" % length, text, re.S)
+    leave_text = text[len(text_list) * length :]
+    if leave_text:
+        text_list.append(leave_text)
+
+    return text_list
+
+
+def get_random_string(length=1):
+    random_string = "".join(random.sample(string.ascii_letters + string.digits, length))
+    return random_string
+
+
+def get_random_password(length=8, special_characters=""):
+    """
+    @summary: 创建随机密码 默认长度为8,包含大写字母、小写字母、数字
+    ---------
+    @param length: 密码长度 默认8
+    @param special_characters: 特殊字符
+    ---------
+    @result: 指定长度的密码
+    """
+
+    while True:
+        random_password = "".join(
+            random.sample(
+                string.ascii_letters + string.digits + special_characters, length
+            )
+        )
+        if (
+            re.search("[0-9]", random_password)
+            and re.search("[A-Z]", random_password)
+            and re.search("[a-z]", random_password)
+        ):
+            if not special_characters:
+                break
+            elif set(random_password).intersection(special_characters):
+                break
+
+    return random_password
+
+
+def get_random_email(length=None, email_types: list = None, special_characters=""):
+    """
+    随机生成邮箱
+    :param length: 邮箱长度
+    :param email_types: 邮箱类型
+    :param special_characters: 特殊字符
+    :return:
+    """
+    if not length:
+        length = random.randint(4, 12)
+    if not email_types:
+        email_types = [
+            "qq.com",
+            "163.com",
+            "gmail.com",
+            "yahoo.com",
+            "hotmail.com",
+            "yeah.net",
+            "126.com",
+            "139.com",
+            "sohu.com",
+        ]
+
+    email_body = get_random_password(length, special_characters)
+    email_type = random.choice(email_types)
+
+    email = email_body + "@" + email_type
+    return email
+
+
+#################################
+
+
+def dumps_obj(obj):
+    return pickle.dumps(obj)
+
+
+def loads_obj(obj_str):
+    return pickle.loads(obj_str)
+
+
+def get_method(obj, name):
+    name = str(name)
+    try:
+        return getattr(obj, name)
+    except AttributeError:
+        log.error("Method %r not found in: %s" % (name, obj))
+        return None
+
+
+def witch_workspace(project_path):
+    """
+    @summary:
+    ---------
+    @param project_path:
+    ---------
+    @result:
+    """
+
+    os.chdir(project_path)  # 切换工作路经
+
+
+############### 数据库相关 #######################
+def format_sql_value(value):
+    if isinstance(value, str):
+        value = value.strip()
+
+    elif isinstance(value, (list, dict)):
+        value = dumps_json(value, indent=None)
+
+    elif isinstance(value, (datetime.date, datetime.time)):
+        value = str(value)
+
+    elif isinstance(value, bool):
+        value = int(value)
+
+    return value
+
+
+def list2str(datas):
+    """
+    列表转字符串
+    :param datas: [1, 2]
+    :return: (1, 2)
+    """
+    data_str = str(tuple(datas))
+    data_str = re.sub(",\)$", ")", data_str)
+    return data_str
+
+
+def make_insert_sql(
+    table, data, auto_update=False, update_columns=(), insert_ignore=False
+):
+    """
+    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
+    ---------
+    @param table:
+    @param data: 表数据 json格式
+    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
+    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
+    @param insert_ignore: 数据存在忽略
+    ---------
+    @result:
+    """
+
+    keys = ["`{}`".format(key) for key in data.keys()]
+    keys = list2str(keys).replace("'", "")
+
+    values = [format_sql_value(value) for value in data.values()]
+    values = list2str(values)
+
+    if update_columns:
+        if not isinstance(update_columns, (tuple, list)):
+            update_columns = [update_columns]
+        update_columns_ = ", ".join(
+            ["{key}=values({key})".format(key=key) for key in update_columns]
+        )
+        sql = (
+            "insert%s into `{table}` {keys} values {values} on duplicate key update %s"
+            % (" ignore" if insert_ignore else "", update_columns_)
+        )
+
+    elif auto_update:
+        sql = "replace into `{table}` {keys} values {values}"
+    else:
+        sql = "insert%s into `{table}` {keys} values {values}" % (
+            " ignore" if insert_ignore else ""
+        )
+
+    sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
+    return sql
+
+
+def make_update_sql(table, data, condition):
+    """
+    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
+    ---------
+    @param table:
+    @param data: 表数据 json格式
+    @param condition: where 条件
+    ---------
+    @result:
+    """
+    key_values = []
+
+    for key, value in data.items():
+        value = format_sql_value(value)
+        if isinstance(value, str):
+            key_values.append("`{}`={}".format(key, repr(value)))
+        elif value is None:
+            key_values.append("`{}`={}".format(key, "null"))
+        else:
+            key_values.append("`{}`={}".format(key, value))
+
+    key_values = ", ".join(key_values)
+
+    sql = "update `{table}` set {key_values} where {condition}"
+    sql = sql.format(table=table, key_values=key_values, condition=condition)
+    return sql
+
+
+def make_batch_sql(
+    table, datas, auto_update=False, update_columns=(), update_columns_value=()
+):
+    """
+    @summary: 生产批量的sql
+    ---------
+    @param table:
+    @param datas: 表数据 [{...}]
+    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
+    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
+    @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号, 如 update_columns_value=("'test'",)
+    ---------
+    @result:
+    """
+    if not datas:
+        return
+
+    keys = list(datas[0].keys())
+    values_placeholder = ["%s"] * len(keys)
+
+    values = []
+    for data in datas:
+        value = []
+        for key in keys:
+            current_data = data.get(key)
+            current_data = format_sql_value(current_data)
+
+            value.append(current_data)
+
+        values.append(value)
+
+    keys = ["`{}`".format(key) for key in keys]
+    keys = list2str(keys).replace("'", "")
+
+    values_placeholder = list2str(values_placeholder).replace("'", "")
+
+    if update_columns:
+        if not isinstance(update_columns, (tuple, list)):
+            update_columns = [update_columns]
+        if update_columns_value:
+            update_columns_ = ", ".join(
+                [
+                    "`{key}`={value}".format(key=key, value=value)
+                    for key, value in zip(update_columns, update_columns_value)
+                ]
+            )
+        else:
+            update_columns_ = ", ".join(
+                ["`{key}`=values(`{key}`)".format(key=key) for key in update_columns]
+            )
+        sql = "insert into `{table}` {keys} values {values_placeholder} on duplicate key update {update_columns}".format(
+            table=table,
+            keys=keys,
+            values_placeholder=values_placeholder,
+            update_columns=update_columns_,
+        )
+    elif auto_update:
+        sql = "replace into `{table}` {keys} values {values_placeholder}".format(
+            table=table, keys=keys, values_placeholder=values_placeholder
+        )
+    else:
+        sql = "insert ignore into `{table}` {keys} values {values_placeholder}".format(
+            table=table, keys=keys, values_placeholder=values_placeholder
+        )
+
+    return sql, values
+
+
+############### json相关 #######################
+
+
+def key2underline(key: str, strict=True):
+    """
+    >>> key2underline("HelloWord")
+    'hello_word'
+    >>> key2underline("SHData", strict=True)
+    's_h_data'
+    >>> key2underline("SHData", strict=False)
+    'sh_data'
+    >>> key2underline("SHDataHi", strict=False)
+    'sh_data_hi'
+    >>> key2underline("SHDataHi", strict=True)
+    's_h_data_hi'
+    >>> key2underline("dataHi", strict=True)
+    'data_hi'
+    """
+    regex = "[A-Z]*" if not strict else "[A-Z]"
+    capitals = re.findall(regex, key)
+
+    if capitals:
+        for capital in capitals:
+            if not capital:
+                continue
+            if key.startswith(capital):
+                if len(capital) > 1:
+                    key = key.replace(
+                        capital, capital[:-1].lower() + "_" + capital[-1].lower(), 1
+                    )
+                else:
+                    key = key.replace(capital, capital.lower(), 1)
+            else:
+                if len(capital) > 1:
+                    key = key.replace(capital, "_" + capital.lower() + "_", 1)
+                else:
+                    key = key.replace(capital, "_" + capital.lower(), 1)
+
+    return key.strip("_")
+
+
+def key2hump(key):
+    """
+    下划线试变成首字母大写
+    """
+    return key.title().replace("_", "")
+
+
+def format_json_key(json_data):
+    json_data_correct = {}
+    for key, value in json_data.items():
+        key = key2underline(key)
+        json_data_correct[key] = value
+
+    return json_data_correct
+
+
+def quick_to_json(text):
+    """
+    @summary: 可快速将浏览器上的header转为json格式
+    ---------
+    @param text:
+    ---------
+    @result:
+    """
+
+    contents = text.split("\n")
+    json = {}
+    for content in contents:
+        if content == "\n":
+            continue
+
+        content = content.strip()
+        regex = ["(:?.*?):(.*)", "(.*?):? +(.*)", "([^:]*)"]
+
+        result = get_info(content, regex)
+        result = result[0] if isinstance(result[0], tuple) else result
+        try:
+            json[result[0]] = eval(result[1].strip())
+        except:
+            json[result[0]] = result[1].strip()
+
+    return json
+
+
+##############################
+
+
+def print_pretty(object):
+    pprint(object)
+
+
+def print_params2json(url):
+    params_json = {}
+    params = url.split("?")[-1].split("&")
+    for param in params:
+        key_value = param.split("=", 1)
+        params_json[key_value[0]] = key_value[1]
+
+    print(dumps_json(params_json))
+
+
+def print_cookie2json(cookie_str_or_list):
+    if isinstance(cookie_str_or_list, str):
+        cookie_json = {}
+        cookies = cookie_str_or_list.split("; ")
+        for cookie in cookies:
+            name, value = cookie.split("=")
+            cookie_json[name] = value
+    else:
+        cookie_json = get_cookies_from_selenium_cookie(cookie_str_or_list)
+
+    print(dumps_json(cookie_json))
+
+
+###############################
+
+
+def flatten(x):
+    """flatten(sequence) -> list
+    Returns a single, flat list which contains all elements retrieved
+    from the sequence and all recursively contained sub-sequences
+    (iterables).
+    Examples:
+    >>> [1, 2, [3,4], (5,6)]
+    [1, 2, [3, 4], (5, 6)]
+    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
+    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
+    >>> flatten(["foo", "bar"])
+    ['foo', 'bar']
+    >>> flatten(["foo", ["baz", 42], "bar"])
+    ['foo', 'baz', 42, 'bar']
+    """
+    return list(iflatten(x))
+
+
+def iflatten(x):
+    """iflatten(sequence) -> iterator
+    Similar to ``.flatten()``, but returns iterator instead"""
+    for el in x:
+        if _is_listlike(el):
+            for el_ in flatten(el):
+                yield el_
+        else:
+            yield el
+
+
+def _is_listlike(x):
+    """
+    >>> _is_listlike("foo")
+    False
+    >>> _is_listlike(5)
+    False
+    >>> _is_listlike(b"foo")
+    False
+    >>> _is_listlike([b"foo"])
+    True
+    >>> _is_listlike((b"foo",))
+    True
+    >>> _is_listlike({})
+    True
+    >>> _is_listlike(set())
+    True
+    >>> _is_listlike((x for x in range(3)))
+    True
+    >>> _is_listlike(six.moves.xrange(5))
+    True
+    """
+    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
+
+
+###################
+
+
+def re_def_supper_class(obj, supper_class):
+    """
+    重新定义父类
+    @param obj: 类 如 class A: 则obj为A 或者 A的实例 a.__class__
+    @param supper_class: 父类
+    @return:
+    """
+    obj.__bases__ = (supper_class,)
+
+
+###################
+freq_limit_record = {}
+
+
+def reach_freq_limit(rate_limit, *key):
+    """
+    频率限制
+    :param rate_limit: 限制时间 单位秒
+    :param key: 频率限制的key
+    :return: True / False
+    """
+    if rate_limit == 0:
+        return False
+
+    msg_md5 = get_md5(*key)
+    key = "rate_limit:{}".format(msg_md5)
+    try:
+        if get_redisdb().get(key):
+            return True
+
+        get_redisdb().set(key, time.time(), ex=rate_limit)
+    except redis.exceptions.ConnectionError as e:
+        # 使用内存做频率限制
+        global freq_limit_record
+
+        if key not in freq_limit_record:
+            freq_limit_record[key] = time.time()
+            return False
+
+        if time.time() - freq_limit_record.get(key) < rate_limit:
+            return True
+        else:
+            freq_limit_record[key] = time.time()
+
+    return False
+
+
+def wechat_warning(
+    message,
+    message_prefix=None,
+    rate_limit=None,
+    url=None,
+    user_phone=None,
+    all_users: bool = None,
+):
+    """企业微信报警"""
+
+    # 为了加载最新的配置
+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
+    url = url or setting.WECHAT_WARNING_URL
+    user_phone = user_phone or setting.WECHAT_WARNING_PHONE
+    all_users = all_users if all_users is not None else setting.WECHAT_WARNING_ALL
+
+    if isinstance(user_phone, str):
+        user_phone = [user_phone] if user_phone else []
+
+    if all_users is True or not user_phone:
+        user_phone = ["@all"]
+
+    if not all([url, message]):
+        return
+
+    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
+        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
+        return
+
+    data = {
+        "msgtype": "text",
+        "text": {"content": message, "mentioned_mobile_list": user_phone},
+    }
+
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(
+            url, headers=headers, data=json.dumps(data).encode("utf8")
+        )
+        result = response.json()
+        response.close()
+        if result.get("errcode") == 0:
+            return True
+        else:
+            raise Exception(result.get("errmsg"))
+    except Exception as e:
+        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
+        return False
+
+
+###################
+
+
+def make_item(cls, data: dict):
+    """提供Item类与原数据,快速构建Item实例
+    :param cls: Item类
+    :param data: 字典格式的数据
+    """
+    item = cls()
+    for key, val in data.items():
+        setattr(item, key, val)
+    return item
+
+
+###################
+
+
+def aio_wrap(loop=None, executor=None):
+    """
+    wrap a normal sync version of a function to an async version
+    """
+    outer_loop = loop
+    outer_executor = executor
+
+    def wrap(fn):
+        @wraps(fn)
+        async def run(*args, loop=None, executor=None, **kwargs):
+            if loop is None:
+                if outer_loop is None:
+                    loop = asyncio.get_event_loop()
+                else:
+                    loop = outer_loop
+            if executor is None:
+                executor = outer_executor
+            pfunc = partial(fn, *args, **kwargs)
+            return await loop.run_in_executor(executor, pfunc)
+
+        return run
+
+    return wrap
+
+
+######### number ##########
+
+
+def ensure_int(n):
+    """
+    >>> ensure_int(None)
+    0
+    >>> ensure_int(False)
+    0
+    >>> ensure_int(12)
+    12
+    >>> ensure_int("72")
+    72
+    >>> ensure_int('')
+    0
+    >>> ensure_int('1')
+    1
+    """
+    if not n:
+        return 0
+    return int(n)
+
+
+def ensure_float(n):
+    """
+    >>> ensure_float(None)
+    0.0
+    >>> ensure_float(False)
+    0.0
+    >>> ensure_float(12)
+    12.0
+    >>> ensure_float("72")
+    72.0
+    """
+    if not n:
+        return 0.0
+    return float(n)
+
+
+def ensure_int64(n):
+    """
+    >>> ensure_int64(None)
+    0
+    >>> ensure_float(False)
+    0
+    >>> ensure_float(12)
+    12
+    >>> ensure_float("72")
+    72
+    """
+    if not n:
+        return bson.int64.Int64(0)
+    return bson.int64.Int64(n)
+
+
+def import_cls(cls_info):
+    module, class_name = cls_info.rsplit(".", 1)
+    cls = importlib.import_module(module).__getattribute__(class_name)
+    return cls

+ 12 - 0
A数据处理/site_monitor/utils/webdriver/__init__.py

@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:39 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+from .playwright_driver import PlaywrightDriver
+from .webdirver import InterceptRequest, InterceptResponse
+from .webdriver_pool import WebDriverPool

+ 300 - 0
A数据处理/site_monitor/utils/webdriver/playwright_driver.py

@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:11 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import json
+import os
+import re
+from collections import defaultdict
+from typing import Union, List
+
+try:
+    from typing import Literal  # python >= 3.8
+except ImportError:  # python <3.8
+    from typing_extensions import Literal
+
+
+from playwright.sync_api import Page, BrowserContext, ViewportSize, ProxySettings
+from playwright.sync_api import Playwright, Browser
+from playwright.sync_api import Response
+from playwright.sync_api import sync_playwright
+
+from utils import tools
+from utils.log import logger as log
+from utils.webdriver.webdirver import *
+
+
+class PlaywrightDriver(WebDriver):
+    def __init__(
+        self,
+        *,
+        page_on_event_callback: dict = None,
+        storage_state_path: str = None,
+        driver_type: Literal["chromium", "firefox", "webkit"] = "webkit",
+        url_regexes: list = None,
+        save_all: bool = False,
+        **kwargs
+    ):
+        """
+
+        Args:
+            page_on_event_callback: page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
+            storage_state_path: 保存浏览器状态的路径
+            driver_type: 浏览器类型 chromium, firefox, webkit
+            url_regexes: 拦截接口,支持正则,数组类型
+            save_all: 是否保存所有拦截的接口, 默认只保存最后一个
+            **kwargs:
+        """
+        super(PlaywrightDriver, self).__init__(**kwargs)
+        self.driver: Playwright = None
+        self.browser: Browser = None
+        self.context: BrowserContext = None
+        self.page: Page = None
+        self.url = None
+        self.storage_state_path = storage_state_path
+
+        self._driver_type = driver_type
+        self._page_on_event_callback = page_on_event_callback
+        self._url_regexes = url_regexes
+        self._save_all = save_all
+
+        if self._save_all and self._url_regexes:
+            log.warning(
+                "获取完拦截的数据后, 请主动调用PlaywrightDriver的clear_cache()方法清空拦截的数据,否则数据会一直累加,导致内存溢出"
+            )
+            self._cache_data = defaultdict(list)
+        else:
+            self._cache_data = {}
+
+        self._setup()
+
+    def _setup(self):
+        # 处理参数
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = self.format_context_proxy(proxy)
+        else:
+            proxy = None
+
+        user_agent = (
+            self._user_agent() if callable(self._user_agent) else self._user_agent
+        )
+
+        view_size = ViewportSize(
+            width=self._window_size[0], height=self._window_size[1]
+        )
+
+        # 初始化浏览器对象
+        self.driver = sync_playwright().start()
+        self.browser = getattr(self.driver, self._driver_type).launch(
+            headless=self._headless,
+            args=["--no-sandbox"],
+            proxy=proxy,
+            executable_path=self._executable_path,
+            downloads_path=self._download_path,
+        )
+
+        if self.storage_state_path and os.path.exists(self.storage_state_path):
+            self.context = self.browser.new_context(
+                user_agent=user_agent,
+                screen=view_size,
+                viewport=view_size,
+                proxy=proxy,
+                storage_state=self.storage_state_path,
+                ignore_https_errors=True
+            )
+        else:
+            self.context = self.browser.new_context(
+                user_agent=user_agent,
+                screen=view_size,
+                viewport=view_size,
+                proxy=proxy,
+                ignore_https_errors=True
+            )
+
+        if self._use_stealth_js:
+            path = os.path.join(os.path.dirname(__file__), "../js/stealth.min.js")
+            self.context.add_init_script(path=path)
+
+        self.page = self.context.new_page()
+        self.page.set_default_timeout(self._timeout * 1000)
+
+        if self._page_on_event_callback:
+            for event, callback in self._page_on_event_callback.items():
+                self.page.on(event, callback)
+
+        if self._url_regexes:
+            self.page.on("response", self.on_response)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            log.error(exc_val)
+
+        self.quit()
+        return True
+
+    def format_context_proxy(self, proxy) -> ProxySettings:
+        """
+        Args:
+            proxy: username:password@ip:port / ip:port
+        Returns:
+            {
+                "server": "ip:port"
+                "username": username,
+                "password": password,
+            }
+            server: http://ip:port or socks5://ip:port. Short form ip:port is considered an HTTP proxy.
+        """
+
+        if "@" in proxy:
+            certification, _proxy = proxy.split("@")
+            username, password = certification.split(":")
+
+            context_proxy = ProxySettings(
+                server=_proxy,
+                username=username,
+                password=password,
+            )
+        else:
+            context_proxy = ProxySettings(server=proxy)
+
+        return context_proxy
+
+    def save_storage_stage(self):
+        if self.storage_state_path:
+            os.makedirs(os.path.dirname(self.storage_state_path), exist_ok=True)
+            self.context.storage_state(path=self.storage_state_path)
+
+    def quit(self):
+        self.page.close()
+        self.context.close()
+        self.browser.close()
+        self.driver.stop()
+
+    @property
+    def domain(self):
+        return tools.get_domain(self.url or self.page.url)
+
+    @property
+    def cookies(self):
+        cookies_json = {}
+        for cookie in self.page.context.cookies():
+            cookies_json[cookie["name"]] = cookie["value"]
+
+        return cookies_json
+
+    @cookies.setter
+    def cookies(self, val: Union[dict, List[dict]]):
+        """
+        设置cookie
+        Args:
+            val: List[{name: str, value: str, url: Union[str, NoneType], domain: Union[str, NoneType], path: Union[str, NoneType], expires: Union[float, NoneType], httpOnly: Union[bool, NoneType], secure: Union[bool, NoneType], sameSite: Union["Lax", "None", "Strict", NoneType]}]
+
+        Returns:
+
+        """
+        if isinstance(val, list):
+            self.page.context.add_cookies(val)
+        else:
+            cookies = []
+            for key, value in val.items():
+                cookies.append(
+                    {"name": key, "value": value, "url": self.url or self.page.url}
+                )
+            self.page.context.add_cookies(cookies)
+
+    @property
+    def user_agent(self):
+        return self.page.evaluate("() => navigator.userAgent")
+
+    def on_response(self, response: Response):
+        for regex in self._url_regexes:
+            if re.search(regex, response.request.url):
+                intercept_request = InterceptRequest(
+                    url=response.request.url,
+                    headers=response.request.headers,
+                    data=response.request.post_data,
+                )
+
+                intercept_response = InterceptResponse(
+                    request=intercept_request,
+                    url=response.url,
+                    headers=response.headers,
+                    content=response.body(),
+                    status_code=response.status,
+                )
+                if self._save_all:
+                    self._cache_data[regex].append(intercept_response)
+                else:
+                    self._cache_data[regex] = intercept_response
+
+    def get_response(self, url_regex) -> InterceptResponse:
+        if self._save_all:
+            response_list = self._cache_data.get(url_regex)
+            if response_list:
+                return response_list[-1]
+        return self._cache_data.get(url_regex)
+
+    def get_all_response(self, url_regex) -> List[InterceptResponse]:
+        """
+        获取所有匹配的响应, 仅在save_all=True时有效
+        Args:
+            url_regex:
+
+        Returns:
+
+        """
+        response_list = self._cache_data.get(url_regex, [])
+        if not isinstance(response_list, list):
+            return [response_list]
+        return response_list
+
+    def get_text(self, url_regex):
+        return (
+            self.get_response(url_regex).content.decode()
+            if self.get_response(url_regex)
+            else None
+        )
+
+    def get_all_text(self, url_regex):
+        """
+        获取所有匹配的响应文本, 仅在save_all=True时有效
+        Args:
+            url_regex:
+
+        Returns:
+
+        """
+        return [
+            response.content.decode() for response in self.get_all_response(url_regex)
+        ]
+
+    def get_json(self, url_regex):
+        return (
+            json.loads(self.get_text(url_regex))
+            if self.get_response(url_regex)
+            else None
+        )
+
+    def get_all_json(self, url_regex):
+        """
+        获取所有匹配的响应json, 仅在save_all=True时有效
+        Args:
+            url_regex:
+
+        Returns:
+
+        """
+        return [json.loads(text) for text in self.get_all_text(url_regex)]
+
+    def clear_cache(self):
+        self._cache_data = defaultdict(list)

+ 81 - 0
A数据处理/site_monitor/utils/webdriver/webdirver.py

@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2022/9/7 4:27 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import abc
+
+import setting as setting
+
+
+class InterceptRequest:
+    def __init__(self, url, data, headers):
+        self.url = url
+        self.data = data
+        self.headers = headers
+
+
+class InterceptResponse:
+    def __init__(self, request: InterceptRequest, url, headers, content, status_code):
+        self.request = request
+        self.url = url
+        self.headers = headers
+        self.content = content
+        self.status_code = status_code
+
+
+class WebDriver:
+    def __init__(
+        self,
+        load_images=True,
+        user_agent=None,
+        proxy=None,
+        headless=False,
+        driver_type=None,
+        timeout=16,
+        window_size=(1024, 800),
+        executable_path=None,
+        custom_argument=None,
+        download_path=None,
+        auto_install_driver=True,
+        use_stealth_js=True,
+        **kwargs,
+    ):
+        """
+        webdirver 封装,支持chrome、phantomjs 和 firefox
+        Args:
+            load_images: 是否加载图片
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式
+            driver_type: CHROME 或 PHANTOMJS,FIREFOX
+            timeout: 请求超时时间
+            window_size: # 窗口大小
+            executable_path: 浏览器路径,默认为默认路径
+            custom_argument: 自定义参数 用于webdriver.Chrome(options=chrome_options, **kwargs)
+            download_path: 文件下载保存路径;如果指定,不再出现“保留”“放弃”提示,仅对Chrome有效
+            auto_install_driver: 自动下载浏览器驱动 支持chrome 和 firefox
+            use_stealth_js: 使用stealth.min.js隐藏浏览器特征
+            **kwargs:
+        """
+        self._load_images = load_images
+        self._user_agent = user_agent or setting.DEFAULT_USERAGENT
+        self._proxy = proxy
+        self._headless = headless
+        self._timeout = timeout
+        self._window_size = window_size
+        self._executable_path = executable_path
+        self._custom_argument = custom_argument
+        self._download_path = download_path
+        self._auto_install_driver = auto_install_driver
+        self._use_stealth_js = use_stealth_js
+        self._driver_type = driver_type
+        self._kwargs = kwargs
+
+    @abc.abstractmethod
+    def quit(self):
+        pass

+ 115 - 0
A数据处理/site_monitor/utils/webdriver/webdriver_pool.py

@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/18 4:59 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import queue
+import threading
+
+from utils.log import logger as log
+from utils.tools import Singleton
+from utils.webdriver.playwright_driver import PlaywrightDriver
+
+
+@Singleton
+class WebDriverPool:
+    def __init__(
+        self, pool_size=5, driver_cls=PlaywrightDriver, thread_safe=False, **kwargs
+    ):
+        """
+
+        Args:
+            pool_size: driver池的大小
+            driver: 驱动类型
+            thread_safe: 是否线程安全
+                是则每个线程拥有一个driver,pool_size无效,driver数量为线程数
+                否则每个线程从池中获取driver
+            **kwargs:
+        """
+        self.pool_size = pool_size
+        self.driver_cls = driver_cls
+        self.thread_safe = thread_safe
+        self.kwargs = kwargs
+
+        self.queue = queue.Queue(maxsize=pool_size)
+        self.lock = threading.RLock()
+        self.driver_count = 0
+        self.ctx = threading.local()
+
+    @property
+    def driver(self):
+        if not hasattr(self.ctx, "driver"):
+            self.ctx.driver = None
+        return self.ctx.driver
+
+    @driver.setter
+    def driver(self, driver):
+        self.ctx.driver = driver
+
+    @property
+    def is_full(self):
+        return self.driver_count >= self.pool_size
+
+    def create_driver(self, user_agent: str = None, proxy: str = None):
+        kwargs = self.kwargs.copy()
+        if user_agent:
+            kwargs["user_agent"] = user_agent
+        if proxy:
+            kwargs["proxy"] = proxy
+        return self.driver_cls(**kwargs)
+
+    def get(self, user_agent: str = None, proxy: str = None):
+        """
+        获取webdriver
+        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
+        Args:
+            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
+            proxy: xxx.xxx.xxx.xxx
+        Returns:
+
+        """
+        if not self.is_full and not self.thread_safe:
+            with self.lock:
+                if not self.is_full:
+                    driver = self.create_driver(user_agent, proxy)
+                    self.queue.put(driver)
+                    self.driver_count += 1
+        elif self.thread_safe:
+            if not self.driver:
+                driver = self.create_driver(user_agent, proxy)
+                self.driver = driver
+                self.driver_count += 1
+
+        if self.thread_safe:
+            driver = self.driver
+        else:
+            driver = self.queue.get()
+
+        return driver
+
+    def put(self, driver):
+        if not self.thread_safe:
+            self.queue.put(driver)
+
+    def remove(self, driver):
+        if self.thread_safe:
+            if self.driver:
+                self.driver.quit()
+                self.driver = None
+        else:
+            driver.quit()
+        self.driver_count -= 1
+
+    def close(self):
+        if self.thread_safe:
+            log.info("暂不支持关闭需线程安全的driver")
+
+        while not self.queue.empty():
+            driver = self.queue.get()
+            driver.quit()
+            self.driver_count -= 1

部分文件因为文件数量过多而无法显示