Ver Fonte

Merge branch 'master' of http://192.168.3.207:8080/data_processing/crawlab_feader

lizongze há 2 anos atrás
pai
commit
193312b55e
30 ficheiros alterados com 0 adições e 7242 exclusões
  1. 0 141
      A数据处理/site_monitor/.gitignore
  2. 0 19
      A数据处理/site_monitor/README.md
  3. 0 9
      A数据处理/site_monitor/db/__init__.py
  4. 0 422
      A数据处理/site_monitor/db/mongodb.py
  5. 0 924
      A数据处理/site_monitor/db/redisdb.py
  6. 0 35
      A数据处理/site_monitor/docker/Dockerfile
  7. 0 17
      A数据处理/site_monitor/docker/docker-compose.yml
  8. 0 205
      A数据处理/site_monitor/monitor.py
  9. 0 8
      A数据处理/site_monitor/network/__init__.py
  10. 0 3
      A数据处理/site_monitor/network/downloader/__init__.py
  11. 0 104
      A数据处理/site_monitor/network/downloader/_playwright.py
  12. 0 46
      A数据处理/site_monitor/network/downloader/_requests.py
  13. 0 41
      A数据处理/site_monitor/network/downloader/base.py
  14. 0 32
      A数据处理/site_monitor/network/proxy_file/de9f83d546a39eca6979d2a6dca3407a.txt
  15. 0 746
      A数据处理/site_monitor/network/proxy_pool.py
  16. 0 524
      A数据处理/site_monitor/network/request.py
  17. 0 414
      A数据处理/site_monitor/network/response.py
  18. 0 389
      A数据处理/site_monitor/network/user_agent.py
  19. 0 14
      A数据处理/site_monitor/requirements.txt
  20. 0 65
      A数据处理/site_monitor/setting.py
  21. 0 8
      A数据处理/site_monitor/utils/__init__.py
  22. 0 147
      A数据处理/site_monitor/utils/clean_html.py
  23. 0 0
      A数据处理/site_monitor/utils/js/intercept.js
  24. 0 6
      A数据处理/site_monitor/utils/js/stealth.min.js
  25. 0 14
      A数据处理/site_monitor/utils/log.py
  26. 0 2401
      A数据处理/site_monitor/utils/tools.py
  27. 0 12
      A数据处理/site_monitor/utils/webdriver/__init__.py
  28. 0 300
      A数据处理/site_monitor/utils/webdriver/playwright_driver.py
  29. 0 81
      A数据处理/site_monitor/utils/webdriver/webdirver.py
  30. 0 115
      A数据处理/site_monitor/utils/webdriver/webdriver_pool.py

+ 0 - 141
A数据处理/site_monitor/.gitignore

@@ -1,141 +0,0 @@
-### Python template
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-.idea

+ 0 - 19
A数据处理/site_monitor/README.md

@@ -1,19 +0,0 @@
-# 原网站监控
-
-#### 构建镜像
-```shell
-$ cd site_monitor
-$ docker build -t site_monitor:v1.0 -f docker/Dockerfile .
-```
-
-#### 创建容器
-```shell
-$ cd site_monitor
-$ docker-compose -f docker/docker-compose.yml up -d
-```
-
-#### 关闭容器
-```shell
-$ cd site_monitor
-$ docker-compose -f docker/docker-compose.yml down
-```

+ 0 - 9
A数据处理/site_monitor/db/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/23 12:09 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""

+ 0 - 422
A数据处理/site_monitor/db/mongodb.py

@@ -1,422 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 操作mongo数据库
----------
-@author: Mkdir700
-@email:  mkdir700@gmail.com
-"""
-import re
-from typing import List, Dict, Optional
-from urllib import parse
-
-import pymongo
-from pymongo import MongoClient
-from pymongo.collection import Collection
-from pymongo.database import Database
-from pymongo.errors import DuplicateKeyError, BulkWriteError
-
-import setting as setting
-from utils.log import logger as log
-
-
-class MongoDB:
-    def __init__(
-        self,
-        ip=None,
-        port=None,
-        db=None,
-        user_name=None,
-        user_pass=None,
-        url=None,
-        **kwargs,
-    ):
-        if url:
-            self.client = MongoClient(url, **kwargs)
-        else:
-            if not ip:
-                ip = setting.MONGO_IP
-            if not port:
-                port = setting.MONGO_PORT
-            if not db:
-                db = setting.MONGO_DB
-            if not user_name:
-                user_name = setting.MONGO_USER_NAME
-            if not user_pass:
-                user_pass = setting.MONGO_USER_PASS
-            self.client = MongoClient(
-                host=ip, port=port, username=user_name, password=user_pass
-            )
-
-        self.db = self.get_database(db)
-
-        # 缓存索引信息
-        self.__index__cached = {}
-
-    @classmethod
-    def from_url(cls, url, **kwargs):
-        """
-        Args:
-            url: mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
-                 参考:http://mongodb.github.io/mongo-java-driver/3.4/javadoc/com/mongodb/MongoClientURI.html
-            **kwargs:
-
-        Returns:
-
-        """
-        url_parsed = parse.urlparse(url)
-
-        db_type = url_parsed.scheme.strip()
-        if db_type != "mongodb":
-            raise Exception(
-                "url error, expect mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]], but get {}".format(
-                    url
-                )
-            )
-
-        return cls(url=url, **kwargs)
-
-    def get_database(self, database, **kwargs) -> Database:
-        """
-        获取数据库对象
-        @param database: 数据库名
-        @return:
-        """
-        return self.client.get_database(database, **kwargs)
-
-    def get_collection(self, coll_name, **kwargs) -> Collection:
-        """
-        根据集合名获取集合对象
-        @param coll_name: 集合名
-        @return:
-        """
-        return self.db.get_collection(coll_name, **kwargs)
-
-    def find(
-        self, coll_name: str, condition: Optional[Dict] = None, limit: int = 0, **kwargs
-    ) -> List[Dict]:
-        """
-        @summary:
-        无数据: 返回[]
-        有数据: [{'_id': 'xx', ...}, ...]
-        ---------
-        @param coll_name: 集合名(表名)
-        @param condition: 查询条件
-        @param limit: 结果数量
-        @param kwargs:
-            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
-
-        ---------
-        @result:
-        """
-        condition = {} if condition is None else condition
-        command = {"find": coll_name, "filter": condition, "limit": limit}
-        command.update(kwargs)
-        result = self.run_command(command)
-        cursor = result["cursor"]
-        cursor_id = cursor["id"]
-        dataset = cursor["firstBatch"]
-        while True:
-            if cursor_id == 0:
-                break
-            result = self.run_command(
-                {
-                    "getMore": cursor_id,
-                    "collection": coll_name,
-                    "batchSize": kwargs.get("batchSize", 100),
-                }
-            )
-            cursor = result["cursor"]
-            cursor_id = cursor["id"]
-            dataset.extend(cursor["nextBatch"])
-        return dataset
-
-    def add(
-        self,
-        coll_name,
-        data: Dict,
-        replace=False,
-        update_columns=(),
-        update_columns_value=(),
-        insert_ignore=False,
-    ):
-        """
-        添加单条数据
-        Args:
-            coll_name: 集合名
-            data: 单条数据
-            replace: 唯一索引冲突时直接覆盖旧数据,默认为False
-            update_columns: 更新指定的列(如果数据唯一索引冲突,则更新指定字段,如 update_columns = ["name", "title"]
-            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
-            insert_ignore: 索引冲突是否忽略 默认False
-
-        Returns: 插入成功的行数
-
-        """
-        affect_count = 1
-        collection = self.get_collection(coll_name)
-        try:
-            collection.insert_one(data)
-        except DuplicateKeyError as e:
-            # 存在则更新
-            if update_columns:
-                if not isinstance(update_columns, (tuple, list)):
-                    update_columns = [update_columns]
-
-                condition = self.__get_update_condition(
-                    coll_name, data, e.details.get("errmsg")
-                )
-
-                # 更新指定的列
-                if update_columns_value:
-                    # 使用指定的值更新
-                    doc = {
-                        key: value
-                        for key, value in zip(update_columns, update_columns_value)
-                    }
-                else:
-                    # 使用数据本身的值更新
-                    doc = {key: data[key] for key in update_columns}
-
-                collection.update_one(condition, {"$set": doc})
-
-            # 覆盖更新
-            elif replace:
-                condition = self.__get_update_condition(
-                    coll_name, data, e.details.get("errmsg")
-                )
-                # 替换已存在的数据
-                collection.replace_one(condition, data)
-
-            elif not insert_ignore:
-                raise e
-
-        return affect_count
-
-    def add_batch(
-        self,
-        coll_name: str,
-        datas: List[Dict],
-        replace=False,
-        update_columns=(),
-        update_columns_value=(),
-        condition_fields: dict = None,
-    ):
-        """
-        批量添加数据
-        Args:
-            coll_name: 集合名
-            datas: 数据 [{'_id': 'xx'}, ... ]
-            replace:  唯一索引冲突时直接覆盖旧数据,默认为False
-            update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
-            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
-            condition_fields: 用于条件查找的字段,不指定则用索引冲突中的字段查找
-
-        Returns: 添加行数,不包含更新
-
-        """
-        add_count = 0
-
-        if not datas:
-            return add_count
-
-        collection = self.get_collection(coll_name)
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-
-        try:
-            add_count = len(datas)
-            collection.insert_many(datas, ordered=False)
-        except BulkWriteError as e:
-            write_errors = e.details.get("writeErrors")
-            for error in write_errors:
-                if error.get("code") == 11000:
-                    # 数据重复
-                    # 获取重复的数据
-                    data = error.get("op")
-
-                    def get_condition():
-                        # 获取更新条件
-                        if condition_fields:
-                            condition = {
-                                condition_field: data[condition_field]
-                                for condition_field in condition_fields
-                            }
-                        else:
-                            # 根据重复的值获取更新条件
-                            condition = self.__get_update_condition(
-                                coll_name, data, error.get("errmsg")
-                            )
-
-                        return condition
-
-                    if update_columns:
-                        # 更新指定的列
-                        if update_columns_value:
-                            # 使用指定的值更新
-                            doc = {
-                                key: value
-                                for key, value in zip(
-                                    update_columns, update_columns_value
-                                )
-                            }
-                        else:
-                            # 使用数据本身的值更新
-                            doc = {key: data.get(key) for key in update_columns}
-
-                        collection.update_one(get_condition(), {"$set": doc})
-                        add_count -= 1
-
-                    elif replace:
-                        # 覆盖更新
-                        collection.replace_one(get_condition(), data)
-                        add_count -= 1
-
-                    else:
-                        # log.error(error)
-                        add_count -= 1
-
-        return add_count
-
-    def count(self, coll_name, condition: Optional[Dict], limit=0, **kwargs):
-        """
-        计数
-        @param coll_name: 集合名
-        @param condition: 查询条件
-        @param limit: 限制数量
-        @param kwargs:
-        ----
-        command = {
-          count: <collection or view>,
-          query: <document>,
-          limit: <integer>,
-          skip: <integer>,
-          hint: <hint>,
-          readConcern: <document>,
-          collation: <document>,
-          comment: <any>
-        }
-        https://docs.mongodb.com/manual/reference/command/count/#mongodb-dbcommand-dbcmd.count
-        @return: 数据数量
-        """
-        command = {"count": coll_name, "query": condition, "limit": limit, **kwargs}
-        result = self.run_command(command)
-        return result["n"]
-
-    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
-        """
-        更新
-        Args:
-            coll_name: 集合名
-            data: 单条数据 {"xxx":"xxx"}
-            condition: 更新条件 {"_id": "xxxx"}
-            upsert: 数据不存在则插入,默认为 False
-
-        Returns: True / False
-        """
-        try:
-            collection = self.get_collection(coll_name)
-            collection.update_one(condition, {"$set": data}, upsert=upsert)
-        except Exception as e:
-            log.error(
-                """
-                error:{}
-                condition: {}
-            """.format(
-                    e, condition
-                )
-            )
-            return False
-        else:
-            return True
-
-    def delete(self, coll_name, condition: Dict) -> bool:
-        """
-        删除
-        Args:
-            coll_name: 集合名
-            condition: 查找条件
-        Returns: True / False
-
-        """
-        try:
-            collection = self.get_collection(coll_name)
-            collection.delete_one(condition)
-        except Exception as e:
-            log.error(
-                """
-                error:{}
-                condition: {}
-            """.format(
-                    e, condition
-                )
-            )
-            return False
-        else:
-            return True
-
-    def run_command(self, command: Dict):
-        """
-        运行指令
-        参考文档 https://www.geek-book.com/src/docs/mongodb/mongodb/docs.mongodb.com/manual/reference/command/index.html
-        @param command:
-        @return:
-        """
-        return self.db.command(command)
-
-    def create_index(self, coll_name, keys, unique=True):
-        collection = self.get_collection(coll_name)
-        _keys = [(key, pymongo.ASCENDING) for key in keys]
-        collection.create_index(_keys, unique=unique)
-
-    def get_index(self, coll_name):
-        return self.get_collection(coll_name).index_information()
-
-    def drop_collection(self, coll_name):
-        return self.db.drop_collection(coll_name)
-
-    def get_index_key(self, coll_name, index_name):
-        """
-        获取参与索引的key
-        Args:
-            index_name: 索引名
-
-        Returns:
-
-        """
-        cache_key = f"{coll_name}:{index_name}"
-
-        if cache_key in self.__index__cached:
-            return self.__index__cached.get(cache_key)
-
-        index = self.get_index(coll_name)
-        index_detail = index.get(index_name)
-        if not index_detail:
-            errmsg = f"not found index {index_name} in collection {coll_name}"
-            raise Exception(errmsg)
-
-        index_keys = [val[0] for val in index_detail.get("key")]
-        self.__index__cached[cache_key] = index_keys
-        return index_keys
-
-    def __get_update_condition(
-        self, coll_name: str, data: dict, duplicate_errmsg: str
-    ) -> dict:
-        """
-        根据索引冲突的报错信息 获取更新条件
-        Args:
-            duplicate_errmsg: E11000 duplicate key error collection: feapder.test index: a_1_b_1 dup key: { : 1, : "你好" }
-            data: {"a": 1, "b": "你好", "c": "嘻嘻"}
-
-        Returns: {"a": 1, "b": "你好"}
-
-        """
-        index_name = re.search(r"index: (\w+)", duplicate_errmsg).group(1)
-        index_keys = self.get_index_key(coll_name, index_name)
-
-        condition = {key: data.get(key) for key in index_keys}
-        return condition
-
-    def __getattr__(self, name):
-        return getattr(self.db, name)

+ 0 - 924
A数据处理/site_monitor/db/redisdb.py

@@ -1,924 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-11-16 16:25
----------
-@summary: 操作redis数据库
----------
-@author: Boris
-"""
-
-import time
-
-import redis
-from redis._compat import unicode, long, basestring
-from redis.connection import Encoder as _Encoder
-from redis.exceptions import ConnectionError, TimeoutError
-from redis.exceptions import DataError
-from redis.sentinel import Sentinel
-from rediscluster import RedisCluster
-
-import setting as setting
-from utils.log import logger as log
-
-
-class Encoder(_Encoder):
-    def encode(self, value):
-        "Return a bytestring or bytes-like representation of the value"
-        if isinstance(value, (bytes, memoryview)):
-            return value
-        # elif isinstance(value, bool):
-        #     # special case bool since it is a subclass of int
-        #     raise DataError(
-        #         "Invalid input of type: 'bool'. Convert to a "
-        #         "bytes, string, int or float first."
-        #     )
-        elif isinstance(value, float):
-            value = repr(value).encode()
-        elif isinstance(value, (int, long)):
-            # python 2 repr() on longs is '123L', so use str() instead
-            value = str(value).encode()
-        elif isinstance(value, (list, dict, tuple)):
-            value = unicode(value)
-        elif not isinstance(value, basestring):
-            # a value we don't know how to deal with. throw an error
-            typename = type(value).__name__
-            raise DataError(
-                "Invalid input of type: '%s'. Convert to a "
-                "bytes, string, int or float first." % typename
-            )
-        if isinstance(value, unicode):
-            value = value.encode(self.encoding, self.encoding_errors)
-        return value
-
-
-redis.connection.Encoder = Encoder
-
-
-class RedisDB:
-    def __init__(
-        self,
-        ip_ports=None,
-        db=None,
-        user_pass=None,
-        url=None,
-        decode_responses=True,
-        service_name=None,
-        max_connections=1000,
-        **kwargs,
-    ):
-        """
-        redis的封装
-        Args:
-            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-            db:
-            user_pass:
-            url:
-            decode_responses:
-            service_name: 适用于redis哨兵模式
-            max_connections: 同一个redis对象使用的并发数(连接池的最大连接数),超过这个数量会抛出redis.ConnectionError
-        """
-
-        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
-        if ip_ports is None:
-            ip_ports = setting.REDISDB_IP_PORTS
-        if db is None:
-            db = setting.REDISDB_DB
-        if user_pass is None:
-            user_pass = setting.REDISDB_USER_PASS
-        if service_name is None:
-            service_name = setting.REDISDB_SERVICE_NAME
-
-        self._is_redis_cluster = False
-
-        self.__redis = None
-        self._url = url
-        self._ip_ports = ip_ports
-        self._db = db
-        self._user_pass = user_pass
-        self._decode_responses = decode_responses
-        self._service_name = service_name
-        self._max_connections = max_connections
-        self._kwargs = kwargs
-        self.get_connect()
-
-    def __repr__(self):
-        if self._url:
-            return "<Redisdb url:{}>".format(self._url)
-
-        return "<Redisdb ip_ports: {} db:{} user_pass:{}>".format(
-            self._ip_ports, self._db, self._user_pass
-        )
-
-    @property
-    def _redis(self):
-        try:
-            if not self.__redis.ping():
-                raise ConnectionError("unable to connect to redis")
-        except:
-            self._reconnect()
-
-        return self.__redis
-
-    @_redis.setter
-    def _redis(self, val):
-        self.__redis = val
-
-    def get_connect(self):
-        # 获取数据库连接
-        try:
-            if not self._url:
-                if not self._ip_ports:
-                    raise ConnectionError("未设置 redis 连接信息")
-
-                ip_ports = (
-                    self._ip_ports
-                    if isinstance(self._ip_ports, list)
-                    else self._ip_ports.split(",")
-                )
-                if len(ip_ports) > 1:
-                    startup_nodes = []
-                    for ip_port in ip_ports:
-                        ip, port = ip_port.split(":")
-                        startup_nodes.append({"host": ip, "port": port})
-
-                    if self._service_name:
-                        # log.debug("使用redis哨兵模式")
-                        hosts = [(node["host"], node["port"]) for node in startup_nodes]
-                        sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs)
-                        self._redis = sentinel.master_for(
-                            self._service_name,
-                            password=self._user_pass,
-                            db=self._db,
-                            redis_class=redis.StrictRedis,
-                            decode_responses=self._decode_responses,
-                            max_connections=self._max_connections,
-                            **self._kwargs,
-                        )
-
-                    else:
-                        # log.debug("使用redis集群模式")
-                        self._redis = RedisCluster(
-                            startup_nodes=startup_nodes,
-                            decode_responses=self._decode_responses,
-                            password=self._user_pass,
-                            max_connections=self._max_connections,
-                            **self._kwargs,
-                        )
-
-                    self._is_redis_cluster = True
-                else:
-                    ip, port = ip_ports[0].split(":")
-                    self._redis = redis.StrictRedis(
-                        host=ip,
-                        port=port,
-                        db=self._db,
-                        password=self._user_pass,
-                        decode_responses=self._decode_responses,
-                        max_connections=self._max_connections,
-                        **self._kwargs,
-                    )
-                    self._is_redis_cluster = False
-            else:
-                self._redis = redis.StrictRedis.from_url(
-                    self._url, decode_responses=self._decode_responses
-                )
-                self._is_redis_cluster = False
-
-        except Exception as e:
-            raise e
-
-        # 不要写成self._redis.ping() 否则循环调用了
-        return self.__redis.ping()
-
-    @classmethod
-    def from_url(cls, url):
-        """
-
-        Args:
-            url: redis://[[username]:[password]]@[host]:[port]/[db]
-
-        Returns:
-
-        """
-        return cls(url=url)
-
-    def sadd(self, table, values):
-        """
-        @summary: 使用无序set集合存储数据, 去重
-        ---------
-        @param table:
-        @param values: 值; 支持list 或 单个值
-        ---------
-        @result: 若库中存在 返回0,否则入库,返回1。 批量添加返回None
-        """
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.sadd(table, value)
-            pipe.execute()
-
-        else:
-            return self._redis.sadd(table, values)
-
-    def sget(self, table, count=1, is_pop=True):
-        """
-        返回 list 如 ['1'] 或 []
-        @param table:
-        @param count:
-        @param is_pop:
-        @return:
-        """
-
-        datas = []
-        if is_pop:
-            count = count if count <= self.sget_count(table) else self.sget_count(table)
-            if count:
-                if count > 1:
-                    pipe = self._redis.pipeline()
-
-                    if not self._is_redis_cluster:
-                        pipe.multi()
-                    while count:
-                        pipe.spop(table)
-                        count -= 1
-                    datas = pipe.execute()
-
-                else:
-                    datas.append(self._redis.spop(table))
-
-        else:
-            datas = self._redis.srandmember(table, count)
-
-        return datas
-
-    def srem(self, table, values):
-        """
-        @summary: 移除集合中的指定元素
-        ---------
-        @param table:
-        @param values: 一个或者列表
-        ---------
-        @result:
-        """
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.srem(table, value)
-            pipe.execute()
-        else:
-            self._redis.srem(table, values)
-
-    def sget_count(self, table):
-        return self._redis.scard(table)
-
-    def sdelete(self, table):
-        """
-        @summary: 删除set集合的大键(数据量大的表)
-        删除大set键,使用sscan命令,每次扫描集合中500个元素,再用srem命令每次删除一个键
-        若直接用delete命令,会导致Redis阻塞,出现故障切换和应用程序崩溃的故障。
-        ---------
-        @param table:
-        ---------
-        @result:
-        """
-
-        # 当 SCAN 命令的游标参数被设置为 0 时, 服务器将开始一次新的迭代, 而当服务器向用户返回值为 0 的游标时, 表示迭代已结束
-        cursor = "0"
-        while cursor != 0:
-            cursor, data = self._redis.sscan(table, cursor=cursor, count=500)
-            for item in data:
-                # pipe.srem(table, item)
-                self._redis.srem(table, item)
-
-            # pipe.execute()
-
-    def sismember(self, table, key):
-        "Return a boolean indicating if ``value`` is a member of set ``name``"
-        return self._redis.sismember(table, key)
-
-    def zadd(self, table, values, prioritys=0):
-        """
-        @summary: 使用有序set集合存储数据, 去重(值存在更新)
-        ---------
-        @param table:
-        @param values: 值; 支持list 或 单个值
-        @param prioritys: 优先级; double类型,支持list 或 单个值。 根据此字段的值来排序, 值越小越优先。 可不传值,默认value的优先级为0
-        ---------
-        @result:若库中存在 返回0,否则入库,返回1。 批量添加返回 [0, 1 ...]
-        """
-        if isinstance(values, list):
-            if not isinstance(prioritys, list):
-                prioritys = [prioritys] * len(values)
-            else:
-                assert len(values) == len(prioritys), "values值要与prioritys值一一对应"
-
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value, priority in zip(values, prioritys):
-                pipe.execute_command(
-                    "ZADD", table, priority, value
-                )  # 为了兼容2.x与3.x版本的redis
-            return pipe.execute()
-
-        else:
-            return self._redis.execute_command(
-                "ZADD", table, prioritys, values
-            )  # 为了兼容2.x与3.x版本的redis
-
-    def zget(self, table, count=1, is_pop=True):
-        """
-        @summary: 从有序set集合中获取数据 优先返回分数小的(优先级高的)
-        ---------
-        @param table:
-        @param count: 数量 -1 返回全部数据
-        @param is_pop:获取数据后,是否在原set集合中删除,默认是
-        ---------
-        @result: 列表
-        """
-
-        start_pos = 0  # 包含
-        end_pos = count - 1 if count > 0 else count
-
-        pipe = self._redis.pipeline()
-
-        if not self._is_redis_cluster:
-            pipe.multi()  # 标记事务的开始 参考 http://www.runoob.com/redis/redis-transactions.html
-        pipe.zrange(table, start_pos, end_pos)  # 取值
-        if is_pop:
-            pipe.zremrangebyrank(table, start_pos, end_pos)  # 删除
-        results, *count = pipe.execute()
-        return results
-
-    def zremrangebyscore(self, table, priority_min, priority_max):
-        """
-        根据分数移除成员 闭区间
-        @param table:
-        @param priority_min:
-        @param priority_max:
-        @return: 被移除的成员个数
-        """
-        return self._redis.zremrangebyscore(table, priority_min, priority_max)
-
-    def zrangebyscore(self, table, priority_min, priority_max, count=None, is_pop=True):
-        """
-        @summary: 返回指定分数区间的数据 闭区间
-        ---------
-        @param table:
-        @param priority_min: 优先级越小越优先
-        @param priority_max:
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        @param is_pop: 是否删除
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[2]
-            local max_score = ARGV[3]
-            local is_pop = ARGV[4]
-            local count = ARGV[5]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
-            end
-
-            -- 删除redis中刚取到的值
-            if (is_pop=='True' or is_pop=='1') then
-                for i=1, #datas do
-                    redis.call('zrem', KEYS[1], datas[i])
-                end
-            end
-
-
-            return datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(
-                keys=[table], args=[table, priority_min, priority_max, is_pop, count]
-            )
-        else:
-            res = cmd(keys=[table], args=[table, priority_min, priority_max, is_pop])
-
-        return res
-
-    def zrangebyscore_increase_score(
-        self, table, priority_min, priority_max, increase_score, count=None
-    ):
-        """
-        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
-        ---------
-        @param table:
-        @param priority_min: 最小分数
-        @param priority_max: 最大分数
-        @param increase_score: 分数值增量 正数则在原有的分数上叠加,负数则相减
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[1]
-            local max_score = ARGV[2]
-            local increase_score = ARGV[3]
-            local count = ARGV[4]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
-            end
-
-            --修改优先级
-            for i=1, #datas do
-                redis.call('zincrby', KEYS[1], increase_score, datas[i])
-            end
-
-            return datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(
-                keys=[table], args=[priority_min, priority_max, increase_score, count]
-            )
-        else:
-            res = cmd(keys=[table], args=[priority_min, priority_max, increase_score])
-
-        return res
-
-    def zrangebyscore_set_score(
-        self, table, priority_min, priority_max, score, count=None
-    ):
-        """
-        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
-        ---------
-        @param table:
-        @param priority_min: 最小分数
-        @param priority_max: 最大分数
-        @param score: 分数值
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[1]
-            local max_score = ARGV[2]
-            local set_score = ARGV[3]
-            local count = ARGV[4]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores','limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores')
-            end
-
-            local real_datas = {} -- 数据
-            --修改优先级
-            for i=1, #datas, 2 do
-               local data = datas[i]
-               local score = datas[i+1]
-
-               table.insert(real_datas, data) -- 添加数据
-
-               redis.call('zincrby', KEYS[1], set_score - score, datas[i])
-            end
-
-            return real_datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(keys=[table], args=[priority_min, priority_max, score, count])
-        else:
-            res = cmd(keys=[table], args=[priority_min, priority_max, score])
-
-        return res
-
-    def zincrby(self, table, amount, value):
-        return self._redis.zincrby(table, amount, value)
-
-    def zget_count(self, table, priority_min=None, priority_max=None):
-        """
-        @summary: 获取表数据的数量
-        ---------
-        @param table:
-        @param priority_min:优先级范围 最小值(包含)
-        @param priority_max:优先级范围 最大值(包含)
-        ---------
-        @result:
-        """
-
-        if priority_min != None and priority_max != None:
-            return self._redis.zcount(table, priority_min, priority_max)
-        else:
-            return self._redis.zcard(table)
-
-    def zrem(self, table, values):
-        """
-        @summary: 移除集合中的指定元素
-        ---------
-        @param table:
-        @param values: 一个或者列表
-        ---------
-        @result:
-        """
-
-        if isinstance(values, list):
-            self._redis.zrem(table, *values)
-        else:
-            self._redis.zrem(table, values)
-
-    def zexists(self, table, values):
-        """
-        利用zscore判断某元素是否存在
-        @param values:
-        @return:
-        """
-
-        is_exists = []
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-            pipe.multi()
-            for value in values:
-                pipe.zscore(table, value)
-            is_exists_temp = pipe.execute()
-            for is_exist in is_exists_temp:
-                if is_exist != None:
-                    is_exists.append(1)
-                else:
-                    is_exists.append(0)
-
-        else:
-            is_exists = self._redis.zscore(table, values)
-            is_exists = 1 if is_exists != None else 0
-
-        return is_exists
-
-    def lpush(self, table, values):
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.rpush(table, value)
-            pipe.execute()
-
-        else:
-            return self._redis.rpush(table, values)
-
-    def lpop(self, table, count=1):
-        """
-        @summary:
-        ---------
-        @param table:
-        @param count:
-        ---------
-        @result: count>1时返回列表
-        """
-
-        datas = None
-        lcount = self.lget_count(table)
-        count = count if count <= lcount else lcount
-
-        if count:
-            if count > 1:
-                pipe = self._redis.pipeline()
-
-                if not self._is_redis_cluster:
-                    pipe.multi()
-                while count:
-                    pipe.lpop(table)
-                    count -= 1
-                datas = pipe.execute()
-
-            else:
-                datas = self._redis.lpop(table)
-
-        return datas
-
-    def rpoplpush(self, from_table, to_table=None):
-        """
-        将列表 from_table 中的最后一个元素(尾元素)弹出,并返回给客户端。
-        将 from_table 弹出的元素插入到列表 to_table ,作为 to_table 列表的的头元素。
-        如果 from_table 和 to_table 相同,则列表中的表尾元素被移动到表头,并返回该元素,可以把这种特殊情况视作列表的旋转(rotation)操作
-        @param from_table:
-        @param to_table:
-        @return:
-        """
-
-        if not to_table:
-            to_table = from_table
-
-        return self._redis.rpoplpush(from_table, to_table)
-
-    def lget_count(self, table):
-        return self._redis.llen(table)
-
-    def lrem(self, table, value, num=0):
-        """
-        @summary:
-        删除value
-        ---------
-        @param table:
-        @param value:
-        @param num:
-        ---------
-        @result: 删除的条数
-        """
-        return self._redis.lrem(table, num, value)
-
-    def lrange(self, table, start=0, end=-1):
-        return self._redis.lrange(table, start, end)
-
-    def hset(self, table, key, value):
-        """
-        @summary:
-        如果 key 不存在,一个新的哈希表被创建并进行 HSET 操作。
-        如果域 field 已经存在于哈希表中,旧值将被覆盖
-        ---------
-        @param table:
-        @param key:
-        @param value:
-        ---------
-        @result: 1 新插入; 0 覆盖
-        """
-        return self._redis.hset(table, key, value)
-
-    def hset_batch(self, table, datas):
-        """
-        批量插入
-        Args:
-            datas:
-                [[key, value]]
-        Returns:
-
-        """
-        pipe = self._redis.pipeline()
-
-        if not self._is_redis_cluster:
-            pipe.multi()
-        for key, value in datas:
-            pipe.hset(table, key, value)
-        return pipe.execute()
-
-    def hincrby(self, table, key, increment):
-        return self._redis.hincrby(table, key, increment)
-
-    def hget(self, table, key, is_pop=False):
-        if not is_pop:
-            return self._redis.hget(table, key)
-        else:
-            lua = """
-                -- local key = KEYS[1]
-                local field = ARGV[1]
-
-                -- 取值
-                local datas = redis.call('hget', KEYS[1], field)
-                -- 删除值
-                redis.call('hdel', KEYS[1], field)
-
-                return datas
-
-                    """
-            cmd = self._redis.register_script(lua)
-            res = cmd(keys=[table], args=[key])
-
-            return res
-
-    def hgetall(self, table):
-        return self._redis.hgetall(table)
-
-    def hexists(self, table, key):
-        return self._redis.hexists(table, key)
-
-    def hdel(self, table, *keys):
-        """
-        @summary: 删除对应的key 可传多个
-        ---------
-        @param table:
-        @param *keys:
-        ---------
-        @result:
-        """
-        self._redis.hdel(table, *keys)
-
-    def hget_count(self, table):
-        return self._redis.hlen(table)
-
-    def hkeys(self, table):
-        return self._redis.hkeys(table)
-
-    def setbit(self, table, offsets, values):
-        """
-        设置字符串数组某一位的值, 返回之前的值
-        @param table:
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            if not isinstance(values, list):
-                values = [values] * len(offsets)
-            else:
-                assert len(offsets) == len(values), "offsets值要与values值一一对应"
-
-            pipe = self._redis.pipeline()
-            pipe.multi()
-
-            for offset, value in zip(offsets, values):
-                pipe.setbit(table, offset, value)
-
-            return pipe.execute()
-
-        else:
-            return self._redis.setbit(table, offsets, values)
-
-    def getbit(self, table, offsets):
-        """
-        取字符串数组某一位的值
-        @param table:
-        @param offsets: 支持列表
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            pipe = self._redis.pipeline()
-            pipe.multi()
-            for offset in offsets:
-                pipe.getbit(table, offset)
-
-            return pipe.execute()
-
-        else:
-            return self._redis.getbit(table, offsets)
-
-    def bitcount(self, table):
-        return self._redis.bitcount(table)
-
-    def strset(self, table, value, **kwargs):
-        return self._redis.set(table, value, **kwargs)
-
-    def str_incrby(self, table, value):
-        return self._redis.incrby(table, value)
-
-    def strget(self, table):
-        return self._redis.get(table)
-
-    def strlen(self, table):
-        return self._redis.strlen(table)
-
-    def getkeys(self, regex):
-        return self._redis.keys(regex)
-
-    def exists_key(self, key):
-        return self._redis.exists(key)
-
-    def set_expire(self, key, seconds):
-        """
-        @summary: 设置过期时间
-        ---------
-        @param key:
-        @param seconds: 秒
-        ---------
-        @result:
-        """
-        self._redis.expire(key, seconds)
-
-    def get_expire(self, key):
-        """
-        @summary: 查询过期时间
-        ---------
-        @param key:
-        @param seconds: 秒
-        ---------
-        @result:
-        """
-        return self._redis.ttl(key)
-
-    def clear(self, table):
-        try:
-            self._redis.delete(table)
-        except Exception as e:
-            log.error(e)
-
-    def get_redis_obj(self):
-        return self._redis
-
-    def _reconnect(self):
-        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
-        retry_count = 0
-        while True:
-            try:
-                retry_count += 1
-                log.error(f"redis 连接断开, 重新连接 {retry_count}")
-                if self.get_connect():
-                    log.info(f"redis 连接成功")
-                    return True
-            except (ConnectionError, TimeoutError) as e:
-                log.error(f"连接失败 e: {e}")
-
-            time.sleep(2)
-
-    def __getattr__(self, name):
-        return getattr(self._redis, name)
-
-    def current_status(self, show_key=True, filter_key_by_used_memory=10 * 1024 * 1024):
-        """
-        统计redis当前使用情况
-        Args:
-            show_key: 是否统计每个key的内存
-            filter_key_by_used_memory: 根据内存的使用量过滤key 只显示使用量大于指定内存的key
-
-        Returns:
-
-        """
-        from prettytable import PrettyTable
-        from tqdm import tqdm
-
-        status_msg = ""
-
-        print("正在查询最大连接数...")
-        clients_count = self._redis.execute_command("info clients")
-        max_clients_count = self._redis.execute_command("config get maxclients")
-        status_msg += ": ".join(max_clients_count) + "\n"
-        status_msg += clients_count + "\n"
-
-        print("正在查询整体内存使用情况...")
-        total_status = self._redis.execute_command("info memory")
-        status_msg += total_status + "\n"
-
-        if show_key:
-            print("正在查询每个key占用内存情况等信息...")
-            table = PrettyTable(
-                field_names=[
-                    "type",
-                    "key",
-                    "value_count",
-                    "used_memory_human",
-                    "used_memory",
-                ],
-                sortby="used_memory",
-                reversesort=True,
-                header_style="title",
-            )
-
-            keys = self._redis.execute_command("keys *")
-            for key in tqdm(keys):
-                key_type = self._redis.execute_command("type {}".format(key))
-                if key_type == "set":
-                    value_count = self._redis.scard(key)
-                elif key_type == "zset":
-                    value_count = self._redis.zcard(key)
-                elif key_type == "list":
-                    value_count = self._redis.llen(key)
-                elif key_type == "hash":
-                    value_count = self._redis.hlen(key)
-                elif key_type == "string":
-                    value_count = self._redis.strlen(key)
-                elif key_type == "none":
-                    continue
-                else:
-                    raise TypeError("尚不支持 {} 类型的key".format(key_type))
-
-                used_memory = self._redis.execute_command("memory usage {}".format(key))
-                if used_memory >= filter_key_by_used_memory:
-                    used_memory_human = (
-                        "%0.2fMB" % (used_memory / 1024 / 1024) if used_memory else 0
-                    )
-
-                    table.add_row(
-                        [key_type, key, value_count, used_memory_human, used_memory]
-                    )
-
-            status_msg += str(table)
-
-        return status_msg

+ 0 - 35
A数据处理/site_monitor/docker/Dockerfile

@@ -1,35 +0,0 @@
-# 拉取镜像
-FROM ubuntu:22.04
-
-# 配置容器时间
-RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo 'Asia/Shanghai' >/etc/timezone
-
-# 更新源 - 阿里源
-RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
-RUN sed -i s@/security.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list
-RUN apt-get clean && apt-get update
-RUN apt-get install -y wget unzip curl vim
-
-# 安装 python3.8.10 gcc相关配置
-WORKDIR /opt
-RUN apt-get install -y gcc build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libbz2-dev liblzma-dev sqlite3 libsqlite3-dev tk-dev uuid-dev libgdbm-compat-dev libncurses-dev libnspr4-dev
-
-# python3.8.10下载与解压缩
-RUN curl -o python3.8.10.tgz https://mirrors.huaweicloud.com/python/3.8.10/Python-3.8.10.tgz && tar -zxvf python3.8.10.tgz
-# 创建编译安装目录, 配置安装位置
-RUN mkdir /usr/local/python38
-WORKDIR /opt/Python-3.8.10
-RUN ./configure --prefix=/usr/local/python38 && make && make install
-# 添加python3的软连接
-RUN rm -rf /usr/bin/python3 /usr/bin/pip3 && ln -s /usr/local/python38/bin/python3 /usr/bin/python3 && ln -s /usr/local/python38/bin/pip3.8 /usr/bin/pip3
-# 更换pip源&更新pip
-RUN pip3 config set global.index-url https://mirrors.bfsu.edu.cn/pypi/web/simple && pip3 install --upgrade pip
-
-# 安装项目依赖
-COPY requirements.txt requirements.txt
-RUN pip3 install -r requirements.txt
-# 安装playwright - webkit 驱动和依赖
-RUN python3 -m playwright install --with-deps webkit
-
-# 设置工作目录
-WORKDIR /mnt

+ 0 - 17
A数据处理/site_monitor/docker/docker-compose.yml

@@ -1,17 +0,0 @@
-version: "3"
-services: # 一组容器
-  worker01:
-    container_name: site_monitor
-    image: site_monitor:v1.0
-    volumes: # 映射文件夹
-      - /mnt/site_monitor:/mnt
-    network_mode: "host" # 指定网络名称
-    restart: always
-    privileged: true
-    shm_size: 2GB
-    logging:
-      driver: "json-file"
-      options:
-        max-size: "200k"
-        max-file: "10"
-    command: 'python3 /mnt/monitor.py'

+ 0 - 205
A数据处理/site_monitor/monitor.py

@@ -1,205 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-05-10 
----------
-@summary:  网站监控
----------
-@author: Dzr
-"""
-import copy
-import threading
-
-import bson
-import numpy as np
-import requests
-import requests.exceptions as requests_exceptions
-from playwright._impl._api_types import Error
-
-import utils.tools as tools
-from db.mongodb import MongoDB
-from network.request import Request
-from network.response import Response
-from utils.log import logger
-
-
-class MonitorParser(threading.Thread):
-
-    def __init__(self, mongo_db, coll_name):
-        threading.Thread.__init__(self)
-        self.mgo_db = mongo_db
-        self.coll_name = coll_name
-
-        self.monitor_api = 'http://cc.spdata.jianyu360.com/crawl/site_monitor/task/fetch'
-
-    def get_task(self):
-        items = {}
-        try:
-            response = requests.get(self.monitor_api, timeout=5)
-            items = response.json()['data']
-            if '_id' in items:
-                items['_id'] = bson.ObjectId(items['_id'])
-        finally:
-            return items
-
-    def get_response(self, url, render=False, **kwargs):
-        response = Response.from_dict({
-            "url": url,
-            "_content": b"",
-            "cookies": {},
-            "status_code": -1,
-            "elapsed": 666,
-            "headers": {}
-        })
-        request = Request(url=url, render=render, **kwargs)
-        for i in range(3):
-            try:
-                response = request.get_response()
-                if response.status_code != 200:
-                    if any([
-                        response.text is None,
-                        len(response.plain_text) == 0,
-                        response.tags()['tags_count'] == 0
-                    ]):
-                        continue
-                break
-            except Error as e:
-                if 'The certificate for this server is invalid.' in e.message:
-                    url = url.replace('https', 'http')
-                    request = Request(url=url, render=render, **kwargs)
-            except requests_exceptions.SSLError:
-                url = url.replace('https', 'http')
-                request = Request(url=url, render=True, **kwargs)
-            except requests_exceptions.ConnectionError:
-                kw = copy.deepcopy(kwargs)
-                kw.pop('proxies', '')
-                request = Request(url=url, render=True, **kw)
-            except requests_exceptions as e:
-                logger.exception(e)
-                break
-
-        # 释放浏览器
-        response.close_browser(request)
-
-        logger.debug(
-            """
-                -------------- %s response for ----------------
-                url = %s
-                title = %s
-                response = %s
-            """
-            % (
-                self.getName(),
-                url,
-                response.title(),
-                response
-            )
-        )
-
-        if response.status_code != -1:
-            response = Response(response)
-        # 设置编码
-        response.encoding = response.encoding or "utf-8"
-        return response
-
-    def __add_items_to_db(self, task, items):
-        result = self.mgo_db.update(
-            coll_name=self.coll_name,
-            condition={'_id': task['_id']},
-            data=items
-        )
-        # print({'_id': task['_id']})
-        return result
-
-    def deal_task(self, task):
-        # 栏目
-        url = task['url']
-        response = self.get_response(url, render=True, proxies=False)
-        status_code = response.status_code
-
-        # 栏目页面标签
-        tags_count = response.tags()['tags_count']
-        tags_count_diff = abs(tags_count - task['tags_count'])
-        tags_count_diff_lst = list(task['tags_count_diff_lst'])
-
-        # 栏目是否改版
-        channel_ischange = task['channel_ischange']
-        if len(tags_count_diff_lst) >= 3 and not channel_ischange:
-            mean = np.mean(tags_count_diff_lst)  # 均值
-            std = np.std(tags_count_diff_lst, ddof=1)  # 标准差
-            std_range = [mean - (2 * std), mean + (2 * std)]
-            if tags_count_diff not in std_range:
-                channel_ischange = True
-
-        if len(tags_count_diff_lst) > 3 and sum(tags_count_diff_lst) == 0:
-            channel_ischange = True
-            status_code = 500
-
-        # 访问频次
-        update_dt = tools.timestamp_to_date(task['update_at'], '%Y-%m-%d')
-        is_first_monitor = tools.get_current_date('%Y-%m-%d') != update_dt
-        if is_first_monitor:
-            visit_count, failure_count = 1, 0
-            if status_code != 200:
-                failure_count = 1
-
-            tags_count_diff_lst = []
-            tags_count_diff_lst.insert(0, tags_count_diff)
-        else:
-            visit_count = task['visit_count'] + 1
-            failure_count = task['failure_count']
-            if status_code != 200:
-                failure_count += 1
-
-            tags_count_diff_lst.insert(0, tags_count_diff)
-
-        items = {
-            'title': response.title(),  # 页面标头
-            'tags_count': tags_count,
-            'tags_count_diff': tags_count_diff,
-            'tags_count_diff_lst': tags_count_diff_lst,
-            'channel_ischange': channel_ischange,
-            'status_code': status_code,
-            'visit_count': visit_count,
-            'failure_count': failure_count,
-            'update_at': tools.ensure_int64(tools.get_current_timestamp())
-        }
-        self.__add_items_to_db(task, items)
-
-    def run(self):
-        while True:
-            task = self.get_task()
-            if not task:
-                logger.debug(f"[{self.getName()}]暂无监控任务")
-                tools.delay_time(2)
-                continue
-
-            try:
-                self.deal_task(task)
-            except Exception as e:
-                logger.exception(e)
-
-
-class MonitorServer(threading.Thread):
-
-    def __init__(self, thread_nums=1):
-        threading.Thread.__init__(self)
-        self.mongo_db = MongoDB()
-        self.coll_name = 'site_monitor'
-
-        self.thread_nums = thread_nums
-
-        self.parser_control_obj = MonitorParser
-        self.parser_controls = []
-
-    def run(self):
-        for _ in range(self.thread_nums):
-            parser_control = self.parser_control_obj(
-                mongo_db=self.mongo_db,
-                coll_name=self.coll_name
-            )
-            parser_control.start()
-            self.parser_controls.append(parser_control)
-
-
-if __name__ == '__main__':
-    MonitorServer(thread_nums=5).start()

+ 0 - 8
A数据处理/site_monitor/network/__init__.py

@@ -1,8 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-05-10 
----------
-@summary:  
----------
-@author: Dzr
-"""

+ 0 - 3
A数据处理/site_monitor/network/downloader/__init__.py

@@ -1,3 +0,0 @@
-from ._requests import RequestsDownloader
-from ._requests import RequestsSessionDownloader
-from ._playwright import PlaywrightDownloader

+ 0 - 104
A数据处理/site_monitor/network/downloader/_playwright.py

@@ -1,104 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/7 4:05 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import setting as setting
-import utils.tools as tools
-from network.downloader.base import RenderDownloader
-from network.response import Response
-from utils.webdriver import WebDriverPool, PlaywrightDriver
-
-
-class PlaywrightDownloader(RenderDownloader):
-    webdriver_pool: WebDriverPool = None
-
-    @property
-    def _webdriver_pool(self):
-        if not self.__class__.webdriver_pool:
-            self.__class__.webdriver_pool = WebDriverPool(
-                **setting.PLAYWRIGHT, driver_cls=PlaywrightDriver, thread_safe=True
-            )
-
-        return self.__class__.webdriver_pool
-
-    def download(self, request) -> Response:
-        # 代理优先级 自定义 > 配置文件 > 随机
-        if request.custom_proxies:
-            proxy = request.get_proxy()
-        elif setting.PLAYWRIGHT.get("proxy"):
-            proxy = setting.PLAYWRIGHT.get("proxy")
-        else:
-            proxy = request.get_proxy()
-
-        # user_agent优先级 自定义 > 配置文件 > 随机
-        if request.custom_ua:
-            user_agent = request.get_user_agent()
-        elif setting.PLAYWRIGHT.get("user_agent"):
-            user_agent = setting.PLAYWRIGHT.get("user_agent")
-        else:
-            user_agent = request.get_user_agent()
-
-        cookies = request.get_cookies()
-        url = request.url
-        render_time = request.render_time or setting.PLAYWRIGHT.get("render_time")
-        wait_until = setting.PLAYWRIGHT.get("wait_until") or "domcontentloaded"
-        if request.get_params():
-            url = tools.joint_url(url, request.get_params())
-
-        driver: PlaywrightDriver = self._webdriver_pool.get(
-            user_agent=user_agent, proxy=proxy
-        )
-        try:
-            if cookies:
-                driver.url = url
-                driver.cookies = cookies
-            driver.page.goto(url, wait_until=wait_until)
-
-            if render_time:
-                tools.delay_time(render_time)
-
-            html = driver.page.content()
-            response = Response.from_dict(
-                {
-                    "url": driver.page.url,
-                    "cookies": driver.cookies,
-                    "_content": html.encode(),
-                    "status_code": 200,
-                    "elapsed": 666,
-                    "headers": {
-                        "User-Agent": driver.user_agent,
-                        "Cookie": tools.cookies2str(driver.cookies),
-                    },
-                }
-            )
-
-            response.driver = driver
-            response.browser = driver
-            return response
-        except Exception as e:
-            self._webdriver_pool.remove(driver)
-            raise e
-
-    def close(self, driver):
-        if driver:
-            self._webdriver_pool.remove(driver)
-
-    def put_back(self, driver):
-        """
-        释放浏览器对象
-        """
-        self._webdriver_pool.put(driver)
-
-    def close_all(self):
-        """
-        关闭所有浏览器
-        """
-        # 不支持
-        # self._webdriver_pool.close()
-        pass

+ 0 - 46
A数据处理/site_monitor/network/downloader/_requests.py

@@ -1,46 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/4/10 5:57 下午
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import requests
-from requests.adapters import HTTPAdapter
-
-from network.downloader.base import Downloader
-from network.response import Response
-
-
-class RequestsDownloader(Downloader):
-    def download(self, request) -> Response:
-        response = requests.request(
-            request.method, request.url, **request.requests_kwargs
-        )
-        response = Response(response)
-        return response
-
-
-class RequestsSessionDownloader(Downloader):
-    session = None
-
-    @property
-    def _session(self):
-        if not self.__class__.session:
-            self.__class__.session = requests.Session()
-            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
-            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-            self.__class__.session.mount("http", http_adapter)
-
-        return self.__class__.session
-
-    def download(self, request) -> Response:
-        response = self._session.request(
-            request.method, request.url, **request.requests_kwargs
-        )
-        response = Response(response)
-        return response

+ 0 - 41
A数据处理/site_monitor/network/downloader/base.py

@@ -1,41 +0,0 @@
-import abc
-from abc import ABC
-
-from network.response import Response
-
-
-class Downloader:
-    @abc.abstractmethod
-    def download(self, request) -> Response:
-        """
-
-        Args:
-            request: Request
-
-        Returns: Response
-
-        """
-        raise NotImplementedError
-
-    def close(self, response: Response):
-        pass
-
-
-class RenderDownloader(Downloader, ABC):
-    def put_back(self, driver):
-        """
-        释放浏览器对象
-        """
-        pass
-
-    def close(self, driver):
-        """
-        关闭浏览器
-        """
-        pass
-
-    def close_all(self):
-        """
-        关闭所有浏览器
-        """
-        pass

+ 0 - 32
A数据处理/site_monitor/network/proxy_file/de9f83d546a39eca6979d2a6dca3407a.txt

@@ -1,32 +0,0 @@
-180.105.104.247:8860&&1684743244
-115.208.199.134:8860&&1684742848
-42.84.93.124:8861&&1684742999
-180.127.72.88:8860&&1684743979
-144.255.48.89:8860&&1684744166
-180.106.242.48:8860&&1684743307
-121.207.84.107:8860&&1684742787
-180.127.72.79:8860&&1684743262
-182.107.181.130:8860&&1684742689
-218.67.90.253:8860&&1684743824
-59.61.165.88:8860&&1684742786
-114.233.0.176:8860&&1684742924
-113.93.224.26:8860&&1684743064
-123.169.34.24:8860&&1684743176
-182.34.27.242:8860&&1684744210
-125.69.91.209:8860&&1684743202
-36.27.184.4:8860&&1684743545
-49.69.209.246:8860&&1684742763
-123.146.150.68:8860&&1684742715
-114.235.254.245:8860&&1684742840
-106.32.10.20:8860&&1684743120
-140.250.148.156:8860&&1684742873
-180.111.177.16:8860&&1684743024
-180.108.151.90:8860&&1684743675
-121.238.107.47:8860&&1684742780
-123.160.96.180:8860&&1684742820
-223.215.119.152:8860&&1684742729
-182.34.102.138:8860&&1684743505
-59.58.211.240:8860&&1684744113
-180.140.47.156:8860&&1684743073
-125.123.136.247:8861&&1684743189
-49.86.182.103:8860&&1684742719

+ 0 - 746
A数据处理/site_monitor/network/proxy_pool.py

@@ -1,746 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-代理池
-"""
-import datetime
-import json
-import os
-import random
-import socket
-import time
-from urllib import parse
-
-import redis
-import requests
-
-import setting
-from utils import tools
-from utils.log import logger as log
-
-
-def decrypt(input_str: str) -> str:
-    """
-    改写:新增
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
-
-# 建立本地缓存代理文件夹
-proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
-if not os.path.exists(proxy_path):
-    os.mkdir(proxy_path)
-
-
-def get_proxy_from_url(**kwargs):
-    """
-    获取指定url的代理
-    :param kwargs:
-    :return:
-    """
-    proxy_source_url = kwargs.get("proxy_source_url", [])
-    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
-
-    if not isinstance(proxy_source_url, list):
-        proxy_source_url = [proxy_source_url]
-        proxy_source_url = [x for x in proxy_source_url if x]
-    if not proxy_source_url:
-        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
-    kwargs = kwargs.copy()
-    kwargs.pop("proxy_source_url")
-    proxies_list = []
-    for url in proxy_source_url:
-        if url.startswith("http"):
-            proxies_list.extend(get_proxy_from_http(url, **kwargs))
-        elif url.startswith("redis"):
-            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
-
-    if proxies_list:
-        # 顺序打乱
-        random.shuffle(proxies_list)
-    return proxies_list
-
-
-def get_proxy_from_http(proxy_source_url, **kwargs):
-    """
-    从指定 http 地址获取代理
-    :param proxy_source_url:
-    :param kwargs:
-    :return:
-    """
-    filename = tools.get_md5(proxy_source_url) + ".txt"
-    abs_filename = os.path.join(proxy_path, filename)
-    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
-    update_flag = 0
-    if not update_interval:
-        # 强制更新
-        update_flag = 1
-    elif not os.path.exists(abs_filename):
-        # 文件不存在则更新
-        update_flag = 1
-    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
-        # 超过更新间隔
-        update_flag = 1
-    if update_flag:
-        pool = []
-        response = requests.get(proxy_source_url, timeout=20)
-        # 改写:获取scocks代理的response处理
-        for proxy in response.json():
-            host = decrypt(proxy['ip'])
-            port = proxy['ports'][0]
-            endTime = proxy['lifetime']
-            pool.append(f"{host}:{port}&&{endTime}")
-
-        with open(os.path.join(proxy_path, filename), "w") as f:
-            f.write('\n'.join(pool))
-    return get_proxy_from_file(filename)
-
-
-def get_proxy_from_file(filename, **kwargs):
-    """
-    从指定本地文件获取代理
-        文件格式
-        ip:port:https
-        ip:port:http
-        ip:port
-    :param filename:
-    :param kwargs:
-    :return:
-    """
-    proxies_list = []
-    with open(os.path.join(proxy_path, filename), "r") as f:
-        lines = f.readlines()
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # 解析
-        auth = ""
-        if "@" in line:
-            auth, line = line.split("@")
-        # 改写,解析代理有效期结束时间
-        line, end = line.split("&&")
-
-        items = line.split(":")
-        if len(items) < 2:
-            continue
-
-        ip, port, *protocol = items
-        if not all([port, ip]):
-            continue
-        if auth:
-            ip = "{}@{}".format(auth, ip)
-        if not protocol:
-            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
-            if time.time() < int(end):
-                proxies = {
-                    "https": "socks5://%s:%s" % (ip, port),
-                    "http": "socks5://%s:%s" % (ip, port),
-                    # "end":end
-                }
-            else:
-                continue
-        else:
-            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
-        proxies_list.append(proxies)
-
-    return proxies_list
-
-
-def get_proxy_from_redis(proxy_source_url, **kwargs):
-    """
-    从指定 redis 地址获取代理
-    @param proxy_source_url: redis://:passwd@host:ip/db
-        redis 存储结构 zset
-        ip:port ts
-    @param kwargs:
-        {"redis_proxies_key": "xxx"}
-    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
-    """
-
-    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
-    key = kwargs.get("redis_proxies_key")
-    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
-    proxies = redis_conn.zrange(key, 0, -1)
-    proxies_list = []
-    for proxy in proxies:
-        proxy = proxy.decode()
-        proxies_list.append(
-            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
-        )
-    return proxies_list
-
-
-def check_proxy(
-        ip="",
-        port="",
-        proxies=None,
-        type=0,
-        timeout=5,
-        logger=None,
-        show_error_log=True,
-        **kwargs,
-):
-    """
-    代理有效性检查
-    :param ip:
-    :param port:
-    :param type: 0:socket  1:requests
-    :param timeout:
-    :param logger:
-    :return:
-    """
-    if not logger:
-        logger = log
-    ok = 0
-    if type == 0 and ip and port:
-        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(timeout)
-            try:
-                # 必须检测 否则代理永远不刷新
-                sk.connect((ip, int(port)))
-                ok = 1
-            except Exception as e:
-                if show_error_log:
-                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
-            sk.close()
-    else:
-        if not proxies:
-            proxies = {
-                "http": "socks5://{}:{}".format(ip, port),
-                "https": "socks5//{}:{}".format(ip, port),
-            }
-        try:
-            # 改写:代理检测的url
-            r = requests.get(
-                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
-            )
-            ok = 1
-            r.close()
-        except Exception as e:
-            if show_error_log:
-                logger.debug(
-                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
-                )
-    return ok
-
-
-class ProxyItem(object):
-    """单个代理对象"""
-
-    # 代理标记
-    proxy_tag_list = (-1, 0, 1)
-
-    def __init__(
-            self,
-            proxies=None,
-            valid_timeout=20,
-            check_interval=180,
-            max_proxy_use_num=10000,
-            delay=30,
-            use_interval=None,
-            **kwargs,
-    ):
-        """
-        :param proxies:
-        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
-        :param check_interval:
-        :param max_proxy_use_num:
-        :param delay:
-        :param use_interval: 使用间隔 单位秒 默认不限制
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs:
-        """
-        # {"http": ..., "https": ...}
-        self.proxies = proxies
-        # 检测超时时间 秒
-        self.valid_timeout = valid_timeout
-        # 检测间隔 秒
-        self.check_interval = check_interval
-
-        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
-        self.flag = 0
-        # 上次状态变化时间
-        self.flag_ts = 0
-        # 上次更新时间 有效时间
-        self.update_ts = 0
-        # 最大被使用次数
-        self.max_proxy_use_num = max_proxy_use_num
-        # 被使用次数记录
-        self.use_num = 0
-        # 延迟使用时间
-        self.delay = delay
-        # 使用间隔 单位秒
-        self.use_interval = use_interval
-        # 使用时间
-        self.use_ts = 0
-
-        self.proxy_args = self.parse_proxies(self.proxies)
-        self.proxy_ip = self.proxy_args["ip"]
-        self.proxy_port = self.proxy_args["port"]
-        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
-        if self.proxy_args["user"]:
-            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
-        else:
-            self.proxy_id = self.proxy_ip_port
-
-        # 日志处理器
-        self.logger = log
-
-    def get_proxies(self):
-        self.use_num += 1
-        return self.proxies
-
-    def is_delay(self):
-        return self.flag == 1
-
-    def is_valid(self, force=0, type=0):
-        """
-        检测代理是否有效
-            1 有效
-            2 延时使用
-            0 无效 直接在代理池删除
-        :param force:
-        :param type:
-        :return:
-        """
-        if self.use_num > self.max_proxy_use_num > 0:
-            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
-            return 0
-        if self.flag == -1:
-            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
-            return 0
-        if self.delay > 0 and self.flag == 1:
-            if time.time() - self.flag_ts < self.delay:
-                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
-                return 2
-            else:
-                self.flag = 0
-                self.logger.debug("延迟代理释放: {}".format(self.proxies))
-        if self.use_interval:
-            if time.time() - self.use_ts < self.use_interval:
-                return 2
-        if not force:
-            if time.time() - self.update_ts < self.check_interval:
-                return 1
-        if self.valid_timeout > 0:
-            ok = check_proxy(
-                proxies=self.proxies,
-                type=type,
-                timeout=self.valid_timeout,
-                logger=self.logger,
-            )
-        else:
-            ok = 1
-        self.update_ts = time.time()
-        return ok
-
-    @classmethod
-    def parse_proxies(self, proxies):
-        """
-        分解代理组成部分
-        :param proxies:
-        :return:
-        """
-        if not proxies:
-            return {}
-        if isinstance(proxies, (str, bytes)):
-            proxies = json.loads(proxies)
-        protocol = list(proxies.keys())
-        if not protocol:
-            return {}
-        _url = proxies.get(protocol[0])
-        # 改写:注释http代理url的拼接,以正常生成代理池
-        # if not _url.startswith("http"):
-        #     _url = "http://" + _url
-        _url_parse = parse.urlparse(_url)
-        netloc = _url_parse.netloc
-        if "@" in netloc:
-            netloc_auth, netloc_host = netloc.split("@")
-        else:
-            netloc_auth, netloc_host = "", netloc
-        ip, *port = netloc_host.split(":")
-        port = port[0] if port else "80"
-        user, *password = netloc_auth.split(":")
-        password = password[0] if password else ""
-        return {
-            "protocol": protocol,
-            "ip": ip,
-            "port": port,
-            "user": user,
-            "password": password,
-            "ip_port": "{}:{}".format(ip, port),
-        }
-
-
-class ProxyPoolBase(object):
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def get(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class ProxyPool(ProxyPoolBase):
-    """代理池"""
-
-    def __init__(self, **kwargs):
-        """
-        :param size: 代理池大小  -1 为不限制
-        :param proxy_source_url: 代理文件地址 支持列表
-        :param proxy_instance:  提供代理的实例
-        :param reset_interval:  代理池重置间隔 最小间隔
-        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
-        :param check_valid: 是否在获取代理时进行检测有效性
-        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
-        :param logger: 日志处理器 默认 log.get_logger()
-        :param kwargs: 其他的参数
-        """
-        kwargs.setdefault("size", -1)
-        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
-
-        super(ProxyPool, self).__init__(**kwargs)
-        # 队列最大长度
-        self.max_queue_size = kwargs.get("size", -1)
-        # 实际代理数量
-        self.real_max_proxy_count = 1000
-        # 代理可用最大次数
-        # 代理获取地址 http://localhost/proxy.txt
-        self.proxy_source_url = kwargs.get("proxy_source_url", [])
-        if not isinstance(self.proxy_source_url, list):
-            self.proxy_source_url = [self.proxy_source_url]
-            self.proxy_source_url = [x for x in self.proxy_source_url if x]
-            self.proxy_source_url = list(set(self.proxy_source_url))
-            kwargs.update({"proxy_source_url": self.proxy_source_url})
-        # 处理日志
-        self.logger = kwargs.get("logger") or log
-        kwargs["logger"] = self.logger
-        if not self.proxy_source_url:
-            self.logger.warn("need set proxy_source_url or proxy_instance")
-
-        # 代理池重置间隔
-        self.reset_interval = kwargs.get("reset_interval", 5)
-        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
-        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
-        # 是否监测代理有效性
-        self.check_valid = kwargs.get("check_valid", True)
-
-        # 代理队列
-        self.proxy_queue = None
-        # {代理id: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 失效代理队列
-        self.invalid_proxy_dict = {}
-
-        self.kwargs = kwargs
-
-        # 重置代理池锁
-        self.reset_lock = None
-        # 重置时间
-        self.last_reset_time = 0
-        # 重置的太快了  计数
-        self.reset_fast_count = 0
-        # 计数 获取代理重试3次仍然失败 次数
-        self.no_valid_proxy_times = 0
-
-        # 上次获取代理时间
-        self.last_get_ts = time.time()
-
-        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
-        self.proxy_item_update_ts_dict = {}
-
-        # 警告
-        self.warn_flag = False
-
-    def warn(self):
-        if not self.warn_flag:
-            for url in self.proxy_source_url:
-                if "zhima" in url:
-                    continue
-            self.warn_flag = True
-        return
-
-    @property
-    def queue_size(self):
-        """
-        当前代理池中代理数量
-        :return:
-        """
-        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
-
-    def clear(self):
-        """
-        清空自己
-        :return:
-        """
-        self.proxy_queue = None
-        # {代理ip: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 清理失效代理集合
-        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
-        self.invalid_proxy_dict = {
-            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
-        }
-        # 清理超时的update_ts记录
-        _limit = time.time() - 600
-        self.proxy_item_update_ts_dict = {
-            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
-        }
-        return
-
-    def get(self, retry: int = 0) -> dict:
-        """
-        从代理池中获取代理
-        :param retry:
-        :return:
-        """
-        retry += 1
-        if retry > 3:
-            self.no_valid_proxy_times += 1
-            return None
-        # if time.time() - self.last_get_ts > 3 * 60:
-        #     # 3分钟没有获取过 重置一下
-        #     try:
-        #         self.reset_proxy_pool()
-        #     except Exception as e:
-        #         self.logger.exception(e)
-        # 记录获取时间
-        self.last_get_ts = time.time()
-        #
-        self.warn()
-        proxy_item = self.get_random_proxy()
-        if proxy_item:
-            # 不检测
-            if not self.check_valid:  #
-                # 塞回去
-                proxies = proxy_item.get_proxies()
-                self.put_proxy_item(proxy_item)
-                return proxies
-            else:
-                is_valid = proxy_item.is_valid()
-                if is_valid:
-                    # 记录update_ts
-                    self.proxy_item_update_ts_dict[
-                        proxy_item.proxy_id
-                    ] = proxy_item.update_ts
-                    # 塞回去
-                    proxies = proxy_item.get_proxies()
-                    self.put_proxy_item(proxy_item)
-                    if is_valid == 1:
-                        if proxy_item.use_interval:
-                            proxy_item.use_ts = time.time()
-                        return proxies
-                else:
-                    # 处理失效代理
-                    self.proxy_dict.pop(proxy_item.proxy_id, "")
-                    self.invalid_proxy_dict[
-                        proxy_item.proxy_id
-                    ] = datetime.datetime.now()
-        else:
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        if self.no_valid_proxy_times >= 5:
-            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
-            # 导致爬虫烂尾
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                self.logger.exception(e)
-        return self.get(retry)
-
-    get_proxy = get
-
-    def get_random_proxy(self) -> ProxyItem:
-        """
-        随机获取代理
-        :return:
-        """
-        if self.proxy_queue is not None:
-            if random.random() < 0.5:
-                # 一半概率检查 这是个高频操作 优化一下
-                if time.time() - self.last_reset_time > self.reset_interval_max:
-                    time.sleep(3)
-                    self.reset_proxy_pool(force=True)
-                else:
-                    min_q_size = (
-                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
-                        if self.max_queue_size > 0
-                        else self.real_max_proxy_count / 2
-                    )
-                    if self.proxy_queue.qsize() < min_q_size:
-                        time.sleep(3)
-                        self.reset_proxy_pool()
-            try:
-                return self.proxy_queue.get_nowait()
-            except Exception:
-                pass
-        return None
-
-    def append_proxies(self, proxies_list: list) -> int:
-        """
-        添加代理到代理池
-        :param proxies_list:
-        :return:
-        """
-        count = 0
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if proxies:
-                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
-                # 增加失效判断 2018/12/18
-                if proxy_item.proxy_id in self.invalid_proxy_dict:
-                    continue
-                if proxy_item.proxy_id not in self.proxy_dict:
-                    # 补充update_ts
-                    if not proxy_item.update_ts:
-                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
-                            proxy_item.proxy_id, 0
-                        )
-                    self.put_proxy_item(proxy_item)
-                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
-                    count += 1
-        return count
-
-    def put_proxy_item(self, proxy_item: ProxyItem):
-        """
-        添加 ProxyItem 到代理池
-        :param proxy_item:
-        :return:
-        """
-        return self.proxy_queue.put_nowait(proxy_item)
-
-    def reset_proxy_pool(self, force: bool = False):
-        """
-        重置代理池
-        :param force: 是否强制重置代理池
-        :return:
-        """
-        if not self.reset_lock:
-            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
-            import threading
-
-            self.reset_lock = threading.RLock()
-        with self.reset_lock:
-            if (
-                    force
-                    or self.proxy_queue is None
-                    or (
-                    self.max_queue_size > 0
-                    and self.proxy_queue.qsize() < self.max_queue_size / 2
-            )
-                    or (
-                    self.max_queue_size < 0
-                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
-            )
-                    or self.no_valid_proxy_times >= 5
-            ):
-                if time.time() - self.last_reset_time < self.reset_interval:
-                    self.reset_fast_count += 1
-                    if self.reset_fast_count % 10 == 0:
-                        self.logger.debug(
-                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
-                        )
-                        time.sleep(1)
-                else:
-                    self.clear()
-                    if self.proxy_queue is None:
-                        import queue
-
-                        self.proxy_queue = queue.Queue()
-                    # TODO 这里获取到的可能重复
-                    proxies_list = get_proxy_from_url(**self.kwargs)
-                    self.real_max_proxy_count = len(proxies_list)
-                    if 0 < self.max_queue_size < self.real_max_proxy_count:
-                        proxies_list = random.sample(proxies_list, self.max_queue_size)
-                    _valid_count = self.append_proxies(proxies_list)
-                    self.last_reset_time = time.time()
-                    self.no_valid_proxy_times = 0
-                    self.logger.debug(
-                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
-                            len(proxies_list),
-                            _valid_count,
-                            len(self.invalid_proxy_dict),
-                            len(self.proxy_dict),
-                        )
-                    )
-        return
-
-    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
-        """
-        对代理进行标记
-        :param proxies_list:
-        :param flag:
-                    -1  废弃
-                    1 延迟使用
-        :param delay: 延迟时间
-        :return:
-        """
-        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
-            return False
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if not proxies:
-                continue
-            proxy_id = ProxyItem(proxies).proxy_id
-            if proxy_id not in self.proxy_dict:
-                continue
-            self.proxy_dict[proxy_id].flag = flag
-            self.proxy_dict[proxy_id].flag_ts = time.time()
-            self.proxy_dict[proxy_id].delay = delay
-
-        return True
-
-    def get_proxy_item(self, proxy_id="", proxies=None):
-        """
-        获取代理对象
-        :param proxy_id:
-        :param proxies:
-        :return:
-        """
-        if proxy_id:
-            return self.proxy_dict.get(proxy_id)
-        if proxies:
-            proxy_id = ProxyItem(proxies).proxy_id
-            return self.proxy_dict.get(proxy_id)
-        return
-
-    def copy(self):
-        return ProxyPool(**self.kwargs)
-
-    def all(self) -> list:
-        """
-        获取当前代理池中的全部代理
-        :return:
-        """
-        return get_proxy_from_url(**self.kwargs)

+ 0 - 524
A数据处理/site_monitor/network/request.py

@@ -1,524 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:49:08
----------
-@summary: 请求结构体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import copy
-import re
-
-import requests
-from requests.cookies import RequestsCookieJar
-from requests.packages.urllib3.exceptions import InsecureRequestWarning
-
-import setting as setting
-import utils.tools as tools
-from db.redisdb import RedisDB
-from network import user_agent
-from network.downloader.base import Downloader, RenderDownloader
-from network.proxy_pool import ProxyPool
-from network.response import Response
-from utils.log import logger as log
-
-# 屏蔽warning信息
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-
-
-class Request:
-    user_agent_pool = user_agent
-    proxies_pool: ProxyPool = None
-
-    cache_db = None  # redis / pika
-    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
-    cached_expire_time = 1200  # 缓存过期时间
-
-    # 下载器
-    downloader: Downloader = None
-    session_downloader: Downloader = None
-    render_downloader: RenderDownloader = None
-
-    __REQUEST_ATTRS__ = {
-        # "method",
-        # "url",
-        "params",
-        "data",
-        "headers",
-        "cookies",
-        "files",
-        "auth",
-        "timeout",
-        "allow_redirects",
-        "proxies",
-        "hooks",
-        "stream",
-        "verify",
-        "cert",
-        "json",
-    }
-
-    _DEFAULT_KEY_VALUE_ = dict(
-        url="",
-        method=None,
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-        make_absolute_links=None,
-    )
-
-    _CUSTOM_PROPERTIES_ = {
-        "requests_kwargs",
-        "custom_ua",
-        "custom_proxies",
-    }
-
-    def __init__(
-        self,
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-        make_absolute_links=None,
-        **kwargs,
-    ):
-        """
-        @summary: Request参数
-        ---------
-        框架参数
-        @param url: 待抓取url
-        @param retry_times: 当前重试次数
-        @param priority: 优先级 越小越优先 默认300
-        @param parser_name: 回调函数所在的类名 默认为当前类
-        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
-        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
-        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
-        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
-        @param use_session: 是否使用session方式
-        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
-        @param download_midware: 下载中间件。默认为parser中的download_midware
-        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
-        @param render: 是否用浏览器渲染
-        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
-        @param make_absolute_links: 是否转成绝对连接,默认是
-        --
-        以下参数与requests参数使用方式一致
-        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
-        @param params: 请求参数
-        @param data: 请求body
-        @param json: 请求json字符串,同 json.dumps(data)
-        @param headers:
-        @param cookies: 字典 或 CookieJar 对象
-        @param files:
-        @param auth:
-        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
-        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
-        @param verify: 为 True 时将会验证 SSL 证书
-        @param stream: 如果为 False,将会立即下载响应内容
-        @param cert:
-        --
-        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
-        ---------
-        @result:
-        """
-
-        self.url = url
-        self.method = None
-        self.retry_times = retry_times
-        self.priority = priority
-        self.parser_name = parser_name
-        self.callback = callback
-        self.filter_repeat = filter_repeat
-        self.auto_request = auto_request
-        self.request_sync = request_sync
-        self.use_session = use_session
-        self.random_user_agent = random_user_agent
-        self.download_midware = download_midware
-        self.is_abandoned = is_abandoned
-        self.render = render
-        self.render_time = render_time
-        self.make_absolute_links = (
-            make_absolute_links
-            if make_absolute_links is not None
-            else setting.MAKE_ABSOLUTE_LINKS
-        )
-
-        # 自定义属性,不参与序列化
-        self.requests_kwargs = {}
-        for key, value in kwargs.items():
-            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
-                self.requests_kwargs[key] = value
-
-            self.__dict__[key] = value
-
-        self.custom_ua = False
-        self.custom_proxies = False
-
-    def __repr__(self):
-        try:
-            return "<Request {}>".format(self.url)
-        except:
-            return "<Request {}>".format(str(self.to_dict)[:40])
-
-    def __setattr__(self, key, value):
-        """
-        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
-        @param key:
-        @param value:
-        @return:
-        """
-        self.__dict__[key] = value
-
-        if key in self.__class__.__REQUEST_ATTRS__:
-            self.requests_kwargs[key] = value
-
-    def __lt__(self, other):
-        return self.priority < other.priority
-
-    @property
-    def _proxies_pool(self):
-        if not self.__class__.proxies_pool:
-            self.__class__.proxies_pool = ProxyPool()
-
-        return self.__class__.proxies_pool
-
-    @property
-    def _downloader(self):
-        if not self.__class__.downloader:
-            self.__class__.downloader = tools.import_cls(setting.DOWNLOADER)()
-
-        return self.__class__.downloader
-
-    @property
-    def _session_downloader(self):
-        if not self.__class__.session_downloader:
-            self.__class__.session_downloader = tools.import_cls(
-                setting.SESSION_DOWNLOADER
-            )()
-
-        return self.__class__.session_downloader
-
-    @property
-    def _render_downloader(self):
-        if not self.__class__.render_downloader:
-            self.__class__.render_downloader = tools.import_cls(
-                setting.RENDER_DOWNLOADER
-            )()
-
-        return self.__class__.render_downloader
-
-    @property
-    def to_dict(self):
-        request_dict = {}
-
-        self.callback = (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-
-        if isinstance(self.download_midware, (tuple, list)):
-            self.download_midware = [
-                getattr(download_midware, "__name__")
-                if callable(download_midware)
-                else download_midware
-                for download_midware in self.download_midware
-            ]
-        else:
-            self.download_midware = (
-                getattr(self.download_midware, "__name__")
-                if callable(self.download_midware)
-                else self.download_midware
-            )
-
-        for key, value in self.__dict__.items():
-            if (
-                key in self.__class__._DEFAULT_KEY_VALUE_
-                and self.__class__._DEFAULT_KEY_VALUE_.get(key) == value
-                or key in self.__class__._CUSTOM_PROPERTIES_
-            ):
-                continue
-
-            if value is not None:
-                if key in self.__class__.__REQUEST_ATTRS__:
-                    if not isinstance(
-                        value, (bytes, bool, float, int, str, tuple, list, dict)
-                    ):
-                        value = tools.dumps_obj(value)
-                else:
-                    if not isinstance(value, (bytes, bool, float, int, str)):
-                        value = tools.dumps_obj(value)
-
-            request_dict[key] = value
-
-        return request_dict
-
-    @property
-    def callback_name(self):
-        return (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-
-    def make_requests_kwargs(self):
-        """
-        处理参数
-        """
-        # 设置超时默认时间
-        self.requests_kwargs.setdefault(
-            "timeout", setting.REQUEST_TIMEOUT
-        )  # connect=22 read=22
-
-        # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。
-        # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
-        # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
-        self.requests_kwargs.setdefault("stream", True)
-
-        # 关闭证书验证
-        self.requests_kwargs.setdefault("verify", False)
-
-        # 设置请求方法
-        method = self.__dict__.get("method")
-        if not method:
-            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
-                method = "POST"
-            else:
-                method = "GET"
-        self.method = method
-
-        # 设置user—agent
-        headers = self.requests_kwargs.get("headers", {})
-        if "user-agent" not in headers and "User-Agent" not in headers:
-            if self.random_user_agent and setting.RANDOM_HEADERS:
-                # 随机user—agent
-                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-                headers.update({"User-Agent": ua})
-                self.requests_kwargs.update(headers=headers)
-            else:
-                # 使用默认的user—agent
-                self.requests_kwargs.setdefault(
-                    "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
-                )
-        else:
-            self.custom_ua = True
-
-        # 代理
-        proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
-            while True:
-                proxies = self._proxies_pool.get()
-                if proxies:
-                    self.requests_kwargs.update(proxies=proxies)
-                    break
-                else:
-                    log.debug("暂无可用代理 ...")
-        else:
-            self.custom_proxies = True
-
-    def get_response(self, save_cached=False):
-        """
-        获取带有selector功能的response
-        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
-        @return:
-        """
-        self.make_requests_kwargs()
-
-        log.debug(
-            """
-                -------------- %srequest for ----------------
-                url  = %s
-                method = %s
-                args = %s
-                """
-            % (
-                ""
-                if not self.parser_name
-                else "%s.%s "
-                % (
-                    self.parser_name,
-                    (
-                        self.callback
-                        and callable(self.callback)
-                        and getattr(self.callback, "__name__")
-                        or self.callback
-                    )
-                    or "parse",
-                ),
-                self.url,
-                self.method,
-                self.requests_kwargs,
-            )
-        )
-
-        # def hooks(response, *args, **kwargs):
-        #     print(response.url)
-        #
-        # self.requests_kwargs.update(hooks={'response': hooks})
-
-        # self.use_session 优先级高
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )
-
-        if self.render:
-            response = self._render_downloader.download(self)
-        elif use_session:
-            response = self._session_downloader.download(self)
-        else:
-            response = self._downloader.download(self)
-
-        response.make_absolute_links = self.make_absolute_links
-
-        if save_cached:
-            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-
-        return response
-
-    def get_params(self):
-        return self.requests_kwargs.get("params")
-
-    def get_proxies(self) -> dict:
-        """
-
-        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
-
-        """
-        return self.requests_kwargs.get("proxies")
-
-    def get_proxy(self) -> str:
-        """
-
-        Returns: ip:port
-
-        """
-        proxies = self.get_proxies()
-        if proxies:
-            return re.sub(
-                "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
-            )
-
-    def get_headers(self) -> dict:
-        return self.requests_kwargs.get("headers", {})
-
-    def get_user_agent(self) -> str:
-        return self.get_headers().get("user_agent") or self.get_headers().get(
-            "User-Agent"
-        )
-
-    def get_cookies(self) -> dict:
-        cookies = self.requests_kwargs.get("cookies")
-        if cookies and isinstance(cookies, RequestsCookieJar):
-            cookies = cookies.get_dict()
-
-        if not cookies:
-            cookie_str = self.get_headers().get("Cookie") or self.get_headers().get(
-                "cookie"
-            )
-            if cookie_str:
-                cookies = tools.get_cookies_from_str(cookie_str)
-        return cookies
-
-    @property
-    def fingerprint(self):
-        """
-        request唯一表识
-        @return:
-        """
-        url = self.__dict__.get("url", "")
-        # url 归一化
-        url = tools.canonicalize_url(url)
-        args = [url]
-
-        for arg in ["params", "data", "files", "auth", "cert", "json"]:
-            if self.requests_kwargs.get(arg):
-                args.append(self.requests_kwargs.get(arg))
-
-        return tools.get_md5(*args)
-
-    @property
-    def _cache_db(self):
-        if not self.__class__.cache_db:
-            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
-
-        return self.__class__.cache_db
-
-    @property
-    def _cached_redis_key(self):
-        if self.__class__.cached_redis_key:
-            return (
-                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
-            )
-        else:
-            return f"response_cached:test:{self.fingerprint}"
-
-    def save_cached(self, response, expire_time=1200):
-        """
-        使用redis保存response 用于调试 不用每回都下载
-        @param response:
-        @param expire_time: 过期时间
-        @return:
-        """
-
-        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
-
-    def get_response_from_cached(self, save_cached=True):
-        """
-        从缓存中获取response
-        注意:
-            属性值为空:
-                -raw : urllib3.response.HTTPResponse
-                -connection:requests.adapters.HTTPAdapter
-                -history
-
-            属性含义改变:
-                - request 由requests 改为Request
-        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
-        @return:
-        """
-        response_dict = self._cache_db.strget(self._cached_redis_key)
-        if not response_dict:
-            log.info("无response缓存  重新下载")
-            response_obj = self.get_response(save_cached=save_cached)
-        else:
-            response_dict = eval(response_dict)
-            response_obj = Response.from_dict(response_dict)
-        return response_obj
-
-    def del_response_cached(self):
-        self._cache_db.clear(self._cached_redis_key)
-
-    @classmethod
-    def from_dict(cls, request_dict):
-        for key, value in request_dict.items():
-            if isinstance(value, bytes):  # 反序列化 如item
-                request_dict[key] = tools.loads_obj(value)
-
-        return cls(**request_dict)
-
-    def copy(self):
-        return self.__class__.from_dict(copy.deepcopy(self.to_dict))

+ 0 - 414
A数据处理/site_monitor/network/response.py

@@ -1,414 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-05-10 
----------
-@summary:  
----------
-@author: Dzr
-"""
-import copy
-import datetime
-import os
-import re
-import time
-from urllib.parse import urlparse, urlunparse, urljoin
-
-from bs4 import BeautifulSoup
-from bs4.dammit import UnicodeDammit
-from lxml.html import fromstring, HtmlElement
-from lxml.html.clean import Cleaner
-from parsel import Selector
-from requests.cookies import RequestsCookieJar
-from requests.models import Response as res
-from w3lib.encoding import (
-    http_content_type_encoding,
-    html_body_declared_encoding
-)
-import utils.tools as tools
-from utils.log import logger as log
-
-FAIL_ENCODING = "ISO-8859-1"
-
-# html 源码中的特殊字符,需要删掉,否则会影响etree的构建
-SPECIAL_CHARACTERS = [
-    # 移除控制字符 全部字符列表 https://zh.wikipedia.org/wiki/%E6%8E%A7%E5%88%B6%E5%AD%97%E7%AC%A6
-    "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]"
-]
-
-SPECIAL_CHARACTER_PATTERNS = [
-    re.compile(special_character) for special_character in SPECIAL_CHARACTERS
-]
-
-
-def iter_node(element: HtmlElement):
-    yield element
-    for sub_element in element:
-        if isinstance(sub_element, HtmlElement):
-            yield from iter_node(sub_element)
-
-
-class Response(res):
-    def __init__(self, response):
-        super(Response, self).__init__()
-        self.__dict__.update(response.__dict__)
-
-        self._cached_selector = None
-        self._cached_text = None
-        self._cached_json = None
-
-        self._encoding = None
-
-        self.encoding_errors = "strict"  # strict / replace / ignore
-
-    @classmethod
-    def from_dict(cls, response_dict):
-        """
-        利用字典获取Response对象
-        @param response_dict: 原生的response.__dict__
-        @return:
-        """
-        cookie_jar = RequestsCookieJar()
-        cookie_jar.update(other=response_dict["cookies"])
-        response_dict["cookies"] = cookie_jar
-
-        response_dict["elapsed"] = datetime.timedelta(
-            0, 0, response_dict["elapsed"]
-        )  # 耗时
-        response_dict["connection"] = None
-        response_dict["_content_consumed"] = True
-
-        response = res()
-        response.__dict__.update(response_dict)
-        return cls(response)
-
-    @property
-    def to_dict(self):
-        response_dict = {
-            "_content": self.content,
-            "cookies": self.cookies.get_dict(),
-            "encoding": self.encoding,
-            "headers": self.headers,
-            "status_code": self.status_code,
-            "elapsed": self.elapsed.microseconds,  # 耗时
-            "url": self.url,
-        }
-
-        return response_dict
-
-    def __clear_cache(self):
-        self.__dict__["_cached_selector"] = None
-        self.__dict__["_cached_text"] = None
-        self.__dict__["_cached_json"] = None
-
-    @property
-    def encoding(self):
-        """
-        编码优先级:自定义编码 > header中编码 > 页面编码 > 根据content猜测的编码
-        """
-        self._encoding = (
-            self._encoding
-            or self._headers_encoding()
-            or self._body_declared_encoding()
-            or self.apparent_encoding
-        )
-        return self._encoding
-
-    @encoding.setter
-    def encoding(self, val):
-        self.__clear_cache()
-        self._encoding = val
-
-    code = encoding
-
-    def _headers_encoding(self):
-        """
-        从headers获取头部charset编码
-        """
-        content_type = self.headers.get("Content-Type") or self.headers.get(
-            "content-type"
-        )
-        if content_type:
-            return (
-                http_content_type_encoding(content_type) or "utf-8"
-                if "application/json" in content_type
-                else None
-            )
-
-    def _body_declared_encoding(self):
-        """
-        从html xml等获取<meta charset="编码">
-        """
-
-        return html_body_declared_encoding(self.content)
-
-    def _get_unicode_html(self, html):
-        if not html or not isinstance(html, bytes):
-            return html
-
-        converted = UnicodeDammit(html, is_html=True)
-        if not converted.unicode_markup:
-            raise Exception(
-                "Failed to detect encoding of article HTML, tried: %s"
-                % ", ".join(converted.tried_encodings)
-            )
-
-        html = converted.unicode_markup
-        return html
-
-    def _make_absolute(self, link):
-        """Makes a given link absolute."""
-        try:
-
-            link = link.strip()
-
-            # Parse the link with stdlib.
-            parsed = urlparse(link)._asdict()
-
-            # If link is relative, then join it with base_url.
-            if not parsed["netloc"]:
-                return urljoin(self.url, link)
-
-            # Link is absolute; if it lacks a scheme, add one from base_url.
-            if not parsed["scheme"]:
-                parsed["scheme"] = urlparse(self.url).scheme
-
-                # Reconstruct the URL to incorporate the new scheme.
-                parsed = (v for v in parsed.values())
-                return urlunparse(parsed)
-
-        except Exception as e:
-            log.error(
-                "Invalid URL <{}> can't make absolute_link. exception: {}".format(
-                    link, e
-                )
-            )
-
-        # Link is absolute and complete with scheme; nothing to be done here.
-        return link
-
-    def _absolute_links(self, text):
-        regexs = [
-            r'(<(?i)a.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # a
-            r'(<(?i)img.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # img
-            r'(<(?i)link.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # css
-            r'(<(?i)script.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # js
-        ]
-
-        for regex in regexs:
-
-            def replace_href(text):
-                # html = text.group(0)
-                link = text.group(2)
-                absolute_link = self._make_absolute(link)
-
-                # return re.sub(regex, r'\1{}\3'.format(absolute_link), html) # 使用正则替换,个别字符不支持。如该网址源代码http://permit.mep.gov.cn/permitExt/syssb/xxgk/xxgk!showImage.action?dataid=0b092f8115ff45c5a50947cdea537726
-                return text.group(1) + absolute_link + text.group(3)
-
-            text = re.sub(regex, replace_href, text, flags=re.S)
-
-        return text
-
-    def _del_special_character(self, text):
-        """
-        删除特殊字符
-        """
-        for special_character_pattern in SPECIAL_CHARACTER_PATTERNS:
-            text = special_character_pattern.sub("", text)
-
-        return text
-
-    @property
-    def __text(self):
-        """Content of the response, in unicode.
-
-        If Response.encoding is None, encoding will be guessed using
-        ``chardet``.
-
-        The encoding of the response content is determined based solely on HTTP
-        headers, following RFC 2616 to the letter. If you can take advantage of
-        non-HTTP knowledge to make a better guess at the encoding, you should
-        set ``r.encoding`` appropriately before accessing this property.
-        """
-
-        if not self.content:
-            return ""
-
-        # Decode unicode from given encoding.
-        try:
-            content = str(self.content, self.encoding, errors=self.encoding_errors)
-        except (LookupError, TypeError):
-            # A LookupError is raised if the encoding was not found which could
-            # indicate a misspelling or similar mistake.
-            #
-            # A TypeError can be raised if encoding is None
-            #
-            # So we try blindly encoding.
-            content = str(self.content, errors=self.encoding_errors)
-
-        return content
-
-    @property
-    def text(self):
-        if self._cached_text is None:
-            if self.encoding and self.encoding.upper() != FAIL_ENCODING:
-                try:
-                    self._cached_text = self.__text
-                except UnicodeDecodeError:
-                    self._cached_text = self._get_unicode_html(self.content)
-            else:
-                self._cached_text = self._get_unicode_html(self.content)
-
-            if self._cached_text:
-                self._cached_text = self._absolute_links(self._cached_text)
-                self._cached_text = self._del_special_character(self._cached_text)
-
-        return self._cached_text
-
-    @text.setter
-    def text(self, html):
-        self._cached_text = html
-        self._cached_text = self._absolute_links(self._cached_text)
-        self._cached_text = self._del_special_character(self._cached_text)
-        self._cached_selector = Selector(self.text)
-
-    @property
-    def json(self, **kwargs):
-        if self._cached_json is None:
-            self.encoding = self.encoding or "utf-8"
-            self._cached_json = super(Response, self).json(**kwargs)
-
-        return self._cached_json
-
-    @property
-    def content(self):
-        content = super(Response, self).content
-        return content
-
-    @property
-    def is_html(self):
-        content_type = self.headers.get("Content-Type", "")
-        if "text/html" in content_type:
-            return True
-        else:
-            return False
-
-    @property
-    def selector(self):
-        if self._cached_selector is None:
-            self._cached_selector = Selector(self.text)
-        return self._cached_selector
-
-    def bs4(self, features="html.parser"):
-        soup = BeautifulSoup(self.text, features)
-        return soup
-
-    def extract(self):
-        return self.selector.get()
-
-    def xpath(self, query, **kwargs):
-        return self.selector.xpath(query, **kwargs)
-
-    def css(self, query):
-        return self.selector.css(query)
-
-    def re(self, regex, replace_entities=False):
-        """
-        @summary: 正则匹配
-        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
-        为了使用方便,正则单双引号自动处理为不敏感
-        ---------
-        @param regex: 正则或者re.compile
-        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
-        ---------
-        @result: 列表
-        """
-
-        # 将单双引号设置为不敏感
-        if isinstance(regex, str):
-            regex = re.sub("['\"]", "['\"]", regex)
-
-        return self.selector.re(regex, replace_entities)
-
-    def re_first(self, regex, default=None, replace_entities=False):
-        """
-        @summary: 正则匹配
-        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
-        为了使用方便,正则单双引号自动处理为不敏感
-        ---------
-        @param regex: 正则或者re.compile
-        @param default: 未匹配到, 默认值
-        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
-        ---------
-        @result: 第一个值或默认值
-        """
-
-        # 将单双引号设置为不敏感
-        if isinstance(regex, str):
-            regex = re.sub("['\"]", "['\"]", regex)
-
-        return self.selector.re_first(regex, default, replace_entities)
-
-    def close_browser(self, request):
-        if hasattr(self, "browser"):
-            request._render_downloader.webdriver_pool.remove(self.browser)
-            del self.browser
-
-    def __del__(self):
-        self.close()
-
-    def open(self, delete_temp_file=False):
-        with open("temp.html", "w", encoding=self.encoding, errors="replace") as html:
-            self.encoding_errors = "replace"
-            html.write(self.text)
-
-        os.system("open temp.html")
-
-        if delete_temp_file:
-            time.sleep(1)
-            os.remove("temp.html")
-
-    @property
-    def plain_text(self):
-        return re.findall('[\u4e00-\u9fa5]', self.text, re.S)
-
-    def tags(self):
-        tags_dict = {}
-
-        html = copy.deepcopy(self.text)
-        if len(html) == 0:
-            tags_dict['tags_count'] = 0
-            return tags_dict
-
-        cleaner = Cleaner()
-        html = cleaner.clean_html(html)
-
-        count = 0
-        node = fromstring(html)
-        for elem in iter_node(node.xpath('/html')[0]):
-            count += 1
-            tag = elem.tag
-            if not tags_dict.get(tag):
-                tags_dict[tag] = 1
-            else:
-                tags_dict[tag] += 1
-
-        tags_dict['tags_count'] = count
-        return tags_dict
-
-    def title(self):
-        title_text = self.xpath('//title/text()').extract_first("")
-
-        htag = '//h1//text() | //h2//text() | //h3//text() | //h4//text()'
-        h_tag_texts_list = self.xpath(htag).extract()
-        htag_text = h_tag_texts_list[0] if len(h_tag_texts_list) > 0 else ''
-
-        news_title = ''
-        for h_tag_text in h_tag_texts_list:
-            lcs = tools.get_longest_common_sub_string(title_text, h_tag_text)
-            if len(lcs) > len(news_title):
-                news_title = lcs
-
-        news_title = news_title if len(news_title) > 8 else ''
-
-        title = (news_title or title_text or htag_text)
-        return title.strip()

+ 0 - 389
A数据处理/site_monitor/network/user_agent.py

@@ -1,389 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-12-28 17:55
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import random
-
-USER_AGENTS = {
-    "chrome": [
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
-        "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
-        "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
-        "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
-        "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
-    ],
-    "opera": [
-        "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
-        "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
-        "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
-        "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
-        "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
-        "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
-        "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
-        "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
-        "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
-        "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
-        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
-        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
-        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
-        "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
-        "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
-        "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
-        "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
-        "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
-        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
-        "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
-        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
-        "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
-        "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
-        "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
-        "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
-        "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
-        "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
-    ],
-    "firefox": [
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
-        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
-        "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
-        "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
-        "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
-        "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
-        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
-        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
-        "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
-        "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
-    ],
-    "internetexplorer": [
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
-        "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko",
-        "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
-        "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
-        "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
-        "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
-        "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
-        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
-        "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)",
-    ],
-    "safari": [
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-    ],
-    "mobile": [
-        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
-        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
-        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
-        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
-        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
-        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
-        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
-        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
-        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
-        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
-        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
-        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Safari/605.1.15",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
-    ],
-}
-
-
-def get(ua_type: str = None):
-    if not ua_type:
-        ua_type = random.choice(list(USER_AGENTS.keys()))
-    elif ua_type not in USER_AGENTS:
-        raise ValueError(
-            "ua_type error, expect one of {}".format(list(USER_AGENTS.keys()))
-        )
-
-    return random.choice(USER_AGENTS[ua_type])

+ 0 - 14
A数据处理/site_monitor/requirements.txt

@@ -1,14 +0,0 @@
-beautifulsoup4==4.9.3
-bs4==0.0.1
-loguru==0.5.3
-lxml==4.9.1
-numpy==1.24.1
-parsel==1.7.0
-playwright==1.24.1
-pymongo==3.12.0
-redis==3.5.3
-requests==2.30.0
-six==1.16.0
-w3lib==2.1.1
-PyExecJS>=1.5.1
-redis-py-cluster>=2.1.0

+ 0 - 65
A数据处理/site_monitor/setting.py

@@ -1,65 +0,0 @@
-# -*- coding: utf-8 -*-
-"""爬虫配置文件"""
-import os
-
-# MONGODB
-MONGO_IP = "172.17.4.87"
-MONGO_PORT = 27080
-MONGO_DB = "py_spider"
-MONGO_USER_NAME = os.getenv("MONGO_USER_NAME")
-MONGO_USER_PASS = os.getenv("MONGO_USER_PASS")
-
-# REDIS
-# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-REDISDB_IP_PORTS = "172.17.4.232:7361"
-REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
-REDISDB_DB = 4
-# 适用于redis哨兵模式
-REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME")
-
-# 浏览器渲染
-PLAYWRIGHT = dict(
-    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=True,  # 是否为无头浏览器
-    driver_type="webkit",  # chromium、firefox、webkit
-    timeout=60,  # 请求超时时间
-    window_size=(1024, 800),  # 窗口大小
-    executable_path=None,  # 浏览器路径,默认为默认路径
-    download_path=None,  # 下载文件的路径
-    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
-    wait_until="networkidle",  # 等待页面加载完成的事件,可选值:"commit", "domcontentloaded", "load", "networkidle"
-    use_stealth_js=False,  # 使用stealth.min.js隐藏浏览器特征
-    page_on_event_callback=None,  # page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
-    storage_state_path=None,  # 保存浏览器状态的路径
-    url_regexes=None,  # 拦截接口,支持正则,数组类型
-    save_all=False,  # 是否保存所有拦截的接口, 配合url_regexes使用,为False时只保存最后一次拦截的接口
-)
-
-# request网络请求超时时间
-REQUEST_TIMEOUT = 30  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
-
-# 设置代理
-PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"  # 代理提取API ,返回的代理分割符为\r\n
-PROXY_ENABLE = True
-
-# 随机headers
-RANDOM_HEADERS = True
-# UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
-USER_AGENT_TYPE = "chrome"
-# 默认使用的浏览器头
-DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-# requests 使用session
-USE_SESSION = False
-
-# 下载
-DOWNLOADER = "network.downloader.RequestsDownloader"
-SESSION_DOWNLOADER = "network.downloader.RequestsSessionDownloader"
-RENDER_DOWNLOADER = "network.downloader.PlaywrightDownloader"
-MAKE_ABSOLUTE_LINKS = True  # 自动转成绝对连接
-
-# 企业微信报警
-WECHAT_WARNING_URL = ""  # 企业微信机器人api
-WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
-WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
-WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重

+ 0 - 8
A数据处理/site_monitor/utils/__init__.py

@@ -1,8 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-05-10 
----------
-@summary:  
----------
-@author: Dzr
-"""

+ 0 - 147
A数据处理/site_monitor/utils/clean_html.py

@@ -1,147 +0,0 @@
-import re
-__all__ = ['cleaner']
-
-# 独立元素
-INDEPENDENT_TAGS = {
-    '<head>[\s\S]*?</head>': '',
-    '<html>|<html [^>]*>|</html>': '',
-    '<body>|<body [^>]*>|</body>': '',
-    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
-    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
-    '\\xa0|\\u3000': '',  # 空格
-    '<!--[\s\S]*?-->': '',  # 注释
-    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
-    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
-    '<input>': '',  # 输入框
-    '<img[^>]*>': '<br>',  # 图片
-}
-# 行内元素
-INLINE_TAGS = {
-    '<a>|<a [^>]*>|</a>': '',  # 超链接
-    '<link>|<link [^>]*>|</link>': '',  # 超链接
-    '<span>|<span [^>]*>|</span>': '',  # span
-    '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
-    'data:image(.*?) ': '',            # 图片base64
-}
-# 块级元素
-BLOCK_TAGS = {
-    '<div>\s*?</div>':'',
-    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
-    '<p>|<p [^>]*>': '<br>',  # 段落
-    '</p>': '',  # 段落
-    '<div>|<div [^>]*>': '<br>',  # 分割 division
-    '</div>': '',  # 分割 division
-    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
-}
-# 其他
-OTHER = {
-    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
-    '<epointform>': '',
-    '<!doctype html>|<!doctype html [^>]*>': '',
-    '【关闭】|关闭': '',
-    '【打印】|打印本页': '',
-    '【字体:[\s\S]*】': '',
-    '文章来源:[\u4e00-\u9fa5]+': '',
-    '浏览次数:.*[<]+': '',
-    '(责任编辑:.*?)': '',
-    '分享到[:]': '',
-
-}
-# 样式
-CSS_STYLE = {
-    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
-    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
-    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
-    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
-    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
-    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
-
-}
-# 空白符
-BLANKS = {
-    '\n\s*\n': '\n',
-    '\s*\n\s*': '\n',
-    '[^\S\n]': ' ',
-    '\s+': ' ',
-}
-# css标签集合
-TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
-# css属性集合
-ATTRS = {'id', 'class', 'style', 'width'}
-
-
-def _repair_tag():
-    """异常的标签组合,用来替换非标准页面的标签"""
-    _repairs = {}
-    for tag in TAGS:
-        for attr in ATTRS:
-            key = '{}{}'.format(tag, attr)
-            val = '{} {}'.format(tag, attr)
-            _repairs[key] = val
-    return _repairs
-
-
-def _escape_character(html):
-    """转义字符"""
-    html = html.replace('&lt;', '<')
-    html = html.replace('&gt;', '>')
-    html = html.replace('&quot;', '"')
-    html = html.replace('&amp;', '&')
-    # 不显示输入框边框
-    html = html.replace('<input', '<input style="border-color: transparent;"')
-    return html
-
-
-def _lowercase_tag(html):
-    """标签归一化处理(全部小写 + 标签修复)"""
-    tags = re.findall("<[^>]+>", html)
-    tag_sets = set(tags)
-
-    if len(tag_sets) > 10000:
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html, "lxml")
-        html = str(soup.body.next_element)
-    else:
-        for tag in tag_sets:
-            html = html.replace(tag, str(tag).lower())
-
-    repair_tags = _repair_tag()
-    for err, right in repair_tags.items():
-        html = html.replace(err, right)
-
-    return html
-
-
-def cleaner(html, special=None, completely=False):
-    """
-    数据清洗
-
-    :param html: 清洗的页面
-    :param special: 额外指定页面清洗规则
-    :param completely: 是否完全清洗页面
-    :return: 清洗后的页面源码
-    """
-    if special is None:
-        special = {}
-
-    OTHER.update(special)
-    remove_tags = {
-        **INDEPENDENT_TAGS,
-        **INLINE_TAGS,
-        **BLOCK_TAGS,
-        **OTHER,
-        **CSS_STYLE,
-        **BLANKS,
-    }
-    html = _lowercase_tag(html)
-    for tag, repl in remove_tags.items():
-        html = re.sub(tag, repl, html)
-
-    if completely:
-        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
-        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
-        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
-
-    html = _escape_character(html)
-    return html

Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 0
A数据处理/site_monitor/utils/js/intercept.js


Diff do ficheiro suprimidas por serem muito extensas
+ 0 - 6
A数据处理/site_monitor/utils/js/stealth.min.js


+ 0 - 14
A数据处理/site_monitor/utils/log.py

@@ -1,14 +0,0 @@
-from pathlib import Path
-
-from loguru import logger
-
-_absolute = Path(__file__).absolute().parent.parent
-_log_path = (_absolute / 'logs/log_{time:YYYY-MM-DD}.log').resolve()
-logger.add(
-    _log_path,
-    format='{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}',
-    level='INFO',
-    rotation='00:00',
-    retention='1 week',
-    encoding='utf-8',
-)

+ 0 - 2401
A数据处理/site_monitor/utils/tools.py

@@ -1,2401 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-09-06 14:21
----------
-@summary: 工具
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import asyncio
-import calendar
-import codecs
-import configparser  # 读配置文件的
-import datetime
-import functools
-import hashlib
-import html
-import importlib
-import json
-import os
-import pickle
-import random
-import re
-import socket
-import ssl
-import string
-import sys
-import time
-import traceback
-import urllib
-import urllib.parse
-import uuid
-import weakref
-from functools import partial, wraps
-from hashlib import md5
-from pprint import pformat
-from pprint import pprint
-from urllib import request
-from urllib.parse import urljoin
-
-import bson
-import execjs  # pip install PyExecJS
-import redis
-import requests
-import six
-from requests.cookies import RequestsCookieJar
-from w3lib.url import canonicalize_url as _canonicalize_url
-
-import setting as setting
-from db.redisdb import RedisDB
-from utils.log import logger as log
-
-os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
-
-# 全局取消ssl证书验证
-ssl._create_default_https_context = ssl._create_unverified_context
-
-TIME_OUT = 30
-TIMER_TIME = 5
-
-redisdb = None
-
-
-def get_redisdb():
-    global redisdb
-    if not redisdb:
-        redisdb = RedisDB()
-    return redisdb
-
-
-# 装饰器
-class Singleton(object):
-    def __init__(self, cls):
-        self._cls = cls
-        self._instance = {}
-
-    def __call__(self, *args, **kwargs):
-        if self._cls not in self._instance:
-            self._instance[self._cls] = self._cls(*args, **kwargs)
-        return self._instance[self._cls]
-
-
-def log_function_time(func):
-    try:
-
-        @functools.wraps(func)  # 将函数的原来属性付给新函数
-        def calculate_time(*args, **kw):
-            began_time = time.time()
-            callfunc = func(*args, **kw)
-            end_time = time.time()
-            log.debug(func.__name__ + " run time  = " + str(end_time - began_time))
-            return callfunc
-
-        return calculate_time
-    except:
-        log.debug("求取时间无效 因为函数参数不符")
-        return func
-
-
-def run_safe_model(module_name):
-    def inner_run_safe_model(func):
-        try:
-
-            @functools.wraps(func)  # 将函数的原来属性付给新函数
-            def run_func(*args, **kw):
-                callfunc = None
-                try:
-                    callfunc = func(*args, **kw)
-                except Exception as e:
-                    log.error(module_name + ": " + func.__name__ + " - " + str(e))
-                    traceback.print_exc()
-                return callfunc
-
-            return run_func
-        except Exception as e:
-            log.error(module_name + ": " + func.__name__ + " - " + str(e))
-            traceback.print_exc()
-            return func
-
-    return inner_run_safe_model
-
-
-def memoizemethod_noargs(method):
-    """Decorator to cache the result of a method (without arguments) using a
-    weak reference to its object
-    """
-    cache = weakref.WeakKeyDictionary()
-
-    @functools.wraps(method)
-    def new_method(self, *args, **kwargs):
-        if self not in cache:
-            cache[self] = method(self, *args, **kwargs)
-        return cache[self]
-
-    return new_method
-
-
-########################【网页解析相关】###############################
-def get_longest_common_sub_string(str1: str, str2: str) -> str:
-    """
-    获取两个字符串的最长公共子串。
-
-    构造一个矩阵,横向是字符串1,纵向是字符串2,例如:
-
-      青南是天才!?
-    听0 0 0 0 00 0
-    说0 0 0 0 00 0
-    青1 0 0 0 00 0
-    南0 1 0 0 00 0
-    是0 0 1 0 00 0
-    天0 0 0 1 00 0
-    才0 0 0 0 10 0
-    !0 0 0 0 01 0
-
-    显然,只要斜对角线最长的就是最长公共子串
-
-    :param str1:
-    :param str2:
-    :return:
-    """
-    if not all([str1, str2]):
-        return ''
-    matrix = [[0] * (len(str2) + 1) for _ in range(len(str1) + 1)]
-    max_length = 0
-    start_position = 0
-    for index_of_str1 in range(1, len(str1) + 1):
-        for index_of_str2 in range(1, len(str2) + 1):
-            if str1[index_of_str1 - 1] == str2[index_of_str2 - 1]:
-                matrix[index_of_str1][index_of_str2] = matrix[index_of_str1 - 1][index_of_str2 - 1] + 1
-                if matrix[index_of_str1][index_of_str2] > max_length:
-                    max_length = matrix[index_of_str1][index_of_str2]
-                    start_position = index_of_str1 - max_length
-            else:
-                matrix[index_of_str1][index_of_str2] = 0
-    return str1[start_position: start_position + max_length]
-
-
-def get_cookies(response):
-    cookies = requests.utils.dict_from_cookiejar(response.cookies)
-    return cookies
-
-
-def get_cookies_from_str(cookie_str):
-    """
-    >>> get_cookies_from_str("key=value; key2=value2; key3=; key4=; ")
-    {'key': 'value', 'key2': 'value2', 'key3': '', 'key4': ''}
-
-    Args:
-        cookie_str: key=value; key2=value2; key3=; key4=
-
-    Returns:
-
-    """
-    cookies = {}
-    for cookie in cookie_str.split(";"):
-        cookie = cookie.strip()
-        if not cookie:
-            continue
-        key, value = cookie.split("=", 1)
-        key = key.strip()
-        value = value.strip()
-        cookies[key] = value
-
-    return cookies
-
-
-def get_cookies_jar(cookies):
-    """
-    @summary: 适用于selenium生成的cookies转requests的cookies
-    requests.get(xxx, cookies=jar)
-    参考:https://www.cnblogs.com/small-bud/p/9064674.html
-
-    ---------
-    @param cookies: [{},{}]
-    ---------
-    @result: cookie jar
-    """
-
-    cookie_jar = RequestsCookieJar()
-    for cookie in cookies:
-        cookie_jar.set(cookie["name"], cookie["value"])
-
-    return cookie_jar
-
-
-def get_cookies_from_selenium_cookie(cookies):
-    """
-    @summary: 适用于selenium生成的cookies转requests的cookies
-    requests.get(xxx, cookies=jar)
-    参考:https://www.cnblogs.com/small-bud/p/9064674.html
-
-    ---------
-    @param cookies: [{},{}]
-    ---------
-    @result: cookie jar
-    """
-
-    cookie_dict = {}
-    for cookie in cookies:
-        if cookie.get("name"):
-            cookie_dict[cookie["name"]] = cookie["value"]
-
-    return cookie_dict
-
-
-def cookiesjar2str(cookies):
-    str_cookie = ""
-    for k, v in requests.utils.dict_from_cookiejar(cookies).items():
-        str_cookie += k
-        str_cookie += "="
-        str_cookie += v
-        str_cookie += "; "
-    return str_cookie
-
-
-def cookies2str(cookies):
-    str_cookie = ""
-    for k, v in cookies.items():
-        str_cookie += k
-        str_cookie += "="
-        str_cookie += v
-        str_cookie += "; "
-    return str_cookie
-
-
-def get_urls(
-    html,
-    stop_urls=(
-        "javascript",
-        "+",
-        ".css",
-        ".js",
-        ".rar",
-        ".xls",
-        ".exe",
-        ".apk",
-        ".doc",
-        ".jpg",
-        ".png",
-        ".flv",
-        ".mp4",
-    ),
-):
-    # 不匹配javascript、 +、 # 这样的url
-    regex = r'<a.*?href.*?=.*?["|\'](.*?)["|\']'
-
-    urls = get_info(html, regex)
-    urls = sorted(set(urls), key=urls.index)
-    if stop_urls:
-        stop_urls = isinstance(stop_urls, str) and [stop_urls] or stop_urls
-        use_urls = []
-        for url in urls:
-            for stop_url in stop_urls:
-                if stop_url in url:
-                    break
-            else:
-                use_urls.append(url)
-
-        urls = use_urls
-    return urls
-
-
-def get_full_url(root_url, sub_url):
-    """
-    @summary: 得到完整的ur
-    ---------
-    @param root_url: 根url (网页的url)
-    @param sub_url:  子url (带有相对路径的 可以拼接成完整的)
-    ---------
-    @result: 返回完整的url
-    """
-
-    return urljoin(root_url, sub_url)
-
-
-def joint_url(url, params):
-    # param_str = "?"
-    # for key, value in params.items():
-    #     value = isinstance(value, str) and value or str(value)
-    #     param_str += key + "=" + value + "&"
-    #
-    # return url + param_str[:-1]
-
-    if not params:
-        return url
-
-    params = urlencode(params)
-    separator = "?" if "?" not in url else "&"
-    return url + separator + params
-
-
-def canonicalize_url(url):
-    """
-    url 归一化 会参数排序 及去掉锚点
-    """
-    return _canonicalize_url(url)
-
-
-def get_url_md5(url):
-    url = canonicalize_url(url)
-    url = re.sub("^http://", "https://", url)
-    return get_md5(url)
-
-
-def fit_url(urls, identis):
-    identis = isinstance(identis, str) and [identis] or identis
-    fit_urls = []
-    for link in urls:
-        for identi in identis:
-            if identi in link:
-                fit_urls.append(link)
-    return list(set(fit_urls))
-
-
-def get_param(url, key):
-    params = url.split("?")[-1].split("&")
-    for param in params:
-        key_value = param.split("=", 1)
-        if key == key_value[0]:
-            return key_value[1]
-    return None
-
-
-def urlencode(params):
-    """
-    字典类型的参数转为字符串
-    @param params:
-    {
-        'a': 1,
-        'b': 2
-    }
-    @return: a=1&b=2
-    """
-    return urllib.parse.urlencode(params)
-
-
-def urldecode(url):
-    """
-    将字符串类型的参数转为json
-    @param url: xxx?a=1&b=2
-    @return:
-    {
-        'a': 1,
-        'b': 2
-    }
-    """
-    params_json = {}
-    params = url.split("?")[-1].split("&")
-    for param in params:
-        key, value = param.split("=")
-        params_json[key] = unquote_url(value)
-
-    return params_json
-
-
-def unquote_url(url, encoding="utf-8"):
-    """
-    @summary: 将url解码
-    ---------
-    @param url:
-    ---------
-    @result:
-    """
-
-    return urllib.parse.unquote(url, encoding=encoding)
-
-
-def quote_url(url, encoding="utf-8"):
-    """
-    @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html
-    ---------
-    @param url:
-    ---------
-    @result:
-    """
-
-    return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding)
-
-
-def quote_chinese_word(text, encoding="utf-8"):
-    def quote_chinese_word_func(text):
-        chinese_word = text.group(0)
-        return urllib.parse.quote(chinese_word, encoding=encoding)
-
-    return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S)
-
-
-def unescape(str):
-    """
-    反转译
-    """
-    return html.unescape(str)
-
-
-def excape(str):
-    """
-    转译
-    """
-    return html.escape(str)
-
-
-_regexs = {}
-
-
-# @log_function_time
-def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
-    regexs = isinstance(regexs, str) and [regexs] or regexs
-
-    infos = []
-    for regex in regexs:
-        if regex == "":
-            continue
-
-        if regex not in _regexs.keys():
-            _regexs[regex] = re.compile(regex, re.S)
-
-        if fetch_one:
-            infos = _regexs[regex].search(html)
-            if infos:
-                infos = infos.groups()
-            else:
-                continue
-        else:
-            infos = _regexs[regex].findall(str(html))
-
-        if len(infos) > 0:
-            # print(regex)
-            break
-
-    if fetch_one:
-        infos = infos if infos else ("",)
-        return infos if len(infos) > 1 else infos[0]
-    else:
-        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
-        infos = split.join(infos) if split else infos
-        return infos
-
-
-def table_json(table, save_one_blank=True):
-    """
-    将表格转为json 适应于 key:value 在一行类的表格
-    @param table: 使用selector封装后的具有xpath的selector
-    @param save_one_blank: 保留一个空白符
-    @return:
-    """
-    data = {}
-
-    trs = table.xpath(".//tr")
-    for tr in trs:
-        tds = tr.xpath("./td|./th")
-
-        for i in range(0, len(tds), 2):
-            if i + 1 > len(tds) - 1:
-                break
-
-            key = tds[i].xpath("string(.)").extract_first(default="").strip()
-            value = tds[i + 1].xpath("string(.)").extract_first(default="").strip()
-            value = replace_str(value, "[\f\n\r\t\v]", "")
-            value = replace_str(value, " +", " " if save_one_blank else "")
-
-            if key:
-                data[key] = value
-
-    return data
-
-
-def get_table_row_data(table):
-    """
-    获取表格里每一行数据
-    @param table: 使用selector封装后的具有xpath的selector
-    @return: [[],[]..]
-    """
-
-    datas = []
-    rows = table.xpath(".//tr")
-    for row in rows:
-        cols = row.xpath("./td|./th")
-        row_datas = []
-        for col in cols:
-            data = col.xpath("string(.)").extract_first(default="").strip()
-            row_datas.append(data)
-        datas.append(row_datas)
-
-    return datas
-
-
-def rows2json(rows, keys=None):
-    """
-    将行数据转为json
-    @param rows: 每一行的数据
-    @param keys: json的key,空时将rows的第一行作为key
-    @return:
-    """
-    data_start_pos = 0 if keys else 1
-    datas = []
-    keys = keys or rows[0]
-    for values in rows[data_start_pos:]:
-        datas.append(dict(zip(keys, values)))
-
-    return datas
-
-
-def get_form_data(form):
-    """
-    提取form中提交的数据
-    :param form: 使用selector封装后的具有xpath的selector
-    :return:
-    """
-    data = {}
-    inputs = form.xpath(".//input")
-    for input in inputs:
-        name = input.xpath("./@name").extract_first()
-        value = input.xpath("./@value").extract_first()
-        if name:
-            data[name] = value
-
-    return data
-
-
-def get_domain(url):
-    return urllib.parse.urlparse(url).netloc
-
-
-def get_index_url(url):
-    return "/".join(url.split("/")[:3])
-
-
-def get_ip(domain):
-    ip = socket.getaddrinfo(domain, "http")[0][4][0]
-    return ip
-
-
-def get_localhost_ip():
-    """
-    利用 UDP 协议来实现的,生成一个UDP包,把自己的 IP 放如到 UDP 协议头中,然后从UDP包中获取本机的IP。
-    这个方法并不会真实的向外部发包,所以用抓包工具是看不到的
-    :return:
-    """
-    s = None
-    try:
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        s.connect(("8.8.8.8", 80))
-        ip = s.getsockname()[0]
-    finally:
-        if s:
-            s.close()
-
-    return ip
-
-
-def ip_to_num(ip):
-    import struct
-
-    ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
-    return ip_num
-
-
-def is_valid_proxy(proxy, check_url=None):
-    """
-    检验代理是否有效
-    @param proxy: xxx.xxx.xxx:xxx
-    @param check_url: 利用目标网站检查,目标网站url。默认为None, 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host
-    @return: True / False
-    """
-    is_valid = False
-
-    if check_url:
-        proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"}
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
-        }
-        response = None
-        try:
-            response = requests.get(
-                check_url, headers=headers, proxies=proxies, stream=True, timeout=20
-            )
-            is_valid = True
-
-        except Exception as e:
-            log.error("check proxy failed: {} {}".format(e, proxy))
-
-        finally:
-            if response:
-                response.close()
-
-    else:
-        ip, port = proxy.split(":")
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(7)
-            try:
-                sk.connect((ip, int(port)))  # 检查代理服务器是否开着
-                is_valid = True
-
-            except Exception as e:
-                log.error("check proxy failed: {} {}:{}".format(e, ip, port))
-
-    return is_valid
-
-
-def is_valid_url(url):
-    """
-    验证url是否合法
-    :param url:
-    :return:
-    """
-    if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url):
-        return True
-    else:
-        return False
-
-
-def get_text(soup, *args):
-    try:
-        return soup.get_text()
-    except Exception as e:
-        log.error(e)
-        return ""
-
-
-def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""):
-    """
-    删除html标签
-    @param content: html内容
-    @param except_line_break: 保留p标签
-    @param save_img: 保留图片
-    @param white_replaced: 空白符替换
-    @return:
-    """
-    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
-    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
-    content = replace_str(content, "<!--(.|\n)*?-->")
-    content = replace_str(
-        content, "(?!&[a-z]+=)&[a-z]+;?"
-    )  # 干掉&nbsp等无用的字符 但&xxx= 这种表示参数的除外
-    if except_line_break:
-        content = content.replace("</p>", "/p")
-        content = replace_str(content, "<[^p].*?>")
-        content = content.replace("/p", "</p>")
-        content = replace_str(content, "[ \f\r\t\v]")
-
-    elif save_img:
-        content = replace_str(content, "(?!<img.+?>)<.+?>")  # 替换掉除图片外的其他标签
-        content = replace_str(content, "(?! +)\s+", "\n")  # 保留空格
-        content = content.strip()
-
-    else:
-        content = replace_str(content, "<(.|\n)*?>")
-        content = replace_str(content, "\s", white_replaced)
-        content = content.strip()
-
-    return content
-
-
-def del_html_js_css(content):
-    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
-    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
-    content = replace_str(content, "<!--(.|\n)*?-->")
-
-    return content
-
-
-def is_have_chinese(content):
-    regex = "[\u4e00-\u9fa5]+"
-    chinese_word = get_info(content, regex)
-    return chinese_word and True or False
-
-
-def is_have_english(content):
-    regex = "[a-zA-Z]+"
-    english_words = get_info(content, regex)
-    return english_words and True or False
-
-
-def get_chinese_word(content):
-    regex = "[\u4e00-\u9fa5]+"
-    chinese_word = get_info(content, regex)
-    return chinese_word
-
-
-def get_english_words(content):
-    regex = "[a-zA-Z]+"
-    english_words = get_info(content, regex)
-    return english_words or ""
-
-
-##################################################
-def get_json(json_str):
-    """
-    @summary: 取json对象
-    ---------
-    @param json_str: json格式的字符串
-    ---------
-    @result: 返回json对象
-    """
-
-    try:
-        return json.loads(json_str) if json_str else {}
-    except Exception as e1:
-        try:
-            json_str = json_str.strip()
-            json_str = json_str.replace("'", '"')
-            keys = get_info(json_str, "(\w+):")
-            for key in keys:
-                json_str = json_str.replace(key, '"%s"' % key)
-
-            return json.loads(json_str) if json_str else {}
-
-        except Exception as e2:
-            pass
-
-        return {}
-
-
-def jsonp2json(jsonp):
-    """
-    将jsonp转为json
-    @param jsonp: jQuery172013600082560040794_1553230569815({})
-    @return:
-    """
-    try:
-        return json.loads(re.match(".*?({.*}).*", jsonp, re.S).group(1))
-    except:
-        raise ValueError("Invalid Input")
-
-
-def dumps_json(data, indent=4, sort_keys=False):
-    """
-    @summary: 格式化json 用于打印
-    ---------
-    @param data: json格式的字符串或json对象
-    ---------
-    @result: 格式化后的字符串
-    """
-    try:
-        if isinstance(data, str):
-            data = get_json(data)
-
-        data = json.dumps(
-            data,
-            ensure_ascii=False,
-            indent=indent,
-            skipkeys=True,
-            sort_keys=sort_keys,
-            default=str,
-        )
-
-    except Exception as e:
-        data = pformat(data)
-
-    return data
-
-
-def get_json_value(json_object, key):
-    """
-    @summary:
-    ---------
-    @param json_object: json对象或json格式的字符串
-    @param key: 建值 如果在多个层级目录下 可写 key1.key2  如{'key1':{'key2':3}}
-    ---------
-    @result: 返回对应的值,如果没有,返回''
-    """
-    current_key = ""
-    value = ""
-    try:
-        json_object = (
-            isinstance(json_object, str) and get_json(json_object) or json_object
-        )
-
-        current_key = key.split(".")[0]
-        value = json_object[current_key]
-
-        key = key[key.find(".") + 1 :]
-    except Exception as e:
-        return value
-
-    if key == current_key:
-        return value
-    else:
-        return get_json_value(value, key)
-
-
-def get_all_keys(datas, depth=None, current_depth=0):
-    """
-    @summary: 获取json李所有的key
-    ---------
-    @param datas: dict / list
-    @param depth: 字典key的层级 默认不限制层级 层级从1开始
-    @param current_depth: 字典key的当前层级 不用传参
-    ---------
-    @result: 返回json所有的key
-    """
-
-    keys = []
-    if depth and current_depth >= depth:
-        return keys
-
-    if isinstance(datas, list):
-        for data in datas:
-            keys.extend(get_all_keys(data, depth, current_depth=current_depth + 1))
-    elif isinstance(datas, dict):
-        for key, value in datas.items():
-            keys.append(key)
-            if isinstance(value, dict):
-                keys.extend(get_all_keys(value, depth, current_depth=current_depth + 1))
-
-    return keys
-
-
-def to_chinese(unicode_str):
-    format_str = json.loads('{"chinese":"%s"}' % unicode_str)
-    return format_str["chinese"]
-
-
-##################################################
-def replace_str(source_str, regex, replace_str=""):
-    """
-    @summary: 替换字符串
-    ---------
-    @param source_str: 原字符串
-    @param regex: 正则
-    @param replace_str: 用什么来替换 默认为''
-    ---------
-    @result: 返回替换后的字符串
-    """
-    str_info = re.compile(regex)
-    return str_info.sub(replace_str, source_str)
-
-
-def del_redundant_blank_character(text):
-    """
-    删除冗余的空白符, 只保留一个
-    :param text:
-    :return:
-    """
-    return re.sub("\s+", " ", text)
-
-
-##################################################
-def get_conf_value(config_file, section, key):
-    cp = configparser.ConfigParser(allow_no_value=True)
-    with codecs.open(config_file, "r", encoding="utf-8") as f:
-        cp.read_file(f)
-    return cp.get(section, key)
-
-
-def mkdir(path):
-    try:
-        if not os.path.exists(path):
-            os.makedirs(path)
-    except OSError as exc:  # Python >2.5
-        pass
-
-
-def write_file(filename, content, mode="w", encoding="utf-8"):
-    """
-    @summary: 写文件
-    ---------
-    @param filename: 文件名(有路径)
-    @param content: 内容
-    @param mode: 模式 w/w+ (覆盖/追加)
-    ---------
-    @result:
-    """
-
-    directory = os.path.dirname(filename)
-    mkdir(directory)
-    with open(filename, mode, encoding=encoding) as file:
-        file.writelines(content)
-
-
-def read_file(filename, readlines=False, encoding="utf-8"):
-    """
-    @summary: 读文件
-    ---------
-    @param filename: 文件名(有路径)
-    @param readlines: 按行读取 (默认False)
-    ---------
-    @result: 按行读取返回List,否则返回字符串
-    """
-
-    content = None
-    try:
-        with open(filename, "r", encoding=encoding) as file:
-            content = file.readlines() if readlines else file.read()
-    except Exception as e:
-        log.error(e)
-
-    return content
-
-
-def get_oss_file_list(oss_handler, prefix, date_range_min, date_range_max=None):
-    """
-    获取文件列表
-    @param prefix: 路径前缀 如 data/car_service_line/yiche/yiche_serial_zongshu_info
-    @param date_range_min: 时间范围 最小值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
-    @param date_range_max: 时间范围 最大值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
-    @return: 每个文件路径 如 html/e_commerce_service_line/alibaba/alibaba_shop_info/2019/03/22/15/53/15/8ca8b9e4-4c77-11e9-9dee-acde48001122.json.snappy
-    """
-
-    # 计算时间范围
-    date_range_max = date_range_max or date_range_min
-    date_format = "/".join(
-        ["%Y", "%m", "%d", "%H", "%M", "%S"][: date_range_min.count("/") + 1]
-    )
-    time_interval = [
-        {"days": 365},
-        {"days": 31},
-        {"days": 1},
-        {"hours": 1},
-        {"minutes": 1},
-        {"seconds": 1},
-    ][date_range_min.count("/")]
-    date_range = get_between_date(
-        date_range_min, date_range_max, date_format=date_format, **time_interval
-    )
-
-    for date in date_range:
-        file_folder_path = os.path.join(prefix, date)
-        objs = oss_handler.list(prefix=file_folder_path)
-        for obj in objs:
-            filename = obj.key
-            yield filename
-
-
-def is_html(url):
-    if not url:
-        return False
-
-    try:
-        content_type = request.urlopen(url).info().get("Content-Type", "")
-
-        if "text/html" in content_type:
-            return True
-        else:
-            return False
-    except Exception as e:
-        log.error(e)
-        return False
-
-
-def is_exist(file_path):
-    """
-    @summary: 文件是否存在
-    ---------
-    @param file_path:
-    ---------
-    @result:
-    """
-
-    return os.path.exists(file_path)
-
-
-def download_file(url, file_path, *, call_func=None, proxies=None, data=None):
-    """
-    下载文件,会自动创建文件存储目录
-    Args:
-        url: 地址
-        file_path: 文件存储地址
-        call_func: 下载成功的回调
-        proxies: 代理
-        data: 请求体
-
-    Returns:
-
-    """
-    directory = os.path.dirname(file_path)
-    mkdir(directory)
-
-    # 进度条
-    def progress_callfunc(blocknum, blocksize, totalsize):
-        """回调函数
-        @blocknum : 已经下载的数据块
-        @blocksize : 数据块的大小
-        @totalsize: 远程文件的大小
-        """
-        percent = 100.0 * blocknum * blocksize / totalsize
-        if percent > 100:
-            percent = 100
-        # print ('进度条 %.2f%%' % percent, end = '\r')
-        sys.stdout.write("进度条 %.2f%%" % percent + "\r")
-        sys.stdout.flush()
-
-    if url:
-        try:
-            if proxies:
-                # create the object, assign it to a variable
-                proxy = request.ProxyHandler(proxies)
-                # construct a new opener using your proxy settings
-                opener = request.build_opener(proxy)
-                # install the openen on the module-level
-                request.install_opener(opener)
-
-            request.urlretrieve(url, file_path, progress_callfunc, data)
-
-            if callable(call_func):
-                call_func()
-            return 1
-        except Exception as e:
-            log.error(e)
-            return 0
-    else:
-        return 0
-
-
-def get_file_list(path, ignore=[]):
-    templist = path.split("*")
-    path = templist[0]
-    file_type = templist[1] if len(templist) >= 2 else ""
-
-    # 递归遍历文件
-    def get_file_list_(path, file_type, ignore, all_file=[]):
-        file_list = os.listdir(path)
-
-        for file_name in file_list:
-            if file_name in ignore:
-                continue
-
-            file_path = os.path.join(path, file_name)
-            if os.path.isdir(file_path):
-                get_file_list_(file_path, file_type, ignore, all_file)
-            else:
-                if not file_type or file_name.endswith(file_type):
-                    all_file.append(file_path)
-
-        return all_file
-
-    return get_file_list_(path, file_type, ignore) if os.path.isdir(path) else [path]
-
-
-def rename_file(old_name, new_name):
-    os.rename(old_name, new_name)
-
-
-def del_file(path, ignore=()):
-    files = get_file_list(path, ignore)
-    for file in files:
-        try:
-            os.remove(file)
-        except Exception as e:
-            log.error(
-                """
-                删除出错: %s
-                Exception : %s
-                """
-                % (file, str(e))
-            )
-        finally:
-            pass
-
-
-def get_file_type(file_name):
-    """
-    @summary: 取文件后缀名
-    ---------
-    @param file_name:
-    ---------
-    @result:
-    """
-    try:
-        return os.path.splitext(file_name)[1]
-    except Exception as e:
-        log.exception(e)
-
-
-def get_file_path(file_path):
-    """
-    @summary: 取文件路径
-    ---------
-    @param file_path: /root/a.py
-    ---------
-    @result: /root
-    """
-    try:
-        return os.path.split(file_path)[0]
-    except Exception as e:
-        log.exception(e)
-
-
-#############################################
-
-
-def exec_js(js_code):
-    """
-    @summary: 执行js代码
-    ---------
-    @param js_code: js代码
-    ---------
-    @result: 返回执行结果
-    """
-
-    return execjs.eval(js_code)
-
-
-def compile_js(js_func):
-    """
-    @summary: 编译js函数
-    ---------
-    @param js_func:js函数
-    ---------
-    @result: 返回函数对象 调用 fun('js_funName', param1,param2)
-    """
-
-    ctx = execjs.compile(js_func)
-    return ctx.call
-
-
-###############################################
-
-#############################################
-
-
-def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
-    """
-    @summary:
-    ---------
-    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
-    @param format:时间格式
-    ---------
-    @result: 返回时间戳
-    """
-
-    timestamp = time.mktime(time.strptime(date, time_format))
-    return int(timestamp)
-
-
-def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
-    """
-    @summary:
-    ---------
-    @param timestamp: 将时间戳转化为日期
-    @param format: 日期格式
-    ---------
-    @result: 返回日期
-    """
-    if timestamp is None:
-        raise ValueError("timestamp is null")
-
-    date = time.localtime(timestamp)
-    return time.strftime(time_format, date)
-
-
-def get_current_timestamp():
-    return int(time.time())
-
-
-def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
-    return datetime.datetime.now().strftime(date_format)
-    # return time.strftime(date_format, time.localtime(time.time()))
-
-
-def get_date_number(year=None, month=None, day=None):
-    """
-    @summary: 获取指定日期对应的日期数
-    默认当前周
-    ---------
-    @param year: 2010
-    @param month: 6
-    @param day: 16
-    ---------
-    @result: (年号,第几周,第几天) 如 (2010, 24, 3)
-    """
-    if year and month and day:
-        return datetime.date(year, month, day).isocalendar()
-    elif not any([year, month, day]):
-        return datetime.datetime.now().isocalendar()
-    else:
-        assert year, "year 不能为空"
-        assert month, "month 不能为空"
-        assert day, "day 不能为空"
-
-
-def get_between_date(
-    begin_date, end_date=None, date_format="%Y-%m-%d", **time_interval
-):
-    """
-    @summary: 获取一段时间间隔内的日期,默认为每一天
-    ---------
-    @param begin_date: 开始日期 str 如 2018-10-01
-    @param end_date: 默认为今日
-    @param date_format: 日期格式,应与begin_date的日期格式相对应
-    @param time_interval: 时间间隔 默认一天 支持 days、seconds、microseconds、milliseconds、minutes、hours、weeks
-    ---------
-    @result: list 值为字符串
-    """
-
-    date_list = []
-
-    begin_date = datetime.datetime.strptime(begin_date, date_format)
-    end_date = (
-        datetime.datetime.strptime(end_date, date_format)
-        if end_date
-        else datetime.datetime.strptime(
-            time.strftime(date_format, time.localtime(time.time())), date_format
-        )
-    )
-    time_interval = time_interval or dict(days=1)
-
-    while begin_date <= end_date:
-        date_str = begin_date.strftime(date_format)
-        date_list.append(date_str)
-
-        begin_date += datetime.timedelta(**time_interval)
-
-    if end_date.strftime(date_format) not in date_list:
-        date_list.append(end_date.strftime(date_format))
-
-    return date_list
-
-
-def get_between_months(begin_date, end_date=None):
-    """
-    @summary: 获取一段时间间隔内的月份
-    需要满一整月
-    ---------
-    @param begin_date: 开始时间 如 2018-01-01
-    @param end_date: 默认当前时间
-    ---------
-    @result: 列表 如 ['2018-01', '2018-02']
-    """
-
-    def add_months(dt, months):
-        month = dt.month - 1 + months
-        year = dt.year + month // 12
-        month = month % 12 + 1
-        day = min(dt.day, calendar.monthrange(year, month)[1])
-        return dt.replace(year=year, month=month, day=day)
-
-    date_list = []
-    begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
-    end_date = (
-        datetime.datetime.strptime(end_date, "%Y-%m-%d")
-        if end_date
-        else datetime.datetime.strptime(
-            time.strftime("%Y-%m-%d", time.localtime(time.time())), "%Y-%m-%d"
-        )
-    )
-    while begin_date <= end_date:
-        date_str = begin_date.strftime("%Y-%m")
-        date_list.append(date_str)
-        begin_date = add_months(begin_date, 1)
-    return date_list
-
-
-def get_today_of_day(day_offset=0):
-    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
-
-
-def get_days_of_month(year, month):
-    """
-    返回天数
-    """
-
-    return calendar.monthrange(year, month)[1]
-
-
-def get_firstday_of_month(date):
-    """''
-    date format = "YYYY-MM-DD"
-    """
-
-    year, month, day = date.split("-")
-    year, month, day = int(year), int(month), int(day)
-
-    days = "01"
-    if int(month) < 10:
-        month = "0" + str(int(month))
-    arr = (year, month, days)
-    return "-".join("%s" % i for i in arr)
-
-
-def get_lastday_of_month(date):
-    """''
-    get the last day of month
-    date format = "YYYY-MM-DD"
-    """
-    year, month, day = date.split("-")
-    year, month, day = int(year), int(month), int(day)
-
-    days = calendar.monthrange(year, month)[1]
-    month = add_zero(month)
-    arr = (year, month, days)
-    return "-".join("%s" % i for i in arr)
-
-
-def get_firstday_month(month_offset=0):
-    """''
-    get the first day of month from today
-    month_offset is how many months
-    """
-    (y, m, d) = get_year_month_and_days(month_offset)
-    d = "01"
-    arr = (y, m, d)
-    return "-".join("%s" % i for i in arr)
-
-
-def get_lastday_month(month_offset=0):
-    """''
-    get the last day of month from today
-    month_offset is how many months
-    """
-    return "-".join("%s" % i for i in get_year_month_and_days(month_offset))
-
-
-def get_last_month(month_offset=0):
-    """''
-    get the last day of month from today
-    month_offset is how many months
-    """
-    return "-".join("%s" % i for i in get_year_month_and_days(month_offset)[:2])
-
-
-def get_year_month_and_days(month_offset=0):
-    """
-    @summary:
-    ---------
-    @param month_offset: 月份偏移量
-    ---------
-    @result: ('2019', '04', '30')
-    """
-
-    today = datetime.datetime.now()
-    year, month = today.year, today.month
-
-    this_year = int(year)
-    this_month = int(month)
-    total_month = this_month + month_offset
-    if month_offset >= 0:
-        if total_month <= 12:
-            days = str(get_days_of_month(this_year, total_month))
-            total_month = add_zero(total_month)
-            return (year, total_month, days)
-        else:
-            i = total_month // 12
-            j = total_month % 12
-            if j == 0:
-                i -= 1
-                j = 12
-            this_year += i
-            days = str(get_days_of_month(this_year, j))
-            j = add_zero(j)
-            return (str(this_year), str(j), days)
-    else:
-        if (total_month > 0) and (total_month < 12):
-            days = str(get_days_of_month(this_year, total_month))
-            total_month = add_zero(total_month)
-            return (year, total_month, days)
-        else:
-            i = total_month // 12
-            j = total_month % 12
-            if j == 0:
-                i -= 1
-                j = 12
-            this_year += i
-            days = str(get_days_of_month(this_year, j))
-            j = add_zero(j)
-            return (str(this_year), str(j), days)
-
-
-def add_zero(n):
-    return "%02d" % n
-
-
-def get_month(month_offset=0):
-    """''
-    获取当前日期前后N月的日期
-    if month_offset>0, 获取当前日期前N月的日期
-    if month_offset<0, 获取当前日期后N月的日期
-    date format = "YYYY-MM-DD"
-    """
-    today = datetime.datetime.now()
-    day = add_zero(today.day)
-
-    (y, m, d) = get_year_month_and_days(month_offset)
-    arr = (y, m, d)
-    if int(day) < int(d):
-        arr = (y, m, day)
-    return "-".join("%s" % i for i in arr)
-
-
-@run_safe_model("format_date")
-def format_date(date, old_format="", new_format="%Y-%m-%d %H:%M:%S"):
-    """
-    @summary: 格式化日期格式
-    ---------
-    @param date: 日期 eg:2017年4月17日 3时27分12秒
-    @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒'
-        %y 两位数的年份表示(00-99)
-        %Y 四位数的年份表示(000-9999)
-        %m 月份(01-12)
-        %d 月内中的一天(0-31)
-        %H 24小时制小时数(0-23)
-        %I 12小时制小时数(01-12)
-        %M 分钟数(00-59)
-        %S 秒(00-59)
-    @param new_format: 输出的日期格式
-    ---------
-    @result: 格式化后的日期,类型为字符串 如2017-4-17 03:27:12
-    """
-    if not date:
-        return ""
-
-    if not old_format:
-        regex = "(\d+)"
-        numbers = get_info(date, regex, allow_repeat=True)
-        formats = ["%Y", "%m", "%d", "%H", "%M", "%S"]
-        old_format = date
-        for i, number in enumerate(numbers[:6]):
-            if i == 0 and len(number) == 2:  # 年份可能是两位 用小%y
-                old_format = old_format.replace(
-                    number, formats[i].lower(), 1
-                )  # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时
-            else:
-                old_format = old_format.replace(number, formats[i], 1)  # 替换一次
-
-    try:
-        date_obj = datetime.datetime.strptime(date, old_format)
-        if "T" in date and "Z" in date:
-            date_obj += datetime.timedelta(hours=8)
-            date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
-        else:
-            date_str = datetime.datetime.strftime(date_obj, new_format)
-
-    except Exception as e:
-        log.error("日期格式化出错,old_format = %s 不符合 %s 格式" % (old_format, date))
-        date_str = date
-
-    return date_str
-
-
-def transform_lower_num(data_str: str):
-    num_map = {
-        "一": "1",
-        "二": "2",
-        "三": "3",
-        "四": "4",
-        "五": "5",
-        "六": "6",
-        "七": "7",
-        "八": "8",
-        "九": "9",
-        "十": "0",
-    }
-    pattern = f'[{"|".join(num_map.keys())}|零]'
-    res = re.search(pattern, data_str)
-    if not res:
-        #  如果字符串中没有包含中文数字 不做处理 直接返回
-        return data_str
-
-    data_str = data_str.replace("0", "零")
-    for n in num_map:
-        data_str = data_str.replace(n, num_map[n])
-
-    re_data_str = re.findall("\d+", data_str)
-    for i in re_data_str:
-        if len(i) == 3:
-            new_i = i.replace("0", "")
-            data_str = data_str.replace(i, new_i, 1)
-        elif len(i) == 4:
-            new_i = i.replace("10", "")
-            data_str = data_str.replace(i, new_i, 1)
-        elif len(i) == 2 and int(i) < 10:
-            new_i = int(i) + 10
-            data_str = data_str.replace(i, str(new_i), 1)
-        elif len(i) == 1 and int(i) == 0:
-            new_i = int(i) + 10
-            data_str = data_str.replace(i, str(new_i), 1)
-
-    return data_str.replace("零", "0")
-
-
-@run_safe_model("format_time")
-def format_time(release_time, date_format="%Y-%m-%d %H:%M:%S"):
-    """
-    >>> format_time("2个月前")
-    '2021-08-15 16:24:21'
-    >>> format_time("2月前")
-    '2021-08-15 16:24:36'
-    """
-    release_time = transform_lower_num(release_time)
-    release_time = release_time.replace("日", "天").replace("/", "-")
-
-    if "年前" in release_time:
-        years = re.compile("(\d+)\s*年前").findall(release_time)
-        years_ago = datetime.datetime.now() - datetime.timedelta(
-            days=int(years[0]) * 365
-        )
-        release_time = years_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "月前" in release_time:
-        months = re.compile("(\d+)[\s个]*月前").findall(release_time)
-        months_ago = datetime.datetime.now() - datetime.timedelta(
-            days=int(months[0]) * 30
-        )
-        release_time = months_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "周前" in release_time:
-        weeks = re.compile("(\d+)\s*周前").findall(release_time)
-        weeks_ago = datetime.datetime.now() - datetime.timedelta(days=int(weeks[0]) * 7)
-        release_time = weeks_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "天前" in release_time:
-        ndays = re.compile("(\d+)\s*天前").findall(release_time)
-        days_ago = datetime.datetime.now() - datetime.timedelta(days=int(ndays[0]))
-        release_time = days_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "小时前" in release_time:
-        nhours = re.compile("(\d+)\s*小时前").findall(release_time)
-        hours_ago = datetime.datetime.now() - datetime.timedelta(hours=int(nhours[0]))
-        release_time = hours_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "分钟前" in release_time:
-        nminutes = re.compile("(\d+)\s*分钟前").findall(release_time)
-        minutes_ago = datetime.datetime.now() - datetime.timedelta(
-            minutes=int(nminutes[0])
-        )
-        release_time = minutes_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "前天" in release_time:
-        today = datetime.date.today()
-        yesterday = today - datetime.timedelta(days=2)
-        release_time = release_time.replace("前天", str(yesterday))
-
-    elif "昨天" in release_time:
-        today = datetime.date.today()
-        yesterday = today - datetime.timedelta(days=1)
-        release_time = release_time.replace("昨天", str(yesterday))
-
-    elif "今天" in release_time:
-        release_time = release_time.replace("今天", get_current_date("%Y-%m-%d"))
-
-    elif "刚刚" in release_time:
-        release_time = get_current_date()
-
-    elif re.search("^\d\d:\d\d", release_time):
-        release_time = get_current_date("%Y-%m-%d") + " " + release_time
-
-    elif not re.compile("\d{4}").findall(release_time):
-        month = re.compile("\d{1,2}").findall(release_time)
-        if month and int(month[0]) <= int(get_current_date("%m")):
-            release_time = get_current_date("%Y") + "-" + release_time
-        else:
-            release_time = str(int(get_current_date("%Y")) - 1) + "-" + release_time
-
-    # 把日和小时粘在一起的拆开
-    template = re.compile("(\d{4}-\d{1,2}-\d{2})(\d{1,2})")
-    release_time = re.sub(template, r"\1 \2", release_time)
-    release_time = format_date(release_time, new_format=date_format)
-
-    return release_time
-
-
-def to_date(date_str, date_format="%Y-%m-%d %H:%M:%S"):
-    return datetime.datetime.strptime(date_str, date_format)
-
-
-def get_before_date(
-    current_date,
-    days,
-    current_date_format="%Y-%m-%d %H:%M:%S",
-    return_date_format="%Y-%m-%d %H:%M:%S",
-):
-    """
-    @summary: 获取之前时间
-    ---------
-    @param current_date: 当前时间 str类型
-    @param days: 时间间隔 -1 表示前一天 1 表示后一天
-    @param days: 返回的时间格式
-    ---------
-    @result: 字符串
-    """
-
-    current_date = to_date(current_date, current_date_format)
-    date_obj = current_date + datetime.timedelta(days=days)
-    return datetime.datetime.strftime(date_obj, return_date_format)
-
-
-def get_utcnow():
-    """utc时间"""
-    return datetime.datetime.utcnow()
-
-
-def delay_time(sleep_time=60):
-    """
-    @summary: 睡眠  默认1分钟
-    ---------
-    @param sleep_time: 以秒为单位
-    ---------
-    @result:
-    """
-
-    time.sleep(sleep_time)
-
-
-def format_seconds(seconds):
-    """
-    @summary: 将秒转为时分秒
-    ---------
-    @param seconds:
-    ---------
-    @result: 2天3小时2分49秒
-    """
-
-    seconds = int(seconds + 0.5)  # 向上取整
-
-    m, s = divmod(seconds, 60)
-    h, m = divmod(m, 60)
-    d, h = divmod(h, 24)
-
-    times = ""
-    if d:
-        times += "{}天".format(d)
-    if h:
-        times += "{}小时".format(h)
-    if m:
-        times += "{}分".format(m)
-    if s:
-        times += "{}秒".format(s)
-
-    return times
-
-
-################################################
-def get_md5(*args):
-    """
-    @summary: 获取唯一的32位md5
-    ---------
-    @param *args: 参与联合去重的值
-    ---------
-    @result: 7c8684bcbdfcea6697650aa53d7b1405
-    """
-
-    m = hashlib.md5()
-    for arg in args:
-        m.update(str(arg).encode())
-
-    return m.hexdigest()
-
-
-def get_sha1(*args):
-    """
-    @summary: 获取唯一的40位值, 用于获取唯一的id
-    ---------
-    @param *args: 参与联合去重的值
-    ---------
-    @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
-    """
-
-    sha1 = hashlib.sha1()
-    for arg in args:
-        sha1.update(str(arg).encode())
-    return sha1.hexdigest()  # 40位
-
-
-def get_base64(secret, message):
-    """
-    @summary: 数字证书签名算法是:"HMAC-SHA256"
-              参考:https://www.jokecamp.com/blog/examples-of-creating-base64-hashes-using-hmac-sha256-in-different-languages/
-    ---------
-    @param secret: 秘钥
-    @param message: 消息
-    ---------
-    @result: 签名输出类型是:"base64"
-    """
-
-    import hashlib
-    import hmac
-    import base64
-
-    message = bytes(message, "utf-8")
-    secret = bytes(secret, "utf-8")
-
-    signature = base64.b64encode(
-        hmac.new(secret, message, digestmod=hashlib.sha256).digest()
-    ).decode("utf8")
-    return signature
-
-
-def get_uuid(key1="", key2=""):
-    """
-    @summary: 计算uuid值
-    可用于将两个字符串组成唯一的值。如可将域名和新闻标题组成uuid,形成联合索引
-    ---------
-    @param key1:str
-    @param key2:str
-    ---------
-    @result:
-    """
-
-    uuid_object = ""
-
-    if not key1 and not key2:
-        uuid_object = uuid.uuid1()
-    else:
-        hash = md5(bytes(key1, "utf-8") + bytes(key2, "utf-8")).digest()
-        uuid_object = uuid.UUID(bytes=hash[:16], version=3)
-
-    return str(uuid_object)
-
-
-def get_hash(text):
-    return hash(text)
-
-
-##################################################
-
-
-def cut_string(text, length):
-    """
-    @summary: 将文本按指定长度拆分
-    ---------
-    @param text: 文本
-    @param length: 拆分长度
-    ---------
-    @result: 返回按指定长度拆分后形成的list
-    """
-
-    text_list = re.findall(".{%d}" % length, text, re.S)
-    leave_text = text[len(text_list) * length :]
-    if leave_text:
-        text_list.append(leave_text)
-
-    return text_list
-
-
-def get_random_string(length=1):
-    random_string = "".join(random.sample(string.ascii_letters + string.digits, length))
-    return random_string
-
-
-def get_random_password(length=8, special_characters=""):
-    """
-    @summary: 创建随机密码 默认长度为8,包含大写字母、小写字母、数字
-    ---------
-    @param length: 密码长度 默认8
-    @param special_characters: 特殊字符
-    ---------
-    @result: 指定长度的密码
-    """
-
-    while True:
-        random_password = "".join(
-            random.sample(
-                string.ascii_letters + string.digits + special_characters, length
-            )
-        )
-        if (
-            re.search("[0-9]", random_password)
-            and re.search("[A-Z]", random_password)
-            and re.search("[a-z]", random_password)
-        ):
-            if not special_characters:
-                break
-            elif set(random_password).intersection(special_characters):
-                break
-
-    return random_password
-
-
-def get_random_email(length=None, email_types: list = None, special_characters=""):
-    """
-    随机生成邮箱
-    :param length: 邮箱长度
-    :param email_types: 邮箱类型
-    :param special_characters: 特殊字符
-    :return:
-    """
-    if not length:
-        length = random.randint(4, 12)
-    if not email_types:
-        email_types = [
-            "qq.com",
-            "163.com",
-            "gmail.com",
-            "yahoo.com",
-            "hotmail.com",
-            "yeah.net",
-            "126.com",
-            "139.com",
-            "sohu.com",
-        ]
-
-    email_body = get_random_password(length, special_characters)
-    email_type = random.choice(email_types)
-
-    email = email_body + "@" + email_type
-    return email
-
-
-#################################
-
-
-def dumps_obj(obj):
-    return pickle.dumps(obj)
-
-
-def loads_obj(obj_str):
-    return pickle.loads(obj_str)
-
-
-def get_method(obj, name):
-    name = str(name)
-    try:
-        return getattr(obj, name)
-    except AttributeError:
-        log.error("Method %r not found in: %s" % (name, obj))
-        return None
-
-
-def witch_workspace(project_path):
-    """
-    @summary:
-    ---------
-    @param project_path:
-    ---------
-    @result:
-    """
-
-    os.chdir(project_path)  # 切换工作路经
-
-
-############### 数据库相关 #######################
-def format_sql_value(value):
-    if isinstance(value, str):
-        value = value.strip()
-
-    elif isinstance(value, (list, dict)):
-        value = dumps_json(value, indent=None)
-
-    elif isinstance(value, (datetime.date, datetime.time)):
-        value = str(value)
-
-    elif isinstance(value, bool):
-        value = int(value)
-
-    return value
-
-
-def list2str(datas):
-    """
-    列表转字符串
-    :param datas: [1, 2]
-    :return: (1, 2)
-    """
-    data_str = str(tuple(datas))
-    data_str = re.sub(",\)$", ")", data_str)
-    return data_str
-
-
-def make_insert_sql(
-    table, data, auto_update=False, update_columns=(), insert_ignore=False
-):
-    """
-    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
-    ---------
-    @param table:
-    @param data: 表数据 json格式
-    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
-    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
-    @param insert_ignore: 数据存在忽略
-    ---------
-    @result:
-    """
-
-    keys = ["`{}`".format(key) for key in data.keys()]
-    keys = list2str(keys).replace("'", "")
-
-    values = [format_sql_value(value) for value in data.values()]
-    values = list2str(values)
-
-    if update_columns:
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-        update_columns_ = ", ".join(
-            ["{key}=values({key})".format(key=key) for key in update_columns]
-        )
-        sql = (
-            "insert%s into `{table}` {keys} values {values} on duplicate key update %s"
-            % (" ignore" if insert_ignore else "", update_columns_)
-        )
-
-    elif auto_update:
-        sql = "replace into `{table}` {keys} values {values}"
-    else:
-        sql = "insert%s into `{table}` {keys} values {values}" % (
-            " ignore" if insert_ignore else ""
-        )
-
-    sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
-    return sql
-
-
-def make_update_sql(table, data, condition):
-    """
-    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
-    ---------
-    @param table:
-    @param data: 表数据 json格式
-    @param condition: where 条件
-    ---------
-    @result:
-    """
-    key_values = []
-
-    for key, value in data.items():
-        value = format_sql_value(value)
-        if isinstance(value, str):
-            key_values.append("`{}`={}".format(key, repr(value)))
-        elif value is None:
-            key_values.append("`{}`={}".format(key, "null"))
-        else:
-            key_values.append("`{}`={}".format(key, value))
-
-    key_values = ", ".join(key_values)
-
-    sql = "update `{table}` set {key_values} where {condition}"
-    sql = sql.format(table=table, key_values=key_values, condition=condition)
-    return sql
-
-
-def make_batch_sql(
-    table, datas, auto_update=False, update_columns=(), update_columns_value=()
-):
-    """
-    @summary: 生产批量的sql
-    ---------
-    @param table:
-    @param datas: 表数据 [{...}]
-    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
-    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
-    @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号, 如 update_columns_value=("'test'",)
-    ---------
-    @result:
-    """
-    if not datas:
-        return
-
-    keys = list(datas[0].keys())
-    values_placeholder = ["%s"] * len(keys)
-
-    values = []
-    for data in datas:
-        value = []
-        for key in keys:
-            current_data = data.get(key)
-            current_data = format_sql_value(current_data)
-
-            value.append(current_data)
-
-        values.append(value)
-
-    keys = ["`{}`".format(key) for key in keys]
-    keys = list2str(keys).replace("'", "")
-
-    values_placeholder = list2str(values_placeholder).replace("'", "")
-
-    if update_columns:
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-        if update_columns_value:
-            update_columns_ = ", ".join(
-                [
-                    "`{key}`={value}".format(key=key, value=value)
-                    for key, value in zip(update_columns, update_columns_value)
-                ]
-            )
-        else:
-            update_columns_ = ", ".join(
-                ["`{key}`=values(`{key}`)".format(key=key) for key in update_columns]
-            )
-        sql = "insert into `{table}` {keys} values {values_placeholder} on duplicate key update {update_columns}".format(
-            table=table,
-            keys=keys,
-            values_placeholder=values_placeholder,
-            update_columns=update_columns_,
-        )
-    elif auto_update:
-        sql = "replace into `{table}` {keys} values {values_placeholder}".format(
-            table=table, keys=keys, values_placeholder=values_placeholder
-        )
-    else:
-        sql = "insert ignore into `{table}` {keys} values {values_placeholder}".format(
-            table=table, keys=keys, values_placeholder=values_placeholder
-        )
-
-    return sql, values
-
-
-############### json相关 #######################
-
-
-def key2underline(key: str, strict=True):
-    """
-    >>> key2underline("HelloWord")
-    'hello_word'
-    >>> key2underline("SHData", strict=True)
-    's_h_data'
-    >>> key2underline("SHData", strict=False)
-    'sh_data'
-    >>> key2underline("SHDataHi", strict=False)
-    'sh_data_hi'
-    >>> key2underline("SHDataHi", strict=True)
-    's_h_data_hi'
-    >>> key2underline("dataHi", strict=True)
-    'data_hi'
-    """
-    regex = "[A-Z]*" if not strict else "[A-Z]"
-    capitals = re.findall(regex, key)
-
-    if capitals:
-        for capital in capitals:
-            if not capital:
-                continue
-            if key.startswith(capital):
-                if len(capital) > 1:
-                    key = key.replace(
-                        capital, capital[:-1].lower() + "_" + capital[-1].lower(), 1
-                    )
-                else:
-                    key = key.replace(capital, capital.lower(), 1)
-            else:
-                if len(capital) > 1:
-                    key = key.replace(capital, "_" + capital.lower() + "_", 1)
-                else:
-                    key = key.replace(capital, "_" + capital.lower(), 1)
-
-    return key.strip("_")
-
-
-def key2hump(key):
-    """
-    下划线试变成首字母大写
-    """
-    return key.title().replace("_", "")
-
-
-def format_json_key(json_data):
-    json_data_correct = {}
-    for key, value in json_data.items():
-        key = key2underline(key)
-        json_data_correct[key] = value
-
-    return json_data_correct
-
-
-def quick_to_json(text):
-    """
-    @summary: 可快速将浏览器上的header转为json格式
-    ---------
-    @param text:
-    ---------
-    @result:
-    """
-
-    contents = text.split("\n")
-    json = {}
-    for content in contents:
-        if content == "\n":
-            continue
-
-        content = content.strip()
-        regex = ["(:?.*?):(.*)", "(.*?):? +(.*)", "([^:]*)"]
-
-        result = get_info(content, regex)
-        result = result[0] if isinstance(result[0], tuple) else result
-        try:
-            json[result[0]] = eval(result[1].strip())
-        except:
-            json[result[0]] = result[1].strip()
-
-    return json
-
-
-##############################
-
-
-def print_pretty(object):
-    pprint(object)
-
-
-def print_params2json(url):
-    params_json = {}
-    params = url.split("?")[-1].split("&")
-    for param in params:
-        key_value = param.split("=", 1)
-        params_json[key_value[0]] = key_value[1]
-
-    print(dumps_json(params_json))
-
-
-def print_cookie2json(cookie_str_or_list):
-    if isinstance(cookie_str_or_list, str):
-        cookie_json = {}
-        cookies = cookie_str_or_list.split("; ")
-        for cookie in cookies:
-            name, value = cookie.split("=")
-            cookie_json[name] = value
-    else:
-        cookie_json = get_cookies_from_selenium_cookie(cookie_str_or_list)
-
-    print(dumps_json(cookie_json))
-
-
-###############################
-
-
-def flatten(x):
-    """flatten(sequence) -> list
-    Returns a single, flat list which contains all elements retrieved
-    from the sequence and all recursively contained sub-sequences
-    (iterables).
-    Examples:
-    >>> [1, 2, [3,4], (5,6)]
-    [1, 2, [3, 4], (5, 6)]
-    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
-    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
-    >>> flatten(["foo", "bar"])
-    ['foo', 'bar']
-    >>> flatten(["foo", ["baz", 42], "bar"])
-    ['foo', 'baz', 42, 'bar']
-    """
-    return list(iflatten(x))
-
-
-def iflatten(x):
-    """iflatten(sequence) -> iterator
-    Similar to ``.flatten()``, but returns iterator instead"""
-    for el in x:
-        if _is_listlike(el):
-            for el_ in flatten(el):
-                yield el_
-        else:
-            yield el
-
-
-def _is_listlike(x):
-    """
-    >>> _is_listlike("foo")
-    False
-    >>> _is_listlike(5)
-    False
-    >>> _is_listlike(b"foo")
-    False
-    >>> _is_listlike([b"foo"])
-    True
-    >>> _is_listlike((b"foo",))
-    True
-    >>> _is_listlike({})
-    True
-    >>> _is_listlike(set())
-    True
-    >>> _is_listlike((x for x in range(3)))
-    True
-    >>> _is_listlike(six.moves.xrange(5))
-    True
-    """
-    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
-
-
-###################
-
-
-def re_def_supper_class(obj, supper_class):
-    """
-    重新定义父类
-    @param obj: 类 如 class A: 则obj为A 或者 A的实例 a.__class__
-    @param supper_class: 父类
-    @return:
-    """
-    obj.__bases__ = (supper_class,)
-
-
-###################
-freq_limit_record = {}
-
-
-def reach_freq_limit(rate_limit, *key):
-    """
-    频率限制
-    :param rate_limit: 限制时间 单位秒
-    :param key: 频率限制的key
-    :return: True / False
-    """
-    if rate_limit == 0:
-        return False
-
-    msg_md5 = get_md5(*key)
-    key = "rate_limit:{}".format(msg_md5)
-    try:
-        if get_redisdb().get(key):
-            return True
-
-        get_redisdb().set(key, time.time(), ex=rate_limit)
-    except redis.exceptions.ConnectionError as e:
-        # 使用内存做频率限制
-        global freq_limit_record
-
-        if key not in freq_limit_record:
-            freq_limit_record[key] = time.time()
-            return False
-
-        if time.time() - freq_limit_record.get(key) < rate_limit:
-            return True
-        else:
-            freq_limit_record[key] = time.time()
-
-    return False
-
-
-def wechat_warning(
-    message,
-    message_prefix=None,
-    rate_limit=None,
-    url=None,
-    user_phone=None,
-    all_users: bool = None,
-):
-    """企业微信报警"""
-
-    # 为了加载最新的配置
-    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
-    url = url or setting.WECHAT_WARNING_URL
-    user_phone = user_phone or setting.WECHAT_WARNING_PHONE
-    all_users = all_users if all_users is not None else setting.WECHAT_WARNING_ALL
-
-    if isinstance(user_phone, str):
-        user_phone = [user_phone] if user_phone else []
-
-    if all_users is True or not user_phone:
-        user_phone = ["@all"]
-
-    if not all([url, message]):
-        return
-
-    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
-        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
-        return
-
-    data = {
-        "msgtype": "text",
-        "text": {"content": message, "mentioned_mobile_list": user_phone},
-    }
-
-    headers = {"Content-Type": "application/json"}
-
-    try:
-        response = requests.post(
-            url, headers=headers, data=json.dumps(data).encode("utf8")
-        )
-        result = response.json()
-        response.close()
-        if result.get("errcode") == 0:
-            return True
-        else:
-            raise Exception(result.get("errmsg"))
-    except Exception as e:
-        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
-        return False
-
-
-###################
-
-
-def make_item(cls, data: dict):
-    """提供Item类与原数据,快速构建Item实例
-    :param cls: Item类
-    :param data: 字典格式的数据
-    """
-    item = cls()
-    for key, val in data.items():
-        setattr(item, key, val)
-    return item
-
-
-###################
-
-
-def aio_wrap(loop=None, executor=None):
-    """
-    wrap a normal sync version of a function to an async version
-    """
-    outer_loop = loop
-    outer_executor = executor
-
-    def wrap(fn):
-        @wraps(fn)
-        async def run(*args, loop=None, executor=None, **kwargs):
-            if loop is None:
-                if outer_loop is None:
-                    loop = asyncio.get_event_loop()
-                else:
-                    loop = outer_loop
-            if executor is None:
-                executor = outer_executor
-            pfunc = partial(fn, *args, **kwargs)
-            return await loop.run_in_executor(executor, pfunc)
-
-        return run
-
-    return wrap
-
-
-######### number ##########
-
-
-def ensure_int(n):
-    """
-    >>> ensure_int(None)
-    0
-    >>> ensure_int(False)
-    0
-    >>> ensure_int(12)
-    12
-    >>> ensure_int("72")
-    72
-    >>> ensure_int('')
-    0
-    >>> ensure_int('1')
-    1
-    """
-    if not n:
-        return 0
-    return int(n)
-
-
-def ensure_float(n):
-    """
-    >>> ensure_float(None)
-    0.0
-    >>> ensure_float(False)
-    0.0
-    >>> ensure_float(12)
-    12.0
-    >>> ensure_float("72")
-    72.0
-    """
-    if not n:
-        return 0.0
-    return float(n)
-
-
-def ensure_int64(n):
-    """
-    >>> ensure_int64(None)
-    0
-    >>> ensure_float(False)
-    0
-    >>> ensure_float(12)
-    12
-    >>> ensure_float("72")
-    72
-    """
-    if not n:
-        return bson.int64.Int64(0)
-    return bson.int64.Int64(n)
-
-
-def import_cls(cls_info):
-    module, class_name = cls_info.rsplit(".", 1)
-    cls = importlib.import_module(module).__getattribute__(class_name)
-    return cls

+ 0 - 12
A数据处理/site_monitor/utils/webdriver/__init__.py

@@ -1,12 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/7 4:39 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-from .playwright_driver import PlaywrightDriver
-from .webdirver import InterceptRequest, InterceptResponse
-from .webdriver_pool import WebDriverPool

+ 0 - 300
A数据处理/site_monitor/utils/webdriver/playwright_driver.py

@@ -1,300 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/7 4:11 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import json
-import os
-import re
-from collections import defaultdict
-from typing import Union, List
-
-try:
-    from typing import Literal  # python >= 3.8
-except ImportError:  # python <3.8
-    from typing_extensions import Literal
-
-
-from playwright.sync_api import Page, BrowserContext, ViewportSize, ProxySettings
-from playwright.sync_api import Playwright, Browser
-from playwright.sync_api import Response
-from playwright.sync_api import sync_playwright
-
-from utils import tools
-from utils.log import logger as log
-from utils.webdriver.webdirver import *
-
-
-class PlaywrightDriver(WebDriver):
-    def __init__(
-        self,
-        *,
-        page_on_event_callback: dict = None,
-        storage_state_path: str = None,
-        driver_type: Literal["chromium", "firefox", "webkit"] = "webkit",
-        url_regexes: list = None,
-        save_all: bool = False,
-        **kwargs
-    ):
-        """
-
-        Args:
-            page_on_event_callback: page.on() 事件的回调 如 page_on_event_callback={"dialog": lambda dialog: dialog.accept()}
-            storage_state_path: 保存浏览器状态的路径
-            driver_type: 浏览器类型 chromium, firefox, webkit
-            url_regexes: 拦截接口,支持正则,数组类型
-            save_all: 是否保存所有拦截的接口, 默认只保存最后一个
-            **kwargs:
-        """
-        super(PlaywrightDriver, self).__init__(**kwargs)
-        self.driver: Playwright = None
-        self.browser: Browser = None
-        self.context: BrowserContext = None
-        self.page: Page = None
-        self.url = None
-        self.storage_state_path = storage_state_path
-
-        self._driver_type = driver_type
-        self._page_on_event_callback = page_on_event_callback
-        self._url_regexes = url_regexes
-        self._save_all = save_all
-
-        if self._save_all and self._url_regexes:
-            log.warning(
-                "获取完拦截的数据后, 请主动调用PlaywrightDriver的clear_cache()方法清空拦截的数据,否则数据会一直累加,导致内存溢出"
-            )
-            self._cache_data = defaultdict(list)
-        else:
-            self._cache_data = {}
-
-        self._setup()
-
-    def _setup(self):
-        # 处理参数
-        if self._proxy:
-            proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = self.format_context_proxy(proxy)
-        else:
-            proxy = None
-
-        user_agent = (
-            self._user_agent() if callable(self._user_agent) else self._user_agent
-        )
-
-        view_size = ViewportSize(
-            width=self._window_size[0], height=self._window_size[1]
-        )
-
-        # 初始化浏览器对象
-        self.driver = sync_playwright().start()
-        self.browser = getattr(self.driver, self._driver_type).launch(
-            headless=self._headless,
-            # args=["--no-sandbox"],
-            proxy=proxy,
-            executable_path=self._executable_path,
-            downloads_path=self._download_path,
-        )
-
-        if self.storage_state_path and os.path.exists(self.storage_state_path):
-            self.context = self.browser.new_context(
-                user_agent=user_agent,
-                screen=view_size,
-                viewport=view_size,
-                proxy=proxy,
-                storage_state=self.storage_state_path,
-                ignore_https_errors=True
-            )
-        else:
-            self.context = self.browser.new_context(
-                user_agent=user_agent,
-                screen=view_size,
-                viewport=view_size,
-                proxy=proxy,
-                ignore_https_errors=True
-            )
-
-        if self._use_stealth_js:
-            path = os.path.join(os.path.dirname(__file__), "../js/stealth.min.js")
-            self.context.add_init_script(path=path)
-
-        self.page = self.context.new_page()
-        self.page.set_default_timeout(self._timeout * 1000)
-
-        if self._page_on_event_callback:
-            for event, callback in self._page_on_event_callback.items():
-                self.page.on(event, callback)
-
-        if self._url_regexes:
-            self.page.on("response", self.on_response)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_val:
-            log.error(exc_val)
-
-        self.quit()
-        return True
-
-    def format_context_proxy(self, proxy) -> ProxySettings:
-        """
-        Args:
-            proxy: username:password@ip:port / ip:port
-        Returns:
-            {
-                "server": "ip:port"
-                "username": username,
-                "password": password,
-            }
-            server: http://ip:port or socks5://ip:port. Short form ip:port is considered an HTTP proxy.
-        """
-
-        if "@" in proxy:
-            certification, _proxy = proxy.split("@")
-            username, password = certification.split(":")
-
-            context_proxy = ProxySettings(
-                server=_proxy,
-                username=username,
-                password=password,
-            )
-        else:
-            context_proxy = ProxySettings(server=proxy)
-
-        return context_proxy
-
-    def save_storage_stage(self):
-        if self.storage_state_path:
-            os.makedirs(os.path.dirname(self.storage_state_path), exist_ok=True)
-            self.context.storage_state(path=self.storage_state_path)
-
-    def quit(self):
-        self.page.close()
-        self.context.close()
-        self.browser.close()
-        self.driver.stop()
-
-    @property
-    def domain(self):
-        return tools.get_domain(self.url or self.page.url)
-
-    @property
-    def cookies(self):
-        cookies_json = {}
-        for cookie in self.page.context.cookies():
-            cookies_json[cookie["name"]] = cookie["value"]
-
-        return cookies_json
-
-    @cookies.setter
-    def cookies(self, val: Union[dict, List[dict]]):
-        """
-        设置cookie
-        Args:
-            val: List[{name: str, value: str, url: Union[str, NoneType], domain: Union[str, NoneType], path: Union[str, NoneType], expires: Union[float, NoneType], httpOnly: Union[bool, NoneType], secure: Union[bool, NoneType], sameSite: Union["Lax", "None", "Strict", NoneType]}]
-
-        Returns:
-
-        """
-        if isinstance(val, list):
-            self.page.context.add_cookies(val)
-        else:
-            cookies = []
-            for key, value in val.items():
-                cookies.append(
-                    {"name": key, "value": value, "url": self.url or self.page.url}
-                )
-            self.page.context.add_cookies(cookies)
-
-    @property
-    def user_agent(self):
-        return self.page.evaluate("() => navigator.userAgent")
-
-    def on_response(self, response: Response):
-        for regex in self._url_regexes:
-            if re.search(regex, response.request.url):
-                intercept_request = InterceptRequest(
-                    url=response.request.url,
-                    headers=response.request.headers,
-                    data=response.request.post_data,
-                )
-
-                intercept_response = InterceptResponse(
-                    request=intercept_request,
-                    url=response.url,
-                    headers=response.headers,
-                    content=response.body(),
-                    status_code=response.status,
-                )
-                if self._save_all:
-                    self._cache_data[regex].append(intercept_response)
-                else:
-                    self._cache_data[regex] = intercept_response
-
-    def get_response(self, url_regex) -> InterceptResponse:
-        if self._save_all:
-            response_list = self._cache_data.get(url_regex)
-            if response_list:
-                return response_list[-1]
-        return self._cache_data.get(url_regex)
-
-    def get_all_response(self, url_regex) -> List[InterceptResponse]:
-        """
-        获取所有匹配的响应, 仅在save_all=True时有效
-        Args:
-            url_regex:
-
-        Returns:
-
-        """
-        response_list = self._cache_data.get(url_regex, [])
-        if not isinstance(response_list, list):
-            return [response_list]
-        return response_list
-
-    def get_text(self, url_regex):
-        return (
-            self.get_response(url_regex).content.decode()
-            if self.get_response(url_regex)
-            else None
-        )
-
-    def get_all_text(self, url_regex):
-        """
-        获取所有匹配的响应文本, 仅在save_all=True时有效
-        Args:
-            url_regex:
-
-        Returns:
-
-        """
-        return [
-            response.content.decode() for response in self.get_all_response(url_regex)
-        ]
-
-    def get_json(self, url_regex):
-        return (
-            json.loads(self.get_text(url_regex))
-            if self.get_response(url_regex)
-            else None
-        )
-
-    def get_all_json(self, url_regex):
-        """
-        获取所有匹配的响应json, 仅在save_all=True时有效
-        Args:
-            url_regex:
-
-        Returns:
-
-        """
-        return [json.loads(text) for text in self.get_all_text(url_regex)]
-
-    def clear_cache(self):
-        self._cache_data = defaultdict(list)

+ 0 - 81
A数据处理/site_monitor/utils/webdriver/webdirver.py

@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/7 4:27 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import abc
-
-import setting as setting
-
-
-class InterceptRequest:
-    def __init__(self, url, data, headers):
-        self.url = url
-        self.data = data
-        self.headers = headers
-
-
-class InterceptResponse:
-    def __init__(self, request: InterceptRequest, url, headers, content, status_code):
-        self.request = request
-        self.url = url
-        self.headers = headers
-        self.content = content
-        self.status_code = status_code
-
-
-class WebDriver:
-    def __init__(
-        self,
-        load_images=True,
-        user_agent=None,
-        proxy=None,
-        headless=False,
-        driver_type=None,
-        timeout=16,
-        window_size=(1024, 800),
-        executable_path=None,
-        custom_argument=None,
-        download_path=None,
-        auto_install_driver=True,
-        use_stealth_js=True,
-        **kwargs,
-    ):
-        """
-        webdirver 封装,支持chrome、phantomjs 和 firefox
-        Args:
-            load_images: 是否加载图片
-            user_agent: 字符串 或 无参函数,返回值为user_agent
-            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
-            headless: 是否启用无头模式
-            driver_type: CHROME 或 PHANTOMJS,FIREFOX
-            timeout: 请求超时时间
-            window_size: # 窗口大小
-            executable_path: 浏览器路径,默认为默认路径
-            custom_argument: 自定义参数 用于webdriver.Chrome(options=chrome_options, **kwargs)
-            download_path: 文件下载保存路径;如果指定,不再出现“保留”“放弃”提示,仅对Chrome有效
-            auto_install_driver: 自动下载浏览器驱动 支持chrome 和 firefox
-            use_stealth_js: 使用stealth.min.js隐藏浏览器特征
-            **kwargs:
-        """
-        self._load_images = load_images
-        self._user_agent = user_agent or setting.DEFAULT_USERAGENT
-        self._proxy = proxy
-        self._headless = headless
-        self._timeout = timeout
-        self._window_size = window_size
-        self._executable_path = executable_path
-        self._custom_argument = custom_argument
-        self._download_path = download_path
-        self._auto_install_driver = auto_install_driver
-        self._use_stealth_js = use_stealth_js
-        self._driver_type = driver_type
-        self._kwargs = kwargs
-
-    @abc.abstractmethod
-    def quit(self):
-        pass

+ 0 - 115
A数据处理/site_monitor/utils/webdriver/webdriver_pool.py

@@ -1,115 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/3/18 4:59 下午
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import queue
-import threading
-
-from utils.log import logger as log
-from utils.tools import Singleton
-from utils.webdriver.playwright_driver import PlaywrightDriver
-
-
-@Singleton
-class WebDriverPool:
-    def __init__(
-        self, pool_size=5, driver_cls=PlaywrightDriver, thread_safe=False, **kwargs
-    ):
-        """
-
-        Args:
-            pool_size: driver池的大小
-            driver: 驱动类型
-            thread_safe: 是否线程安全
-                是则每个线程拥有一个driver,pool_size无效,driver数量为线程数
-                否则每个线程从池中获取driver
-            **kwargs:
-        """
-        self.pool_size = pool_size
-        self.driver_cls = driver_cls
-        self.thread_safe = thread_safe
-        self.kwargs = kwargs
-
-        self.queue = queue.Queue(maxsize=pool_size)
-        self.lock = threading.RLock()
-        self.driver_count = 0
-        self.ctx = threading.local()
-
-    @property
-    def driver(self):
-        if not hasattr(self.ctx, "driver"):
-            self.ctx.driver = None
-        return self.ctx.driver
-
-    @driver.setter
-    def driver(self, driver):
-        self.ctx.driver = driver
-
-    @property
-    def is_full(self):
-        return self.driver_count >= self.pool_size
-
-    def create_driver(self, user_agent: str = None, proxy: str = None):
-        kwargs = self.kwargs.copy()
-        if user_agent:
-            kwargs["user_agent"] = user_agent
-        if proxy:
-            kwargs["proxy"] = proxy
-        return self.driver_cls(**kwargs)
-
-    def get(self, user_agent: str = None, proxy: str = None):
-        """
-        获取webdriver
-        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
-        Args:
-            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
-            proxy: xxx.xxx.xxx.xxx
-        Returns:
-
-        """
-        if not self.is_full and not self.thread_safe:
-            with self.lock:
-                if not self.is_full:
-                    driver = self.create_driver(user_agent, proxy)
-                    self.queue.put(driver)
-                    self.driver_count += 1
-        elif self.thread_safe:
-            if not self.driver:
-                driver = self.create_driver(user_agent, proxy)
-                self.driver = driver
-                self.driver_count += 1
-
-        if self.thread_safe:
-            driver = self.driver
-        else:
-            driver = self.queue.get()
-
-        return driver
-
-    def put(self, driver):
-        if not self.thread_safe:
-            self.queue.put(driver)
-
-    def remove(self, driver):
-        if self.thread_safe:
-            if self.driver:
-                self.driver.quit()
-                self.driver = None
-        else:
-            driver.quit()
-        self.driver_count -= 1
-
-    def close(self):
-        if self.thread_safe:
-            log.info("暂不支持关闭需线程安全的driver")
-
-        while not self.queue.empty():
-            driver = self.queue.get()
-            driver.quit()
-            self.driver_count -= 1

Alguns ficheiros não foram mostrados porque muitos ficheiros mudaram neste diff