Forráskód Böngészése

删除冗余的采集框架代码

dzr 1 hónapja
szülő
commit
8fff78da8c
100 módosított fájl, 0 hozzáadás és 17962 törlés
  1. 0 10
      spider_frame/FworkSpider/MANIFEST.in
  2. 0 9
      spider_frame/FworkSpider/README.md
  3. 0 0
      spider_frame/FworkSpider/__init__.py
  4. 0 54
      spider_frame/FworkSpider/crawl_func/PYCCS_cookies.py
  5. 0 59
      spider_frame/FworkSpider/crawl_func/YunSuoAutoJump.py
  6. 0 0
      spider_frame/FworkSpider/crawl_func/__init__.py
  7. 0 129
      spider_frame/FworkSpider/crawl_func/ali_slide_verify.py
  8. 0 99
      spider_frame/FworkSpider/crawl_func/jsl_5s.py
  9. 0 83
      spider_frame/FworkSpider/crawl_func/jsl_clearance_s.py
  10. 0 1
      spider_frame/FworkSpider/feapder/VERSION
  11. 0 45
      spider_frame/FworkSpider/feapder/__init__.py
  12. 0 9
      spider_frame/FworkSpider/feapder/buffer/__init__.py
  13. 0 213
      spider_frame/FworkSpider/feapder/buffer/heartbeat_buffer.py
  14. 0 10
      spider_frame/FworkSpider/feapder/buffer/item_buffer/__init__.py
  15. 0 474
      spider_frame/FworkSpider/feapder/buffer/item_buffer/item_buffer.py
  16. 0 469
      spider_frame/FworkSpider/feapder/buffer/item_buffer/jy_item_buffer.py
  17. 0 150
      spider_frame/FworkSpider/feapder/buffer/request_buffer.py
  18. 0 0
      spider_frame/FworkSpider/feapder/commands/__init__.py
  19. 0 45
      spider_frame/FworkSpider/feapder/commands/cmdline.py
  20. 0 21
      spider_frame/FworkSpider/feapder/commands/create/__init__.py
  21. 0 48
      spider_frame/FworkSpider/feapder/commands/create/create_cookies.py
  22. 0 30
      spider_frame/FworkSpider/feapder/commands/create/create_init.py
  23. 0 165
      spider_frame/FworkSpider/feapder/commands/create/create_item.py
  24. 0 52
      spider_frame/FworkSpider/feapder/commands/create/create_json.py
  25. 0 51
      spider_frame/FworkSpider/feapder/commands/create/create_params.py
  26. 0 52
      spider_frame/FworkSpider/feapder/commands/create/create_project.py
  27. 0 27
      spider_frame/FworkSpider/feapder/commands/create/create_setting.py
  28. 0 107
      spider_frame/FworkSpider/feapder/commands/create/create_spider.py
  29. 0 135
      spider_frame/FworkSpider/feapder/commands/create/create_table.py
  30. 0 117
      spider_frame/FworkSpider/feapder/commands/create_builder.py
  31. 0 93
      spider_frame/FworkSpider/feapder/commands/shell.py
  32. 0 0
      spider_frame/FworkSpider/feapder/core/__init__.py
  33. 0 111
      spider_frame/FworkSpider/feapder/core/base_parser.py
  34. 0 127
      spider_frame/FworkSpider/feapder/core/collector.py
  35. 0 96
      spider_frame/FworkSpider/feapder/core/handle_failed_items.py
  36. 0 71
      spider_frame/FworkSpider/feapder/core/handle_failed_requests.py
  37. 0 1064
      spider_frame/FworkSpider/feapder/core/parser_control.py
  38. 0 416
      spider_frame/FworkSpider/feapder/core/scheduler.py
  39. 0 27
      spider_frame/FworkSpider/feapder/core/spiders/__init__.py
  40. 0 128
      spider_frame/FworkSpider/feapder/core/spiders/air_spider.py
  41. 0 274
      spider_frame/FworkSpider/feapder/core/spiders/spider.py
  42. 0 9
      spider_frame/FworkSpider/feapder/db/__init__.py
  43. 0 40
      spider_frame/FworkSpider/feapder/db/memory_db.py
  44. 0 427
      spider_frame/FworkSpider/feapder/db/mongodb.py
  45. 0 381
      spider_frame/FworkSpider/feapder/db/mysqldb.py
  46. 0 511
      spider_frame/FworkSpider/feapder/db/rabbitMq.py
  47. 0 848
      spider_frame/FworkSpider/feapder/db/redisdb.py
  48. 0 140
      spider_frame/FworkSpider/feapder/dedup/README.md
  49. 0 177
      spider_frame/FworkSpider/feapder/dedup/__init__.py
  50. 0 41
      spider_frame/FworkSpider/feapder/dedup/basefilter.py
  51. 0 143
      spider_frame/FworkSpider/feapder/dedup/bitarray.py
  52. 0 373
      spider_frame/FworkSpider/feapder/dedup/bloomfilter.py
  53. 0 81
      spider_frame/FworkSpider/feapder/dedup/expirefilter.py
  54. 0 70
      spider_frame/FworkSpider/feapder/dedup/litefilter.py
  55. 0 138
      spider_frame/FworkSpider/feapder/dedup/redisfilter.py
  56. 0 0
      spider_frame/FworkSpider/feapder/network/__init__.py
  57. 0 786
      spider_frame/FworkSpider/feapder/network/cookie_pool.py
  58. 0 273
      spider_frame/FworkSpider/feapder/network/item.py
  59. 0 723
      spider_frame/FworkSpider/feapder/network/proxy_pool.py
  60. 0 557
      spider_frame/FworkSpider/feapder/network/request.py
  61. 0 356
      spider_frame/FworkSpider/feapder/network/response.py
  62. 0 155
      spider_frame/FworkSpider/feapder/network/selector.py
  63. 0 389
      spider_frame/FworkSpider/feapder/network/user_agent.py
  64. 0 56
      spider_frame/FworkSpider/feapder/pipelines/__init__.py
  65. 0 47
      spider_frame/FworkSpider/feapder/pipelines/console_pipeline.py
  66. 0 97
      spider_frame/FworkSpider/feapder/pipelines/mongo_pipeline.py
  67. 0 74
      spider_frame/FworkSpider/feapder/pipelines/mysql_pipeline.py
  68. 0 60
      spider_frame/FworkSpider/feapder/pipelines/rabbitmq_pipeline.py
  69. 0 45
      spider_frame/FworkSpider/feapder/pipelines/redis_pipeline.py
  70. 0 17
      spider_frame/FworkSpider/feapder/requirements.txt
  71. 0 204
      spider_frame/FworkSpider/feapder/setting.py
  72. 0 22
      spider_frame/FworkSpider/feapder/templates/air_spider_template.tmpl
  73. 0 121
      spider_frame/FworkSpider/feapder/templates/detail_template.tmpl
  74. 0 22
      spider_frame/FworkSpider/feapder/templates/item_template.tmpl
  75. 0 146
      spider_frame/FworkSpider/feapder/templates/njpc_detail_template.tmpl
  76. 0 88
      spider_frame/FworkSpider/feapder/templates/njpc_list_template.tmpl
  77. 0 49
      spider_frame/FworkSpider/feapder/templates/project_template/CHECK_DATA.md
  78. 0 8
      spider_frame/FworkSpider/feapder/templates/project_template/README.md
  79. 0 0
      spider_frame/FworkSpider/feapder/templates/project_template/items/__init__.py
  80. 0 44
      spider_frame/FworkSpider/feapder/templates/project_template/main.py
  81. 0 137
      spider_frame/FworkSpider/feapder/templates/project_template/setting.py
  82. 0 0
      spider_frame/FworkSpider/feapder/templates/project_template/spiders/__init__.py
  83. 0 88
      spider_frame/FworkSpider/feapder/templates/spider_list_template.tmpl
  84. 0 108
      spider_frame/FworkSpider/feapder/templates/spider_template.tmpl
  85. 0 9
      spider_frame/FworkSpider/feapder/utils/__init__.py
  86. 0 63
      spider_frame/FworkSpider/feapder/utils/custom_argparse.py
  87. 0 93
      spider_frame/FworkSpider/feapder/utils/email_sender.py
  88. 0 6
      spider_frame/FworkSpider/feapder/utils/js/stealth.min.js
  89. 0 278
      spider_frame/FworkSpider/feapder/utils/log.py
  90. 0 539
      spider_frame/FworkSpider/feapder/utils/metrics.py
  91. 0 94
      spider_frame/FworkSpider/feapder/utils/perfect_dict.py
  92. 0 121
      spider_frame/FworkSpider/feapder/utils/redis_lock.py
  93. 0 2683
      spider_frame/FworkSpider/feapder/utils/tools.py
  94. 0 442
      spider_frame/FworkSpider/feapder/utils/webdriver.py
  95. 0 0
      spider_frame/FworkSpider/items/__init__.py
  96. 0 196
      spider_frame/FworkSpider/items/njpc_item.py
  97. 0 160
      spider_frame/FworkSpider/items/spider_item.py
  98. 0 25
      spider_frame/FworkSpider/requirements.txt
  99. 0 129
      spider_frame/FworkSpider/setting.py
  100. 0 2
      spider_frame/FworkSpider/setup.cfg

+ 0 - 10
spider_frame/FworkSpider/MANIFEST.in

@@ -1,10 +0,0 @@
-include README.md
-include LICENSE
-
-include feapder/requirements.txt
-include feapder/VERSION
-
-recursive-include feapder/utils/js *
-recursive-include feapder/templates *
-
-global-exclude __pycache__ *.py[cod]

+ 0 - 9
spider_frame/FworkSpider/README.md

@@ -1,9 +0,0 @@
-# 注意事项
-    1.python 安装
-        环境版本:python3.8.10
-
-    2.nodejs 安装
-        无注意事项
-
-    3.firfox + geckodrive 安装
-        建议使用火狐浏览器 78.14,本地建议使用88版本之前的火狐和驱动,避免运行结果不一致

+ 0 - 0
spider_frame/FworkSpider/__init__.py


+ 0 - 54
spider_frame/FworkSpider/crawl_func/PYCCS_cookies.py

@@ -1,54 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-09-08 
----------
-@summary: 进入前检查浏览器 PYCCS
----------
-@author: Lzz
-"""
-import re
-
-import execjs
-import requests
-
-
-def get_PYCCS_ck(url, headers, proxies=False):
-    session = requests.session()
-    session.proxies = proxies
-    ex_js = '''
-    function get_ck(a,b,c) {
-        var x08c924 = parseInt(a);
-        x08c924 = x08c924 * parseInt(b);
-        x08c924 = x08c924 + parseInt(c);
-        x08c924 = (x08c924 * 0x3 + 0x7);
-        if (x08c924 < 0x7b)
-            x08c924 = x08c924 + 0x929;
-        if (x08c924 > 0x929)
-            x08c924 = Math['floor'](x08c924 / 0x7b);
-        return x08c924
-    }
-    '''
-    ctx = execjs.compile(ex_js)
-    count = 0
-    while count < 3:
-        try:
-            res = session.get(url, headers=headers, timeout=60,verify=False)
-
-            pm_data = "".join(re.findall('\|function\|(.*?)\|version\|',res.text,re.S)).split('|')
-
-            answer = ctx.call('get_ck',pm_data[1],pm_data[3],pm_data[-1])
-
-            data = {
-                "answer": f"{answer}"
-            }
-            resp = session.post(url.split('?')[0], headers=headers, data=data,timeout=60,verify=False)
-            cookies = session.cookies.get_dict()
-
-            if re.findall('\|function\|(.*?)\|version\|',resp.text,re.S):
-                print(f"请求解析异常!重试 {count} 次")
-                count += 1
-            else:
-                return cookies
-        except:
-            print("cookies_PYCCS 获取失败!")
-            return {}

+ 0 - 59
spider_frame/FworkSpider/crawl_func/YunSuoAutoJump.py

@@ -1,59 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-09-12 
----------
-@summary: cookies -> security_session_mid_verify
----------
-@author: Lzz
-"""
-import time
-
-import execjs
-import requests
-
-
-def get_mid_code(security_verify_data_url, proxies=False):
-
-    session = requests.session()
-    session.proxies = proxies
-
-    headers = {
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-        "Accept-Language": "zh-CN,zh;q=0.9",
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-        "Pragma": "no-cache",
-        "Upgrade-Insecure-Requests": "1",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
-    }
-    res = session.get(security_verify_data_url, headers=headers, timeout=60, verify=False)
-
-    ex_js = '''
-    function YunSuoAutoJump(url) {
-        function stringToHex(str) {
-            var val = "";
-            for (var i = 0; i < str.length; i++) {
-                if (val == "") val = str.charCodeAt(i).toString(16); else val += str.charCodeAt(i).toString(16);
-            }
-            return val;
-        }
-        var width = 1536;
-        var height = 864;
-        var screendate = width + "," + height;
-        location = url + "?security_verify_data=" + stringToHex(screendate);
-        return location
-    }
-    '''
-    ctx = execjs.compile(ex_js)
-    yz_url = ctx.call("YunSuoAutoJump",security_verify_data_url)
-
-    num = 0
-    cookies = {}
-    while num < 10:
-        response = session.get(yz_url, headers=headers, timeout=60, verify=False)
-        cookies = session.cookies.get_dict()
-        if cookies.get('security_session_mid_verify'):
-            break
-        num += 1
-        time.sleep(2)
-    return cookies

+ 0 - 0
spider_frame/FworkSpider/crawl_func/__init__.py


+ 0 - 129
spider_frame/FworkSpider/crawl_func/ali_slide_verify.py

@@ -1,129 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-08-17 
----------
-@summary: 阿里滑块验证 acw_sc__v3 acw_sc__v2
----------
-@author: Lzz
-"""
-import re
-
-import execjs
-from selenium.webdriver import ActionChains
-
-from feapder.network.cookie_pool import PageCookiePool
-from feapder.utils.log import log
-from feapder.utils.webdriver import WebDriver
-
-
-def get_acw_sc_v2(html):
-    try:
-        arg1 = "".join(re.findall("arg1='(.*?)'", html))
-        if arg1:
-            js_script = '''
-                function getAcw_sc__v2(obt_arg1) {
-                    String["prototype"]["hexXor"] = function (_0x4e08d8) {
-                        var _0x5a5d3b = '';
-                        for (var _0xe89588 = 0x0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
-                            var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
-                            var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
-                            var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
-                            if (_0x189e2c["length"] == 1) {
-                                _0x189e2c = '0' + _0x189e2c;
-                            }
-                            _0x5a5d3b += _0x189e2c;
-                        }
-                        return _0x5a5d3b;
-                    };
-                    String["prototype"]["unsbox"] = function () {
-                        var _0x4b082b = [15, 35,29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
-                        var _0x4da0dc = [];
-                        var _0x12605e = '';
-                        for (var _0x20a7bf = 0x0; _0x20a7bf < this["length"]; _0x20a7bf++) {
-                            var _0x385ee3 = this[_0x20a7bf];
-                            for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
-                                if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
-                                    _0x4da0dc[_0x217721] = _0x385ee3;
-                                }
-                            }
-                        }
-                        _0x12605e = _0x4da0dc["join"]('');
-                        return _0x12605e;
-                    };
-    
-                    var _0x5e8b26 = "3000176000856006061501533003690027800375";
-                    // var arg1 = "0A5F01F50F9BC66FB28038F18B99B7B10CFF4667"
-                    var arg1 = obt_arg1
-                    var _0x23a392 = arg1["unsbox"]();
-                    arg2 = _0x23a392["hexXor"](_0x5e8b26);
-                    return arg2
-                }
-            '''
-            ctx = execjs.compile(js_script)
-            arg2 = ctx.call('getAcw_sc__v2', arg1)
-            return {"acw_sc__v2": arg2}
-        else:
-            return {}
-    except:
-        return {}
-
-
-class WebCookiePool(PageCookiePool):
-    def __init__(self, redis_key, page_url=None, proxies=None, **kwargs):
-        super(WebCookiePool, self).__init__(redis_key, page_url=None,
-                                           min_cookies=10000, must_contained_keys=(), keep_alive=False, **kwargs)
-        self.page_url = page_url
-        self.proxies = proxies
-        self._kwargs = kwargs
-        self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", True)
-        self._kwargs.setdefault("usages_local_driver", True)  # 是否加载本地驱动
-        # self._kwargs.setdefault("executable_path", "D:\\geckodriver.exe")
-        self._kwargs.setdefault("driver_type", "FIREFOX")
-        self._kwargs.setdefault("proxy", proxies)
-
-
-    def create_cookie(self):
-
-        with WebDriver(**self._kwargs) as driver_pool:
-            import time
-            try:
-                cookies = {}
-                js = "return navigator.userAgent"
-                driver_pool.execute_script(js)
-                driver_pool.get(self.page_url)
-                time.sleep(5)
-
-                arg2 = get_acw_sc_v2(driver_pool.page_source)
-                if arg2:
-                    cookies.update(arg2)
-
-                for i in range(3):
-                    try:
-                        slider = driver_pool.find_element_by_xpath("//span[contains(@class, 'nc_iconfont btn_slide')]")
-                        if slider.is_displayed():
-                            ActionChains(driver_pool).click_and_hold(on_element=slider).perform()
-                            ActionChains(driver_pool).move_by_offset(xoffset=252, yoffset=0).perform()
-                            ActionChains(driver_pool).pause(1).release().perform()
-
-                            cookies.update(driver_pool.cookies)
-                    except Exception as e:
-                        log.info(e)
-                    time.sleep(2)
-
-                    if 'nc_iconfont btn_slide' in driver_pool.page_source:
-                        continue
-                    else:
-                        break
-
-                return cookies
-            except Exception as e:
-                log.error(f"获取cookie失败,{e}")
-
-
-# if __name__ == '__main__':
-#     cookie_pool =WebCookiePool(redis_key='gdcookie',page_url="https://sourcing.lecaiyun.com/luban/bidding/newest?tradeModel=BIDDING")
-#     cookies = cookie_pool.get_cookie()
-#     log.debug(cookies)
-
-

+ 0 - 99
spider_frame/FworkSpider/crawl_func/jsl_5s.py

@@ -1,99 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-12-25
----------
-@summary: jsl+创宇云盾 通用模板
----------
-@author: jsl、创宇5秒盾
-"""
-import json
-import re
-
-import execjs
-import requests
-
-from feapder.network.cookie_pool import PageCookiePool
-
-
-class DTCookiePool(PageCookiePool):
-    def __init__(self, redis_key, header, page_url=None, cwd=None, save_js=False, **kwargs):
-        super(DTCookiePool, self).__init__(redis_key, page_url=None,
-                                           min_cookies=10000,
-                                           must_contained_keys=(),
-                                           keep_alive=False, **kwargs)
-        self.headers = header
-        self.page_url = page_url
-        self.proxies = kwargs.get('proxies') or False
-        self.cwd = cwd
-        self.is_save_js = save_js
-
-    def create_cookie(self):
-        proxies = self.proxies
-        try:
-            session = requests.Session()
-            session.proxies = proxies
-            start_url = self.page_url
-            res = session.get(start_url, headers=self.headers,timeout=120, verify=False)
-            js_func = "".join(re.findall("document.cookie=(.*?)location.href", res.text))
-            js_func = 'function sd() { return ' + js_func + "}"
-            ctx = execjs.compile(js_func)
-            sss = ctx.call("sd")
-            cookie = {}
-            for temp, index in res.cookies.get_dict().items():
-                cookie[temp] = index
-
-            for item in sss.split(";"):
-                if '=' in item:
-                    cookie[item.split("=")[0]] = item.split("=")[-1]
-
-            res = session.get(start_url, cookies=cookie,headers=self.headers,timeout=120,verify=False)
-            html_str = res.content.decode()
-            if "<!DOCTYPE html>" in html_str:
-                html_str = re.sub("<!DOCTYPE html>[\s\S]*?</html>", "", html_str.strip(),re.S)
-
-            if self.is_save_js:
-                with open('./source_code.js', 'w+', encoding='utf-8') as f:
-                    f.write(html_str)
-
-            js_do_data = "".join(re.findall('};go\((.*?)\)', html_str))
-            js_func = re.sub("<(/*?)script>", "", html_str)
-            location = re.compile('location(.*?)}}else')
-            location2 = re.compile('location(.*?)}else')
-            setTimeout = re.compile('0x5dc;}}(.*?)setTimeout,function\(\)\{')
-            setTimeout2 = re.compile('0x5dc;}(.*?)setTimeout\(function\(\)\{')
-            gox = re.compile('};go(.*?)\)')
-            js_func = re.sub(location, "}}else", js_func)
-            js_func = re.sub(location2, "}else", js_func)
-            js_func = re.sub(setTimeout, "0x5dc;}}", js_func)
-            js_func = re.sub(setTimeout2, "0x5dc;}", js_func)
-            js_func = re.sub(gox, "return document['cookie']\n};", js_func)
-
-            js_func = '''const jsdom = require("jsdom");
-                        const {JSDOM} = jsdom;
-                        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`,
-                                            {
-                                                url: "https://example.org/",
-                                                referrer: "https://example.com/",
-                                                contentType: "text/html",
-                                            });
-                        window = dom.window;
-                        document = window.document;
-                        location = window.location;
-                        ''' + js_func
-            ctx = execjs.compile(js_func,cwd=self.cwd)
-
-            if self.is_save_js:
-                with open('./clean_code.js', 'w+', encoding='utf-8') as f:
-                    f.write(js_func)
-
-            ss = ctx.call("go", json.loads(js_do_data))
-            for item in ss.split(";"):
-                if '=' in item:
-                    session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
-            session.get(start_url,headers=self.headers,timeout=120,verify=False)
-            cookies = requests.utils.dict_from_cookiejar(session.cookies)
-            return cookies
-        except Exception as e:
-            print("cookie生产错误:",e)
-            return {}
-

+ 0 - 83
spider_frame/FworkSpider/crawl_func/jsl_clearance_s.py

@@ -1,83 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-04-24 
----------
-@summary: jsl通用模板
----------
-@author: jsl
-"""
-import json
-import re
-
-import execjs
-import requests
-
-from untils.cookie_pool import PageCookiePool
-
-
-class DTCookiePool(PageCookiePool):
-    def __init__(self, redis_key, header, page_url=None, **kwargs):
-        super(DTCookiePool, self).__init__(redis_key, page_url=None,
-                                           min_cookies=10000,
-                                           must_contained_keys=(),
-                                           keep_alive=False, **kwargs)
-        self.headers = header
-        self.page_url = page_url
-        self.proxies = kwargs.get('proxies') or False
-
-    def create_cookie(self):
-        session = requests.Session()
-        session.proxies = self.proxies
-        start_url = self.page_url
-        res = session.get(start_url, headers=self.headers, timeout=120, verify=False)
-        js_func = "".join(re.findall("document.cookie=(.*?)location.href", res.text))
-        js_func = 'function sd() { return ' + js_func + "}"
-        ctx = execjs.compile(js_func)
-        sss = ctx.call("sd")
-        cookie = {}
-        for temp, index in res.cookies.get_dict().items():
-            cookie[temp] = index
-
-        for item in sss.split(";"):
-            if '=' in item:
-                cookie[item.split("=")[0]] = item.split("=")[-1]
-
-        res = session.get(start_url, cookies=cookie,headers=self.headers,timeout=120,verify=False)
-        html_str = res.content.decode()
-        js_do_data = "".join(re.findall('};go\((.*?)\)', html_str))
-        js_func = re.sub("<(/*?)script>", "", html_str)
-        location = re.compile('location(.*?)}}else')
-        location2 = re.compile('location(.*?)}else')
-        setTimeout = re.compile('setTimeout(.*?)document')
-        gox = re.compile('};go(.*?)\)')
-        js_func = re.sub(location, "}}else", js_func)
-        js_func = re.sub(location2, "}else", js_func)
-        js_func = re.sub(setTimeout, "document", js_func)
-        js_func = re.sub('0x5dc;}(.*?)\(document', "0x5dc;}document", js_func)
-        js_func = re.sub(gox, "return document['cookie']\n};", js_func)
-        js_func = '''const jsdom = require("jsdom");
-                    const {JSDOM} = jsdom;
-                    const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`,
-                                        {
-                                            url: "https://example.org/",
-                                            referrer: "https://example.com/",
-                                            contentType: "text/html",
-                                        });
-                    window = dom.window;
-                    document = window.document;
-                    location = window.location;
-                    ''' + js_func
-        ctx = execjs.compile(js_func)
-        # with open('wzjyjt_xxgg_pm.js', 'w+', encoding='utf-8') as f:
-        #     f.write(js_func)
-        try:
-            ss = ctx.call("go", json.loads(js_do_data))
-            for item in ss.split(";"):
-                if '=' in item:
-                    session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
-            session.get(start_url,headers=self.headers,timeout=120,verify=False)
-            cookies = requests.utils.dict_from_cookiejar(session.cookies)
-            return cookies
-        except Exception as e:
-            pass
-

+ 0 - 1
spider_frame/FworkSpider/feapder/VERSION

@@ -1 +0,0 @@
-1.6.9

+ 0 - 45
spider_frame/FworkSpider/feapder/__init__.py

@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/21 10:41 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import os
-import re
-import sys
-
-sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
-
-__all__ = [
-    "AirSpider",
-    "Spider",
-    "BiddingListSpider",
-    "BiddingDetailSpider",
-    "PlanToBuildListSpider",
-    "PlanToBuildDetailSpider",
-    "BaseParser",
-    "Request",
-    "Response",
-    "Item",
-    "UpdateItem",
-    "BaseListItem",
-    "BaseDetailItem",
-    "ArgumentParser",
-]
-
-from feapder.core.spiders import (
-    Spider,
-    AirSpider,
-    BiddingListSpider,
-    BiddingDetailSpider,
-    PlanToBuildListSpider,
-    PlanToBuildDetailSpider,
-)
-from feapder.core.base_parser import BaseParser
-from feapder.network.request import Request
-from feapder.network.response import Response
-from feapder.network.item import Item, UpdateItem, BaseListItem, BaseDetailItem
-from feapder.utils.custom_argparse import ArgumentParser

+ 0 - 9
spider_frame/FworkSpider/feapder/buffer/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/23 12:09 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""

+ 0 - 213
spider_frame/FworkSpider/feapder/buffer/heartbeat_buffer.py

@@ -1,213 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-11-02
----------
-@summary: 心跳管理器 负责缓冲添加到数据库中的item,由该manager统一添加。防止多线程同时访问数据库
----------
-@author: dzr
-"""
-
-import threading
-from queue import Queue
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.network.item import HeartBeatItem
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-
-MAX_ITEM_COUNT = 5000  # 缓存中最大item数
-UPLOAD_BATCH_MAX_SIZE = 1000
-
-
-class HeartBeatBuffer(threading.Thread):
-
-    # 聚合汇总,因为线程切换而导致数据汇总结果不一致,此时汇总本次结果,并记录推送结果,用于下次计算
-    _prev_success_task_count = 0
-    _prev_failed_task_count = 0
-
-    def __init__(self, redis_key=None):
-        if not hasattr(self, "_items_queue"):
-            super(HeartBeatBuffer, self).__init__()
-
-            self._thread_stop = False
-            self._is_adding_to_db = False
-            self._redis_key = redis_key
-
-            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
-
-            self._item_tables = {
-                # 'item_name': 'table_name' # 缓存item名与表名对应关系
-            }
-
-            self._pipelines = self.load_pipelines()
-
-    def load_pipelines(self):
-        pipelines = []
-        for pipeline_path in setting.ITEM_PIPELINES:
-            pipeline = tools.import_cls(pipeline_path)()
-            if not isinstance(pipeline, BasePipeline):
-                raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline")
-            pipelines.append(pipeline)
-
-        return pipelines
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            self.flush()
-            tools.delay_time(1)
-
-        self.close()
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def put_item(self, item):
-        if isinstance(item, HeartBeatItem):
-            self._items_queue.put(item)
-
-    def flush(self):
-        try:
-            heartbeat_items = []
-            need_aggregate_items = []
-            data_count = 0
-
-            while not self._items_queue.empty():
-                data = self._items_queue.get_nowait()
-                data_count += 1
-
-                business_type = data.business_type
-                if business_type and str(business_type).endswith("Detail"):
-                    need_aggregate_items.append(data)
-                else:
-                    heartbeat_items.append(data)
-
-                if data_count >= UPLOAD_BATCH_MAX_SIZE:
-                    self.__add_item_to_db(heartbeat_items, need_aggregate_items)
-
-                    heartbeat_items = []
-                    need_aggregate_items = []
-                    data_count = 0
-
-            if data_count:
-                self.__add_item_to_db(heartbeat_items, need_aggregate_items)
-
-        except Exception as e:
-            log.exception(e)
-
-    def get_items_count(self):
-        return self._items_queue.qsize()
-
-    def is_adding_to_db(self):
-        return self._is_adding_to_db
-
-    def __pick_items(self, items, is_aggregate=False):
-        """
-        将每个表之间的数据分开 拆分后 原items为空
-        @param items:
-        @param is_aggregate: 是否需要聚合汇总数据
-        @return:
-        """
-        datas_dict = {
-            # 'table_name': [{}, {}]
-        }
-
-        while items:
-            item = items.pop(0)
-            # 取item下划线格式的名
-            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
-            item_name = item.item_name
-            table_name = self._item_tables.get(item_name)
-            if not table_name:
-                table_name = item.table_name
-                self._item_tables[item_name] = table_name
-
-            if table_name not in datas_dict:
-                datas_dict[table_name] = []
-
-            datas_dict[table_name].append(item.to_dict)
-
-        if is_aggregate:
-            aggregate_data_dict = {
-                # 'table_name': [{}, {}]
-            }
-
-            for table_name, datas in datas_dict.items():
-                latest = datas[-1]
-                latest['rel_count'] = sum([item['rel_count'] for item in datas])
-                # 请求失败次数
-                max_failed_data_dict = max(datas, key=lambda x: x.get("failed_task_count", 0))
-                failed_task_count = max_failed_data_dict["failed_task_count"] - self._prev_failed_task_count
-                self._prev_failed_task_count = max_failed_data_dict["failed_task_count"]
-                latest['failed_task_count'] = failed_task_count
-                # 请求成功次数
-                max_success_data_dict = max(datas, key=lambda x: x.get("success_task_count", 0))
-                success_task_count = max_success_data_dict["success_task_count"] - self._prev_success_task_count
-                self._prev_success_task_count = max_success_data_dict["success_task_count"]
-                latest['success_task_count'] = success_task_count
-                # 总请求次数
-                latest['count'] = failed_task_count + success_task_count
-                if table_name not in aggregate_data_dict:
-                    aggregate_data_dict[table_name] = [latest]
-
-            datas_dict = aggregate_data_dict
-
-        return datas_dict
-
-    def __export_to_db(self, table, datas):
-        for pipeline in self._pipelines:
-            if not pipeline.save_items(table, datas):
-                log.error(
-                    f"{pipeline.__class__.__name__} 保存心跳失败. table: {table}  items: {datas}"
-                )
-                return False
-
-        return True
-
-    def __add_item_to_db(self, items, aggregate_items):
-        self._is_adding_to_db = True
-
-        # 分捡
-        items_dict = self.__pick_items(items)
-        aggregate_dict = self.__pick_items(aggregate_items, is_aggregate=True)
-
-        # heartbeat_item批量入库
-        while items_dict:
-            table, datas = items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            self.__export_to_db(table, datas)
-
-        while aggregate_dict:
-            table, datas = aggregate_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            self.__export_to_db(table, datas)
-
-        self._is_adding_to_db = False
-
-    def close(self):
-        # 调用pipeline的close方法
-        for pipeline in self._pipelines:
-            try:
-                pipeline.close()
-            except:
-                pass

+ 0 - 10
spider_frame/FworkSpider/feapder/buffer/item_buffer/__init__.py

@@ -1,10 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-29 
----------
-@summary:  
----------
-@author: Dzr
-"""
-from feapder.buffer.item_buffer.item_buffer import ItemBuffer
-from feapder.buffer.item_buffer.jy_item_buffer import JyItemBuffer

+ 0 - 474
spider_frame/FworkSpider/feapder/buffer/item_buffer/item_buffer.py

@@ -1,474 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-06-19 17:17
----------
-@summary: item 管理器 负责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import threading
-from queue import Queue
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.rabbitMq import RabbitMQ
-from feapder.dedup import Dedup
-from feapder.network.item import (
-    Item,
-    UpdateItem,
-    BaseListItem,
-    BaseDetailItem,
-    FailedTaskItem,
-)
-from feapder.pipelines import BasePipeline
-from feapder.utils import metrics
-from feapder.utils.log import log
-
-MAX_ITEM_COUNT = 5000  # 缓存中最大item数
-UPLOAD_BATCH_MAX_SIZE = 1000
-
-
-class ItemBuffer(threading.Thread):
-    dedup = None
-
-    def __init__(self, redis_key, rabbitmq=None, user=None):
-        if not hasattr(self, "_items_queue"):
-            super(ItemBuffer, self).__init__()
-
-            self._thread_stop = False
-            self._is_adding_to_db = False
-            self._redis_key = redis_key
-            self._user = user
-
-            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
-
-            self._rabbitmq = rabbitmq or RabbitMQ()
-
-            # 任务队列
-            self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
-            self._rabbitmq.declare_bind(queue=self._tab_requests)
-
-            # 数据保存失败队列
-            self._tab_failed_items = setting.TAB_FAILED_ITEMS
-            self._rabbitmq.declare_bind(queue=self._tab_failed_items)
-
-            # 采集任务队列(rabbitMq)
-            self._tab_items = setting.TAB_ITEMS.format(
-                redis_key=redis_key.replace('_detailc', '')
-            )
-            self._rabbitmq.declare_bind(queue=self._tab_items)
-
-            self._item_tables = {
-                # 'item_name': 'table_name' # 缓存item名与表名对应关系
-            }
-
-            self._item_update_keys = {
-                # 'table_name': ['id', 'name'...] # 缓存table_name与__update_key__的关系
-            }
-
-            self._pipelines = self.load_pipelines()
-            if setting.ITEM_FILTER_ENABLE and not self.__class__.dedup:
-                self.__class__.dedup = Dedup(
-                    to_md5=False, **setting.ITEM_FILTER_SETTING
-                )
-
-            # 导出重试的次数
-            self.export_retry_times = 0
-            # 导出失败的次数
-            self.export_falied_times = 0
-
-    def load_pipelines(self):
-        pipelines = []
-        for pipeline_path in setting.ITEM_PIPELINES:
-            pipeline = tools.import_cls(pipeline_path)()
-            if not isinstance(pipeline, BasePipeline):
-                raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline")
-            pipelines.append(pipeline)
-
-        return pipelines
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            self.flush()
-            tools.delay_time(1)
-
-        self.close()
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def put_item(self, item):
-        if isinstance(item, Item):
-            # 入库前的回调
-            item.pre_to_db()
-
-        self._items_queue.put(item)
-
-    def flush(self):
-        try:
-            items = []
-            update_items = []
-            failed_task_items = []
-            requests = []
-            callbacks = []
-            items_fingerprints = []
-            data_count = 0
-
-            while not self._items_queue.empty():
-                data = self._items_queue.get_nowait()
-                data_count += 1
-                update_at = tools.ensure_int64(tools.get_current_timestamp())
-
-                # data 分类
-                if callable(data):
-                    callbacks.append(data)
-
-                elif isinstance(data, UpdateItem):
-                    update_items.append(data)
-
-                elif isinstance(data, FailedTaskItem):
-                    data.queue_name = self._tab_items  # 采集任务队列名称
-                    if data.failed_retries >= setting.SPIDER_MAX_RETRY_TIMES:
-                        state = 4  # 待采集任务停止采集状态[4=停止采集]
-
-                        '''更新失败的采集任务状态'''
-                        update_item = UpdateItem(
-                            state=state,
-                            pyuuid=data.pyuuid,
-                            update_at=update_at,
-                            failed_retries=data.failed_retries,
-                        )
-                        update_key = ['state', 'update_at', 'failed_retries']
-
-                        '''记录失败的采集任务详情'''
-                        data.state = state
-                        data.create_at = update_at
-                        failed_task_items.append(data)
-                    else:
-                        '''更新失败的采集任务状态'''
-                        update_item = UpdateItem(
-                            state=3,  # 待采集任务失败采集状态[3=采集失败]
-                            pyuuid=data.pyuuid,
-                            failed_retries=data.failed_retries,
-                        )
-                        update_key = ['state', 'failed_retries']
-
-                    update_item.update_key = update_key
-                    update_item.table_name = setting.TASK_REQUEST_PRODUCE
-                    update_items.append(update_item)
-
-                elif isinstance(data, Item):
-                    if isinstance(data, BaseListItem):
-                        data.queue_name = self._tab_items
-                        data.update_at = update_at
-                        if hasattr(data, 'is_delay') and data.is_delay:
-                            data.state = 5  # 待采集任务延时采集状态[5=延时采集]
-                        else:
-                            data.state = 1  # 待采集任务等待采集状态[1=等待采集]
-
-                    elif isinstance(data, BaseDetailItem):
-                        if not getattr(data, 'is_mixed'):
-                            update_item = UpdateItem(
-                                state=2,  # 待采集任务成功采集状态[2=完成采集]
-                                pyuuid=data.pyuuid,
-                                update_at=update_at,
-                            )
-                            update_item.update_key = ['state', 'update_at']
-                            update_item.table_name = setting.TASK_REQUEST_PRODUCE
-                            update_items.append(update_item)
-
-                    if data.dont_save:
-                        # 数据不写入爬虫生产库(data_bak)
-                        continue
-
-                    items.append(data)
-                    if setting.ITEM_FILTER_ENABLE:
-                        items_fingerprints.append(data.fingerprint)
-
-                else:  # request-redis
-                    requests.append(data)
-
-                if data_count >= UPLOAD_BATCH_MAX_SIZE:
-                    self.__add_item_to_db(
-                        items, update_items, failed_task_items, requests, callbacks, items_fingerprints
-                    )
-
-                    items = []
-                    update_items = []
-                    failed_task_items = []
-                    requests = []
-                    callbacks = []
-                    items_fingerprints = []
-                    data_count = 0
-
-            if data_count:
-                self.__add_item_to_db(
-                    items, update_items, failed_task_items, requests, callbacks, items_fingerprints
-                )
-
-        except Exception as e:
-            log.exception(e)
-
-    def get_items_count(self):
-        return self._items_queue.qsize()
-
-    def is_adding_to_db(self):
-        return self._is_adding_to_db
-
-    def __dedup_items(self, items, items_fingerprints):
-        """
-        去重
-        @param items:
-        @param items_fingerprints:
-        @return: 返回去重后的items, items_fingerprints
-        """
-        if not items:
-            return items, items_fingerprints
-
-        is_exists = self.__class__.dedup.get(items_fingerprints)
-        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]
-
-        dedup_items = []
-        dedup_items_fingerprints = []
-        items_count = dedup_items_count = dup_items_count = 0
-
-        while is_exists:
-            item = items.pop(0)
-            items_fingerprint = items_fingerprints.pop(0)
-            is_exist = is_exists.pop(0)
-
-            items_count += 1
-
-            if not is_exist:
-                dedup_items.append(item)
-                dedup_items_fingerprints.append(items_fingerprint)
-                dedup_items_count += 1
-            else:
-                dup_items_count += 1
-
-        log.info(
-            "待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format(
-                items_count, dup_items_count, dedup_items_count
-            )
-        )
-
-        return dedup_items, dedup_items_fingerprints
-
-    def __pick_items(self, items, is_update_item=False):
-        """
-        将每个表之间的数据分开 拆分后 原items为空
-        @param items:
-        @param is_update_item:
-        @return:
-        """
-        datas_dict = {
-            # 'table_name': [{}, {}]
-        }
-
-        while items:
-            item = items.pop(0)
-            # 取item下划线格式的名
-            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
-            item_name = item.item_name
-            table_name = self._item_tables.get(item_name)
-            if not table_name:
-                table_name = item.table_name
-                self._item_tables[item_name] = table_name
-
-            if table_name not in datas_dict:
-                datas_dict[table_name] = []
-
-            datas_dict[table_name].append(item.to_dict)
-
-            if is_update_item and table_name not in self._item_update_keys:
-                self._item_update_keys[table_name] = item.update_key
-
-        return datas_dict
-
-    def __export_to_db(self, table, datas, is_update=False, update_keys=()):
-        for pipeline in self._pipelines:
-            if is_update:
-                if not pipeline.update_items(table, datas, update_keys=update_keys):
-                    log.error(
-                        f"{pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
-                    )
-                    return False
-            else:
-                if not pipeline.save_items(table, datas):
-                    log.error(
-                        f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
-                    )
-                    return False
-
-        self.metric_datas(table=table, datas=datas)
-        return True
-
-    def __add_item_to_db(
-        self, items, update_items, failed_task_items, requests, callbacks, items_fingerprints
-    ):
-        export_success = True
-        self._is_adding_to_db = True
-
-        if setting.ITEM_FILTER_ENABLE:
-            items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
-
-        # 分捡
-        items_dict = self.__pick_items(items)
-        update_items_dict = self.__pick_items(update_items, is_update_item=True)
-        failed_task_items_dict = self.__pick_items(failed_task_items)
-
-        # item批量入库
-        failed_items = {"add": [], "update": [], "requests": []}
-        while items_dict:
-            table, datas = items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            if not self.__export_to_db(table, datas):
-                export_success = False
-                failed_items["add"].append({"table": table, "datas": datas})
-
-        # 执行批量update
-        while update_items_dict:
-            table, datas = update_items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量更新 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            update_keys = self._item_update_keys.get(table)
-            if not self.__export_to_db(
-                table, datas, is_update=True, update_keys=update_keys
-            ):
-                export_success = False
-                failed_items["update"].append({"table": table, "datas": datas})
-
-        # 采集失败 item批量入库
-        while failed_task_items_dict:
-            table, datas = failed_task_items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- crawl failed item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            if not self.__export_to_db(table, datas):
-                export_success = False
-                failed_items["add"].append({"table": table, "datas": datas})
-
-        if export_success:
-            # 执行回调
-            while callbacks:
-                try:
-                    callback = callbacks.pop(0)
-                    callback()
-                except Exception as e:
-                    log.exception(e)
-
-            # 删除做过的request
-            if requests:
-                # self._rabbitmq.add(self._tab_requests, requests)
-                pass
-
-            # 去重入库
-            if setting.ITEM_FILTER_ENABLE:
-                if items_fingerprints:
-                    self.__class__.dedup.add(items_fingerprints, skip_check=True)
-        else:
-            failed_items["requests"] = requests
-            # 设置mq访问者的唯一标识特性 correlation_id
-            properties = dict(correlation_id=self._user or self._redis_key)
-
-            if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
-                if self._redis_key != "air_spider":
-                    # 记录失败的item
-                    self._rabbitmq.add_batch(self._tab_failed_items, failed_items, properties=properties)
-                    # 删除做过的request
-                    if requests:
-                        # self.redis_db.zrem(self._table_request, requests)
-                        print(f'做过的requests数量: {len(requests)}')
-
-                    log.error(
-                        "入库超过最大重试次数,不再重试,数据记录到redis,items:\n {}".format(
-                            tools.dumps_json(failed_items)
-                        )
-                    )
-                self.export_retry_times = 0
-
-            else:
-                tip = ["入库不成功"]
-                if callbacks:
-                    tip.append("不执行回调")
-                if requests:
-                    tip.append("不删除任务")
-                    self._rabbitmq.add_batch(self._tab_requests, requests, properties=properties)
-
-                if setting.ITEM_FILTER_ENABLE:
-                    tip.append("数据不入去重库")
-
-                if self._redis_key != "air_spider":
-                    tip.append("将自动重试")
-
-                tip.append("失败items:\n {}".format(tools.dumps_json(failed_items)))
-                log.error(",".join(tip))
-
-                self.export_falied_times += 1
-
-                if self._redis_key != "air_spider":
-                    self.export_retry_times += 1
-
-            if self.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
-                # 报警
-                msg = "《{}》爬虫导出数据失败,失败次数:{},请检查爬虫是否正常".format(
-                    self._redis_key, self.export_falied_times
-                )
-                log.error(msg)
-                tools.send_msg(
-                    msg=msg,
-                    level="error",
-                    message_prefix="《%s》爬虫导出数据失败" % (self._redis_key),
-                )
-
-        self._is_adding_to_db = False
-
-    def metric_datas(self, table, datas):
-        """
-        打点 记录总条数及每个key情况
-        @param table: 表名
-        @param datas: 数据 列表
-        @return:
-        """
-        total_count = 0
-        for data in datas:
-            total_count += 1
-            for k, v in data.items():
-                metrics.emit_counter(k, int(bool(v)), classify=table)
-        metrics.emit_counter("total count", total_count, classify=table)
-
-    def close(self):
-        # 调用pipeline的close方法
-        for pipeline in self._pipelines:
-            try:
-                pipeline.close()
-            except:
-                pass

+ 0 - 469
spider_frame/FworkSpider/feapder/buffer/item_buffer/jy_item_buffer.py

@@ -1,469 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-06-19 17:17
----------
-@summary: 剑鱼数据缓存
----------
-@author:
-@email:
-"""
-
-import threading
-from queue import Queue
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.dedup import Dedup
-from feapder.network.item import (
-    Item,
-    UpdateItem,
-    BaseListItem,
-    BaseDetailItem,
-    FailedTaskItem,
-)
-from feapder.network.request import Request
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-
-MAX_ITEM_COUNT = 5000  # 缓存中最大item数
-UPLOAD_BATCH_MAX_SIZE = 1000
-
-
-class JyItemBuffer(threading.Thread):
-    dedup = None
-
-    def __init__(self, redis_key, rabbitmq=None):
-        if not hasattr(self, "_items_queue"):
-            super(JyItemBuffer, self).__init__()
-
-            self._thread_stop = False
-            self._is_adding_to_db = False
-            self._redis_key = redis_key
-
-            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
-
-            # 采集任务队列(rabbitMq)
-            self._tab_items = setting.TAB_ITEMS.format(
-                redis_key=redis_key.replace("_detailc", "")
-            )
-
-            self._item_tables = {
-                # 'item_name': 'table_name' # 缓存item名与表名对应关系
-            }
-
-            self._item_update_keys = {
-                # 'table_name': ['id', 'name'...] # 缓存table_name与__update_key__的关系
-            }
-
-            self._pipelines = self.load_pipelines()
-            if setting.ITEM_FILTER_ENABLE and not self.__class__.dedup:
-                self.__class__.dedup = Dedup(
-                    to_md5=False, **setting.ITEM_FILTER_SETTING
-                )
-
-            # 导出重试的次数
-            self.export_retry_times = 0
-            # 导出失败的次数
-            self.export_falied_times = 0
-            # 缓存队列
-            self.tasks_dict = {}
-
-    def load_pipelines(self):
-        pipelines = []
-        for pipeline_path in setting.ITEM_PIPELINES:
-            pipeline = tools.import_cls(pipeline_path)()
-            if not isinstance(pipeline, BasePipeline):
-                raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline")
-            pipelines.append(pipeline)
-
-        return pipelines
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            self.flush()
-            tools.delay_time(1)
-
-        self.close()
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def put_item(self, item):
-        if isinstance(item, Item):
-            # 入库前的回调
-            item.pre_to_db()
-
-        self._items_queue.put(item)
-
-    def flush(self):
-        try:
-            items = []
-            update_items = []
-            failed_task_items = []
-            requests = []
-            callbacks = []
-            items_fingerprints = []
-            data_count = 0
-
-            while not self._items_queue.empty():
-                data = self._items_queue.get_nowait()
-                data_count += 1
-                update_at = tools.ensure_int64(tools.get_current_timestamp())
-
-                # data 分类
-                if callable(data):
-                    callbacks.append(data)
-
-                elif isinstance(data, UpdateItem):
-                    update_items.append(data)
-
-                elif isinstance(data, FailedTaskItem):
-                    data.queue_name = self._tab_items  # 采集任务队列名称
-                    if data.failed_retries >= setting.SPIDER_MAX_RETRY_TIMES:
-                        state = 4  # 待采集任务停止采集状态[4=停止采集]
-
-                        '''更新失败的采集任务状态'''
-                        update_item = UpdateItem(
-                            state=state,
-                            pyuuid=data.pyuuid,
-                            update_at=update_at,
-                            failed_retries=data.failed_retries,
-                        )
-                        update_key = ['state', 'update_at', 'failed_retries']
-
-                        '''记录失败的采集任务详情'''
-                        data.state = state
-                        data.create_at = update_at
-                        failed_task_items.append(data)
-                    else:
-                        '''更新失败的采集任务状态'''
-                        update_item = UpdateItem(
-                            state=3,  # 待采集任务失败采集状态[3=采集失败]
-                            pyuuid=data.pyuuid,
-                            failed_retries=data.failed_retries,
-                        )
-                        update_key = ['state', 'failed_retries']
-
-                    update_item.update_key = update_key
-                    update_item.table_name = setting.TASK_REQUEST_PRODUCE
-                    update_items.append(update_item)
-
-                elif isinstance(data, Item):
-                    if isinstance(data, BaseListItem):
-                        data.queue_name = self._tab_items
-                        data.update_at = update_at
-                        if hasattr(data, 'is_delay') and data.is_delay:
-                            data.state = 5  # 待采集任务延时采集状态[5=延时采集]
-                        else:
-                            data.state = 1  # 待采集任务等待采集状态[1=等待采集]
-
-                    elif isinstance(data, BaseDetailItem):
-                        if not getattr(data, 'is_mixed'):
-                            update_item = UpdateItem(
-                                state=2,  # 待采集任务成功采集状态[2=完成采集]
-                                pyuuid=data.pyuuid,
-                                update_at=update_at,
-                            )
-                            update_item.update_key = ['state', 'update_at']
-                            update_item.table_name = setting.TASK_REQUEST_PRODUCE
-                            update_items.append(update_item)
-
-                    if data.dont_save:
-                        # 数据不入爬虫生产库(data_bak)
-                        continue
-
-                    items.append(data)
-                    if setting.ITEM_FILTER_ENABLE:
-                        items_fingerprints.append(data.fingerprint)
-
-                else:  # request-redis
-                    requests.append(data)
-
-                if data_count >= UPLOAD_BATCH_MAX_SIZE:
-                    self.__add_item_to_db(
-                        items, update_items, failed_task_items, requests, callbacks, items_fingerprints
-                    )
-
-                    items = []
-                    update_items = []
-                    failed_task_items = []
-                    requests = []
-                    callbacks = []
-                    items_fingerprints = []
-                    data_count = 0
-
-            if data_count:
-                self.__add_item_to_db(
-                    items, update_items, failed_task_items, requests, callbacks, items_fingerprints
-                )
-
-        except Exception as e:
-            log.exception(e)
-
-    def get_items_count(self):
-        return self._items_queue.qsize()
-
-    def is_adding_to_db(self):
-        return self._is_adding_to_db
-
-    def __dedup_items(self, items, items_fingerprints):
-        """
-        去重
-        @param items:
-        @param items_fingerprints:
-        @return: 返回去重后的items, items_fingerprints
-        """
-        if not items:
-            return items, items_fingerprints
-
-        is_exists = self.__class__.dedup.get(items_fingerprints)
-        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]
-
-        dedup_items = []
-        dedup_items_fingerprints = []
-        items_count = dedup_items_count = dup_items_count = 0
-
-        while is_exists:
-            item = items.pop(0)
-            items_fingerprint = items_fingerprints.pop(0)
-            is_exist = is_exists.pop(0)
-
-            items_count += 1
-
-            if not is_exist:
-                dedup_items.append(item)
-                dedup_items_fingerprints.append(items_fingerprint)
-                dedup_items_count += 1
-            else:
-                dup_items_count += 1
-
-        log.info(
-            "待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format(
-                items_count, dup_items_count, dedup_items_count
-            )
-        )
-
-        return dedup_items, dedup_items_fingerprints
-
-    def __pick_items(self, items, is_update_item=False):
-        """
-        将每个表之间的数据分开 拆分后 原items为空
-        @param items:
-        @param is_update_item:
-        @return:
-        """
-        datas_dict = {
-            # 'table_name': [{}, {}]
-        }
-
-        while items:
-            item = items.pop(0)
-            # 取item下划线格式的名
-            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
-            item_name = item.item_name
-            table_name = self._item_tables.get(item_name)
-            if not table_name:
-                table_name = item.table_name
-                self._item_tables[item_name] = table_name
-
-            if table_name not in datas_dict:
-                datas_dict[table_name] = []
-
-            datas_dict[table_name].append(item.to_dict)
-
-            if is_update_item and table_name not in self._item_update_keys:
-                self._item_update_keys[table_name] = item.update_key
-
-        return datas_dict
-
-    def release_tasks(self, datas, finished=True):
-        if not datas:
-            return
-
-        token = self.tasks_dict["token"] if "token" in self.tasks_dict else None
-        if not token:
-            return
-
-        all_task_dict = {}
-        if "data" in self.tasks_dict:
-            all_task_dict = self.tasks_dict["data"]
-
-        commit_task_dict = {}
-        if "data" in datas:
-            commit_task_dict = datas["data"]
-
-        if not all_task_dict and not commit_task_dict:
-            return
-
-        release_tasks = []
-        if not finished:
-            # 爬虫运行结束,释放剩余未完成采集的任务
-            for pyuuid in dict(all_task_dict):
-                release_tasks.append(commit_task_dict.pop(pyuuid))
-        else:
-            # 爬虫运行中,释放已采集完成的任务
-            finished_task_pyuuid_lst = [data["pyuuid"] for data in datas]
-            for pyuuid in finished_task_pyuuid_lst:
-                if pyuuid not in all_task_dict:
-                    continue
-                release_tasks.append(all_task_dict.pop(pyuuid))
-
-        if len(release_tasks) == 0:
-            log.debug("无回传任务")
-            return
-
-        r = None
-        url = f"{setting.JY_TASK_URL}/tasks/batch-release"
-        headers = {"Authorization": token}
-        params = dict(headers=headers, timeout=10, json=release_tasks, proxies=False)
-        try:
-            r = Request(method="DELETE", url=url, **params).get_response()
-            log.debug(f"任务回传成功,~{len(release_tasks)}")
-        except Exception as e:
-            log.error(f"任务回传失败, 原因:{e}")
-        return True if r and r.status_code == 200 else False
-
-    def __export_to_db(self, table, datas, is_update=False, update_keys=()):
-        for pipeline in self._pipelines:
-            if is_update:
-                if not pipeline.update_items(table, datas, update_keys=update_keys):
-                    log.error(
-                        f"{pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
-                    )
-                    return False
-            else:
-                if not pipeline.save_items(table, datas):
-                    log.error(
-                        f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
-                    )
-                    return False
-
-        self.release_tasks(datas=datas)
-        return True
-
-    def __add_item_to_db(
-        self, items, update_items, failed_task_items, requests, callbacks, items_fingerprints
-    ):
-        export_success = True
-        self._is_adding_to_db = True
-
-        if setting.ITEM_FILTER_ENABLE:
-            items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
-
-        # 分捡
-        items_dict = self.__pick_items(items)
-        update_items_dict = self.__pick_items(update_items, is_update_item=True)
-        failed_task_items_dict = self.__pick_items(failed_task_items)
-
-        # item批量入库
-        failed_items = {"add": [], "update": [], "requests": []}
-        while items_dict:
-            table, datas = items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            if not self.__export_to_db(table, datas):
-                export_success = False
-                failed_items["add"].append({"table": table, "datas": datas})
-
-        # 执行批量update
-        while update_items_dict:
-            table, datas = update_items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- item 批量更新 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            update_keys = self._item_update_keys.get(table)
-            if not self.__export_to_db(
-                table, datas, is_update=True, update_keys=update_keys
-            ):
-                export_success = False
-                failed_items["update"].append({"table": table, "datas": datas})
-
-        # 采集失败 item批量入库
-        while failed_task_items_dict:
-            table, datas = failed_task_items_dict.popitem()
-
-            log.debug(
-                """
-                -------------- crawl failed item 批量入库 --------------
-                表名: %s
-                datas: %s
-                    """
-                % (table, tools.dumps_json(datas, indent=16))
-            )
-
-            if not self.__export_to_db(table, datas):
-                export_success = False
-                failed_items["add"].append({"table": table, "datas": datas})
-
-        if export_success:
-            # 执行回调
-            while callbacks:
-                try:
-                    callback = callbacks.pop(0)
-                    callback()
-                except Exception as e:
-                    log.exception(e)
-
-            # 删除做过的request
-            if requests:
-                pass
-
-            # 去重入库
-            if setting.ITEM_FILTER_ENABLE:
-                if items_fingerprints:
-                    self.__class__.dedup.add(items_fingerprints, skip_check=True)
-        else:
-            failed_items["requests"] = requests
-
-            tip = ["入库不成功"]
-            if callbacks:
-                tip.append("不执行回调")
-            if requests:
-                tip.append("删除任务")
-
-            if setting.ITEM_FILTER_ENABLE:
-                tip.append("数据不入去重库")
-
-            tip.append("失败items:\n {}".format(tools.dumps_json(failed_items)))
-            log.error(",".join(tip))
-
-            self.export_falied_times += 1
-
-            if self.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
-                # 报警
-                msg = "《{}》爬虫导出数据失败,失败次数:{},请检查爬虫是否正常".format(
-                    self._redis_key, self.export_falied_times
-                )
-                log.error(msg)
-                tools.send_msg(
-                    msg=msg,
-                    level="error",
-                    message_prefix="《%s》爬虫导出数据失败" % (self._redis_key),
-                )
-
-        self._is_adding_to_db = False
-
-    def close(self):
-        # 调用pipeline的close方法
-        for pipeline in self._pipelines:
-            try:
-                pipeline.close()
-            except:
-                pass

+ 0 - 150
spider_frame/FworkSpider/feapder/buffer/request_buffer.py

@@ -1,150 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-06-19 17:17
----------
-@summary: request 管理器, 负责缓冲添加到数据库中的request
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import collections
-import threading
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.rabbitMq import RabbitMQ
-from feapder.dedup import Dedup
-from feapder.utils.log import log
-
-MAX_URL_COUNT = 1000  # 缓存中最大request数
-
-
-class RequestBuffer(threading.Thread):
-    dedup = None
-
-    def __init__(self, redis_key, rabbitmq=None, user=None):
-        if not hasattr(self, "_requests_deque"):
-            super(RequestBuffer, self).__init__()
-
-            self._thread_stop = False
-            self._is_adding_to_db = False
-            self._redis_key = redis_key
-            self._user = user
-
-            self._requests_deque = collections.deque()
-            self._del_requests_deque = collections.deque()
-
-            self._rabbitmq = rabbitmq or RabbitMQ()
-            # 任务队列
-            self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
-            self._rabbitmq.declare_bind(queue=self._tab_requests)
-            # 失败任务队列
-            self._tab_failed_requests = setting.TAB_FAILED_REQUESTS
-            self._rabbitmq.declare_bind(queue=self._tab_failed_requests)
-
-            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
-                self.__class__.dedup = Dedup(
-                    name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
-                )  # 默认过期时间为一个月
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            try:
-                self.__add_request_to_db()
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def get_failed_requests_count(self):
-        return self._rabbitmq.get_message_count(self._tab_failed_requests)
-
-    def put_request(self, request):
-        self._requests_deque.append(request)
-
-        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
-            self.flush()
-
-    def put_failed_request(self, request, table=None):
-        try:
-            request_dict = request.to_dict
-            if table is not None:
-                self._rabbitmq.declare_bind(queue=table)  # 声明额外的队列
-
-            queue = table or self._tab_failed_requests
-            # 设置访问者的唯一标识
-            properties = dict(correlation_id=self._user or self._redis_key)
-            self._rabbitmq.add(request_dict, queue=queue, properties=properties)
-        except Exception as e:
-            log.exception(e)
-
-    def flush(self):
-        try:
-            self.__add_request_to_db()
-        except Exception as e:
-            log.exception(e)
-
-    def get_requests_count(self):
-        return len(self._requests_deque)
-
-    def is_adding_to_db(self):
-        return self._is_adding_to_db
-
-    def __add_request_to_db(self):
-        kw = {"properties": dict(correlation_id=self._user) if self._user else None}
-
-        request_list = []
-        prioritys = []
-        callbacks = []
-
-        while self._requests_deque:
-            request = self._requests_deque.popleft()
-            self._is_adding_to_db = True
-
-            if callable(request):
-                # 函数
-                # 注意:应该考虑闭包情况。闭包情况可写成
-                # def test(xxx = xxx):
-                #     # TODO 业务逻辑 使用 xxx
-                # 这么写不会导致xxx为循环结束后的最后一个值
-                callbacks.append(request)
-                continue
-
-            priority = request.priority
-
-            # 如果需要去重并且库中已重复,则continue
-            if (
-                request.filter_repeat
-                and setting.REQUEST_FILTER_ENABLE
-                and not self.__class__.dedup.add(request.fingerprint)
-            ):
-                log.debug("request已存在 url = %s" % request.url)
-                continue
-            else:
-                request_list.append(str(request.to_dict))
-                prioritys.append(priority)
-
-            # 入库(超过上限[MAX_URL_COUNT]执行)
-            if len(request_list) > MAX_URL_COUNT:
-                self._rabbitmq.add_batch(self._tab_requests, request_list, **kw)
-                request_list = []
-                prioritys = []
-
-        # 入库(小于上限[MAX_URL_COUNT]执行)
-        if request_list:
-            self._rabbitmq.add_batch(self._tab_requests, request_list, **kw)
-
-        # 执行回调
-        for callback in callbacks:
-            try:
-                callback()
-            except Exception as e:
-                log.exception(e)
-
-        self._is_adding_to_db = False

+ 0 - 0
spider_frame/FworkSpider/feapder/commands/__init__.py


+ 0 - 45
spider_frame/FworkSpider/feapder/commands/cmdline.py

@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/5/8 2:24 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import sys
-from os.path import dirname, join
-
-from feapder.commands import create_builder
-from feapder.commands import shell
-
-
-def _print_commands():
-    with open(join(dirname(dirname(__file__)), "VERSION"), "rb") as f:
-        version = f.read().decode("ascii").strip()
-
-    print("feapder {}".format(version))
-    print("\nUsage:")
-    print("  feapder <command> [options] [args]\n")
-    print("Available commands:")
-    cmds = {"create": "create project、spider、item and so on", "shell": "debug response"}
-    for cmdname, cmdclass in sorted(cmds.items()):
-        print("  %-13s %s" % (cmdname, cmdclass))
-
-    print('\nUse "feapder <command> -h" to see more info about a command')
-
-
-def execute():
-    args = sys.argv
-    if len(args) < 2:
-        _print_commands()
-        return
-
-    command = args.pop(1)
-    if command == "create":
-        create_builder.main()
-    elif command == "shell":
-        shell.main()
-    else:
-        _print_commands()

+ 0 - 21
spider_frame/FworkSpider/feapder/commands/create/__init__.py

@@ -1,21 +0,0 @@
-__all__ = [
-    "CreateProject",
-    "CreateSpider",
-    "CreateItem",
-    "CreateInit",
-    "CreateJson",
-    "CreateTable",
-    "CreateCookies",
-    "CreateSetting",
-    "CreateParams",
-]
-
-from .create_table import CreateTable
-from .create_json import CreateJson
-from .create_spider import CreateSpider
-from .create_init import CreateInit
-from .create_item import CreateItem
-from .create_project import CreateProject
-from .create_cookies import CreateCookies
-from .create_setting import CreateSetting
-from .create_params import CreateParams

+ 0 - 48
spider_frame/FworkSpider/feapder/commands/create/create_cookies.py

@@ -1,48 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/25 10:22 上午
----------
-@summary: 将浏览器的cookie转为request的cookie
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import json
-import sys
-
-from feapder.utils.tools import get_cookies_from_str, print_pretty
-
-
-class CreateCookies:
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        print("请输入浏览器cookie (列表或字符串格式)")
-        data = []
-        while True:
-            line = sys.stdin.readline().strip()
-            if not line:
-                break
-
-            data.append(line)
-
-        return "".join(data)
-
-    def create(self):
-        data = self.get_data()
-        cookies = {}
-        try:
-            data_json = json.loads(data)
-
-            for data in data_json:
-                cookies[data.get("name")] = data.get("value")
-
-        except:
-            cookies = get_cookies_from_str(data)
-
-        print_pretty(cookies)

+ 0 - 30
spider_frame/FworkSpider/feapder/commands/create/create_init.py

@@ -1,30 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建__init__.py
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-from feapder.utils.tools import dumps_json
-
-
-class CreateInit:
-    def create(self):
-        __all__ = []
-
-        import os
-
-        path = os.getcwd()
-        for file in os.listdir(path):
-            if file.endswith(".py") and not file.startswith("__init__"):
-                model = file.split(".")[0]
-                __all__.append(model)
-
-        del os
-
-        with open("__init__.py", "w", encoding="utf-8") as file:
-            text = "__all__ = %s" % dumps_json(__all__)
-            file.write(text)

+ 0 - 165
spider_frame/FworkSpider/feapder/commands/create/create_item.py

@@ -1,165 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建item
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import getpass
-import os
-
-import feapder.utils.tools as tools
-from feapder import setting
-from feapder.db.mysqldb import MysqlDB
-from .create_init import CreateInit
-
-
-def deal_file_info(file):
-    file = file.replace("{DATE}", tools.get_current_date())
-    file = file.replace("{USER}", getpass.getuser())
-
-    return file
-
-
-class CreateItem:
-    def __init__(self):
-        self._db = MysqlDB()
-        self._create_init = CreateInit()
-
-    def select_columns(self, table_name):
-        # sql = 'SHOW COLUMNS FROM ' + table_name
-        sql = f"SELECT COLUMN_NAME, COLUMN_TYPE, IS_NULLABLE, COLUMN_DEFAULT, EXTRA, COLUMN_KEY, COLUMN_COMMENT FROM INFORMATION_SCHEMA.Columns WHERE table_name = '{table_name}' and table_schema = '{setting.MYSQL_DB}'"
-        columns = self._db.find(sql)
-
-        return columns
-
-    def select_tables_name(self, tables_name):
-        """
-        @summary:
-        ---------
-        @param tables_name: 一类tables 如 qidian*
-        ---------
-        @result:
-        """
-        sql = f"select table_name from information_schema.tables where table_name like '{tables_name}' and table_schema = '{setting.MYSQL_DB}'"
-        tables_name = self._db.find(sql)
-
-        return tables_name
-
-    def convert_table_name_to_hump(self, table_name):
-        """
-        @summary: 格式化表明为驼峰格式
-        ---------
-        @param table:
-        ---------
-        @result:
-        """
-        table_hump_format = ""
-
-        words = table_name.split("_")
-        for word in words:
-            table_hump_format += word.capitalize()  # 首字母大写
-
-        return table_hump_format
-
-    def get_item_template(self):
-        template_path = os.path.abspath(
-            os.path.join(__file__, "../../../templates/item_template.tmpl")
-        )
-        with open(template_path, "r", encoding="utf-8") as file:
-            item_template = file.read()
-
-        return item_template
-
-    def create_item(self, item_template, columns, table_name, support_dict):
-        table_name_hump_format = self.convert_table_name_to_hump(table_name)
-        # 组装 类名
-        item_template = item_template.replace("${item_name}", table_name_hump_format)
-        if support_dict:
-            item_template = item_template.replace("${table_name}", table_name + " 1")
-        else:
-            item_template = item_template.replace("${table_name}", table_name)
-
-        # 组装 属性
-        propertys = ""
-        for column in columns:
-            column_name = column[0]
-            column_type = column[1]
-            is_nullable = column[2]
-            column_default = column[3]
-            column_extra = column[4]
-            column_key = column[5]
-            column_comment = column[6]
-
-            try:
-                value = (
-                    "kwargs.get('{column_name}')".format(column_name=column_name)
-                    if support_dict
-                    else (
-                        column_default != "CURRENT_TIMESTAMP" and column_default or None
-                    )
-                    and eval(column_default)
-                )
-            except:
-                value = (
-                    "kwargs.get('{column_name}')".format(column_name=column_name)
-                    if support_dict
-                    else (
-                        column_default != "CURRENT_TIMESTAMP" and column_default or None
-                    )
-                    and column_default
-                )
-
-            if column_extra == "auto_increment" or column_default is not None:
-                propertys += f"# self.{column_name} = {value}"
-
-            else:
-                if value is None or isinstance(value, (float, int)) or support_dict:
-                    propertys += f"self.{column_name} = {value}"
-                else:
-                    propertys += f"self.{column_name} = '{value}'"
-
-            if column_comment:
-                propertys += f"  # {column_comment}"
-            propertys += "\n" + " " * 8
-
-        item_template = item_template.replace("${propertys}", propertys.strip())
-        item_template = deal_file_info(item_template)
-
-        return item_template
-
-    def save_template_to_file(self, item_template, table_name):
-        item_file = table_name + "_item.py"
-        if os.path.exists(item_file):
-            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % item_file)
-            if confirm != "y":
-                print("取消覆盖  退出")
-                return
-
-        with open(item_file, "w", encoding="utf-8") as file:
-            file.write(item_template)
-            print("\n%s 生成成功" % item_file)
-
-        self._create_init.create()
-
-    def create(self, tables_name, support_dict):
-        input_tables_name = tables_name
-
-        tables_name = self.select_tables_name(tables_name)
-        if not tables_name:
-            print(tables_name)
-            tip = "mysql数据库中无 %s 表 " % input_tables_name
-            raise KeyError(tip)
-
-        for table_name in tables_name:
-            table_name = table_name[0]
-
-            columns = self.select_columns(table_name)
-            item_template = self.get_item_template()
-            item_template = self.create_item(
-                item_template, columns, table_name, support_dict
-            )
-            self.save_template_to_file(item_template, table_name)

+ 0 - 52
spider_frame/FworkSpider/feapder/commands/create/create_json.py

@@ -1,52 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 字符串转json
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import sys
-
-import feapder.utils.tools as tools
-
-
-class CreateJson:
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        print("请输入需要转换的内容: (xxx:xxx格式,支持多行)")
-        data = []
-        while True:
-            line = sys.stdin.readline().strip().replace("\t", " " * 4)
-            if not line:
-                break
-
-            data.append(line)
-
-        return data
-
-    def create(self, sort_keys=False):
-        contents = self.get_data()
-
-        json = {}
-        for content in contents:
-            content = content.strip()
-            if not content or content.startswith(":"):
-                continue
-
-            regex = "([^:\s]*)[:|\s]*(.*)"
-
-            result = tools.get_info(content, regex, fetch_one=True)
-            if result[0] in json:
-                json[result[0]] = json[result[0]] + "&" + result[1]
-            else:
-                json[result[0]] = result[1].strip()
-
-        print(tools.dumps_json(json, sort_keys=sort_keys))

+ 0 - 51
spider_frame/FworkSpider/feapder/commands/create/create_params.py

@@ -1,51 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/25 10:22 上午
----------
-@summary: 将浏览器的cookie转为request的cookie
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import sys
-
-from feapder.utils.tools import dumps_json
-
-
-class CreateParams:
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        print("请输入请求地址")
-        data = []
-        while True:
-            line = sys.stdin.readline().strip()
-            if not line:
-                break
-
-            data.append(line)
-
-        return "".join(data)
-
-    def get_params(self, url):
-        params_json = {}
-        params = url.split("?")[-1].split("&")
-        for param in params:
-            key_value = param.split("=", 1)
-            params_json[key_value[0]] = key_value[1]
-
-        return params_json
-
-    def create(self):
-        data = self.get_data()
-
-        params = self.get_params(data)
-        url = data.split("?")[0]
-
-        print(f'url = "{url}"')
-        print(f"params = {dumps_json(params)}")

+ 0 - 52
spider_frame/FworkSpider/feapder/commands/create/create_project.py

@@ -1,52 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建项目
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import getpass
-import os
-import shutil
-
-import feapder.utils.tools as tools
-
-
-def deal_file_info(file):
-    file = file.replace("{DATE}", tools.get_current_date())
-    file = file.replace("{USER}", getpass.getuser())
-
-    return file
-
-
-class CreateProject:
-    def copy_callback(self, src, dst, *, follow_symlinks=True):
-        if src.endswith(".py"):
-            with open(src, "r", encoding="utf-8") as src_file, open(
-                dst, "w", encoding="utf8"
-            ) as dst_file:
-                content = src_file.read()
-                content = deal_file_info(content)
-                dst_file.write(content)
-
-        else:
-            shutil.copy2(src, dst, follow_symlinks=follow_symlinks)
-
-    def create(self, project_name):
-        if os.path.exists(project_name):
-            print("%s 项目已经存在" % project_name)
-        else:
-            template_path = os.path.abspath(
-                os.path.join(__file__, "../../../templates/project_template")
-            )
-            shutil.copytree(
-                template_path, project_name, copy_function=self.copy_callback
-            )
-
-            print("\n%s 项目生成成功" % project_name)
-
-
-

+ 0 - 27
spider_frame/FworkSpider/feapder/commands/create/create_setting.py

@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/23 13:20
----------
-@summary: 生成配置文件
----------
-@author: mkdir700
-@email:  mkdir700@gmail.com
-"""
-
-import os
-import shutil
-
-
-class CreateSetting:
-    def create(self):
-        if os.path.exists("setting.py"):
-            confirm = input("配置文件已存在 是否覆盖 (y/n).  ")
-            if confirm != "y":
-                print("取消覆盖  退出")
-                return
-
-        template_file_path = os.path.abspath(
-            os.path.join(__file__, "../../../templates/project_template/setting.py")
-        )
-        shutil.copy(template_file_path, "./", follow_symlinks=False)
-        print("配置文件生成成功")

+ 0 - 107
spider_frame/FworkSpider/feapder/commands/create/create_spider.py

@@ -1,107 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 创建spider
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import getpass
-import os
-import re
-
-import feapder.utils.tools as tools
-from .create_init import CreateInit
-
-
-def deal_file_info(file, author=None):
-    file = file.replace("{DATE}", tools.get_current_date())
-    author = author or getpass.getuser()
-    file = file.replace("{USER}", author)
-    return file
-
-
-class CreateSpider:
-    def __init__(self):
-        self._create_init = CreateInit()
-
-    def cover_to_underline(self, key):
-        regex = "[A-Z]*"
-        capitals = re.findall(regex, key)
-
-        if capitals:
-            for pos, capital in enumerate(capitals):
-                if not capital:
-                    continue
-                if pos == 0:
-                    if len(capital) > 1:
-                        key = key.replace(capital, capital.lower() + "_", 1)
-                    else:
-                        key = key.replace(capital, capital.lower(), 1)
-                else:
-                    if len(capital) > 1:
-                        key = key.replace(capital, "_" + capital.lower() + "_", 1)
-                    else:
-                        key = key.replace(capital, "_" + capital.lower(), 1)
-
-        return key
-
-    def get_spider_template(self, spider_type):
-        if spider_type == 1:
-            template_path = "air_spider_template.tmpl"
-        elif spider_type == 2:
-            template_path = "spider_template.tmpl"
-        elif spider_type == 3:
-            template_path = "batch_spider_template.tmpl"
-        elif spider_type == 4:
-            template_path = "spider_list_template.tmpl"
-        elif spider_type == 5:
-            template_path = "detail_template.tmpl"
-        elif spider_type == 6:
-            template_path = "njpc_list_template.tmpl"
-        elif spider_type == 7:
-            template_path = "njpc_detail_template.tmpl"
-        else:
-            raise ValueError("spider type error, support 1 2 3 4 5 6 7")
-
-        template_path = os.path.abspath(
-            os.path.join(__file__, "../../../templates", template_path)
-        )
-        with open(template_path, "r", encoding="utf-8") as file:
-            spider_template = file.read()
-
-        return spider_template
-
-    def create_spider(self, spider_template, spider_name, author=None):
-        spider_template = spider_template.replace("${spider_name}", spider_name)
-        spider_template = deal_file_info(spider_template, author)
-        return spider_template
-
-    def save_spider_to_file(self, spider, spider_name):
-        spider_underline = self.cover_to_underline(spider_name)
-        spider_file = spider_underline + ".py"
-
-        if os.path.exists(spider_file):
-            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % spider_file)
-            if confirm != "y":
-                print("取消覆盖  退出")
-                return
-
-        with open(spider_file, "w", encoding="utf-8") as file:
-            file.write(spider)
-            print("\n%s 生成成功" % spider_name)
-
-        self._create_init.create()
-
-    def create(self, spider_name, spider_type, author=None):
-        # 检查spider_name
-        if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
-            raise Exception("爬虫名不符合命名规范,请用下划线命名或驼峰命名方式")
-
-        if spider_name.islower():
-            spider_name = tools.key2hump(spider_name)
-        spider_template = self.get_spider_template(spider_type)
-        spider = self.create_spider(spider_template, spider_name, author)
-        self.save_spider_to_file(spider, spider_name)

+ 0 - 135
spider_frame/FworkSpider/feapder/commands/create/create_table.py

@@ -1,135 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-28 17:38:43
----------
-@summary: 根据json生成表
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import sys
-import time
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.mysqldb import MysqlDB
-from feapder.utils.tools import key2underline
-
-
-class CreateTable:
-    def __init__(self):
-        self._db = MysqlDB()
-
-    def is_vaild_date(self, date):
-        try:
-            if ":" in date:
-                time.strptime(date, "%Y-%m-%d %H:%M:%S")
-            else:
-                time.strptime(date, "%Y-%m-%d")
-            return True
-        except:
-            return False
-
-    def get_key_type(self, value):
-        try:
-            value = eval(value)
-        except:
-            value = value
-
-        key_type = "varchar(255)"
-        if isinstance(value, int):
-            key_type = "int"
-        elif isinstance(value, float):
-            key_type = "double"
-        elif isinstance(value, str):
-            if self.is_vaild_date(value):
-                if ":" in value:
-                    key_type = "datetime"
-                else:
-                    key_type = "date"
-            elif len(value) > 255:
-                key_type = "text"
-            else:
-                key_type = "varchar(255)"
-
-        return key_type
-
-    def get_data(self):
-        """
-        @summary: 从控制台读取多行
-        ---------
-        ---------
-        @result:
-        """
-        data = ""
-        while True:
-            line = sys.stdin.readline().strip()
-            if not line:
-                break
-            data += line
-
-        return tools.get_json(data)
-
-    def create(self, table_name):
-        # 输入表字段
-        print('请输入表数据 json格式 如 {"name":"张三"}\n等待输入:\n')
-        data = self.get_data()
-
-        if not isinstance(data, dict):
-            raise Exception("表数据格式不正确")
-
-        # 拼接表结构
-        sql = """
-            CREATE TABLE `{db}`.`{table_name}` (
-                `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id 自动递增',
-                {other_key}
-                `gtime` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间',
-                PRIMARY KEY (`id`),
-                {unique}
-            ) COMMENT='';
-        """
-
-        print("请设置注释 回车跳过")
-        other_key = ""
-        for key, value in data.items():
-            key = key2underline(key)
-            key_type = self.get_key_type(value)
-
-            comment = input("%s : %s  -> comment:" % (key, key_type))
-
-            other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
-                key=key, key_type=key_type, comment=comment
-            )
-
-        print("\n")
-
-        while True:
-            is_need_batch_date = input("是否添加batch_date 字段 (y/n):")
-            if is_need_batch_date == "y":
-                other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
-                    key="batch_date", key_type="date", comment="批次时间"
-                )
-                break
-            elif is_need_batch_date == "n":
-                break
-
-        print("\n")
-
-        while True:
-            unique = input("请设置唯一索引, 多个逗号间隔\n等待输入:\n").replace(",", ",")
-            if unique:
-                break
-        unique = "UNIQUE `idx` USING BTREE (`%s`) comment ''" % "`,`".join(
-            unique.split(",")
-        )
-
-        sql = sql.format(
-            db=setting.MYSQL_DB,
-            table_name=table_name,
-            other_key=other_key,
-            unique=unique,
-        )
-        print(sql)
-        self._db.execute(sql)
-        print("\n%s 创建成功" % table_name)

+ 0 - 117
spider_frame/FworkSpider/feapder/commands/create_builder.py

@@ -1,117 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/2/8 11:21 上午
----------
-@summary: 生成器
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import argparse
-
-import feapder.setting as setting
-from feapder.commands.create import *
-
-
-def main():
-    spider = argparse.ArgumentParser(description="生成器")
-
-    spider.add_argument(
-        "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
-    )
-    spider.add_argument(
-        "--s",
-        "--spider",
-        nargs="+",
-        help="创建爬虫\n"
-        "如 feapder create -s <spider_name> <spider_type> "
-        "spider_type=1  AirSpider; "
-        "spider_type=2  Spider; ",
-        metavar="",
-    )
-    spider.add_argument(
-        "-i",
-        "--item",
-        nargs="+",
-        help="创建item 如 feapder create -i test 则生成test表对应的item。 "
-        "支持like语法模糊匹配所要生产的表。 "
-        "若想生成支持字典方式赋值的item,则create -item test 1",
-        metavar="",
-    )
-    spider.add_argument(
-        "-t", "--table", help="根据json创建表 如 feapder create -t <table_name>", metavar=""
-    )
-    spider.add_argument(
-        "-init", help="创建__init__.py 如 feapder create -init", action="store_true"
-    )
-    spider.add_argument("-j", "--json", help="创建json", action="store_true")
-    spider.add_argument("-sj", "--sort_json", help="创建有序json", action="store_true")
-    spider.add_argument("-c", "--cookies", help="创建cookie", action="store_true")
-    spider.add_argument("--params", help="解析地址中的参数", action="store_true")
-    spider.add_argument(
-        "--setting", help="创建全局配置文件" "feapder create --setting", action="store_true"
-    )
-
-    # 指定数据库
-    spider.add_argument("--host", type=str, help="mysql 连接地址", metavar="")
-    spider.add_argument("--port", type=str, help="mysql 端口", metavar="")
-    spider.add_argument("--username", type=str, help="mysql 用户名", metavar="")
-    spider.add_argument("--password", type=str, help="mysql 密码", metavar="")
-    spider.add_argument("--db", type=str, help="mysql 数据库名", metavar="")
-    args = spider.parse_args()
-
-    if args.host:
-        setting.MYSQL_IP = args.host
-    if args.port:
-        setting.MYSQL_PORT = int(args.port)
-    if args.username:
-        setting.MYSQL_USER_NAME = args.username
-    if args.password:
-        setting.MYSQL_USER_PASS = args.password
-    if args.db:
-        setting.MYSQL_DB = args.db
-
-    if args.item:
-        item_name, *support_dict = args.item
-        support_dict = bool(support_dict)
-        CreateItem().create(item_name, support_dict)
-
-    elif args.spider:
-        spider_name, *spider_type = args.spider
-        if not spider_type:
-            spider_type = 1
-        else:
-            spider_type = spider_type[0]
-        try:
-            spider_type = int(spider_type)
-        except:
-            raise ValueError("spider_type error, support 1, 2, 3")
-        CreateSpider().create(spider_name, spider_type, None)
-
-    elif args.project:
-        CreateProject().create(args.project)
-
-    elif args.table:
-        CreateTable().create(args.table)
-
-    elif args.init:
-        CreateInit().create()
-
-    elif args.json:
-        CreateJson().create()
-
-    elif args.sort_json:
-        CreateJson().create(sort_keys=True)
-
-    elif args.cookies:
-        CreateCookies().create()
-
-    elif args.setting:
-        CreateSetting().create()
-
-    elif args.params:
-        CreateParams().create()
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 93
spider_frame/FworkSpider/feapder/commands/shell.py

@@ -1,93 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/5/9 12:37 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import json
-import re
-import sys
-
-import IPython
-
-from feapder import Request
-
-
-def request(**kwargs):
-    kwargs.setdefault("proxies", None)
-    response = Request(**kwargs).get_response()
-    print(response)
-
-    IPython.embed(header="now you can use response")
-
-
-def fetch_url(url):
-    request(url=url)
-
-
-def fetch_curl(curl_args):
-    """
-    解析及抓取curl请求
-    :param curl_args:
-    [url, '-H', 'xxx', '-H', 'xxx', '--data-binary', '{"xxx":"xxx"}', '--compressed']
-    :return:
-    """
-    url = curl_args[0]
-    curl_args.pop(0)
-
-    headers = {}
-    data = {}
-    for i in range(0, len(curl_args), 2):
-        if curl_args[i] == "-H":
-            regex = "([^:\s]*)[:|\s]*(.*)"
-            result = re.search(regex, curl_args[i + 1], re.S).groups()
-            if result[0] in headers:
-                headers[result[0]] = headers[result[0]] + "&" + result[1]
-            else:
-                headers[result[0]] = result[1].strip()
-
-        elif curl_args[i] == "--data-binary":
-            data = json.loads(curl_args[i + 1])
-
-    request(url=url, data=data, headers=headers)
-
-
-def usage():
-    """
-    下载调试器
-
-    usage: feapder shell [options] [args]
-
-    optional arguments:
-      -u, --url     抓取指定url
-      -c, --curl    抓取curl格式的请求
-
-    """
-    print(usage.__doc__)
-    sys.exit()
-
-
-def main():
-    args = sys.argv
-    if len(args) < 3:
-        usage()
-
-    elif args[1] in ("-h", "--help"):
-        usage()
-
-    elif args[1] in ("-u", "--url"):
-        fetch_url(args[2])
-
-    elif args[1] in ("-c", "--curl"):
-        fetch_curl(args[2:])
-
-    else:
-        usage()
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 0
spider_frame/FworkSpider/feapder/core/__init__.py


+ 0 - 111
spider_frame/FworkSpider/feapder/core/base_parser.py

@@ -1,111 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:41:57
----------
-@summary: parser 的基类
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-
-class BaseParser(object):
-    def start_requests(self):
-        """
-        @summary: 添加初始url
-        ---------
-        ---------
-        @result: yield Request()
-        """
-
-        pass
-
-    def download_midware(self, request):
-        """
-        @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
-        ---------
-        @param request:
-        ---------
-        @result: return request / request, response
-        """
-
-        pass
-
-    def validate(self, request, response):
-        """
-        @summary: 校验函数, 可用于校验response是否正确
-        若函数内抛出异常,则重试请求
-        若返回True 或 None,则进入解析函数
-        若返回False,则抛弃当前请求
-        可通过request.callback_name 区分不同的回调函数,编写不同的校验逻辑
-        ---------
-        @param request:
-        @param response:
-        ---------
-        @result: True / None / False
-        """
-        pass
-
-    def parse(self, request, response):
-        """
-        @summary: 默认的解析函数
-        ---------
-        @param request:
-        @param response:
-        ---------
-        @result:
-        """
-
-        pass
-
-    def exception_request(self, request, response):
-        """
-        @summary: 请求或者parser里解析出异常的request
-        ---------
-        @param request:
-        @param response:
-        ---------
-        @result: request / callback / None (返回值必须可迭代)
-        """
-
-        pass
-
-    def failed_request(self, request, response):
-        """
-        @summary: 超过最大重试次数的request
-        可返回修改后的request 若不返回request,则将传进来的request直接人redis的failed表。否则将修改后的request入failed表
-        ---------
-        @param request:
-        @param response:
-        ---------
-        @result: request / item / callback / None (返回值必须可迭代)
-        """
-
-        pass
-
-    def start_callback(self):
-        """
-        @summary: 程序开始的回调
-        ---------
-        ---------
-        @result: None
-        """
-
-        pass
-
-    def end_callback(self):
-        """
-        @summary: 程序结束的回调
-        ---------
-        ---------
-        @result: None
-        """
-
-        pass
-
-    @property
-    def name(self):
-        return self.__class__.__name__
-
-    def close(self):
-        pass

+ 0 - 127
spider_frame/FworkSpider/feapder/core/collector.py

@@ -1,127 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-09-21 11:24
----------
-@summary: request 管理
----------
-@author: dzr
-"""
-import threading
-import time
-from queue import Queue, Empty
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.rabbitMq import RabbitMQ, RabbitMQMessage
-from feapder.network.request import Request
-from feapder.utils.log import log
-
-# 执行 eval 需要的全局对象
-tools.load_globals(RabbitMQMessage)
-
-
-class Collector(threading.Thread):
-
-    def __init__(self, redis_key, rabbitmq=None, user=None):
-        """
-        @summary:
-        ---------
-        @param redis_key:
-        ---------
-        @result:
-        """
-        super(Collector, self).__init__()
-
-        self._thread_stop = False
-        self._user = user
-
-        self._todo_requests = Queue(maxsize=setting.COLLECTOR_TASK_COUNT)
-
-        self._rabbitmq = rabbitmq or RabbitMQ()
-
-        # 任务队列
-        self._tab_requests = setting.TAB_REQUESTS.format(redis_key=redis_key)
-        self._rabbitmq.declare_bind(queue=self._tab_requests)
-
-        self._interval = setting.COLLECTOR_SLEEP_TIME
-        self._request_count = setting.COLLECTOR_TASK_COUNT
-        self._is_collector_task = False
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            try:
-                self.__input_data()
-            except Exception as e:
-                log.exception(e)
-
-            self._is_collector_task = False
-
-            time.sleep(self._interval)
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def __get_messages(self, request_count):
-        kwargs = dict(correlation_id=self._user)
-        messages = self._rabbitmq.get(self._tab_requests, request_count, **kwargs)
-        messages = [eval(message) for message in messages]
-        return messages
-
-    def __input_data(self):
-        if self._request_count / setting.SPIDER_THREAD_COUNT > 1 and (
-            self._todo_requests.qsize() > setting.SPIDER_THREAD_COUNT
-            or self._todo_requests.qsize() >= self._todo_requests.maxsize
-        ):  # 当任务总数大于线程数 且 内存队列持有任务总数大于线程数 此时不添加任务
-            time.sleep(0.1)
-            return
-
-        # 取任务
-        message_list = self.__get_messages(self._request_count)
-        if message_list:
-            self._is_collector_task = True
-            self.__put_messages(message_list)
-        else:
-            time.sleep(0.1)
-
-    def __put_messages(self, message_list):
-        for message in message_list:
-            delivery_tag = message.delivery_tag
-            request = message.body
-            try:
-                request_dict = {
-                    "request_obj": Request.from_dict(request),
-                    "request_redis": request,
-                }
-            except Exception as e:
-                log.exception(
-                    """
-                    error %s
-                    request %s
-                    """
-                    % (e, request)
-                )
-                request_dict = None
-
-            if request_dict:
-                self._todo_requests.put(request_dict)
-                self._rabbitmq.ack(delivery_tag)
-
-    def get_request(self):
-        try:
-            request = self._todo_requests.get(timeout=1)
-            return request
-        except Empty as e:
-            return None
-
-    def get_requests_count(self):
-        arguments = dict(queue=self._tab_requests, user=self._user)
-        return (
-            self._todo_requests.qsize()
-            or self._rabbitmq.get_message_count(**arguments)
-            or 0
-        )
-
-    def is_collector_task(self):
-        return self._is_collector_task

+ 0 - 96
spider_frame/FworkSpider/feapder/core/handle_failed_items.py

@@ -1,96 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/11/18 11:33 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import bson
-from bson import ObjectId
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.db.rabbitMq import RabbitMQ, RabbitMQMessage
-from feapder.network.item import Item, UpdateItem
-from feapder.utils.log import log
-
-# 执行 eval 需要的全局作用域
-tools.load_globals(RabbitMQMessage, ObjectId)
-
-
-class HandleFailedItems:
-    def __init__(self, redis_key, rabbitmq=None, item_buffer=None, user=None):
-        if redis_key.endswith(":s_failed_items"):
-            redis_key = redis_key.replace(":s_failed_items", "")
-
-        self._redis_key = redis_key
-        self._user = user
-        self._rabbitmq = rabbitmq or RabbitMQ()
-        self._item_buffer = item_buffer or ItemBuffer(redis_key, user=user)
-
-        # 数据保存失败队列
-        self._tab_failed_items = setting.TAB_FAILED_ITEMS
-        self._rabbitmq.declare_bind(queue=self._tab_failed_items)
-
-    def get_failed_items(self, count=10000):
-        kwargs = dict(correlation_id=self._user or self._redis_key)
-        failed_items = self._rabbitmq.get(self._tab_failed_items, count, **kwargs)
-        failed_items = [eval(message) for message in failed_items]
-        return failed_items
-
-    def reput_failed_items_to_db(self):
-        log.debug("正在重新写入失败的items...")
-        total_count = 0
-        while True:
-            try:
-                failed_items = self.get_failed_items()
-                if not failed_items:
-                    break
-
-                for message in failed_items:
-                    delivery_tag = message.delivery_tag
-                    data = message.body
-                    for add in data.get("add"):
-                        table = add.get("table")
-                        datas = add.get("datas")
-                        for _data in datas:
-                            _data = {k: v for k, v in _data.items() if k != '_id'}
-                            if "comeintime" in _data:
-                                _data["comeintime"] = bson.Int64(tools.get_current_timestamp())  # 重置入库时间
-
-                            item = Item(**_data)
-                            item.table_name = table
-                            self._item_buffer.put_item(item)  # 异步推送
-                            total_count += 1
-
-                    for update in data.get("update"):
-                        table = update.get("table")
-                        datas = update.get("datas")
-                        update_keys = update.get("update_keys")
-                        for _data in datas:
-                            item = UpdateItem(**_data)
-                            item.table_name = table
-                            item.update_keys = update_keys
-                            self._item_buffer.put_item(item)
-                            total_count += 1
-
-                    # 入库成功后删除
-                    def delete_item():
-                        self._rabbitmq.ack(delivery_tag)
-
-                    self._item_buffer.put_item(delete_item)
-                    self._item_buffer.flush()
-
-            except Exception as e:
-                log.exception(e)
-
-        if total_count:
-            log.debug("导入%s条失败item到数据库" % total_count)
-        else:
-            log.debug("没有失败的item")
-
-    def close(self):
-        self._item_buffer.close()

+ 0 - 71
spider_frame/FworkSpider/feapder/core/handle_failed_requests.py

@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-08-13 11:43:01
----------
-@summary:
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.request_buffer import RequestBuffer
-from feapder.db.rabbitMq import RabbitMQ, RabbitMQMessage
-from feapder.network.request import Request
-from feapder.utils.log import log
-
-# 执行 eval 需要的全局作用域
-tools.load_globals(RabbitMQMessage)
-
-
-class HandleFailedRequests(object):
-    """docstring for HandleFailedRequests"""
-
-    def __init__(self, redis_key, rabbitmq=None, user=None):
-        super(HandleFailedRequests, self).__init__()
-
-        self._redis_key = redis_key
-        self._user = user
-
-        self._rabbitmq = rabbitmq or RabbitMQ()
-        self._request_buffer = RequestBuffer(self._redis_key, rabbitmq)
-
-        # 失败任务队列
-        self._tab_failed_requests = setting.TAB_FAILED_REQUESTS
-        self._rabbitmq.declare_bind(queue=self._tab_failed_requests)
-
-    def get_failed_messages(self, count=10000):
-        kwargs = dict(correlation_id=self._user or self._redis_key)
-        failed_messages = self._rabbitmq.get(self._tab_failed_requests, count, **kwargs)
-        failed_messages = [eval(message) for message in failed_messages]
-        return failed_messages
-
-    def reput_failed_requests_to_requests(self):
-        log.debug("正在重置失败的requests...")
-        total_count = 0
-        while True:
-            try:
-                failed_messages = self.get_failed_messages()
-                if not failed_messages:
-                    break
-
-                for message in failed_messages:
-                    delivery_tag = message.delivery_tag
-                    request = message.body
-                    request["retry_times"] = 0
-                    request_obj = Request.from_dict(request)
-                    self._request_buffer.put_request(request_obj)
-
-                    # 入库成功后删除
-                    def delete_request():
-                        self._rabbitmq.ack(delivery_tag)
-
-                    self._request_buffer.put_request(delete_request)
-                    total_count += 1
-
-            except Exception as e:
-                log.exception(e)
-
-        self._request_buffer.flush()
-        self._request_buffer.stop()
-        log.debug("重置%s条失败requests为待抓取requests" % total_count)

+ 0 - 1064
spider_frame/FworkSpider/feapder/core/parser_control.py

@@ -1,1064 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2017-01-03 16:06
----------
-@summary: parser 控制类
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import pathlib
-import random
-import threading
-import time
-from collections.abc import Iterable
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.buffer.item_buffer import JyItemBuffer
-from feapder.db.memory_db import MemoryDB
-from feapder.network.item import Item, HeartBeatItem
-from feapder.network.request import Request
-from feapder.utils import metrics
-from feapder.utils.log import log
-
-
-class PaserControl(threading.Thread):
-    DOWNLOAD_EXCEPTION = "download_exception"
-    DOWNLOAD_SUCCESS = "download_success"
-    DOWNLOAD_TOTAL = "download_total"
-    PAESERS_EXCEPTION = "parser_exception"
-
-    is_show_tip = False
-
-    # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警
-    _success_task_count = 0
-    _failed_task_count = 0
-
-    def __init__(self, collector, redis_key, request_buffer, item_buffer, heartbeat_buffer):
-        super(PaserControl, self).__init__()
-        self._parsers = []
-        self._collector = collector
-        self._redis_key = redis_key
-        self._request_buffer = request_buffer
-        self._item_buffer = item_buffer
-        self._heartbeat_buffer = heartbeat_buffer
-
-        self._thread_stop = False
-
-    def is_not_task(self):
-        return self.is_show_tip
-
-    @classmethod
-    def get_task_status_count(cls):
-        return cls._failed_task_count, cls._success_task_count
-
-    def run(self):
-        self._thread_stop = False
-        while not self._thread_stop:
-            try:
-                request = self._collector.get_request()  # step 2 获取任务
-                if not request:
-                    if not self.is_show_tip:
-                        log.debug("等待任务...")
-                        self.is_show_tip = True
-                    continue
-
-                self.is_show_tip = False
-                self.deal_request(request)
-            except (Exception, BaseException) as e:
-                log.exception(e)
-
-    def deal_request(self, request):
-        response = None
-        request_redis = request["request_redis"]
-        request = request["request_obj"]
-        now_page = request.page or -1
-
-        for parser in self._parsers:
-            counter = {
-                'now_page': now_page,
-                'extract_count': 0,  # 列表页抽取的列表数量
-                'rel_count': 0,  # 去重后实际入库数量
-            }
-            if parser.name == request.parser_name:
-                used_download_midware_enable = False
-                try:
-                    # 记录需下载的文档
-                    self.record_download_status(
-                        PaserControl.DOWNLOAD_TOTAL, parser.name
-                    )
-
-                    # 解析request
-                    if request.auto_request:
-                        request_temp = None
-                        response = None
-
-                        # 下载中间件
-                        if request.download_midware:
-                            if isinstance(request.download_midware, (list, tuple)):
-                                request_temp = request
-                                for download_midware in request.download_midware:
-                                    download_midware = (
-                                        download_midware
-                                        if callable(download_midware)
-                                        else tools.get_method(
-                                            parser, download_midware
-                                        )
-                                    )
-                                    request_temp = download_midware(request_temp)
-                            else:
-                                download_midware = (
-                                    request.download_midware
-                                    if callable(request.download_midware)
-                                    else tools.get_method(
-                                        parser, request.download_midware
-                                    )
-                                )
-                                request_temp = download_midware(request)
-                        elif request.download_midware != False:
-                            request_temp = parser.download_midware(request)
-
-                        # 请求
-                        if request_temp:
-                            if (
-                                isinstance(request_temp, (tuple, list))
-                                and len(request_temp) == 2
-                            ):
-                                request_temp, response = request_temp
-
-                            if not isinstance(request_temp, Request):
-                                raise Exception(
-                                    "download_midware need return a request, but received type: {}".format(
-                                        type(request_temp)
-                                    )
-                                )
-                            used_download_midware_enable = True
-                            if not response:
-                                response = (
-                                    request_temp.get_response()
-                                    if not setting.RESPONSE_CACHED_USED
-                                    else request_temp.get_response_from_cached(
-                                        save_cached=False
-                                    )
-                                )
-                        else:
-                            response = (
-                                request.get_response()
-                                if not setting.RESPONSE_CACHED_USED
-                                else request.get_response_from_cached(
-                                    save_cached=False
-                                )
-                            )
-
-                        if response == None:
-                            raise Exception(
-                                "连接超时 url: %s" % (request.url or request_temp.url)
-                            )
-
-                        # 校验
-                        if parser.validate(request, response) == False:
-                            break
-
-                    else:
-                        response = None
-
-                    if request.callback:  # 如果有parser的回调函数,则用回调处理
-                        callback_parser = (
-                            request.callback
-                            if callable(request.callback)
-                            else tools.get_method(parser, request.callback)
-                        )
-
-                        results = callback_parser(request, response)
-                    else:  # 否则默认用parser处理
-                        results = parser.parse(request, response)
-
-                    if results and not isinstance(results, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代"
-                            % (parser.name, request.callback or "parse")
-                        )
-
-                    # 标识上一个result是什么
-                    result_type = 0  # 0\1\2 (初始值\request\item)
-                    # 此处判断是request 还是 item
-                    for result in results or []:
-                        if isinstance(result, Request):
-                            result_type = 1
-                            # 给request的 parser_name 赋值
-                            result.parser_name = result.parser_name or parser.name
-
-                            # 判断是同步的callback还是异步的
-                            if result.request_sync:  # 同步
-                                request_dict = {
-                                    "request_obj": result,
-                                    "request_redis": None,
-                                }
-                                self.deal_request(request_dict)  # 1.8.5 版本
-                            else:  # 异步
-                                # 将next_request 入库
-                                self._request_buffer.put_request(result)
-
-                        elif isinstance(result, Item):
-                            result_type = 2
-
-                            # 爬虫采集方式
-                            #   True=混合采集(列表页+详情页)
-                            #   False=独立采集(列表页,详情页)
-                            result.is_mixed = False
-                            if "List" in parser.__business_type__ and hasattr(result, 'contenthtml'):
-                                result.is_mixed = True
-
-                            counter['extract_count'] += 1  # 统计抽取列表数
-                            if not self.is_duplicate(result):
-                                counter['rel_count'] += 1  # 统计实际列表数
-
-                            # 将item入库(异步)
-                            self._item_buffer.put_item(result)
-
-                        elif callable(result):  # result为可执行的无参函数
-                            if result_type == 2:  # item 的 callback,buffer里的item均入库后再执行
-                                self._item_buffer.put_item(result)
-
-                            else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
-                                self._request_buffer.put_request(result)
-
-                        elif result is not None:
-                            function_name = "{}.{}".format(
-                                parser.name,
-                                (
-                                    request.callback
-                                    and callable(request.callback)
-                                    and getattr(request.callback, "__name__")
-                                    or request.callback
-                                )
-                                or "parse",
-                            )
-                            raise TypeError(
-                                f"{function_name} result expect Request、Item or callback, but get type: {type(result)}"
-                            )
-
-                except (Exception, BaseException) as e:
-                    exception_type = (
-                        str(type(e)).replace("<class '", "").replace("'>", "")
-                    )
-                    if exception_type.startswith("requests"):
-                        # 记录下载失败的文档
-                        self.record_download_status(
-                            PaserControl.DOWNLOAD_EXCEPTION, parser.name
-                        )
-
-                    else:
-                        # 记录解析程序异常
-                        self.record_download_status(
-                            PaserControl.PAESERS_EXCEPTION, parser.name
-                        )
-
-                    if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印,超时的异常篇幅太多
-                        log.exception(e)
-
-                    log.error(
-                        """
-                        -------------- %s.%s error -------------
-                        error          %s
-                        response       %s
-                        deal request   %s
-                        """
-                        % (
-                            parser.name,
-                            (
-                                request.callback
-                                and callable(request.callback)
-                                and getattr(request.callback, "__name__")
-                                or request.callback
-                            )
-                            or "parse",
-                            str(e),
-                            response,
-                            tools.dumps_json(request.to_dict, indent=28)
-                            if setting.LOG_LEVEL == "DEBUG"
-                            else request,
-                        )
-                    )
-
-                    request.error_msg = "%s: %s" % (exception_type, e)
-                    request.response = str(response)
-
-                    if "Invalid URL" in str(e):
-                        request.is_abandoned = True
-
-                    requests = parser.exception_request(request, response) or [request]
-                    if not isinstance(requests, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
-                        )
-                    for request in requests:
-                        if callable(request):
-                            self._request_buffer.put_request(request)
-                            continue
-
-                        if not isinstance(request, Request):
-                            raise Exception("exception_request 需 yield request")
-
-                        if request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES or request.is_abandoned:
-                            self.__class__._failed_task_count += 1  # 记录失败任务数
-
-                            # 处理failed_request的返回值 request 或 func
-                            results = parser.failed_request(request, response) or [request]
-                            if not isinstance(results, Iterable):
-                                raise Exception(
-                                    "%s.%s返回值必须可迭代"
-                                    % (parser.name, "failed_request")
-                                )
-
-                            for result in results:
-                                if isinstance(result, Request):
-                                    if setting.SAVE_FAILED_REQUEST:
-                                        if used_download_midware_enable:
-                                            # 去掉download_midware 添加的属性
-                                            original_request = (
-                                                Request.from_dict(
-                                                    eval(request_redis)
-                                                )
-                                                if request_redis
-                                                else result
-                                            )
-                                            original_request.error_msg = request.error_msg
-                                            original_request.response = request.response
-                                            self._request_buffer.put_failed_request(original_request)
-                                        else:
-                                            self._request_buffer.put_failed_request(result)
-
-                                elif callable(result):
-                                    self._request_buffer.put_request(result)
-
-                                elif isinstance(result, Item):
-                                    self._item_buffer.put_item(result)
-
-                        else:
-                            # 将 requests 重新入库 爬取
-                            request.retry_times += 1
-                            request.filter_repeat = False
-                            log.info(
-                                """
-                                入库 等待重试
-                                url     %s
-                                重试次数 %s
-                                最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-                            if used_download_midware_enable:
-                                # 去掉download_midware 添加的属性 使用原来的requests
-                                original_request = (
-                                    Request.from_dict(eval(request_redis))
-                                    if request_redis
-                                    else request
-                                )
-                                if hasattr(request, "error_msg"):
-                                    original_request.error_msg = request.error_msg
-                                if hasattr(request, "response"):
-                                    original_request.response = request.response
-
-                                original_request.retry_times = request.retry_times
-                                original_request.filter_repeat = request.filter_repeat
-                                self._request_buffer.put_request(original_request)
-                            else:
-                                self._request_buffer.put_request(request)
-
-                else:
-                    # 记录下载成功的文档
-                    self.record_download_status(
-                        PaserControl.DOWNLOAD_SUCCESS, parser.name
-                    )
-                    # 记录成功任务数
-                    self.__class__._success_task_count += 1
-
-                    # 缓存下载成功的文档
-                    if setting.RESPONSE_CACHED_ENABLE:
-                        request.save_cached(
-                            response=response,
-                            expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
-                        )
-
-                finally:
-                    # 释放浏览器
-                    if response and hasattr(response, "browser"):
-                        request._webdriver_pool.put(response.browser)
-
-                    # 发布心跳
-                    self.publish_heartbeat(parser, request, response, **counter)
-                break
-
-        if setting.SPIDER_SLEEP_TIME:
-            if (
-                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
-                and len(setting.SPIDER_SLEEP_TIME) == 2
-            ):
-                sleep_time = random.randint(
-                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
-                )
-                time.sleep(sleep_time)
-            else:
-                time.sleep(setting.SPIDER_SLEEP_TIME)
-
-    def record_download_status(self, status, spider):
-        """
-        记录html等文档下载状态
-
-        @return:
-        """
-        metrics.emit_counter(f"{spider}:{status}", 1, classify="document")
-
-    def stop(self):
-        self._thread_stop = True
-        self._started.clear()
-
-    def add_parser(self, parser):
-        self._parsers.append(parser)
-
-    def is_duplicate(self, item):
-        """item入库前是否会被过滤"""
-        if setting.ITEM_FILTER_ENABLE:
-            if self._item_buffer.__class__.dedup.get(item.fingerprint):
-                return True
-        return False
-
-    def publish_heartbeat(self, parser, request, response, **kwargs):
-        request_item = getattr(request, "item")
-        business_type: str = parser.__business_type__  # 爬虫业务类型
-        if business_type.endswith("List"):
-            site = getattr(parser, "site")
-            spidercode = request_item["code"]
-            count = kwargs["extract_count"]  # 抽取的列表数量
-        else:
-            site = request_item["site"]
-            spidercode = request_item["spidercode"]
-            count = 0  # 详情页采集任务总数,汇总计算在心跳管理器处理
-
-        run_time = tools.get_current_date(date_format="%Y-%m-%d")  # 运行时间,单位:天
-        heartbeat_item = HeartBeatItem(
-            batch_no=tools.get_md5(spidercode + business_type + run_time),
-            node_ip=tools.os.environ.get("CRAWLAB_SERVER_REGISTER_IP"),  # crawlab 节点名称
-            crawlab_taskid=tools.os.environ.get("CRAWLAB_TASK_ID"),  # crawlab 执行采集的任务id
-            filepath=str(pathlib.Path(setting.sys.argv[0])),  # 文件路径
-            site=site,
-            channel=request_item["channel"],
-            spidercode=spidercode,
-            business_type=business_type,
-            runtime=run_time,
-            url=request.url,
-            status_code=getattr(response, "status_code", -1),
-            nowpage=kwargs["now_page"],  # 当前列表页页码
-            count=count,
-            failed_retry_times=request.retry_times,  # 失败重试次数
-            rel_count=kwargs["rel_count"],  # 实际入库总数
-            failed_task_count=self._failed_task_count,
-            success_task_count=self._success_task_count,
-            create_at=tools.ensure_int64(tools.get_current_timestamp()),  # 创建时间, 单位:秒
-            expire_at=tools.get_utcnow(),  # 设置utc时间,定期删除(5天)
-        )
-        # 采集任务总数(爬虫本次运行发起的总请求数) failed_task_count + success_task_count
-        heartbeat_item.table_name = setting.SPIDER_HEARTBEAT_RECORD  # 设置表名
-        return self._heartbeat_buffer.put_item(heartbeat_item)
-
-
-class AirSpiderParserControl(PaserControl):
-    is_show_tip = False
-
-    # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警
-    _success_task_count = 0
-    _failed_task_count = 0
-
-    def __init__(self, memory_db: MemoryDB, item_buffer: ItemBuffer):
-        super(PaserControl, self).__init__()
-        self._parsers = []
-        self._memory_db = memory_db
-        self._thread_stop = False
-        self._wait_task_time = 0
-        self._item_buffer = item_buffer
-
-    def run(self):
-        while not self._thread_stop:
-            try:
-                request = self._memory_db.get()
-                if not request:
-                    if not self.is_show_tip:
-                        log.debug("等待任务...")
-                        self.is_show_tip = True
-                    continue
-
-                self.is_show_tip = False
-                self.deal_request(request)
-
-            except (Exception, BaseException) as e:
-                log.exception(e)
-
-    def deal_request(self, request):
-        response = None
-
-        for parser in self._parsers:
-            if parser.name == request.parser_name:
-                try:
-                    # 记录需下载的文档
-                    self.record_download_status(
-                        PaserControl.DOWNLOAD_TOTAL, parser.name
-                    )
-
-                    # 解析request
-                    if request.auto_request:
-                        request_temp = None
-                        response = None
-
-                        # 下载中间件
-                        if request.download_midware:
-                            if isinstance(request.download_midware, (list, tuple)):
-                                request_temp = request
-                                for download_midware in request.download_midware:
-                                    download_midware = (
-                                        download_midware
-                                        if callable(download_midware)
-                                        else tools.get_method(
-                                            parser, download_midware
-                                        )
-                                    )
-                                    request_temp = download_midware(request_temp)
-                            else:
-                                download_midware = (
-                                    request.download_midware
-                                    if callable(request.download_midware)
-                                    else tools.get_method(
-                                        parser, request.download_midware
-                                    )
-                                )
-                                request_temp = download_midware(request)
-                        elif request.download_midware != False:
-                            request_temp = parser.download_midware(request)
-
-                        # 请求
-                        if request_temp:
-                            if (
-                                isinstance(request_temp, (tuple, list))
-                                and len(request_temp) == 2
-                            ):
-                                request_temp, response = request_temp
-
-                            if not isinstance(request_temp, Request):
-                                raise Exception(
-                                    "download_midware need return a request, but received type: {}".format(
-                                        type(request_temp)
-                                    )
-                                )
-                            request = request_temp
-
-                        if not response:
-                            response = (
-                                request.get_response()
-                                if not setting.RESPONSE_CACHED_USED
-                                else request.get_response_from_cached(save_cached=False)
-                            )
-
-                        # 校验
-                        if parser.validate(request, response) == False:
-                            break
-
-                    else:
-                        response = None
-
-                    if request.callback:  # 如果有parser的回调函数,则用回调处理
-                        callback_parser = (
-                            request.callback
-                            if callable(request.callback)
-                            else tools.get_method(parser, request.callback)
-                        )
-                        results = callback_parser(request, response)
-                    else:  # 否则默认用parser处理
-                        results = parser.parse(request, response)
-
-                    if results and not isinstance(results, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代"
-                            % (parser.name, request.callback or "parse")
-                        )
-
-                    # 此处判断是request 还是 item
-                    for result in results or []:
-                        if isinstance(result, Request):
-                            # 给request的 parser_name 赋值
-                            result.parser_name = result.parser_name or parser.name
-
-                            # 判断是同步的callback还是异步的
-                            if result.request_sync:  # 同步
-                                self.deal_request(result)
-                            else:  # 异步
-                                # 将next_request 入库
-                                self._memory_db.add(result)
-
-                        elif isinstance(result, Item):
-                            self._item_buffer.put_item(result)
-
-                        elif result is not None:
-                            function_name = "{}.{}".format(
-                                parser.name,
-                                (
-                                    request.callback
-                                    and callable(request.callback)
-                                    and getattr(request.callback, "__name__")
-                                    or request.callback
-                                )
-                                or "parse",
-                            )
-                            raise TypeError(
-                                f"{function_name} result expect Request or Item, bug get type: {type(result)}"
-                            )
-
-                except Exception as e:
-                    exception_type = (
-                        str(type(e)).replace("<class '", "").replace("'>", "")
-                    )
-                    if exception_type.startswith("requests"):
-                        # 记录下载失败的文档
-                        self.record_download_status(
-                            PaserControl.DOWNLOAD_EXCEPTION, parser.name
-                        )
-
-                    else:
-                        # 记录解析程序异常
-                        self.record_download_status(
-                            PaserControl.PAESERS_EXCEPTION, parser.name
-                        )
-
-                    if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
-                        log.exception(e)
-
-                    log.error(
-                        """
-                            -------------- %s.%s error -------------
-                            error          %s
-                            response       %s
-                            deal request   %s
-                            """
-                        % (
-                            parser.name,
-                            (
-                                request.callback
-                                and callable(request.callback)
-                                and getattr(request.callback, "__name__")
-                                or request.callback
-                            )
-                            or "parse",
-                            str(e),
-                            response,
-                            tools.dumps_json(request.to_dict, indent=28)
-                            if setting.LOG_LEVEL == "DEBUG"
-                            else request,
-                        )
-                    )
-
-                    request.error_msg = "%s: %s" % (exception_type, e)
-                    request.response = str(response)
-
-                    if "Invalid URL" in str(e):
-                        request.is_abandoned = True
-
-                    requests = parser.exception_request(request, response) or [
-                        request
-                    ]
-                    if not isinstance(requests, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
-                        )
-                    for request in requests:
-                        if not isinstance(request, Request):
-                            raise Exception("exception_request 需 yield request")
-
-                        if (
-                            request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
-                            or request.is_abandoned
-                        ):
-                            self.__class__._failed_task_count += 1  # 记录失败任务数
-
-                            # 处理failed_request的返回值 request 或 func
-                            results = parser.failed_request(request, response) or [
-                                request
-                            ]
-                            if not isinstance(results, Iterable):
-                                raise Exception(
-                                    "%s.%s返回值必须可迭代"
-                                    % (parser.name, "failed_request")
-                                )
-
-                            log.info(
-                                """
-                                任务超过最大重试次数,丢弃
-                                url     %s
-                                重试次数 %s
-                                最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-
-                        else:
-                            # 将 requests 重新入库 爬取
-                            request.retry_times += 1
-                            request.filter_repeat = False
-                            log.info(
-                                """
-                                    入库 等待重试
-                                    url     %s
-                                    重试次数 %s
-                                    最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-                            self._memory_db.add(request)
-
-                else:
-                    # 记录下载成功的文档
-                    self.record_download_status(
-                        PaserControl.DOWNLOAD_SUCCESS, parser.name
-                    )
-                    # 记录成功任务数
-                    self.__class__._success_task_count += 1
-
-                    # 缓存下载成功的文档
-                    if setting.RESPONSE_CACHED_ENABLE:
-                        request.save_cached(
-                            response=response,
-                            expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
-                        )
-
-                finally:
-                    # 释放浏览器
-                    if response and hasattr(response, "browser"):
-                        request._webdriver_pool.put(response.browser)
-
-                break
-
-        if setting.SPIDER_SLEEP_TIME:
-            if (
-                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
-                and len(setting.SPIDER_SLEEP_TIME) == 2
-            ):
-                sleep_time = random.randint(
-                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
-                )
-                time.sleep(sleep_time)
-            else:
-                time.sleep(setting.SPIDER_SLEEP_TIME)
-
-
-class JySpiderParserControl(PaserControl):
-    is_show_tip = False
-
-    _success_task_count = 0
-    _failed_task_count = 0
-
-    def __init__(self, memory_db: MemoryDB, item_buffer: JyItemBuffer, heartbeat_buffer):
-        super(PaserControl, self).__init__()
-        self._parsers = []
-
-        self._memory_db = memory_db
-        self._item_buffer = item_buffer
-        self._heartbeat_buffer = heartbeat_buffer
-
-        self._thread_stop = False
-        self._selenium_stop = False
-
-    def run(self):
-        while not self._thread_stop:
-            try:
-                request = self._memory_db.get()
-                if not request:
-                    if not self.is_show_tip:
-                        log.debug("等待任务...")
-                        self.is_show_tip = True
-                    continue
-
-                self.is_show_tip = False
-                self.deal_request(request)
-            except (Exception, BaseException) as e:
-                log.exception(e)
-
-            finally:
-                if self._selenium_stop and not self.is_show_tip:
-                    log.debug("暂无可用浏览器,释放任务...")
-                    self._memory_db.clear()
-
-    def deal_request(self, request):
-        response = None
-        now_page = request.page or -1
-
-        for parser in self._parsers:
-            counter = {
-                "now_page": now_page,
-                "extract_count": 0,  # 列表页抽取的列表数量
-                "rel_count": 0,  # 去重后实际入库数量
-            }
-            if parser.name == request.parser_name:
-                try:
-                    # 解析request
-                    if request.auto_request:
-                        request_temp = None
-                        response = None
-
-                        # 下载中间件
-                        if request.download_midware:
-                            if isinstance(request.download_midware, (list, tuple)):
-                                request_temp = request
-                                for download_midware in request.download_midware:
-                                    download_midware = (
-                                        download_midware
-                                        if callable(download_midware)
-                                        else tools.get_method(
-                                            parser, download_midware
-                                        )
-                                    )
-                                    request_temp = download_midware(request_temp)
-                            else:
-                                download_midware = (
-                                    request.download_midware
-                                    if callable(request.download_midware)
-                                    else tools.get_method(
-                                        parser, request.download_midware
-                                    )
-                                )
-                                request_temp = download_midware(request)
-
-                        elif request.download_midware != False:
-                            request_temp = parser.download_midware(request)
-
-                        # 请求
-                        if request_temp:
-                            if (
-                                isinstance(request_temp, (tuple, list))
-                                and len(request_temp) == 2
-                            ):
-                                request_temp, response = request_temp
-
-                            if not isinstance(request_temp, Request):
-                                raise Exception(
-                                    "download_midware need return a request,"
-                                    "but received type: {}".format(
-                                        type(request_temp)
-                                    )
-                                )
-                            request = request_temp
-
-                        if not response:
-                            response = (
-                                request.get_response()
-                                if not setting.RESPONSE_CACHED_USED
-                                else request.get_response_from_cached(save_cached=False)
-                            )
-
-                        # 校验
-                        if parser.validate(request, response) == False:
-                            break
-
-                    else:
-                        response = None
-
-                    if request.callback:  # 如果有parser的回调函数,则用回调处理
-                        callback_parser = (
-                            request.callback
-                            if callable(request.callback)
-                            else tools.get_method(parser, request.callback)
-                        )
-                        results = callback_parser(request, response)
-                    else:  # 否则默认用parser处理
-                        results = parser.parse(request, response)
-
-                    if results and not isinstance(results, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代"
-                            % (parser.name, request.callback or "parse")
-                        )
-
-                    # 此处判断是 request 还是 item
-                    for result in results or []:
-                        if isinstance(result, Request):
-                            # 给request的 parser_name 赋值
-                            result.parser_name = result.parser_name or parser.name
-
-                            # 判断是同步的callback还是异步的
-                            if result.request_sync:  # 同步
-                                self.deal_request(result)
-                            else:  # 异步
-                                # 将next_request 入库
-                                self._memory_db.add(result)
-
-                        elif isinstance(result, Item):
-                            # 爬虫采集方式[True=混合采集(列表页+详情页); False=独立采集(列表页,详情页)]
-                            result.is_mixed = False
-                            if "List" in parser.__business_type__ and hasattr(result, "contenthtml"):
-                                result.is_mixed = True
-
-                            counter["extract_count"] += 1  # 统计抽取列表数
-                            if not self.is_duplicate(result):
-                                counter["rel_count"] += 1  # 统计实际列表数
-
-                            self._item_buffer.put_item(result)
-
-                        elif result is not None:
-                            function_name = "{}.{}".format(
-                                parser.name,
-                                (
-                                    request.callback
-                                    and callable(request.callback)
-                                    and getattr(request.callback, "__name__")
-                                    or request.callback
-                                ) or "parse",
-                            )
-                            raise TypeError(
-                                f"{function_name} result expect Request or Item, bug get type: {type(result)}"
-                            )
-
-                except (Exception, BaseException) as e:
-                    exception_type = (
-                        str(type(e)).replace("<class '", "").replace("'>", "")
-                    )
-
-                    if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
-                        log.exception(e)
-
-                    log.error(
-                        """
-                            -------------- %s.%s error -------------
-                            error          %s
-                            response       %s
-                            deal request   %s
-                        """
-                        % (
-                            parser.name,
-                            (
-                                request.callback
-                                and callable(request.callback)
-                                and getattr(request.callback, "__name__")
-                                or request.callback
-                            ) or "parse",
-                            str(e),
-                            response,
-                            tools.dumps_json(request.to_dict, indent=28)
-                            if setting.LOG_LEVEL == "DEBUG"
-                            else request,
-                        )
-                    )
-
-                    request.error_msg = "%s: %s" % (exception_type, e)
-                    request.response = str(response)
-
-                    if "Invalid URL" in str(e):
-                        request.is_abandoned = True
-
-                    if exception_type == "selenium.common.exceptions.InvalidSessionIdException":
-                        self._selenium_stop = True  # TODO 暂无解决方案,目前只能通过重建爬虫实例,建立新会话。参考文章:https://github.com/SeleniumHQ/docker-selenium/issues/2153
-                        _id = str(e.args[0]).split()[14] if len(str(e.args[0]).split()) > 14 else ""
-                        raise IOError(
-                            "%s 远程调用超时, session_id: %s"
-                            % ("selenium.driver.session", _id)
-                        )
-
-                    requests = parser.exception_request(request, response) or [
-                        request
-                    ]
-                    if not isinstance(requests, Iterable):
-                        raise Exception(
-                            "%s.%s返回值必须可迭代"
-                            % (parser.name, "exception_request")
-                        )
-
-                    for request in requests:
-                        if not isinstance(request, Request):
-                            raise Exception("exception_request 需 yield request")
-
-                        if (
-                            request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
-                            or request.is_abandoned
-                        ):
-                            self.__class__._failed_task_count += 1  # 记录失败任务数
-
-                            # 处理failed_request的返回值 request 或 func
-                            results = parser.failed_request(request, response) or [
-                                request
-                            ]
-                            if not isinstance(results, Iterable):
-                                raise Exception(
-                                    "%s.%s返回值必须可迭代"
-                                    % (parser.name, "failed_request")
-                                )
-
-                            log.info(
-                                """
-                                任务超过最大重试次数,丢弃
-                                url     %s
-                                重试次数 %s
-                                最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-
-                        else:
-                            # 将 requests 重新入库 爬取
-                            request.retry_times += 1
-                            request.filter_repeat = False
-                            log.info(
-                                """
-                                    入库 等待重试
-                                    url     %s
-                                    重试次数 %s
-                                    最大允许重试次数 %s"""
-                                % (
-                                    request.url,
-                                    request.retry_times,
-                                    setting.SPIDER_MAX_RETRY_TIMES,
-                                )
-                            )
-                            self._memory_db.add(request)
-
-                else:
-                    # 记录成功任务数
-                    self.__class__._success_task_count += 1
-
-                    # 缓存下载成功的文档
-                    if setting.RESPONSE_CACHED_ENABLE:
-                        request.save_cached(
-                            response=response,
-                            expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
-                        )
-
-                finally:
-                    # 释放浏览器
-                    if response and getattr(response, "browser", None):
-                        request._webdriver_pool.put(response.browser)
-
-                    self.publish_heartbeat(parser, request, response, **counter)
-
-                break
-
-        if setting.SPIDER_SLEEP_TIME:
-            if (
-                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
-                and len(setting.SPIDER_SLEEP_TIME) == 2
-            ):
-                sleep_time = random.randint(
-                    int(setting.SPIDER_SLEEP_TIME[0]),
-                    int(setting.SPIDER_SLEEP_TIME[1])
-                )
-                time.sleep(sleep_time)
-            else:
-                time.sleep(setting.SPIDER_SLEEP_TIME)

+ 0 - 416
spider_frame/FworkSpider/feapder/core/scheduler.py

@@ -1,416 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2017-01-09 10:38
----------
-@summary: 组装parser、 parser_control 和 collector
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import threading
-import time
-from collections import Iterable
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.buffer.request_buffer import RequestBuffer
-from feapder.buffer.heartbeat_buffer import HeartBeatBuffer
-from feapder.core.base_parser import BaseParser
-from feapder.core.collector import Collector
-from feapder.core.handle_failed_items import HandleFailedItems
-from feapder.core.handle_failed_requests import HandleFailedRequests
-from feapder.core.parser_control import PaserControl
-from feapder.db.rabbitMq import RabbitMQ
-from feapder.network.item import Item
-from feapder.network.request import Request
-from feapder.utils import metrics
-from feapder.utils.log import log
-
-
-class Scheduler(threading.Thread):
-    __custom_setting__ = {}
-
-    def __init__(
-        self,
-        redis_key=None,
-        user=None,
-        thread_count=None,
-        begin_callback=None,
-        end_callback=None,
-        keep_alive=None,
-        auto_start_requests=None,
-        **kwargs
-    ):
-        """
-        @summary: 调度器
-        ---------
-        @param redis_key: 爬虫request及item存放redis中的文件夹
-        @param user: 指定mq特定的程序消费用户标识
-        @param thread_count: 线程数,默认为配置文件中的线程数
-        @param begin_callback: 爬虫开始回调函数
-        @param end_callback: 爬虫结束回调函数
-        @param keep_alive: 爬虫是否常驻,默认否
-        @param auto_start_requests: 爬虫是否自动添加任务
-        ---------
-        @result:
-        """
-
-        super(Scheduler, self).__init__()
-
-        for key, value in self.__class__.__custom_setting__.items():
-            if key == "AUTO_STOP_WHEN_SPIDER_DONE":  # 兼容老版本的配置
-                setattr(setting, "KEEP_ALIVE", not value)
-            else:
-                setattr(setting, key, value)
-
-        self._redis_key = redis_key or setting.REDIS_KEY
-        if not self._redis_key:
-            raise Exception(
-                """
-                redis_key 为redis中存放request与item的目录。不能为空,
-                可在setting中配置,如 REDIS_KEY = 'test'
-                或spider初始化时传参, 如 TestSpider(redis_key='test')
-                """
-            )
-
-        self._rabbitmq = RabbitMQ()
-        self._request_buffer = RequestBuffer(redis_key, user=user)
-        self._item_buffer = ItemBuffer(redis_key, user=user)
-        self._collector = Collector(redis_key, user=user)
-        self._heartbeat_buffer = HeartBeatBuffer(redis_key)
-
-        self._parsers = []
-        self._parser_controls = []
-        self._parser_control_obj = PaserControl
-
-        # 兼容老版本的参数
-        if "auto_stop_when_spider_done" in kwargs:
-            self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
-        else:
-            self._keep_alive = (
-                keep_alive if keep_alive is not None else setting.KEEP_ALIVE
-            )
-
-        self._auto_start_requests = (
-            auto_start_requests
-            if auto_start_requests is not None
-            else setting.SPIDER_AUTO_START_REQUESTS
-        )
-
-        self._begin_callback = (
-            begin_callback
-            if begin_callback
-            else lambda: log.info("\n********** feapder begin **********")
-        )
-        self._end_callback = (
-            end_callback
-            if end_callback
-            else lambda: log.info("\n********** feapder end **********")
-        )
-
-        self._thread_count = (
-            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
-        )
-
-        self._spider_id = tools.get_uuid(redis_key, tools.get_current_date())
-        self._spider_name = redis_key
-
-        self._is_notify_end = False  # 是否已经通知结束
-
-        # Request 缓存设置
-        Request.cached_redis_key = redis_key
-        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
-
-        self._last_check_task_status_time = 0
-        self._user = user
-
-        self.init_metrics()
-
-    def init_metrics(self):
-        """
-        初始化打点系统
-        """
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def add_parser(self, parser):
-        parser = parser()  # parser 实例化
-        if isinstance(parser, BaseParser):
-            self._parsers.append(parser)
-        else:
-            raise ValueError("类型错误,爬虫需继承feapder.BaseParser")
-
-    def _start(self):
-        self.spider_begin()  # 启动爬虫 -- start_callback
-
-        # 将失败的item入库
-        if setting.RETRY_FAILED_ITEMS:
-            handle_failed_items = HandleFailedItems(
-                redis_key=self._redis_key,
-                item_buffer=self._item_buffer,
-                rabbitmq=self._rabbitmq,
-                user=self._user
-            )
-            handle_failed_items.reput_failed_items_to_db()
-
-        self._heartbeat_buffer.start()  # 心跳管理器
-
-        # STEP 3.1 开启 request_buffer -- 任务管理器 负责缓冲添加到数据库中的request
-        self._request_buffer.start()
-        # STEP 3.2 开启 item_buffer -- 管道管理器 负责缓冲采集的数据添加到数据库
-        self._item_buffer.start()
-        # STEP 3.4 开启 collector  -- 任务管理 分发任务
-        self._collector.start()
-
-        # 启动parser control 线程池
-        for i in range(self._thread_count):
-            # STEP 3.4 创建执行任务线程池
-            parser_control = self._parser_control_obj(
-                self._collector,
-                self._redis_key,
-                self._request_buffer,
-                self._item_buffer,
-                self._heartbeat_buffer
-            )
-
-            for parser in self._parsers:  # step 3.5 把所有待执行任务添加到线程池
-                parser_control.add_parser(parser)
-
-            parser_control.start()  # STEP 3.6 开启采集线程
-            self._parser_controls.append(parser_control)
-
-        # STEP 3.7下发任务 有消费线程之后开始读取任务
-        if setting.RETRY_FAILED_REQUESTS:
-            # 重设失败的任务
-            handle_failed_requests = HandleFailedRequests(
-                redis_key=self._redis_key,
-                rabbitmq=self._rabbitmq,
-                user=self._user
-            )
-            handle_failed_requests.reput_failed_requests_to_requests()
-
-        # STEP 3.8下发新任务 ,生产新任务
-        if self._auto_start_requests:
-            self.__add_task()
-
-    def run(self):
-        self._start()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    if not self._is_notify_end:
-                        self.spider_end()  # 爬虫结束
-                        self._is_notify_end = True
-
-                    if not self._keep_alive:  # 如果不是常驻爬虫 关闭所有线程
-                        self._stop_all_thread()
-                        break
-
-                else:
-                    self._is_notify_end = False
-
-                self.check_task_status()
-            except (Exception, BaseException) as e:
-                log.exception(e)
-
-            tools.delay_time(1)
-
-    def __add_task(self):
-        # 判断任务池中属否还有任务,若有接着抓取,若无则生产新任务
-        todo_task_count = self._collector.get_requests_count()
-        if todo_task_count:
-            log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
-        else:
-            for parser in self._parsers:
-                results = parser.start_requests()
-                # 添加request到请求队列,由请求队列统一入库
-                if results and not isinstance(results, Iterable):
-                    raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
-
-                result_type = 1
-                for result in results or []:  # step 对yield 的数据进行判断处理
-                    if isinstance(result, Request):  # Request 加入到任务队列
-                        result.parser_name = result.parser_name or parser.name
-                        self._request_buffer.put_request(result)
-                        result_type = 1
-
-                    elif isinstance(result, Item):  # Item 数据,存入到数据管道队列,等待存储
-                        self._item_buffer.put_item(result)
-                        result_type = 2
-
-                    elif callable(result):  # callable 的 request 可能是更新数据库操作的函数
-                        if result_type == 1:
-                            self._request_buffer.put_request(result)
-                        else:
-                            self._item_buffer.put_item(result)
-                    else:
-                        raise TypeError(
-                            "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
-                                type(result)
-                            )
-                        )
-
-                self._request_buffer.flush()
-                self._item_buffer.flush()
-
-    def all_thread_is_done(self):
-        # Stress 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
-        for i in range(5):
-            # STEP 5.1 检测 collector 状态
-            if (
-                self._collector.is_collector_task()
-                or self._collector.get_requests_count() > 0
-            ):
-                return False
-
-            # STEP 5.2 检测 parser_control 状态
-            for parser_control in self._parser_controls:
-                if not parser_control.is_not_task():
-                    return False
-
-            # STEP 5.3 检测 item_buffer 状态
-            if (
-                self._item_buffer.get_items_count() > 0
-                or self._item_buffer.is_adding_to_db()
-            ):
-                return False
-
-            # STEP 5.4 检测 request_buffer 状态
-            if (
-                self._request_buffer.get_requests_count() > 0
-                or self._request_buffer.is_adding_to_db()
-            ):
-                return False
-
-            # 检测 heartbeat_buffer 状态
-            if (
-                self._heartbeat_buffer.get_items_count() > 0
-                or self._heartbeat_buffer.is_adding_to_db()
-            ):
-                return False
-
-            tools.delay_time(1)  # 休眠1秒
-
-        return True
-
-    @tools.run_safe_model("check_task_status")
-    def check_task_status(self):
-        """
-        检查任务状态 预警
-        """
-        # step 每分钟检查一次
-        now_time = time.time()
-        if now_time - self._last_check_task_status_time > 60:
-            self._last_check_task_status_time = now_time
-        else:
-            return
-
-        # 检查失败任务数量 超过1000 报警
-        failed_count = self._request_buffer.get_failed_requests_count()
-        if failed_count > setting.WARNING_FAILED_COUNT:
-            # 发送报警
-            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
-            log.error(msg)
-            self.send_msg(
-                msg,
-                level="error",
-                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
-            )
-
-        # parser_control 实时统计已做任务数及失败任务数,若成功率<0.5 则报警
-        failed_task_count, success_task_count = PaserControl.get_task_status_count()
-        total_count = success_task_count + failed_task_count
-        if total_count > 0:
-            task_success_rate = success_task_count / total_count
-            if task_success_rate < 0.5:
-                # 发送报警
-                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
-                    self._spider_name,
-                    success_task_count,
-                    failed_task_count,
-                    task_success_rate,
-                )
-                log.error(msg)
-                self.send_msg(
-                    msg,
-                    level="error",
-                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
-                )
-
-        # 检查入库失败次数
-        if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
-            msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
-                self._spider_name, self._item_buffer.export_falied_times
-            )
-            log.error(msg)
-            self.send_msg(
-                msg,
-                level="error",
-                message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
-            )
-
-    def _stop_all_thread(self):
-        # 关闭任务管理器
-        self._request_buffer.stop()
-        # 关闭数据管道
-        self._item_buffer.stop()
-        # 关闭任务管理
-        self._collector.stop()
-        # 停止 parser_controls
-        for parser_control in self._parser_controls:
-            parser_control.stop()
-
-        # 关闭心跳管理
-        self._heartbeat_buffer.stop()
-        # 记录爬虫停止时间
-        self._started.clear()
-
-    def send_msg(self, msg, level="debug", message_prefix=""):
-        # log.debug("发送报警 level:{} msg{}".format(level, msg))
-        tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
-
-    def spider_begin(self):
-        """
-        @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
-        ---------
-        ---------
-        @result:
-        """
-        if self._begin_callback:
-            self._begin_callback()
-
-        for parser in self._parsers:
-            parser.start_callback()  # 任务开始回调
-
-    def spider_end(self):
-        if self._end_callback:  # 任务结束回调
-            self._end_callback()
-
-        for parser in self._parsers:
-            if not self._keep_alive:
-                parser.close()  # 爬虫自定义 close
-
-            parser.end_callback()  # 调用结束回调函数
-
-        if not self._keep_alive:
-            if Request.webdriver_pool:
-                Request.webdriver_pool.close()  # 关闭 webdriver 管理池
-
-            metrics.close()  # 关闭打点
-        else:
-            metrics.flush()
-
-        if self._keep_alive:
-            log.info("爬虫不自动结束,等待下一轮任务...")
-        else:
-            log.info("《%s》爬虫结束" % (self._spider_name,))
-
-    def join(self, timeout=None):
-        """
-        重写线程的join
-        """
-        if not self._started.is_set():
-            return
-
-        super().join()

+ 0 - 27
spider_frame/FworkSpider/feapder/core/spiders/__init__.py

@@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/22 12:08 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-__all__ = [
-    "AirSpider",
-    "Spider",
-    "BiddingListSpider",
-    "BiddingDetailSpider",
-    "PlanToBuildListSpider",
-    "PlanToBuildDetailSpider",
-]
-
-from feapder.core.spiders.air_spider import AirSpider
-from feapder.core.spiders.spider import (
-    Spider,
-    BiddingListSpider,
-    BiddingDetailSpider,
-    PlanToBuildListSpider,
-    PlanToBuildDetailSpider
-)

+ 0 - 128
spider_frame/FworkSpider/feapder/core/spiders/air_spider.py

@@ -1,128 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/22 12:05 AM
----------
-@summary: 基于内存队列的爬虫,不支持分布式
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-from threading import Thread
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.item_buffer import ItemBuffer
-from feapder.core.base_parser import BaseParser
-from feapder.core.parser_control import AirSpiderParserControl
-from feapder.db.memory_db import MemoryDB
-from feapder.network.request import Request
-from feapder.utils import metrics
-from feapder.utils.log import log
-
-
-class AirSpider(BaseParser, Thread):
-    __custom_setting__ = {}
-
-    def __init__(self, thread_count=None):
-        """
-        基于内存队列的爬虫,不支持分布式
-        :param thread_count: 线程数
-        """
-        super(AirSpider, self).__init__()
-
-        for key, value in self.__class__.__custom_setting__.items():
-            setattr(setting, key, value)
-
-        self._thread_count = (
-            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
-        )
-
-        self._memory_db = MemoryDB()
-        self._parser_controls = []
-        self._item_buffer = ItemBuffer(redis_key="air_spider")
-
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def distribute_task(self):
-        try:
-            for request in self.start_requests():
-                if not isinstance(request, Request):
-                    raise ValueError("仅支持 yield Request")
-
-                request.parser_name = request.parser_name or self.name
-                self._memory_db.add(request)
-        except IOError:
-            log.error("distribute task failed")
-
-    def all_thread_is_done(self):
-        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
-            # 检测 parser_control 状态
-            for parser_control in self._parser_controls:
-                if not parser_control.is_not_task():
-                    return False
-
-            # 检测 任务队列 状态
-            if not self._memory_db.empty():
-                return False
-
-            # 检测 item_buffer 状态
-            if (
-                self._item_buffer.get_items_count() > 0
-                or self._item_buffer.is_adding_to_db()
-            ):
-                return False
-
-            tools.delay_time(1)
-
-        return True
-
-    def run(self):
-        self.start_callback()
-
-        for i in range(self._thread_count):
-            parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer)
-            parser_control.add_parser(self)
-            parser_control.start()
-            self._parser_controls.append(parser_control)
-
-        self._item_buffer.start()
-
-        self.distribute_task()
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    # 停止 parser_controls
-                    for parser_control in self._parser_controls:
-                        parser_control.stop()
-
-                    # 关闭item_buffer
-                    self._item_buffer.stop()
-
-                    # 关闭webdirver
-                    if Request.webdriver_pool:
-                        Request.webdriver_pool.close()
-
-                    log.info("无任务,爬虫结束")
-                    break
-
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-        self.end_callback()
-        # 为了线程可重复start
-        self._started.clear()
-        # 关闭打点
-        metrics.close()
-
-    def join(self, timeout=None):
-        """
-        重写线程的join
-        """
-        if not self._started.is_set():
-            return
-
-        super().join()

+ 0 - 274
spider_frame/FworkSpider/feapder/core/spiders/spider.py

@@ -1,274 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2024-08-19 
----------
-@summary:  
----------
-@author: Dzr
-"""
-
-from threading import Thread
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.buffer.heartbeat_buffer import HeartBeatBuffer
-from feapder.buffer.item_buffer import JyItemBuffer
-from feapder.core.base_parser import BaseParser
-from feapder.core.parser_control import JySpiderParserControl
-from feapder.db.memory_db import MemoryDB
-from feapder.network.item import FailedTaskItem
-from feapder.network.request import Request
-from feapder.utils.log import log
-
-
-class Spider(BaseParser, Thread):
-    __custom_setting__ = {}
-
-    __business_type__ = ""
-
-    def __init__(self, redis_key, thread_count=None, **kwargs):
-        """
-
-        ---------
-        @param redis_key:
-        @param thread_count: 线程数,默认为配置文件中的线程数
-        ---------
-        """
-        super(Spider, self).__init__()
-
-        for key, value in self.__class__.__custom_setting__.items():
-            setattr(setting, key, value)
-
-        self._thread_count = (
-            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
-        )
-
-        self._heartbeat_buffer = HeartBeatBuffer()
-        self._item_buffer = JyItemBuffer(redis_key=redis_key)
-
-        self._memory_db = MemoryDB()
-        self._parser_controls = []  # 爬虫实例列表
-
-        self.tasks_dict = {}
-        self.task_api_auth_token = None
-
-    def distribute_task(self):
-        try:
-            for request in self.start_requests():
-                if not isinstance(request, Request):
-                    raise ValueError("仅支持 yield Request")
-
-                request.parser_name = request.parser_name or self.name
-                self._memory_db.add(request)
-        except IOError:
-            log.error("distribute task failed")
-
-    def all_thread_is_done(self):
-        for i in range(3):
-            # 检测 heartbeat_buffer 状态
-            if (
-                self._heartbeat_buffer.get_items_count() > 0
-                or self._heartbeat_buffer.is_adding_to_db()
-            ):
-                return False
-
-            # 检测 parser_control 状态
-            for parser_control in self._parser_controls:
-                if not parser_control.is_not_task():
-                    return False
-
-            # 检测 任务队列 状态
-            if not self._memory_db.empty():
-                return False
-
-            # 检测 item_buffer 状态
-            if (
-                self._item_buffer.get_items_count() > 0
-                or self._item_buffer.is_adding_to_db()
-            ):
-                return False
-
-            tools.delay_time(1)
-
-        return True
-
-    def register_task_api_token(self):
-        if self.task_api_auth_token is None:
-            token_url = f"{setting.JY_TASK_URL}/tasks/token"
-            data = {"username": "spider@py", "password": "123@qweA!"}
-            auth_params = dict(url=token_url, timeout=10, data=data, proxies=False)
-            response = Request(method="GET", **auth_params).get_response(show_log=False)
-            token = response.json["token"]
-            self.task_api_auth_token = token
-
-        log.debug(f"register api token:{self.task_api_auth_token}")
-
-    def run(self):  # 调度控制流程起始
-        self.start_callback()
-        # 启动 heartbeat_buffer
-        self._heartbeat_buffer.start()
-        # 启动线程池
-        for i in range(self._thread_count):
-            parser_control = JySpiderParserControl(
-                memory_db=self._memory_db,
-                item_buffer=self._item_buffer,
-                heartbeat_buffer=self._heartbeat_buffer
-            )
-            parser_control.add_parser(self)
-            parser_control.start()
-            self._parser_controls.append(parser_control)
-        # 启动任务缓存模块
-        self._item_buffer.start()
-        # 注册任务中心token
-        self.register_task_api_token()
-        # 派发任务
-        self.distribute_task()
-        # 派发任务加入 item_buffer 缓存容器(派发任务通过任务中心领取)
-        self._item_buffer.tasks_dict.update(self.tasks_dict)
-
-        while True:
-            try:
-                if self.all_thread_is_done():
-                    # 停止 parser_controls
-                    for parser_control in self._parser_controls:
-                        parser_control.stop()
-
-                    self._item_buffer.stop()  # 关闭 item_buffer
-                    self._heartbeat_buffer.stop()  # 关闭 heartbeat_buffer
-
-                    # 关闭 webdriver
-                    if Request.webdriver_pool:
-                        Request.webdriver_pool.close()
-
-                    log.info("无任务,爬虫结束")
-                    break
-
-            except Exception as e:
-                log.exception(e)
-
-            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
-
-        # 释放剩余未完成的任务
-        self._item_buffer.release_tasks(self.tasks_dict, finished=False)
-
-        self.end_callback()
-        self._started.clear()  # 为了线程可重复start
-
-    def join(self, timeout=None):
-        """
-        重写线程的join
-        """
-        if not self._started.is_set():
-            return
-
-        super().join()
-
-
-class BaseBusinessListSpider(Spider):
-    """列表页采集基础爬虫"""
-
-    __business_type__ = "List"
-
-    def infinite_pages(self, request, response):
-        """无限翻页"""
-
-        def _page_increment():
-            if request.page is None:
-                raise ValueError("请设置 request.page 起始页码数")
-
-            if request.page < int(request.item["crawl_page"]):
-                request.page += 1  # 采集页码自增
-                yield request
-
-        return next(_page_increment(), None)
-
-
-class BaseBusinessDetailSpider(Spider):
-    """详情页采集基础爬虫"""
-
-    __business_type__ = "Detail"
-    __business_setting__ = dict(
-        ITEM_FILTER_ENABLE=False
-    )
-
-    def __init__(self, redis_key, thread_count=None, **kwargs):
-        self.__class__.__custom_setting__.update(
-            self.__class__.__business_setting__
-        )
-        super(BaseBusinessDetailSpider, self).__init__(
-            redis_key=redis_key,
-            thread_count=thread_count,
-            **kwargs
-        )
-
-        self._redis_key = redis_key
-
-    def failed_request(self, request, response):
-        """请求、解析错误次数超过上限后,记录错误详情信息"""
-        failed_item = FailedTaskItem(
-            reason=getattr(request, "error_msg", ""),
-            status_code=getattr(response, "status_code", -1),
-            **request.item,  # 请求失败的任务详情
-        )
-        failed_item.table_name = setting.TASK_REQUEST_FAILED
-        yield failed_item
-
-    def get_tasks(self, limit=None, **kwargs):
-        timeout = kwargs.pop("timeout", 10)
-        queue = setting.TAB_ITEMS.format(redis_key=self._redis_key.replace("_detailc", ""))
-
-        # 获取任务
-        url = f"{setting.JY_TASK_URL}/tasks/fd?qn={queue}&limit={limit}"
-        headers = {"Authorization": self.task_api_auth_token}
-        params = dict(headers=headers, timeout=timeout, proxies=False)
-        response = Request(method="GET", url=url, **params).get_response()
-        ret = response.json["task"]
-        self.tasks_dict = {
-            "token": self.task_api_auth_token,
-            "data": {t["pyuuid"]: {"tid": t["tid"], "queue": queue} for t in ret}
-        }
-        return ret
-
-    get_tasks_by_rabbitmq = get_tasks
-
-    def get_tasks_by_mongodb(self, table=None, query=None, limit=None):
-        pipeline_path = "feapder.pipelines.mongo_pipeline.TaskPipeline"
-        pipeline = tools.import_cls(pipeline_path)()
-        table = table or setting.TASK_REQUEST_PRODUCE
-        queue_name = setting.TAB_ITEMS.format(
-            redis_key=self._redis_key.replace('_detailc', '')
-        )
-        conditions = query or {
-            'state': {'$in': [1, 3]},
-            'queue_name': queue_name,
-            'update_at': {'$lt': tools.get_current_timestamp()}
-        }
-        limit = limit or setting.COLLECTOR_TASK_COUNT
-        results = pipeline.find_items(table, conditions, limit)
-        ignore = {'_id', 'state', 'update_at', 'queue_name'}
-        task_lst = [{k: v for k, v in items.items() if k not in ignore} for items in results]
-        return task_lst
-
-
-class BiddingListSpider(BaseBusinessListSpider):
-    """标讯列表页采集爬虫"""
-
-    __business_type__ = "BiddingList"
-
-
-class BiddingDetailSpider(BaseBusinessDetailSpider):
-    """标讯详情页采集爬虫"""
-
-    __business_type__ = "BiddingDetail"
-
-
-class PlanToBuildListSpider(BaseBusinessListSpider):
-    """拟建列表页采集爬虫"""
-
-    __business_type__ = "PlanToBuildList"
-
-
-class PlanToBuildDetailSpider(BaseBusinessDetailSpider):
-    """拟建详情页采集爬虫"""
-
-    __business_type__ = "PlanToBuildDetail"

+ 0 - 9
spider_frame/FworkSpider/feapder/db/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/23 12:09 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""

+ 0 - 40
spider_frame/FworkSpider/feapder/db/memory_db.py

@@ -1,40 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/4/21 11:42 PM
----------
-@summary: 基于内存的队列,代替redis
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-from queue import PriorityQueue
-
-
-class MemoryDB:
-    def __init__(self):
-        self.priority_queue = PriorityQueue()
-
-    def add(self, item):
-        """
-        添加任务
-        :param item: 数据: 支持小于号比较的类 或者 (priority, item)
-        :return:
-        """
-        self.priority_queue.put(item)
-
-    def get(self):
-        """
-        获取任务
-        :return:
-        """
-        try:
-            item = self.priority_queue.get_nowait()
-            return item
-        except:
-            return
-
-    def empty(self):
-        return self.priority_queue.empty()
-
-    def clear(self):
-        self.priority_queue = PriorityQueue()

+ 0 - 427
spider_frame/FworkSpider/feapder/db/mongodb.py

@@ -1,427 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 操作mongo数据库
----------
-@author: Mkdir700
-@email:  mkdir700@gmail.com
-"""
-import re
-from typing import List, Dict, Optional
-from urllib import parse
-
-import pymongo
-from pymongo import MongoClient
-from pymongo.collection import Collection
-from pymongo.database import Database
-from pymongo.errors import DuplicateKeyError, BulkWriteError
-
-import feapder.setting as setting
-from feapder.utils.log import log
-
-
-class MongoDB:
-    def __init__(
-        self,
-        ip=None,
-        port=None,
-        db=None,
-        user_name=None,
-        user_pass=None,
-        url=None,
-        max_pool_size=5,  # 默认 100
-        **kwargs,
-    ):
-        if url:
-            config = dict(host=url)
-        else:
-            if not ip:
-                ip = setting.MONGO_IP
-            if not port:
-                port = setting.MONGO_PORT
-            if not user_name:
-                user_name = setting.MONGO_USER_NAME
-            if not user_pass:
-                user_pass = setting.MONGO_USER_PASS
-
-            config = dict(host=ip, port=port, username=user_name, password=user_pass)
-
-        if "maxPoolSize" not in kwargs:
-            kwargs["maxPoolSize"] = max_pool_size
-
-        if "uuidRepresentation" not in kwargs:
-            kwargs["uuidRepresentation"] = "standard"  # 设置为 standard 以实现跨语言兼容性
-
-        self.client = MongoClient(**config, **kwargs)
-        self.db = self.get_database((db or setting.MONGO_DB))
-
-        # 缓存索引信息
-        self.__index__cached = {}
-
-    @classmethod
-    def from_url(cls, url, **kwargs):
-        """
-        Args:
-            url: mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
-                 参考:http://mongodb.github.io/mongo-java-driver/3.4/javadoc/com/mongodb/MongoClientURI.html
-            **kwargs:
-
-        Returns:
-
-        """
-        url_parsed = parse.urlparse(url)
-
-        db_type = url_parsed.scheme.strip()
-        if db_type != "mongodb":
-            raise Exception(
-                "url error, expect mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]], but get {}".format(
-                    url
-                )
-            )
-
-        return cls(url=url, **kwargs)
-
-    def get_database(self, database, **kwargs) -> Database:
-        """
-        获取数据库对象
-        @param database: 数据库名
-        @return:
-        """
-        return self.client.get_database(database, **kwargs)
-
-    def get_collection(self, coll_name, **kwargs) -> Collection:
-        """
-        根据集合名获取集合对象
-        @param coll_name: 集合名
-        @return:
-        """
-        return self.db.get_collection(coll_name, **kwargs)
-
-    def find(
-        self, coll_name: str, condition: Optional[Dict] = None, limit: int = 0, **kwargs
-    ) -> List[Dict]:
-        """
-        @summary:
-        无数据: 返回[]
-        有数据: [{'_id': 'xx', ...}, ...]
-        ---------
-        @param coll_name: 集合名(表名)
-        @param condition: 查询条件
-        @param limit: 结果数量
-        @param kwargs:
-            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
-
-        ---------
-        @result:
-        """
-        condition = {} if condition is None else condition
-        command = {"find": coll_name, "filter": condition, "limit": limit}
-        command.update(kwargs)
-        result = self.run_command(command)
-        cursor = result["cursor"]
-        cursor_id = cursor["id"]
-        dataset = cursor["firstBatch"]
-        while True:
-            if cursor_id == 0:
-                break
-            result = self.run_command(
-                {
-                    "getMore": cursor_id,
-                    "collection": coll_name,
-                    "batchSize": kwargs.get("batchSize", 100),
-                }
-            )
-            cursor = result["cursor"]
-            cursor_id = cursor["id"]
-            dataset.extend(cursor["nextBatch"])
-        return dataset
-
-    def add(
-        self,
-        coll_name,
-        data: Dict,
-        replace=False,
-        update_columns=(),
-        update_columns_value=(),
-        insert_ignore=False,
-    ):
-        """
-        添加单条数据
-        Args:
-            coll_name: 集合名
-            data: 单条数据
-            replace: 唯一索引冲突时直接覆盖旧数据,默认为False
-            update_columns: 更新指定的列(如果数据唯一索引冲突,则更新指定字段,如 update_columns = ["name", "title"]
-            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
-            insert_ignore: 索引冲突是否忽略 默认False
-
-        Returns: 插入成功的行数
-
-        """
-        affect_count = 1
-        collection = self.get_collection(coll_name)
-        try:
-            collection.insert_one(data)
-        except DuplicateKeyError as e:
-            # 存在则更新
-            if update_columns:
-                if not isinstance(update_columns, (tuple, list)):
-                    update_columns = [update_columns]
-
-                condition = self.__get_update_condition(
-                    coll_name, data, e.details.get("errmsg")
-                )
-
-                # 更新指定的列
-                if update_columns_value:
-                    # 使用指定的值更新
-                    doc = {
-                        key: value
-                        for key, value in zip(update_columns, update_columns_value)
-                    }
-                else:
-                    # 使用数据本身的值更新
-                    doc = {key: data[key] for key in update_columns}
-
-                collection.update_one(condition, {"$set": doc})
-
-            # 覆盖更新
-            elif replace:
-                condition = self.__get_update_condition(
-                    coll_name, data, e.details.get("errmsg")
-                )
-                # 替换已存在的数据
-                collection.replace_one(condition, data)
-
-            elif not insert_ignore:
-                raise e
-
-        return affect_count
-
-    def add_batch(
-        self,
-        coll_name: str,
-        datas: List[Dict],
-        replace=False,
-        update_columns=(),
-        update_columns_value=(),
-        condition_fields: dict = None,
-    ):
-        """
-        批量添加数据
-        Args:
-            coll_name: 集合名
-            datas: 数据 [{'_id': 'xx'}, ... ]
-            replace:  唯一索引冲突时直接覆盖旧数据,默认为False
-            update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
-            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
-            condition_fields: 用于条件查找的字段,不指定则用索引冲突中的字段查找
-
-        Returns: 添加行数,不包含更新
-
-        """
-        add_count = 0
-
-        if not datas:
-            return add_count
-
-        collection = self.get_collection(coll_name)
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-
-        try:
-            add_count = len(datas)
-            collection.insert_many(datas, ordered=False)
-        except BulkWriteError as e:
-            write_errors = e.details.get("writeErrors")
-            for error in write_errors:
-                if error.get("code") == 11000:
-                    # 数据重复
-                    # 获取重复的数据
-                    data = error.get("op")
-
-                    def get_condition():
-                        # 获取更新条件
-                        if condition_fields:
-                            condition = {
-                                condition_field: data[condition_field]
-                                for condition_field in condition_fields
-                            }
-                        else:
-                            # 根据重复的值获取更新条件
-                            condition = self.__get_update_condition(
-                                coll_name, data, error.get("errmsg")
-                            )
-
-                        return condition
-
-                    if update_columns:
-                        # 更新指定的列
-                        if update_columns_value:
-                            # 使用指定的值更新
-                            doc = {
-                                key: value
-                                for key, value in zip(
-                                    update_columns, update_columns_value
-                                )
-                            }
-                        else:
-                            # 使用数据本身的值更新
-                            doc = {key: data.get(key) for key in update_columns}
-
-                        collection.update_one(get_condition(), {"$set": doc})
-                        add_count -= 1
-
-                    elif replace:
-                        # 覆盖更新
-                        collection.replace_one(get_condition(), data)
-                        add_count -= 1
-
-                    else:
-                        # log.error(error)
-                        add_count -= 1
-
-        return add_count
-
-    def count(self, coll_name, condition: Optional[Dict], limit=0, **kwargs):
-        """
-        计数
-        @param coll_name: 集合名
-        @param condition: 查询条件
-        @param limit: 限制数量
-        @param kwargs:
-        ----
-        command = {
-          count: <collection or view>,
-          query: <document>,
-          limit: <integer>,
-          skip: <integer>,
-          hint: <hint>,
-          readConcern: <document>,
-          collation: <document>,
-          comment: <any>
-        }
-        https://docs.mongodb.com/manual/reference/command/count/#mongodb-dbcommand-dbcmd.count
-        @return: 数据数量
-        """
-        command = {"count": coll_name, "query": condition, "limit": limit, **kwargs}
-        result = self.run_command(command)
-        return result["n"]
-
-    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
-        """
-        更新
-        Args:
-            coll_name: 集合名
-            data: 单条数据 {"xxx":"xxx"}
-            condition: 更新条件 {"_id": "xxxx"}
-            upsert: 数据不存在则插入,默认为 False
-
-        Returns: True / False
-        """
-        try:
-            collection = self.get_collection(coll_name)
-            collection.update_one(condition, {"$set": data}, upsert=upsert)
-        except Exception as e:
-            log.error(
-                """
-                error:{}
-                condition: {}
-            """.format(
-                    e, condition
-                )
-            )
-            return False
-        else:
-            return True
-
-    def delete(self, coll_name, condition: Dict) -> bool:
-        """
-        删除
-        Args:
-            coll_name: 集合名
-            condition: 查找条件
-        Returns: True / False
-
-        """
-        try:
-            collection = self.get_collection(coll_name)
-            collection.delete_one(condition)
-        except Exception as e:
-            log.error(
-                """
-                error:{}
-                condition: {}
-            """.format(
-                    e, condition
-                )
-            )
-            return False
-        else:
-            return True
-
-    def run_command(self, command: Dict):
-        """
-        运行指令
-        参考文档 https://www.geek-book.com/src/docs/mongodb/mongodb/docs.mongodb.com/manual/reference/command/index.html
-        @param command:
-        @return:
-        """
-        return self.db.command(command)
-
-    def create_index(self, coll_name, keys, unique=True):
-        collection = self.get_collection(coll_name)
-        _keys = [(key, pymongo.ASCENDING) for key in keys]
-        collection.create_index(_keys, unique=unique)
-
-    def get_index(self, coll_name):
-        return self.get_collection(coll_name).index_information()
-
-    def drop_collection(self, coll_name):
-        return self.db.drop_collection(coll_name)
-
-    def get_index_key(self, coll_name, index_name):
-        """
-        获取参与索引的key
-        Args:
-            index_name: 索引名
-
-        Returns:
-
-        """
-        cache_key = f"{coll_name}:{index_name}"
-
-        if cache_key in self.__index__cached:
-            return self.__index__cached.get(cache_key)
-
-        index = self.get_index(coll_name)
-        index_detail = index.get(index_name)
-        if not index_detail:
-            errmsg = f"not found index {index_name} in collection {coll_name}"
-            raise Exception(errmsg)
-
-        index_keys = [val[0] for val in index_detail.get("key")]
-        self.__index__cached[cache_key] = index_keys
-        return index_keys
-
-    def __get_update_condition(
-        self, coll_name: str, data: dict, duplicate_errmsg: str
-    ) -> dict:
-        """
-        根据索引冲突的报错信息 获取更新条件
-        Args:
-            duplicate_errmsg: E11000 duplicate key error collection: feapder.test index: a_1_b_1 dup key: { : 1, : "你好" }
-            data: {"a": 1, "b": "你好", "c": "嘻嘻"}
-
-        Returns: {"a": 1, "b": "你好"}
-
-        """
-        index_name = re.search(r"index: (\w+)", duplicate_errmsg).group(1)
-        index_keys = self.get_index_key(coll_name, index_name)
-
-        condition = {key: data.get(key) for key in index_keys}
-        return condition
-
-    def __getattr__(self, name):
-        return getattr(self.db, name)

+ 0 - 381
spider_frame/FworkSpider/feapder/db/mysqldb.py

@@ -1,381 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-11-16 16:25
----------
-@summary: 操作oracle数据库
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import datetime
-import json
-from urllib import parse
-from typing import List, Dict
-
-import pymysql
-from dbutils.pooled_db import PooledDB
-from pymysql import cursors
-from pymysql import err
-
-import feapder.setting as setting
-from feapder.utils.log import log
-from feapder.utils.tools import make_insert_sql, make_batch_sql, make_update_sql
-
-
-def auto_retry(func):
-    def wapper(*args, **kwargs):
-        for i in range(3):
-            try:
-                return func(*args, **kwargs)
-            except (err.InterfaceError, err.OperationalError) as e:
-                log.error(
-                    """
-                    error:%s
-                    sql:  %s
-                    """
-                    % (e, kwargs.get("sql") or args[1])
-                )
-
-    return wapper
-
-
-class MysqlDB:
-    def __init__(
-        self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs
-    ):
-        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
-        if not ip:
-            ip = setting.MYSQL_IP
-        if not port:
-            port = setting.MYSQL_PORT
-        if not db:
-            db = setting.MYSQL_DB
-        if not user_name:
-            user_name = setting.MYSQL_USER_NAME
-        if not user_pass:
-            user_pass = setting.MYSQL_USER_PASS
-
-        try:
-
-            self.connect_pool = PooledDB(
-                creator=pymysql,
-                mincached=1,
-                maxcached=100,
-                maxconnections=100,
-                blocking=True,
-                ping=7,
-                host=ip,
-                port=port,
-                user=user_name,
-                passwd=user_pass,
-                db=db,
-                charset="utf8mb4",
-                cursorclass=cursors.SSCursor,
-            )  # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增
-
-        except Exception as e:
-            log.error(
-                """
-            连接数据失败:
-            ip: {}
-            port: {}
-            db: {}
-            user_name: {}
-            user_pass: {}
-            exception: {}
-            """.format(
-                    ip, port, db, user_name, user_pass, e
-                )
-            )
-        else:
-            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
-
-    @classmethod
-    def from_url(cls, url, **kwargs):
-        # mysql://username:password@ip:port/db?charset=utf8mb4
-        url_parsed = parse.urlparse(url)
-
-        db_type = url_parsed.scheme.strip()
-        if db_type != "mysql":
-            raise Exception(
-                "url error, expect mysql://username:ip:port/db?charset=utf8mb4, but get {}".format(
-                    url
-                )
-            )
-
-        connect_params = {}
-        connect_params["ip"] = url_parsed.hostname.strip()
-        connect_params["port"] = url_parsed.port
-        connect_params["user_name"] = url_parsed.username.strip()
-        connect_params["user_pass"] = url_parsed.password.strip()
-        connect_params["db"] = url_parsed.path.strip("/").strip()
-
-        connect_params.update(kwargs)
-
-        return cls(**connect_params)
-
-    @staticmethod
-    def unescape_string(value):
-        if not isinstance(value, str):
-            return value
-
-        value = value.replace("\\0", "\0")
-        value = value.replace("\\\\", "\\")
-        value = value.replace("\\n", "\n")
-        value = value.replace("\\r", "\r")
-        value = value.replace("\\Z", "\032")
-        value = value.replace('\\"', '"')
-        value = value.replace("\\'", "'")
-
-        return value
-
-    def get_connection(self):
-        conn = self.connect_pool.connection(shareable=False)
-        # cursor = conn.cursor(cursors.SSCursor)
-        cursor = conn.cursor()
-
-        return conn, cursor
-
-    def close_connection(self, conn, cursor):
-        cursor.close()
-        conn.close()
-
-    def size_of_connections(self):
-        """
-        当前活跃的连接数
-        @return:
-        """
-        return self.connect_pool._connections
-
-    def size_of_connect_pool(self):
-        """
-        池子里一共有多少连接
-        @return:
-        """
-        return len(self.connect_pool._idle_cache)
-
-    @auto_retry
-    def find(self, sql, limit=0, to_json=False):
-        """
-        @summary:
-        无数据: 返回()
-        有数据: 若limit == 1 则返回 (data1, data2)
-                否则返回 ((data1, data2),)
-        ---------
-        @param sql:
-        @param limit:
-        @param to_json 是否将查询结果转为json
-        ---------
-        @result:
-        """
-        conn, cursor = self.get_connection()
-
-        cursor.execute(sql)
-
-        if limit == 1:
-            result = cursor.fetchone()  # 全部查出来,截取 不推荐使用
-        elif limit > 1:
-            result = cursor.fetchmany(limit)  # 全部查出来,截取 不推荐使用
-        else:
-            result = cursor.fetchall()
-
-        if to_json:
-            columns = [i[0] for i in cursor.description]
-
-            # 处理数据
-            def convert(col):
-                if isinstance(col, (datetime.date, datetime.time)):
-                    return str(col)
-                elif isinstance(col, str) and (
-                    col.startswith("{") or col.startswith("[")
-                ):
-                    try:
-                        # col = self.unescape_string(col)
-                        return json.loads(col)
-                    except:
-                        return col
-                else:
-                    # col = self.unescape_string(col)
-                    return col
-
-            if limit == 1:
-                result = [convert(col) for col in result]
-                result = dict(zip(columns, result))
-            else:
-                result = [[convert(col) for col in row] for row in result]
-                result = [dict(zip(columns, r)) for r in result]
-
-        self.close_connection(conn, cursor)
-
-        return result
-
-    def add(self, sql, exception_callfunc=None):
-        """
-
-        Args:
-            sql:
-            exception_callfunc: 异常回调
-
-        Returns: 添加行数
-
-        """
-        affect_count = None
-
-        try:
-            conn, cursor = self.get_connection()
-            affect_count = cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            if exception_callfunc:
-                exception_callfunc(e)
-        finally:
-            self.close_connection(conn, cursor)
-
-        return affect_count
-
-    def add_smart(self, table, data: Dict, **kwargs):
-        """
-        添加数据, 直接传递json格式的数据,不用拼sql
-        Args:
-            table: 表名
-            data: 字典 {"xxx":"xxx"}
-            **kwargs:
-
-        Returns: 添加行数
-
-        """
-        sql = make_insert_sql(table, data, **kwargs)
-        return self.add(sql)
-
-    def add_batch(self, sql, datas: List[Dict]):
-        """
-        @summary: 批量添加数据
-        ---------
-        @ param sql: insert ignore into (xxx,xxx) values (%s, %s, %s)
-        # param datas: 列表 [{}, {}, {}]
-        ---------
-        @result: 添加行数
-        """
-        affect_count = None
-
-        try:
-            conn, cursor = self.get_connection()
-            affect_count = cursor.executemany(sql, datas)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-                """
-                % (e, sql)
-            )
-        finally:
-            self.close_connection(conn, cursor)
-
-        return affect_count
-
-    def add_batch_smart(self, table, datas: List[Dict], **kwargs):
-        """
-        批量添加数据, 直接传递list格式的数据,不用拼sql
-        Args:
-            table: 表名
-            datas: 列表 [{}, {}, {}]
-            **kwargs:
-
-        Returns: 添加行数
-
-        """
-        sql, datas = make_batch_sql(table, datas, **kwargs)
-        return self.add_batch(sql, datas)
-
-    def update(self, sql):
-        try:
-            conn, cursor = self.get_connection()
-            cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            return False
-        else:
-            return True
-        finally:
-            self.close_connection(conn, cursor)
-
-    def update_smart(self, table, data: Dict, condition):
-        """
-        更新, 不用拼sql
-        Args:
-            table: 表名
-            data: 数据 {"xxx":"xxx"}
-            condition: 更新条件 where后面的条件,如 condition='status=1'
-
-        Returns: True / False
-
-        """
-        sql = make_update_sql(table, data, condition)
-        return self.update(sql)
-
-    def delete(self, sql):
-        """
-        删除
-        Args:
-            sql:
-
-        Returns: True / False
-
-        """
-        try:
-            conn, cursor = self.get_connection()
-            cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            return False
-        else:
-            return True
-        finally:
-            self.close_connection(conn, cursor)
-
-    def execute(self, sql):
-        try:
-            conn, cursor = self.get_connection()
-            cursor.execute(sql)
-            conn.commit()
-
-        except Exception as e:
-            log.error(
-                """
-                error:%s
-                sql:  %s
-            """
-                % (e, sql)
-            )
-            return False
-        else:
-            return True
-        finally:
-            self.close_connection(conn, cursor)

+ 0 - 511
spider_frame/FworkSpider/feapder/db/rabbitMq.py

@@ -1,511 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-09-25
----------
-@summary:  rabbitMq消息队列(基于amqpstorm封装)
----------
-@author: Dzr
-"""
-import time
-
-import amqpstorm
-from amqpstorm.channel import Channel as AmqpStormChannel
-from amqpstorm.connection import Connection as AmqpStormConnection
-from amqpstorm.exception import AMQPChannelError, AMQPConnectionError
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.utils.log import log
-
-
-class RabbitMQMessage:
-
-    def __init__(self, delivery_tag, body):
-        self.delivery_tag = delivery_tag
-        self.body = body
-
-    def __str__(self):
-        return f"RabbitMQMessage(delivery_tag={self.delivery_tag}, body={self.body})"
-
-
-class RabbitMQ:
-    __RABBITMQ_ATTRS__ = {
-        "timeout",
-        "virtual_host",
-        "heartbeat",
-        "ssl",
-        "ssl_options",
-        "client_properties",
-    }
-
-    __cache = {}  # 使用缓存机制来实现仅在需要创建新队列或绑定新关系时进行相关操作
-
-    def __init__(
-        self,
-        user=None,
-        user_pass=None,
-        ip_port=None,
-        url=None,
-        exchange=None,
-        exchange_type=None,
-        durable=True,
-        **kwargs
-    ):
-
-        """
-        @param str user: 用户名
-        @param str user_pass: 密码
-        @param ip_port: ip:port
-        @param str url:
-        @param str exchange: 交换机名称
-        @param str exchange_type: 交换机类型
-            RabbitMQ支持以下几种exchange_type类型:
-                1. **direct(直连交换机)**:它将消息通过路由键直接发送到与之匹配的队列。使用direct交换机时,消息的路由键需要与绑定到队列上的绑定键完全匹配。
-                2. **topic(主题交换机)**:它将消息通过路由键的模式匹配发送到一个或多个队列,这是一种灵活的交换机类型。使用主题交换机时,可以使用通配符进行模糊匹配,例如使用*表示一个单词,#表示零个或多个单词。
-                3. **fanout(扇型交换机)**:它将消息广播到所有绑定到它的队列。它忽略了路由键的概念,只需简单地将消息发送给所有队列即可。
-                4. **headers(头交换机)**:该交换机根据消息的头部属性进行匹配,而不是路由键。它的匹配规则非常灵活,但在实际应用中使用较少。
-        @param durable: 是否定义队列或者交换机持久化(服务器重启后,队列是否能够恢复到原来的状态)
-        @param kwargs: 自定义键值参数
-        """
-
-        if ip_port is None:
-            ip_port = setting.RABBITMQ_IP_PORT
-        if user is None:
-            user = setting.RABBITMQ_USER
-        if user_pass is None:
-            user_pass = setting.RABBITMQ_USER_PASS
-        if exchange is None:
-            exchange = setting.RABBITMQ_EXCHANGE
-        if exchange_type is None:
-            exchange_type = setting.RABBITMQ_EXCHANGE_TYPE
-
-        self.__mq = None
-        self.__channel = None
-        self._url = url
-        self._ip_port = ip_port
-        self._user = user
-        self._user_pass = user_pass
-        self._durable = durable
-        self._exchange = exchange
-        self._exchange_type = exchange_type
-        self._stop_server = False
-
-        self.mq_kwargs = {
-            "virtual_host": setting.RABBITMQ_VIRTUAL_HOST,
-            "heartbeat": setting.RABBITMQ_HEARTBEAT,
-            "timeout": setting.RABBITMQ_SOCKET_TIMEOUT
-        }
-        for key, val in kwargs.copy().items():
-            if key in self.__RABBITMQ_ATTRS__:
-                self.mq_kwargs[key] = val
-
-        # 创建连接
-        self.get_connect()
-        # 创建信道
-        self.get_channel()
-
-    @property
-    def _mq(self) -> AmqpStormConnection:
-        try:
-            if not self.__mq.is_open:
-                raise ConnectionError("unable to connect to RabbitMQ")
-        except:
-            if not self._stop_server:
-                self._reconnect()
-
-        return self.__mq
-
-    @_mq.setter
-    def _mq(self, connection: AmqpStormConnection):
-        self.__mq = connection
-
-    def _reconnect(self):
-        # 检测连接状态,当RabbitMQ重启或者因网络波动导致断开连接时自动重连
-        retry_count = 0
-        while True:
-            try:
-                retry_count += 1
-                log.error(f"RabbitMQ 连接断开, 重新连接 {retry_count}")
-                if self.get_connect():
-                    log.info(f"RabbitMQ 连接成功")
-                    return True
-            except (ConnectionError,) as e:
-                log.error(f"连接失败 e: {e}")
-
-            time.sleep(1)
-
-    def get_connect(self, lazy=False):
-        try:
-            if not self._url:
-                if not self._ip_port:
-                    raise Exception("未设置 RabbitMQ 连接信息")
-
-                ip, port = self._ip_port.split(":")
-                node = {
-                    "hostname": ip,
-                    "port": int(port),
-                    **self.mq_kwargs
-                }
-                if self._user and self._user_pass:
-                    node["username"] = self._user
-                    node["password"] = self._user_pass
-                # 创建连接
-                self._mq = amqpstorm.Connection(**node, lazy=lazy)
-            else:
-                # 创建连接
-                self._mq = amqpstorm.UriConnection(self._url, lazy=lazy)
-        except Exception as e:
-            raise
-
-        return self.__mq.is_open
-
-    def get_channel(self):
-        try:
-            # 建立信道
-            self._channel = self._mq.channel()
-            # 队列重新绑定交换机
-            for binding_key in self.__cache.copy():
-                if isinstance(binding_key, tuple):
-                    queue, exchange, routing_key = binding_key
-                    # 清除缓存
-                    self.__cache.pop(queue, None)
-                    self.__cache.pop(binding_key, None)
-                    # 重新声明绑定
-                    self.declare_bind(queue, exchange, routing_key)
-        except Exception as e:
-            raise
-
-        return self.__channel.is_open
-
-    def _re0channel(self):
-        retry_count = 0
-        while True:
-            try:
-                retry_count += 1
-                log.error(f"Channel 连接断开, 重新连接 {retry_count}")
-                if self.get_channel():
-                    log.info(f"Channel 连接成功")
-                    return True
-            except (ConnectionError,) as e:
-                log.error(f"连接失败 e: {e}")
-
-            time.sleep(1)
-
-    @property
-    def _channel(self) -> AmqpStormChannel:
-        try:
-            if not self.__channel.is_open:
-                raise ConnectionError("unable to connect to Channel")
-        except:
-            if not self._stop_server:
-                self._re0channel()
-
-        return self.__channel
-
-    @_channel.setter
-    def _channel(self, channel: AmqpStormChannel):
-        self.__channel = channel
-
-    def add_batch(self, queue, datas, exchange="", routing_key="", **kwargs):
-        """
-        批量发布消息
-
-        @param str queue: 队列名称
-        @param datas: 消息内容
-        @param exchange: 交换机名称
-        @param routing_key: 路由键
-        """
-        data_lst = datas if isinstance(datas, list) else [datas]
-        for data in data_lst:
-            self.add(data, queue, exchange, routing_key, **kwargs)
-
-    def add(self, data, queue="", exchange="", routing_key="", properties=None):
-        """
-        发布消息
-
-        @param str queue: 队列名称
-        @param data: 消息内容
-        @param exchange: 交换机名称
-        @param routing_key: 路由键
-        @param properties: 消息属性
-        """
-        if not routing_key and not queue:
-            raise AttributeError("请设置 routing_key or queue")
-
-        # 不指定交换机发送消息,routing_key 表示消息队列名称
-        # 指定交换机发送消息,routing_key 表示路由键
-        if not exchange:
-            routing_key = queue
-
-        message_id = tools.get_uuid().replace("-", "")
-        if isinstance(data, dict):
-            message_id = data.get("pyuuid") or message_id
-
-        # RabbitMQ 的 delivery_mode 属性用于设置消息的持久性。它有两种取值:
-        #   delivery_mode=1:表示消息被标记为持久化,但是仍然可能在服务器重启之前丢失。
-        #   delivery_mode=2:表示消息被标记为持久化,并且会存储在磁盘上,确保消息不会丢失
-        properties = properties or {}
-        properties = dict(delivery_mode=2, **properties)
-        if "message_id" not in properties:
-            properties["message_id"] = message_id
-
-        self._channel.basic.publish(
-            body=tools.dumps_obj(data),  # 对象序列化
-            routing_key=routing_key,
-            exchange=exchange,
-            properties=properties  # specification.Basic.Properties
-        )
-
-    def add_dlx(self, exchange, routing_key, data, properties=None):
-        """
-        发布延时消息
-
-        @param exchange: 交换机名称
-        @param routing_key: 路由键
-        @param data: 消息内容
-        @param properties: 消息属性
-        """
-        queue = routing_key
-        self.add(data, queue, exchange, routing_key, properties)
-
-    def ack(self, delivery_tag=0, multiple=False):
-        """
-        手动回复队列消息确认
-
-        @param delivery_tag: 消息标签
-        @param bool multiple: 开启多个回复消息确认
-        """
-        self._channel.basic.ack(delivery_tag=delivery_tag, multiple=multiple)
-
-    def declare_exchange(self, exchange, exchange_type=None, auto_delete=False, arguments=None):
-        """声明交换机"""
-        shares = dict(
-            exchange_type=exchange_type or self._exchange_type,
-            auto_delete=auto_delete,
-            arguments=arguments
-        )
-        try:
-            # 检查交换机是否存在
-            params = dict(passive=True, **shares)
-            return self._channel.exchange.declare(exchange, **params)
-        except AMQPChannelError as why:
-            if why.error_code == 404:
-                self.get_channel()
-                # 创建一个直连交换机
-                params = dict(durable=True, **shares)
-                return self._channel.exchange.declare(exchange, **params)
-            else:
-                raise why
-
-    def declare_queue(self, queue, auto_delete=False, arguments=None):
-        """
-        声明队列
-
-        @param queue:
-        @param auto_delete:
-        @param arguments:
-        """
-        shares = dict(auto_delete=auto_delete, arguments=arguments)
-        try:
-            params = dict(passive=True, **shares)
-            return self._channel.queue.declare(queue, **params)
-        except AMQPChannelError as why:
-            if why.error_code == 404:
-                self.get_channel()
-                # 声明持久化队列
-                params = dict(durable=True, **shares)
-                return self._channel.queue.declare(queue, **params)
-            else:
-                raise why
-
-    def declare_bind(self, queue="", exchange="", routing_key=""):
-        """
-        声明队列和交换机,同时将队列绑定交换机
-
-        @param queue: 队列名称
-        @param exchange: 交换机名称
-        @param routing_key: 路由键
-        """
-        exchange = exchange or self._exchange
-        binding_key = (queue, exchange, routing_key)
-        if queue in self.__cache and binding_key in self.__cache:
-            return self.__cache[queue]
-
-        self.declare_exchange(exchange)
-        result = self.declare_queue(queue)
-        self.__cache[queue] = result
-        # 队列绑定一个交换机
-        self._channel.queue.bind(queue, exchange, routing_key)
-        self.__cache[binding_key] = True
-        return result
-
-    def start_consuming(
-        self,
-        limit=None,
-        to_tuple=False,
-        auto_decode=True,
-        correlation_id="",
-        use_filter=False
-    ):
-        """
-        @param int limit: 消费数据上限
-        @param to_tuple: 消息结果返回元组形式
-        @param auto_decode: 自动解码
-        @param correlation_id: 应用程序关联标识符
-        @param use_filter: 使用缓存过滤重复数据
-        """
-        if not self._channel._consumer_callbacks:
-            raise AMQPChannelError("no consumer callback defined")
-
-        cache = []  # 消息队列缓存,去除重复消息
-        recv_count = 0  # 接收消息计数
-        params = dict(break_on_empty=True, auto_decode=auto_decode)
-        for message in self._channel.build_inbound_messages(**params):
-            consumer_tag = message._method.get("consumer_tag")
-            if use_filter and message.message_id in cache:
-                continue
-
-            cache.append(message.message_id)  # 缓存消息id
-
-            # 指定应用程序标识,只会在特定的程序中被消费
-            if correlation_id:
-                if correlation_id == message.correlation_id:
-                    self._channel._consumer_callbacks[consumer_tag](message)
-                    recv_count += 1
-
-                if limit is not None and recv_count == limit:
-                    break
-                else:
-                    continue
-
-            recv_count += 1
-
-            if to_tuple:
-                self._channel._consumer_callbacks[consumer_tag](*message.to_tuple())
-                if limit is not None and recv_count == limit:
-                    break
-                else:
-                    continue
-
-            self._channel._consumer_callbacks[consumer_tag](message)
-            if limit is not None and recv_count == limit:
-                break
-
-    def stop_consuming(self, consumer_tag=None):
-        """
-        @param str consumer_tag: 消费者标签
-        """
-        self._channel.basic.cancel(consumer_tag)
-
-    def get(self, queue, limit, auto_ack=False, to_str=True, **kwargs):
-        """
-        获取rabbitmq消息队列中的信道数据
-
-        @param str queue: 队列名称
-        @param int limit: 获取消息数量
-        @param auto_ack: 自动回复消息确认
-        @param to_str: 消息是否转成字符串
-        @param to_str: 消息是否转成字符串
-        """
-        message_lst = []
-
-        if "use_filter" not in kwargs:
-            kwargs["use_filter"] = True
-
-        def callback(message):
-            body = tools.loads_obj(message.body)  # 反序列化消息对象
-            delivery_tag = message.delivery_tag
-            if auto_ack:
-                self.ack(delivery_tag)
-                delivery_tag = 0
-
-            if to_str:
-                message_lst.append(str(RabbitMQMessage(delivery_tag, body)))
-            else:
-                message_lst.append(RabbitMQMessage(delivery_tag, body))
-
-        try:
-            # 设置预取上限数量
-            self._channel.basic.qos(prefetch_count=limit)
-            # 注册消费者并获取消费者标签
-            consumer_tag = self._channel.basic.consume(callback, queue=queue)
-            # 开始消费
-            self.start_consuming(limit, **kwargs)
-            # 停止消费并关闭消费者
-            self.stop_consuming(consumer_tag)
-        except (AMQPChannelError, AMQPConnectionError) as why:
-            log.exception(why)
-
-        return message_lst
-
-    @property
-    def is_open(self):
-        return self._mq.is_open
-
-    def __get_message_count(self, correlation_id, **kwargs):
-        channel = self._mq.channel()  # 启用检查数量的临时信道
-        recv_count = 0  # 接收消息计数
-
-        try:
-            channel.basic.consume(queue=kwargs["queue"])  # 指定查询的队列
-            for message in channel.build_inbound_messages(break_on_empty=True):
-                # 指定应用程序标识,只会在特定的程序中被消费
-                if correlation_id == message.correlation_id:
-                    recv_count += 1
-
-        except amqpstorm.exception.AMQPChannelError as why:
-            log.exception(why)
-        finally:
-            channel.close()  # 关闭查询的临时信道
-
-        result = kwargs.copy()
-        result["message_count"] = recv_count  # 重置消息数量
-        return result
-
-    def get_message_count(self, queue, user=None):
-        try:
-            message = self._channel.queue.declare(queue, passive=True)
-            if user is not None:
-                message = self.__get_message_count(user, **message)
-
-            # message_count 消息统计是消息发布确认之后的数量,未确认消息无法统计
-            return message.get("message_count")
-        except amqpstorm.exception.AMQPChannelError:
-            return 0
-
-    def get_mq_obj(self):
-        return self._mq
-
-    def close(self, n=-1):
-        log.debug(f"关闭 RabbitMQ {n}")
-        if self._channel.is_open:
-            self._channel.close()
-
-        if self._mq.is_open:
-            self.__mq.close()
-
-        self._stop_server = True
-
-    def __getattr__(self, name):
-        return getattr(self._mq, name)
-
-    def __repr__(self):
-        if self._url:
-            return "<RabbitMQ url:{}>".format(self._url)
-
-        return "<RabbitMQ ip_port: {} username:{} password:{}>".format(
-            self._ip_port, self._user, self._user_pass
-        )
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exception_type, exception_value, _):
-        if exception_type:
-            log.warning(
-                "Closing RabbitMQ to an unhandled exception: %s",
-                exception_value
-            )
-        if not self._mq.is_open:
-            return
-        self.close()

+ 0 - 848
spider_frame/FworkSpider/feapder/db/redisdb.py

@@ -1,848 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-11-16 16:25
----------
-@summary: 操作redis数据库
----------
-@author: Boris
-"""
-
-import time
-
-import redis
-from redis._compat import unicode, long, basestring
-from redis.connection import Encoder as _Encoder
-from redis.exceptions import ConnectionError, TimeoutError
-from redis.exceptions import DataError
-from redis.sentinel import Sentinel
-from rediscluster import RedisCluster
-
-import feapder.setting as setting
-from feapder.utils.log import log
-
-
-class Encoder(_Encoder):
-    def encode(self, value):
-        "Return a bytestring or bytes-like representation of the value"
-        if isinstance(value, (bytes, memoryview)):
-            return value
-        # elif isinstance(value, bool):
-        #     # special case bool since it is a subclass of int
-        #     raise DataError(
-        #         "Invalid input of type: 'bool'. Convert to a "
-        #         "bytes, string, int or float first."
-        #     )
-        elif isinstance(value, float):
-            value = repr(value).encode()
-        elif isinstance(value, (int, long)):
-            # python 2 repr() on longs is '123L', so use str() instead
-            value = str(value).encode()
-        elif isinstance(value, (list, dict, tuple)):
-            value = unicode(value)
-        elif not isinstance(value, basestring):
-            # a value we don't know how to deal with. throw an error
-            typename = type(value).__name__
-            raise DataError(
-                "Invalid input of type: '%s'. Convert to a "
-                "bytes, string, int or float first." % typename
-            )
-        if isinstance(value, unicode):
-            value = value.encode(self.encoding, self.encoding_errors)
-        return value
-
-
-redis.connection.Encoder = Encoder
-
-
-class RedisDB:
-    def __init__(
-        self,
-        ip_ports=None,
-        db=None,
-        user_pass=None,
-        url=None,
-        decode_responses=True,
-        service_name=None,
-        max_connections=32,
-        **kwargs,
-    ):
-        """
-        redis的封装
-        Args:
-            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-            db:
-            user_pass:
-            url:
-            decode_responses:
-            service_name: 适用于redis哨兵模式
-        """
-
-        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
-        if ip_ports is None:
-            ip_ports = setting.REDISDB_IP_PORTS
-        if db is None:
-            db = setting.REDISDB_DB
-        if user_pass is None:
-            user_pass = setting.REDISDB_USER_PASS
-        if service_name is None:
-            service_name = setting.REDISDB_SERVICE_NAME
-
-        self._is_redis_cluster = False
-
-        self.__redis = None
-        self._url = url
-        self._ip_ports = ip_ports
-        self._db = db
-        self._user_pass = user_pass
-        self._decode_responses = decode_responses
-        self._service_name = service_name
-        self._max_connections = max_connections
-        self._kwargs = kwargs
-        self.get_connect()
-
-    def __repr__(self):
-        if self._url:
-            return "<Redisdb url:{}>".format(self._url)
-
-        return "<Redisdb ip_ports: {} db:{} user_pass:{}>".format(
-            self._ip_ports, self._db, self._user_pass
-        )
-
-    @property
-    def _redis(self):
-        try:
-            if not self.__redis.ping():
-                raise ConnectionError("unable to connect to redis")
-        except:
-            self._reconnect()
-
-        return self.__redis
-
-    @_redis.setter
-    def _redis(self, val):
-        self.__redis = val
-
-    def get_connect(self):
-        # 获取数据库连接
-        try:
-            if not self._url:
-                if not self._ip_ports:
-                    raise Exception("未设置 redis 连接信息")
-
-                ip_ports = (
-                    self._ip_ports
-                    if isinstance(self._ip_ports, list)
-                    else self._ip_ports.split(",")
-                )
-                if len(ip_ports) > 1:
-                    startup_nodes = []
-                    for ip_port in ip_ports:
-                        ip, port = ip_port.split(":")
-                        startup_nodes.append({"host": ip, "port": port})
-
-                    if self._service_name:
-                        # log.debug("使用redis哨兵模式")
-                        hosts = [(node["host"], node["port"]) for node in startup_nodes]
-                        sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs)
-                        self._redis = sentinel.master_for(
-                            self._service_name,
-                            password=self._user_pass,
-                            db=self._db,
-                            redis_class=redis.StrictRedis,
-                            decode_responses=self._decode_responses,
-                            max_connections=self._max_connections,
-                            **self._kwargs,
-                        )
-
-                    else:
-                        # log.debug("使用redis集群模式")
-                        self._redis = RedisCluster(
-                            startup_nodes=startup_nodes,
-                            decode_responses=self._decode_responses,
-                            password=self._user_pass,
-                            max_connections=self._max_connections,
-                            **self._kwargs,
-                        )
-
-                    self._is_redis_cluster = True
-                else:
-                    ip, port = ip_ports[0].split(":")
-                    self._redis = redis.StrictRedis(
-                        host=ip,
-                        port=port,
-                        db=self._db,
-                        password=self._user_pass,
-                        decode_responses=self._decode_responses,
-                        max_connections=self._max_connections,
-                        **self._kwargs,
-                    )
-                    self._is_redis_cluster = False
-            else:
-                self._redis = redis.StrictRedis.from_url(
-                    self._url, decode_responses=self._decode_responses
-                )
-                self._is_redis_cluster = False
-
-        except Exception as e:
-            raise
-
-        # 不要写成self._redis.ping() 否则循环调用了
-        return self.__redis.ping()
-
-    @classmethod
-    def from_url(cls, url):
-        """
-
-        Args:
-            url: redis://[[username]:[password]]@[host]:[port]/[db]
-
-        Returns:
-
-        """
-        return cls(url=url)
-
-    def sadd(self, table, values):
-        """
-        @summary: 使用无序set集合存储数据, 去重
-        ---------
-        @param table:
-        @param values: 值; 支持list 或 单个值
-        ---------
-        @result: 若库中存在 返回0,否则入库,返回1。 批量添加返回None
-        """
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.sadd(table, value)
-            pipe.execute()
-
-        else:
-            return self._redis.sadd(table, values)
-
-    def sget(self, table, count=1, is_pop=True):
-        """
-        返回 list 如 ['1'] 或 []
-        @param table:
-        @param count:
-        @param is_pop:
-        @return:
-        """
-
-        datas = []
-        if is_pop:
-            count = count if count <= self.sget_count(table) else self.sget_count(table)
-            if count:
-                if count > 1:
-                    pipe = self._redis.pipeline()
-
-                    if not self._is_redis_cluster:
-                        pipe.multi()
-                    while count:
-                        pipe.spop(table)
-                        count -= 1
-                    datas = pipe.execute()
-
-                else:
-                    datas.append(self._redis.spop(table))
-
-        else:
-            datas = self._redis.srandmember(table, count)
-
-        return datas
-
-    def srem(self, table, values):
-        """
-        @summary: 移除集合中的指定元素
-        ---------
-        @param table:
-        @param values: 一个或者列表
-        ---------
-        @result:
-        """
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.srem(table, value)
-            pipe.execute()
-        else:
-            self._redis.srem(table, values)
-
-    def sget_count(self, table):
-        return self._redis.scard(table)
-
-    def sdelete(self, table):
-        """
-        @summary: 删除set集合的大键(数据量大的表)
-        删除大set键,使用sscan命令,每次扫描集合中500个元素,再用srem命令每次删除一个键
-        若直接用delete命令,会导致Redis阻塞,出现故障切换和应用程序崩溃的故障。
-        ---------
-        @param table:
-        ---------
-        @result:
-        """
-
-        # 当 SCAN 命令的游标参数被设置为 0 时, 服务器将开始一次新的迭代, 而当服务器向用户返回值为 0 的游标时, 表示迭代已结束
-        cursor = "0"
-        while cursor != 0:
-            cursor, data = self._redis.sscan(table, cursor=cursor, count=500)
-            for item in data:
-                # pipe.srem(table, item)
-                self._redis.srem(table, item)
-
-            # pipe.execute()
-
-    def sismember(self, table, key):
-        "Return a boolean indicating if ``value`` is a member of set ``name``"
-        return self._redis.sismember(table, key)
-
-    def zadd(self, table, values, prioritys=0):
-        """
-        @summary: 使用有序set集合存储数据, 去重(值存在更新)
-        ---------
-        @param table:
-        @param values: 值; 支持list 或 单个值
-        @param prioritys: 优先级; double类型,支持list 或 单个值。 根据此字段的值来排序, 值越小越优先。 可不传值,默认value的优先级为0
-        ---------
-        @result:若库中存在 返回0,否则入库,返回1。 批量添加返回 [0, 1 ...]
-        """
-        if isinstance(values, list):
-            if not isinstance(prioritys, list):
-                prioritys = [prioritys] * len(values)
-            else:
-                assert len(values) == len(prioritys), "values值要与prioritys值一一对应"
-
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value, priority in zip(values, prioritys):
-                pipe.execute_command(
-                    "ZADD", table, priority, value
-                )  # 为了兼容2.x与3.x版本的redis
-            return pipe.execute()
-
-        else:
-            return self._redis.execute_command(
-                "ZADD", table, prioritys, values
-            )  # 为了兼容2.x与3.x版本的redis
-
-    def zget(self, table, count=1, is_pop=True):
-        """
-        @summary: 从有序set集合中获取数据 优先返回分数小的(优先级高的)
-        ---------
-        @param table:
-        @param count: 数量 -1 返回全部数据
-        @param is_pop:获取数据后,是否在原set集合中删除,默认是
-        ---------
-        @result: 列表
-        """
-
-        start_pos = 0  # 包含
-        end_pos = count - 1 if count > 0 else count
-
-        pipe = self._redis.pipeline()
-
-        if not self._is_redis_cluster:
-            pipe.multi()  # 标记事务的开始 参考 http://www.runoob.com/redis/redis-transactions.html
-        pipe.zrange(table, start_pos, end_pos)  # 取值
-        if is_pop:
-            pipe.zremrangebyrank(table, start_pos, end_pos)  # 删除
-        results, *count = pipe.execute()
-        return results
-
-    def zremrangebyscore(self, table, priority_min, priority_max):
-        """
-        根据分数移除成员 闭区间
-        @param table:
-        @param priority_min:
-        @param priority_max:
-        @return: 被移除的成员个数
-        """
-        return self._redis.zremrangebyscore(table, priority_min, priority_max)
-
-    def zrangebyscore(self, table, priority_min, priority_max, count=None, is_pop=True):
-        """
-        @summary: 返回指定分数区间的数据 闭区间
-        ---------
-        @param table:
-        @param priority_min: 优先级越小越优先
-        @param priority_max:
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        @param is_pop: 是否删除
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[2]
-            local max_score = ARGV[3]
-            local is_pop = ARGV[4]
-            local count = ARGV[5]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
-            end
-
-            -- 删除redis中刚取到的值
-            if (is_pop=='True' or is_pop=='1') then
-                for i=1, #datas do
-                    redis.call('zrem', KEYS[1], datas[i])
-                end
-            end
-
-
-            return datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(
-                keys=[table], args=[table, priority_min, priority_max, is_pop, count]
-            )
-        else:
-            res = cmd(keys=[table], args=[table, priority_min, priority_max, is_pop])
-
-        return res
-
-    def zrangebyscore_increase_score(
-        self, table, priority_min, priority_max, increase_score, count=None
-    ):
-        """
-        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
-        ---------
-        @param table:
-        @param priority_min: 最小分数
-        @param priority_max: 最大分数
-        @param increase_score: 分数值增量 正数则在原有的分数上叠加,负数则相减
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[1]
-            local max_score = ARGV[2]
-            local increase_score = ARGV[3]
-            local count = ARGV[4]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
-            end
-
-            --修改优先级
-            for i=1, #datas do
-                redis.call('zincrby', KEYS[1], increase_score, datas[i])
-            end
-
-            return datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(
-                keys=[table], args=[priority_min, priority_max, increase_score, count]
-            )
-        else:
-            res = cmd(keys=[table], args=[priority_min, priority_max, increase_score])
-
-        return res
-
-    def zrangebyscore_set_score(
-        self, table, priority_min, priority_max, score, count=None
-    ):
-        """
-        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
-        ---------
-        @param table:
-        @param priority_min: 最小分数
-        @param priority_max: 最大分数
-        @param score: 分数值
-        @param count: 获取的数量,为空则表示分数区间内的全部数据
-        ---------
-        @result:
-        """
-
-        # 使用lua脚本, 保证操作的原子性
-        lua = """
-            -- local key = KEYS[1]
-            local min_score = ARGV[1]
-            local max_score = ARGV[2]
-            local set_score = ARGV[3]
-            local count = ARGV[4]
-
-            -- 取值
-            local datas = nil
-            if count then
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores','limit', 0, count)
-            else
-                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores')
-            end
-
-            local real_datas = {} -- 数据
-            --修改优先级
-            for i=1, #datas, 2 do
-               local data = datas[i]
-               local score = datas[i+1]
-
-               table.insert(real_datas, data) -- 添加数据
-
-               redis.call('zincrby', KEYS[1], set_score - score, datas[i])
-            end
-
-            return real_datas
-
-        """
-        cmd = self._redis.register_script(lua)
-        if count:
-            res = cmd(keys=[table], args=[priority_min, priority_max, score, count])
-        else:
-            res = cmd(keys=[table], args=[priority_min, priority_max, score])
-
-        return res
-
-    def zincrby(self, table, amount, value):
-        return self._redis.zincrby(table, amount, value)
-
-    def zget_count(self, table, priority_min=None, priority_max=None):
-        """
-        @summary: 获取表数据的数量
-        ---------
-        @param table:
-        @param priority_min:优先级范围 最小值(包含)
-        @param priority_max:优先级范围 最大值(包含)
-        ---------
-        @result:
-        """
-
-        if priority_min != None and priority_max != None:
-            return self._redis.zcount(table, priority_min, priority_max)
-        else:
-            return self._redis.zcard(table)
-
-    def zrem(self, table, values):
-        """
-        @summary: 移除集合中的指定元素
-        ---------
-        @param table:
-        @param values: 一个或者列表
-        ---------
-        @result:
-        """
-
-        if isinstance(values, list):
-            self._redis.zrem(table, *values)
-        else:
-            self._redis.zrem(table, values)
-
-    def zexists(self, table, values):
-        """
-        利用zscore判断某元素是否存在
-        @param values:
-        @return:
-        """
-
-        is_exists = []
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-            pipe.multi()
-            for value in values:
-                pipe.zscore(table, value)
-            is_exists_temp = pipe.execute()
-            for is_exist in is_exists_temp:
-                if is_exist != None:
-                    is_exists.append(1)
-                else:
-                    is_exists.append(0)
-
-        else:
-            is_exists = self._redis.zscore(table, values)
-            is_exists = 1 if is_exists != None else 0
-
-        return is_exists
-
-    def lpush(self, table, values):
-
-        if isinstance(values, list):
-            pipe = self._redis.pipeline()
-
-            if not self._is_redis_cluster:
-                pipe.multi()
-            for value in values:
-                pipe.rpush(table, value)
-            pipe.execute()
-
-        else:
-            return self._redis.rpush(table, values)
-
-    def lpop(self, table, count=1):
-        """
-        @summary:
-        ---------
-        @param table:
-        @param count:
-        ---------
-        @result: count>1时返回列表
-        """
-
-        datas = None
-
-        count = count if count <= self.lget_count(table) else self.lget_count(table)
-
-        if count:
-            if count > 1:
-                pipe = self._redis.pipeline()
-
-                if not self._is_redis_cluster:
-                    pipe.multi()
-                while count:
-                    pipe.lpop(table)
-                    count -= 1
-                datas = pipe.execute()
-
-            else:
-                datas = self._redis.lpop(table)
-
-        return datas
-
-    def rpoplpush(self, from_table, to_table=None):
-        """
-        将列表 from_table 中的最后一个元素(尾元素)弹出,并返回给客户端。
-        将 from_table 弹出的元素插入到列表 to_table ,作为 to_table 列表的的头元素。
-        如果 from_table 和 to_table 相同,则列表中的表尾元素被移动到表头,并返回该元素,可以把这种特殊情况视作列表的旋转(rotation)操作
-        @param from_table:
-        @param to_table:
-        @return:
-        """
-
-        if not to_table:
-            to_table = from_table
-
-        return self._redis.rpoplpush(from_table, to_table)
-
-    def lget_count(self, table):
-        return self._redis.llen(table)
-
-    def lrem(self, table, value, num=0):
-        """
-        @summary:
-        删除value
-        ---------
-        @param table:
-        @param value:
-        @param num:
-        ---------
-        @result: 删除的条数
-        """
-        return self._redis.lrem(table, num, value)
-
-    def lrange(self, table, start=0, end=-1):
-        return self._redis.lrange(table, start, end)
-
-    def hset(self, table, key, value):
-        """
-        @summary:
-        如果 key 不存在,一个新的哈希表被创建并进行 HSET 操作。
-        如果域 field 已经存在于哈希表中,旧值将被覆盖
-        ---------
-        @param table:
-        @param key:
-        @param value:
-        ---------
-        @result: 1 新插入; 0 覆盖
-        """
-        return self._redis.hset(table, key, value)
-
-    def hset_batch(self, table, datas):
-        """
-        批量插入
-        Args:
-            datas:
-                [[key, value]]
-        Returns:
-
-        """
-        pipe = self._redis.pipeline()
-
-        if not self._is_redis_cluster:
-            pipe.multi()
-        for key, value in datas:
-            pipe.hset(table, key, value)
-        return pipe.execute()
-
-    def hincrby(self, table, key, increment):
-        return self._redis.hincrby(table, key, increment)
-
-    def hget(self, table, key, is_pop=False):
-        if not is_pop:
-            return self._redis.hget(table, key)
-        else:
-            lua = """
-                -- local key = KEYS[1]
-                local field = ARGV[1]
-
-                -- 取值
-                local datas = redis.call('hget', KEYS[1], field)
-                -- 删除值
-                redis.call('hdel', KEYS[1], field)
-
-                return datas
-
-                    """
-            cmd = self._redis.register_script(lua)
-            res = cmd(keys=[table], args=[key])
-
-            return res
-
-    def hgetall(self, table):
-        return self._redis.hgetall(table)
-
-    def hexists(self, table, key):
-        return self._redis.hexists(table, key)
-
-    def hdel(self, table, *keys):
-        """
-        @summary: 删除对应的key 可传多个
-        ---------
-        @param table:
-        @param *keys:
-        ---------
-        @result:
-        """
-        self._redis.hdel(table, *keys)
-
-    def hget_count(self, table):
-        return self._redis.hlen(table)
-
-    def setbit(self, table, offsets, values):
-        """
-        设置字符串数组某一位的值, 返回之前的值
-        @param table:
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            if not isinstance(values, list):
-                values = [values] * len(offsets)
-            else:
-                assert len(offsets) == len(values), "offsets值要与values值一一对应"
-
-            pipe = self._redis.pipeline()
-            pipe.multi()
-
-            for offset, value in zip(offsets, values):
-                pipe.setbit(table, offset, value)
-
-            return pipe.execute()
-
-        else:
-            return self._redis.setbit(table, offsets, values)
-
-    def getbit(self, table, offsets):
-        """
-        取字符串数组某一位的值
-        @param table:
-        @param offsets: 支持列表
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            pipe = self._redis.pipeline()
-            pipe.multi()
-            for offset in offsets:
-                pipe.getbit(table, offset)
-
-            return pipe.execute()
-
-        else:
-            return self._redis.getbit(table, offsets)
-
-    def bitcount(self, table):
-        return self._redis.bitcount(table)
-
-    def strset(self, table, value, **kwargs):
-        return self._redis.set(table, value, **kwargs)
-
-    def str_incrby(self, table, value):
-        return self._redis.incrby(table, value)
-
-    def strget(self, table):
-        return self._redis.get(table)
-
-    def strlen(self, table):
-        return self._redis.strlen(table)
-
-    def getkeys(self, regex):
-        return self._redis.keys(regex)
-
-    def exists_key(self, key):
-        return self._redis.exists(key)
-
-    def set_expire(self, key, seconds):
-        """
-        @summary: 设置过期时间
-        ---------
-        @param key:
-        @param seconds: 秒
-        ---------
-        @result:
-        """
-        self._redis.expire(key, seconds)
-
-    def get_expire(self, key):
-        """
-        @summary: 查询过期时间
-        ---------
-        @param key:
-        @param seconds: 秒
-        ---------
-        @result:
-        """
-        return self._redis.ttl(key)
-
-    def clear(self, table):
-        try:
-            self._redis.delete(table)
-        except Exception as e:
-            log.error(e)
-
-    def get_redis_obj(self):
-        return self._redis
-
-    def _reconnect(self):
-        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
-        retry_count = 0
-        while True:
-            try:
-                retry_count += 1
-                log.error(f"redis 连接断开, 重新连接 {retry_count}")
-                if self.get_connect():
-                    log.info(f"redis 连接成功")
-                    return True
-            except (ConnectionError, TimeoutError) as e:
-                log.error(f"连接失败 e: {e}")
-
-            time.sleep(2)
-
-    def __getattr__(self, name):
-        return getattr(self._redis, name)

+ 0 - 140
spider_frame/FworkSpider/feapder/dedup/README.md

@@ -1,140 +0,0 @@
-# Dedup
-
-Dedup是feapder大数据去重模块,内置3种去重机制,使用方式一致,可容纳的去重数据量与内存有关。不同于BloomFilter,去重受槽位数量影响,Dedup使用了弹性的去重机制,可容纳海量的数据去重。
-
-
-## 去重方式
-
-### 临时去重
-
-> 基于redis,支持批量,去重有时效性。去重一万条数据约0.26秒,一亿条数据占用内存约1.43G
-
-```
-from feapder.dedup import Dedup
-
-data = {"xxx": 123, "xxxx": "xxxx"}
-datas = ["xxx", "bbb"]
-
-def test_ExpireFilter():
-    dedup = Dedup(
-        Dedup.ExpireFilter, expire_time=10, redis_url="redis://@localhost:6379/0"
-    )
-
-    # 逐条去重
-    assert dedup.add(data) == 1
-    assert dedup.get(data) == 1
-
-    # 批量去重
-    assert dedup.add(datas) == [1, 1]
-    assert dedup.get(datas) == [1, 1]
-```
-
-
-### 内存去重
-
-> 基于内存,支持批量。去重一万条数据约0.5秒,一亿条数据占用内存约285MB
-
-```
-from feapder.dedup import Dedup
-
-data = {"xxx": 123, "xxxx": "xxxx"}
-datas = ["xxx", "bbb"]
-
-def test_MemoryFilter():
-    dedup = Dedup(Dedup.MemoryFilter)  # 表名为test 历史数据3秒有效期
-
-    # 逐条去重
-    assert dedup.add(data) == 1
-    assert dedup.get(data) == 1
-
-    # 批量去重
-    assert dedup.add(datas) == [1, 1]
-    assert dedup.get(datas) == [1, 1]
-```
-
-### 永久去重
-
-> 基于redis,支持批量,永久去重。 去重一万条数据约3.5秒,一亿条数据占用内存约285MB
-
-    from feapder.dedup import Dedup
-
-    datas = {
-        "xxx": xxx,
-        "xxxx": "xxxx",
-    }
-
-    dedup = Dedup()
-
-    print(dedup) # <ScalableBloomFilter: RedisBitArray: dedup:bloomfilter:bloomfilter>
-    print(dedup.add(datas)) # 0 不存在
-    print(dedup.get(datas)) # 1 存在
-    
-## 过滤数据
-
-Dedup可以通过如下方法,过滤掉已存在的数据
-
-
-```python
-from feapder.dedup import Dedup
-
-def test_filter():
-    dedup = Dedup(Dedup.BloomFilter, redis_url="redis://@localhost:6379/0")
-
-    # 制造已存在数据
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    dedup.filter_exist_data(datas)
-    assert datas == ["ccc"]
-```
-
-```python
-# redis cluster 去重
-from feapder.dedup import Dedup
-
-def test_filter():
-    dedup = Dedup(Dedup.RedisFilter, to_md5=False, ip_ports=["192.168.3.207:2179", "192.168.3.166:2379"], expire_time=60)
-
-    # 制造已存在数据
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    ss = dedup.filter_exist_data(datas)
-    print(ss)
-    assert datas == ["ccc"]
-```
-
-```python
-# 多个 redis 实例去重
-from feapder.dedup import Dedup
-
-def test_filter():
-    redisdb_conf = [
-        dict(
-            fingerprint_pref="pylist_",
-            ip_port="192.168.3.71:8371",
-            user_pass="top@123",
-            db=0
-        ),
-        dict(
-            fingerprint_pref="list_",
-            ip_port="192.168.3.165:8165",
-            user_pass="top@123",
-            db=0
-        )
-    ]
-    
-    dedup = Dedup(filter_type=6, to_md5=False, redisdb_conf=redisdb_conf, expire_time=60)
-    datas = ["xxx", "bbb"]
-    dedup.add(datas)
-    
-    # 过滤掉已存在数据 "xxx", "bbb"
-    datas = ["xxx", "bbb", "ccc"]
-    dedup.filter_exist_data(datas)
-    print(datas)
-    assert datas == ["ccc"]
-```

+ 0 - 177
spider_frame/FworkSpider/feapder/dedup/__init__.py

@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-12-13 21:08
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import copy
-from typing import Any, List, Union, Tuple, Callable, Optional
-
-from feapder.utils.tools import get_md5
-from .bloomfilter import BloomFilter, ScalableBloomFilter
-from .expirefilter import ExpireFilter
-from .litefilter import LiteFilter
-from .redisfilter import RedisFilter, RedisMultiFilter
-
-
-class Dedup:
-    BloomFilter = 1
-    MemoryFilter = 2
-    ExpireFilter = 3
-    LiteFilter = 4
-    RedisFilter = 5
-    RedisMultiFilter = 6
-
-    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
-        if filter_type == Dedup.ExpireFilter:
-            try:
-                expire_time = kwargs["expire_time"]
-            except:
-                raise ValueError("需传参数 expire_time")
-
-            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
-                "name", expire_time
-            )
-            expire_time_record_key = "dedup:expire_set:expire_time"
-
-            self.dedup = ExpireFilter(
-                name=name,
-                expire_time=expire_time,
-                expire_time_record_key=expire_time_record_key,
-                redis_url=kwargs.get("redis_url"),
-            )
-        elif filter_type == Dedup.RedisFilter:
-            self.dedup = RedisFilter(
-                ip_ports=kwargs.get("ip_ports"),
-                user_pass=kwargs.get("user_pass", ""),
-                redis_url=kwargs.get("redis_url"),
-                expire_time=kwargs.get("expire_time")
-            )
-        elif filter_type == Dedup.RedisMultiFilter:
-            self.dedup = RedisMultiFilter(
-                redisdb_conf=kwargs.get("redisdb_conf"),
-                expire_time=kwargs.get("expire_time")
-            )
-        else:
-            initial_capacity = kwargs.get("initial_capacity", 100000000)
-            error_rate = kwargs.get("error_rate", 0.00001)
-            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get("name", "bloomfilter")
-            if filter_type == Dedup.BloomFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
-                    redis_url=kwargs.get("redis_url"),
-                )
-            elif filter_type == Dedup.MemoryFilter:
-                self.dedup = ScalableBloomFilter(
-                    name=name,
-                    initial_capacity=initial_capacity,
-                    error_rate=error_rate,
-                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
-                )
-            elif filter_type == Dedup.LiteFilter:
-                self.dedup = LiteFilter()
-            else:
-                raise ValueError(
-                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
-                )
-
-        self._to_md5 = to_md5
-
-    def __repr__(self):
-        return str(self.dedup)
-
-    def _deal_datas(self, datas):
-        if self._to_md5:
-            if isinstance(datas, list):
-                keys = [get_md5(data) for data in datas]
-            else:
-                keys = get_md5(datas)
-        else:
-            keys = copy.deepcopy(datas)
-
-        return keys
-
-    def add(
-        self, datas: Union[List[Any], Any], skip_check: bool = False
-    ) -> Union[List[Any], Any]:
-        """
-        添加数据
-        @param datas: list / 单个值
-        @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
-        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-
-        keys = self._deal_datas(datas)
-        is_added = self.dedup.add(keys, skip_check)
-
-        return is_added
-
-    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
-        """
-        检查数据是否存在
-        @param datas: list / 单个值
-        @return: list / 单个值 (存在返回1 不存在返回0)
-        """
-        keys = self._deal_datas(datas)
-        is_exists = self.dedup.get(keys)
-
-        return is_exists
-
-    def filter_exist_data(
-        self,
-        datas: List[Any],
-        *,
-        datas_fingerprints: Optional[List] = None,
-        callback: Callable[[Any], None] = None
-    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
-        """
-        过滤掉已存在的数据
-        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
-        @param datas_fingerprints: 数据的唯一指纹 列表
-        @param datas: 数据 列表
-        @param callback: 数据已存在时的回调 callback(data)
-        @return: None
-        """
-
-        is_exists = self.get(datas_fingerprints or datas)
-
-        dedup_datas = []
-
-        if datas_fingerprints:
-            dedup_datas_fingerprints = []
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-                data_fingerprint = datas_fingerprints.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                    dedup_datas_fingerprints.append(data_fingerprint)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas_fingerprints.extend(dedup_datas_fingerprints)
-            datas.extend(dedup_datas)
-            return datas, datas_fingerprints
-
-        else:
-            while is_exists:
-                data = datas.pop(0)
-                is_exist = is_exists.pop(0)
-
-                if not is_exist:
-                    dedup_datas.append(data)
-                else:
-                    if callback:
-                        callback(data)
-
-            datas.extend(dedup_datas)
-            return datas

+ 0 - 41
spider_frame/FworkSpider/feapder/dedup/basefilter.py

@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/21 11:17 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import abc
-from typing import List, Union
-
-
-class BaseFilter:
-    @abc.abstractmethod
-    def add(
-        self, keys: Union[List[str], str], *args, **kwargs
-    ) -> Union[List[bool], bool]:
-        """
-
-        Args:
-            keys: list / 单个值
-            *args:
-            **kwargs:
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-        pass
-
-    @abc.abstractmethod
-    def get(self, keys: Union[List[str], str]) -> Union[List[bool], bool]:
-        """
-        检查数据是否存在
-        Args:
-            keys: list / 单个值
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 1 否则返回 0)
-        """
-        pass

+ 0 - 143
spider_frame/FworkSpider/feapder/dedup/bitarray.py

@@ -1,143 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/14 1:05 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-from __future__ import absolute_import
-
-
-from feapder.db.redisdb import RedisDB
-
-
-class BitArray:
-    def setall(self, value):
-        pass
-
-    def __repr__(self):
-        raise ImportError("this method mush be implement")
-
-    def set(self, offsets, values):
-        """
-        设置字符串数字某一位的值, 返回之前的值
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        raise ImportError("this method mush be implement")
-
-    def get(self, offsets):
-        """
-        取字符串数字某一位的值
-        @param offsets: 支持列表或单个值
-        @return: list / 单个值
-        """
-        raise ImportError("this method mush be implement")
-
-    def count(self, value=True):
-        raise ImportError("this method mush be implement")
-
-
-class MemoryBitArray(BitArray):
-    def __init__(self, num_bits):
-        try:
-            import bitarray
-        except Exception as e:
-            raise Exception(
-                "需要安装feapder完整版\ncommand: pip install feapder[all]\n若安装出错,参考:https://boris.org.cn/feapder/#/question/%E5%AE%89%E8%A3%85%E9%97%AE%E9%A2%98"
-            )
-
-        self.num_bits = num_bits
-        self.bitarray = bitarray.bitarray(num_bits, endian="little")
-
-        self.setall(0)
-
-    def __repr__(self):
-        return "MemoryBitArray: {}".format(self.num_bits)
-
-    def setall(self, value):
-        self.bitarray.setall(value)
-
-    def set(self, offsets, values):
-        """
-        设置字符串数字某一位的值, 返回之前的值
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-
-        old_values = []
-
-        if isinstance(offsets, list):
-            if not isinstance(values, list):
-                values = [values] * len(offsets)
-            else:
-                assert len(offsets) == len(values), "offsets值要与values值一一对应"
-
-            for offset, value in zip(offsets, values):
-                old_values.append(int(self.bitarray[offset]))
-                self.bitarray[offset] = value
-
-        else:
-            old_values = int(self.bitarray[offsets])
-            self.bitarray[offsets] = values
-
-        return old_values
-
-    def get(self, offsets):
-        """
-        取字符串数字某一位的值
-        @param offsets: 支持列表或单个值
-        @return: list / 单个值
-        """
-        if isinstance(offsets, list):
-            return [self.bitarray[offset] for offset in offsets]
-        else:
-            return self.bitarray[offsets]
-
-    def count(self, value=True):
-        return self.bitarray.count(value)
-
-
-class RedisBitArray(BitArray):
-    """
-    仿bitarray 基于redis
-    """
-
-    redis_db = None
-
-    def __init__(self, name, redis_url=None):
-        self.name = name
-        self.count_cached_name = name + "_count_cached"
-
-        if not self.__class__.redis_db:
-            self.__class__.redis_db = RedisDB(url=redis_url)
-
-    def __repr__(self):
-        return "RedisBitArray: {}".format(self.name)
-
-    def set(self, offsets, values):
-        """
-        设置字符串数字某一位的值, 返回之前的值
-        @param offsets: 支持列表或单个值
-        @param values: 支持列表或单个值
-        @return: list / 单个值
-        """
-        return self.redis_db.setbit(self.name, offsets, values)
-
-    def get(self, offsets):
-        return self.redis_db.getbit(self.name, offsets)
-
-    def count(self, value=True):
-        # 先查redis的缓存,若没有 在统计数量
-        count = self.redis_db.strget(self.count_cached_name)
-        if count:
-            return int(count)
-        else:
-            count = self.redis_db.bitcount(self.name)
-            self.redis_db.strset(self.count_cached_name, count, ex=1800)  # 半小时过期
-            return count

+ 0 - 373
spider_frame/FworkSpider/feapder/dedup/bloomfilter.py

@@ -1,373 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/13 4:11 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import hashlib
-import math
-import threading
-import time
-from struct import unpack, pack
-
-from feapder.dedup.basefilter import BaseFilter
-from feapder.utils.redis_lock import RedisLock
-from . import bitarray
-
-
-def make_hashfuncs(num_slices, num_bits):
-    if num_bits >= (1 << 31):
-        fmt_code, chunk_size = "Q", 8
-    elif num_bits >= (1 << 15):
-        fmt_code, chunk_size = "I", 4
-    else:
-        fmt_code, chunk_size = "H", 2
-    total_hash_bits = 8 * num_slices * chunk_size
-    if total_hash_bits > 384:
-        hashfn = hashlib.sha512
-    elif total_hash_bits > 256:
-        hashfn = hashlib.sha384
-    elif total_hash_bits > 160:
-        hashfn = hashlib.sha256
-    elif total_hash_bits > 128:
-        hashfn = hashlib.sha1
-    else:
-        hashfn = hashlib.md5
-    fmt = fmt_code * (hashfn().digest_size // chunk_size)
-    num_salts, extra = divmod(num_slices, len(fmt))
-    if extra:
-        num_salts += 1
-    salts = tuple(hashfn(hashfn(pack("I", i)).digest()) for i in range(num_salts))
-
-    def _make_hashfuncs(key):
-        if isinstance(key, str):
-            key = key.encode("utf-8")
-        else:
-            key = str(key).encode("utf-8")
-
-        i = 0
-        for salt in salts:
-            h = salt.copy()
-            h.update(key)
-            for uint in unpack(fmt, h.digest()):
-                yield uint % num_bits
-                i += 1
-                if i >= num_slices:
-                    return
-
-    return _make_hashfuncs
-
-
-class BloomFilter(object):
-    BASE_MEMORY = 1
-    BASE_REDIS = 2
-
-    def __init__(
-        self,
-        capacity: int,
-        error_rate: float = 0.00001,
-        bitarray_type=BASE_REDIS,
-        name=None,
-        redis_url=None,
-    ):
-        if not (0 < error_rate < 1):
-            raise ValueError("Error_Rate must be between 0 and 1.")
-        if not capacity > 0:
-            raise ValueError("Capacity must be > 0")
-
-        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
-        # k = log2(1/P)
-        # solving for m = bits_per_slice
-        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
-        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
-        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
-        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
-        bits_per_slice = int(
-            math.ceil(
-                (capacity * abs(math.log(error_rate)))
-                / (num_slices * (math.log(2) ** 2))
-            )
-        )
-        self._setup(error_rate, num_slices, bits_per_slice, capacity)
-
-        if bitarray_type == BloomFilter.BASE_MEMORY:
-            self.bitarray = bitarray.MemoryBitArray(self.num_bits)
-            self.bitarray.setall(False)
-        elif bitarray_type == BloomFilter.BASE_REDIS:
-            assert name, "name can't be None "
-            self.bitarray = bitarray.RedisBitArray(name, redis_url)
-        else:
-            raise ValueError("not support this bitarray type")
-
-    def _setup(self, error_rate, num_slices, bits_per_slice, capacity):
-        self.error_rate = error_rate
-        self.num_slices = num_slices
-        self.bits_per_slice = bits_per_slice
-        self.capacity = capacity
-        self.num_bits = num_slices * bits_per_slice
-        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
-
-        self._is_at_capacity = False
-        self._check_capacity_time = 0
-
-    def __repr__(self):
-        return "<BloomFilter: {}>".format(self.bitarray)
-
-    def get(self, keys, to_list=False):
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-        is_exists = []
-
-        offsets = []
-        for key in keys:
-            hashes = self.make_hashes(key)
-            offset = 0
-            for k in hashes:
-                offsets.append(offset + k)
-                offset += self.bits_per_slice
-
-        old_values = self.bitarray.get(offsets)
-        for i in range(0, len(old_values), self.num_slices):
-            is_exists.append(int(all(old_values[i : i + self.num_slices])))
-
-        if to_list:
-            return is_exists
-        else:
-            return is_exists if is_list else is_exists[0]
-
-    @property
-    def is_at_capacity(self):
-        """
-        是否容量已满, 1的个数满位数组的一半的时,则看做已满
-        比较耗时 半小时检查一次
-        @return:
-        """
-        if self._is_at_capacity:
-            return self._is_at_capacity
-
-        if (
-            not self._check_capacity_time
-            or time.time() - self._check_capacity_time > 1800
-        ):
-            bit_count = self.bitarray.count()
-            if bit_count and bit_count / self.num_bits > 0.5:
-                self._is_at_capacity = True
-
-            self._check_capacity_time = time.time()
-
-        return self._is_at_capacity
-
-    def add(self, keys):
-        """
-        Adds a key to this bloom filter. If the key already exists in this
-        filter it will return False. Otherwise True. keys support list
-        @param keys: list or one key
-        @return:
-        """
-        # if self.is_at_capacity:
-        #     raise IndexError("BloomFilter is at capacity")
-
-        is_list = isinstance(keys, list)
-
-        keys = keys if is_list else [keys]
-        is_added = []
-
-        offsets = []
-        for key in keys:
-            hashes = self.make_hashes(key)
-            offset = 0
-            for k in hashes:
-                offsets.append(offset + k)
-                offset += self.bits_per_slice
-
-        old_values = self.bitarray.set(offsets, 1)
-        for i in range(0, len(old_values), self.num_slices):
-            is_added.append(1 ^ int(all(old_values[i : i + self.num_slices])))
-
-        return is_added if is_list else is_added[0]
-
-
-class ScalableBloomFilter(BaseFilter):
-    """
-    自动扩展空间的bloomfilter, 当一个filter满一半的时候,创建下一个
-    """
-
-    BASE_MEMORY = BloomFilter.BASE_MEMORY
-    BASE_REDIS = BloomFilter.BASE_REDIS
-
-    def __init__(
-        self,
-        initial_capacity: int = 100000000,
-        error_rate: float = 0.00001,
-        bitarray_type=BASE_REDIS,
-        name=None,
-        redis_url=None,
-    ):
-
-        if not error_rate or error_rate < 0:
-            raise ValueError("Error_Rate must be a decimal less than 0.")
-
-        self._setup(
-            initial_capacity, error_rate, name, bitarray_type, redis_url=redis_url
-        )
-
-    def _setup(self, initial_capacity, error_rate, name, bitarray_type, redis_url):
-        self.initial_capacity = initial_capacity
-        self.error_rate = error_rate
-        self.name = name
-        self.bitarray_type = bitarray_type
-        self.redis_url = redis_url
-
-        self.filters = []
-
-        self.filters.append(self.create_filter())
-        self._thread_lock = threading.RLock()
-        self._check_capacity_time = 0
-
-    def __repr__(self):
-        return "<ScalableBloomFilter: {}>".format(self.filters[-1].bitarray)
-
-    def create_filter(self):
-        bloom_filter = BloomFilter(
-            capacity=self.initial_capacity,
-            error_rate=self.error_rate,
-            bitarray_type=self.bitarray_type,
-            name=self.name + str(len(self.filters)) if self.name else self.name,
-            redis_url=self.redis_url,
-        )
-        return bloom_filter
-
-    def check_filter_capacity(self):
-        """
-        检测filter状态,如果已满,加载新的filter
-        @return:
-        """
-        if (
-            not self._check_capacity_time
-            or time.time() - self._check_capacity_time > 1800
-        ):
-            if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY:
-                with self._thread_lock:
-                    while True:
-                        if self.filters[-1].is_at_capacity:
-                            self.filters.append(self.create_filter())
-                        else:
-                            break
-
-                    self._check_capacity_time = time.time()
-            else:
-                # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
-                key = f"ScalableBloomFilter:{self.name}" if self.name else "ScalableBloomFilter"
-                with RedisLock(key=key, redis_url=self.redis_url) as lock:
-                    if lock.locked:
-                        while True:
-                            if self.filters[-1].is_at_capacity:
-                                self.filters.append(self.create_filter())
-                            else:
-                                break
-
-                        self._check_capacity_time = time.time()
-
-    def add(self, keys, skip_check=False, *args, **kwargs):
-        """
-        Adds a key to this bloom filter. If the key already exists in this
-        filter it will return False. Otherwise True. keys support list
-        @param keys: list or one key
-        @param skip_check: add directly,not check if is exist in bloomfilters
-        @return:
-        """
-
-        self.check_filter_capacity()
-
-        current_filter = self.filters[-1]
-
-        if skip_check:
-            return current_filter.add(keys)
-        else:
-            is_list = isinstance(keys, list)
-
-            keys = keys if is_list else [keys]
-            not_exist_keys = list(set(keys))
-
-            # 检查之前的bloomfilter是否存在
-            # 记录下每级filter存在的key,不存在的key继续向下检查
-            for filter_ in reversed(self.filters):
-                current_filter_is_exists = filter_.get(
-                    not_exist_keys, to_list=True
-                )  # 当前的filter是否存在
-
-                not_exist_keys_temp = []
-
-                for key, is_exist in zip(not_exist_keys, current_filter_is_exists):
-                    if not is_exist:  # 当前filter不存在的key 需要继续向下检查
-                        not_exist_keys_temp.append(key)
-
-                not_exist_keys = not_exist_keys_temp
-
-                if not not_exist_keys:
-                    break
-
-            # 仍有不存在的关键词,记录该关键词
-            if not_exist_keys:
-                current_filter.add(not_exist_keys)
-
-            # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
-            for i, key in enumerate(keys):
-                for j, not_exist_key in enumerate(not_exist_keys):
-                    if key == not_exist_key:
-                        keys[i] = 1
-                        not_exist_keys.pop(j)
-                        break
-                else:
-                    keys[i] = 0
-
-            is_added = keys
-            return is_added if is_list else is_added[0]
-
-    def get(self, keys):
-        self.check_filter_capacity()
-
-        is_list = isinstance(keys, list)
-
-        keys = keys if is_list else [keys]  # 最终会修改为 [0, 1, ...] 0表示不存在 1 已存在
-        not_exist_keys = list(set(keys))
-
-        # 检查之前的bloomfilter是否存在
-        # 记录下每级filter存在的key,不存在的key继续向下检查
-        for filter_ in reversed(self.filters):
-            current_filter_is_exists = filter_.get(
-                not_exist_keys, to_list=True
-            )  # 当前的filter是否存在
-
-            not_exist_keys_temp = []
-
-            for checked_key, is_exist in zip(not_exist_keys, current_filter_is_exists):
-                if not is_exist:  # 当前filter不存在的key 需要继续向下检查
-                    not_exist_keys_temp.append(checked_key)
-
-            not_exist_keys = not_exist_keys_temp
-
-            if not not_exist_keys:
-                break
-
-        # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
-        for i, key in enumerate(keys):
-            for j, not_exist_key in enumerate(not_exist_keys):
-                if key == not_exist_key:
-                    keys[i] = 0
-                    not_exist_keys.pop(j)
-                    break
-            else:
-                keys[i] = 1
-
-        is_exists = keys
-        return is_exists if is_list else is_exists[0]
-
-    @property
-    def capacity(self):
-        """Returns the total capacity for all filters in this SBF"""
-        return sum(f.capacity for f in self.filters)

+ 0 - 81
spider_frame/FworkSpider/feapder/dedup/expirefilter.py

@@ -1,81 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/13 9:44 PM
----------
-@summary: 带有有效期的去重集合
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import time
-
-from feapder.db.redisdb import RedisDB
-from feapder.dedup.basefilter import BaseFilter
-
-
-class ExpireFilter(BaseFilter):
-    redis_db = None
-
-    def __init__(
-        self, name: str, expire_time: int, expire_time_record_key=None, redis_url=None
-    ):
-        if not name:
-            raise ValueError("name cant't be None")
-        if not expire_time:
-            raise ValueError("please set expire time, units is seconds")
-
-        if not self.__class__.redis_db:
-            self.__class__.redis_db = RedisDB(url=redis_url)
-
-        self.name = name
-        self.expire_time = expire_time
-        self.expire_time_record_key = expire_time_record_key
-        self.del_expire_key_time = None
-
-        self.record_expire_time()
-
-        self.del_expire_key()
-
-    def __repr__(self):
-        return "<ExpireSet: {}>".format(self.name)
-
-    @property
-    def current_timestamp(self):
-        return int(time.time())
-
-    def add(self, keys, *args, **kwargs):
-        """
-        @param keys: 检查关键词在zset中是否存在,支持列表批量
-        @return: list / 单个值
-        """
-        if self.current_timestamp - self.del_expire_key_time > self.expire_time:
-            self.del_expire_key()
-
-        is_added = self.redis_db.zadd(self.name, keys, self.current_timestamp)
-        return is_added
-
-    def get(self, keys):
-        is_exist = self.redis_db.zexists(self.name, keys)
-        if isinstance(keys, list):
-            # 判断数据本身是否重复
-            temp_set = set()
-            for i, key in enumerate(keys):
-                if key in temp_set:
-                    is_exist[i] = 1
-                else:
-                    temp_set.add(key)
-
-        return is_exist
-
-    def del_expire_key(self):
-        self.redis_db.zremrangebyscore(
-            self.name, "-inf", self.current_timestamp - self.expire_time
-        )
-        self.del_expire_key_time = self.current_timestamp
-
-    def record_expire_time(self):
-        if self.expire_time_record_key:
-            self.redis_db.hset(
-                self.expire_time_record_key, key=self.name, value=self.expire_time
-            )

+ 0 - 70
spider_frame/FworkSpider/feapder/dedup/litefilter.py

@@ -1,70 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2022/9/21 11:28 AM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-from typing import List, Union, Set
-
-from feapder.dedup.basefilter import BaseFilter
-
-
-class LiteFilter(BaseFilter):
-    def __init__(self):
-        self.datas: Set[str] = set()
-
-    def add(
-        self, keys: Union[List[str], str], *args, **kwargs
-    ) -> Union[List[int], int]:
-        """
-
-        Args:
-            keys: list / 单个值
-            *args:
-            **kwargs:
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
-        """
-        if isinstance(keys, list):
-            is_add = []
-            for key in keys:
-                if key not in self.datas:
-                    self.datas.add(key)
-                    is_add.append(1)
-                else:
-                    is_add.append(0)
-        else:
-            if keys not in self.datas:
-                is_add = 1
-                self.datas.add(keys)
-            else:
-                is_add = 0
-        return is_add
-
-    def get(self, keys: Union[List[str], str]) -> Union[List[int], int]:
-        """
-        检查数据是否存在
-        Args:
-            keys: list / 单个值
-
-        Returns:
-            list / 单个值 (如果数据已存在 返回 1 否则返回 0)
-        """
-        if isinstance(keys, list):
-            temp_set = set()
-            is_exist = []
-            for key in keys:
-                # 数据本身重复或者数据在去重库里
-                if key in temp_set or key in self.datas:
-                    is_exist.append(1)
-                else:
-                    is_exist.append(0)
-                    temp_set.add(key)
-
-            return is_exist
-        else:
-            return int(keys in self.datas)

+ 0 - 138
spider_frame/FworkSpider/feapder/dedup/redisfilter.py

@@ -1,138 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-03-01
----------
-@summary: 集群/单机/多台单机 redis 过滤器
----------
-@author: dzr
-@email: dongzhaorui@topnet.net.cn
-"""
-
-from feapder.db.redisdb import RedisDB
-from feapder.dedup.basefilter import BaseFilter
-
-
-class RedisFilter(BaseFilter):
-    redis_db = None
-
-    def __init__(self, ip_ports=None, user_pass=None, redis_url=None, expire_time=None):
-        if redis_url:
-            self.__class__.redis_db = RedisDB.from_url(redis_url)  # 单机
-        else:
-            self.__class__.redis_db = RedisDB(
-                ip_ports=ip_ports,
-                user_pass=user_pass,
-                decode_responses=True,
-            )  # 集群/单机
-
-        self._ex = expire_time or 86400 * 365 * 2  # 2年 = 86400 * 365 * 2
-        self._fingerprint_pref = "pylist_"  # 数据指纹前缀标识
-
-    def __repr__(self):
-        return "<RedisFilter: {}>".format(self.redis_db)
-
-    def exists(self, key):
-        """全量检索"""
-        if self.redis_db.exists(self._fingerprint_pref + key) > 0:
-            return True
-        return False
-
-    def add(self, keys, *args, **kwargs):
-        """
-        添加数据
-        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
-        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-
-        is_added = []
-        for key in keys:
-            if not self.exists(key):
-                key = self._fingerprint_pref + key
-                is_added.append(self.redis_db.set(key, 1, ex=self._ex))
-            else:
-                is_added.append(False)
-
-        return is_added if is_list else is_added[0]
-
-    def get(self, keys):
-        """
-        检查数据是否存在
-        @param keys: list / 单个值
-        @return: list / 单个值 (存在返回True 不存在返回False)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-
-        is_exist = []
-        for key in keys:
-            is_exist.append(self.exists(key))
-
-        # 判断数据本身是否重复
-        temp_set = set()
-        for i, key in enumerate(keys):
-            if key in temp_set:
-                is_exist[i] = True
-            else:
-                temp_set.add(key)
-
-        return is_exist if is_list else is_exist[0]
-
-
-class RedisMultiFilter(RedisFilter):
-    redis_dbs = {}
-
-    def __init__(self, redisdb_conf=None, **kwargs):
-        super(RedisMultiFilter, self).__init__(**kwargs)
-
-        self._go_fingerprint_pref = "list_"  # lua 数据指纹前缀标识
-        self._py_fingerprint_pref = "pylist_"  # python 数据指纹前缀标识
-
-        if not redisdb_conf:
-            self.__class__.redis_dbs[self._py_fingerprint_pref] = RedisDB()
-        else:
-            if not isinstance(redisdb_conf, list):
-                raise ValueError("redisdb_conf 必须是一个 list")
-
-            # 检查指纹前缀
-            for conf in redisdb_conf:
-                fingerprint_pref = conf["fingerprint_pref"]
-                if fingerprint_pref not in [self._py_fingerprint_pref, self._go_fingerprint_pref]:
-                    raise AttributeError(f"unknown fingerprint pref '{fingerprint_pref}'")
-
-                self.__class__.redis_dbs[fingerprint_pref] = RedisDB(
-                    ip_ports=conf["ip_port"],
-                    user_pass=conf["user_pass"],
-                    db=conf["db"]
-                )
-
-    def __repr__(self):
-        return "<RedisMultiFilter: {}>".format(self.redis_dbs)
-
-    def exists(self, key):
-        """lua增量检索/python增量检索"""
-        for fingerprint_pref, redis_db in self.redis_dbs.items():
-            if redis_db.exists(fingerprint_pref + key) > 0:
-                return True
-        return False
-
-    def add(self, keys, *args, **kwargs):
-        """
-        添加数据
-        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
-        @return: list / 单个值(如果数据已存在 返回 False 否则返回 True, 可以理解为是否添加成功)
-        """
-        is_list = isinstance(keys, list)
-        keys = keys if is_list else [keys]
-        redis_db = self.redis_dbs[self._py_fingerprint_pref]
-
-        is_added = []
-        for key in keys:
-            if not self.exists(key):
-                key = self._py_fingerprint_pref + key
-                is_added.append(redis_db.set(key, 1, ex=self._ex))
-            else:
-                is_added.append(False)
-
-        return is_added if is_list else is_added[0]

+ 0 - 0
spider_frame/FworkSpider/feapder/network/__init__.py


+ 0 - 786
spider_frame/FworkSpider/feapder/network/cookie_pool.py

@@ -1,786 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018/12/27 11:32 AM
----------
-@summary: cookie池
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import abc
-import datetime
-import random
-import time
-import warnings
-from collections import Iterable
-from enum import Enum, unique
-
-import requests
-from func_timeout import func_set_timeout
-
-import feapder.utils.tools as tools
-from feapder import setting
-from feapder.db.mongodb import MongoDB
-from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent
-from feapder.utils import metrics
-from feapder.utils.log import log
-from feapder.utils.redis_lock import RedisLock
-from feapder.utils.tools import send_msg
-
-
-class CookiePoolInterface(metaclass=abc.ABCMeta):
-    """
-    cookie pool interface
-    """
-
-    @abc.abstractmethod
-    def create_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def get_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def del_cookie(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def run(self):
-        raise NotImplementedError
-
-
-class PageCookiePool(CookiePoolInterface):
-    """
-    由页面产生的cookie 不需要用户登陆
-    """
-
-    def __init__(
-        self,
-        redis_key,
-        page_url=None,
-        min_cookies=10000,
-        must_contained_keys=(),
-        keep_alive=False,
-        **kwargs,
-    ):
-        """
-        @param redis_key: 项目名
-        @param page_url: 生产cookie的url
-        @param min_cookies: 最小cookie数
-        @param must_contained_keys: cookie 必须包含的key
-        @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
-        ---
-        @param kwargs: WebDriver的一些参数
-            load_images: 是否加载图片
-            user_agent_pool: user-agent池 为None时不使用
-            proxies_pool: ;代理池 为None时不使用
-            headless: 是否启用无头模式
-            driver_type: web driver 类型
-            timeout: 请求超时时间 默认16s
-            window_size: 屏幕分辨率 (width, height)
-
-        """
-
-        self._redisdb = RedisDB()
-
-        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
-        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
-            redis_key
-        )  # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
-        self._page_url = page_url
-        self._min_cookies = min_cookies
-        self._must_contained_keys = must_contained_keys
-        self._keep_alive = keep_alive
-
-        self._kwargs = kwargs
-        self._kwargs.setdefault("load_images", False)
-        self._kwargs.setdefault("headless", True)
-
-    def create_cookie(self):
-        """
-        可能会重写
-        @return:
-        """
-        url = self._page_url
-        header = {
-            "Upgrade-Insecure-Requests": "1",
-            "User-Agent": user_agent.get()
-        }
-        res = requests.get(url, headers=header)
-        cookies = requests.utils.dict_from_cookiejar(res.cookies)
-        return cookies
-
-    def add_cookies(self, cookies):
-        log.info("添加cookie {}".format(cookies))
-        self._redisdb.lpush(self._tab_cookie_pool, cookies)
-
-    def run(self):
-        for i in range(5):
-            try:
-                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
-                need_cookie_count = self._min_cookies - now_cookie_count
-                if need_cookie_count > 0:
-                    log.info(
-                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
-                            now_cookie_count, self._min_cookies
-                        )
-                    )
-                    try:
-                        cookies = self.create_cookie()
-                        if cookies:
-                            self.add_cookies(cookies)
-                    except Exception as e:
-                        log.exception(e)
-                else:
-                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
-                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
-                    last_count_info = self._redisdb.strget(
-                        self._tab_cookie_pool_last_count
-                    )
-                    if not last_count_info:
-                        self._redisdb.strset(
-                            self._tab_cookie_pool_last_count,
-                            "{}:{}".format(time.time(), now_cookie_count),
-                        )
-                    else:
-                        last_time, last_count = last_count_info.split(":")
-                        last_time = float(last_time)
-                        last_count = int(last_count)
-
-                        if time.time() - last_time > 60:
-                            if now_cookie_count == last_count:
-                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
-                                break
-                            else:
-                                self._redisdb.strset(
-                                    self._tab_cookie_pool_last_count,
-                                    "{}:{}".format(time.time(), now_cookie_count),
-                                )
-
-                    if self._keep_alive:
-                        log.info("sleep 10")
-                        tools.delay_time(10)
-                    else:
-                        break
-
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    @func_set_timeout(120)
-    def get_cookie(self, wait_when_null=True):
-        for i in range(3):
-            try:
-                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
-                if not cookie_info and wait_when_null:
-                    log.info("暂无cookie 生产中...")
-                    self._keep_alive = False
-                    self._min_cookies = 1
-                    _lock = RedisLock(key=self._tab_cookie_pool,
-                                      lock_timeout=3600,
-                                      wait_timeout=5)
-                    with _lock:
-                        if _lock.locked:
-                            self.run()
-                    continue
-                return eval(cookie_info) if cookie_info else {}
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def del_cookie(self, cookies):
-        self._redisdb.lrem(self._tab_cookie_pool, cookies)
-
-
-class User:
-    def __init__(self, username, cookie):
-        self.username = username
-        self.cookie = cookie
-
-
-class LoginCookiePool(CookiePoolInterface):
-    """
-    需要登陆的cookie池, 用户账号密码等信息用mysql保存
-    """
-
-    def __init__(
-        self,
-        redis_key,
-        *,
-        table_userbase,
-        login_state_key="login_state",
-        lock_state_key="lock_state",
-        username_key="username",
-        password_key="password",
-        login_retry_times=10,
-    ):
-        """
-        @param redis_key: 项目名
-        @param table_userbase: 用户表名
-        @param login_state_key: 登录状态列名
-        @param lock_state_key: 封锁状态列名
-        @param username_key: 登陆名列名
-        @param password_key: 密码列名
-        @param login_retry_times: 登陆失败重试次数
-        """
-
-        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
-        self._login_retry_times = login_retry_times
-        self._table_userbase = table_userbase
-        self._login_state_key = login_state_key
-        self._lock_state_key = lock_state_key
-        self._username_key = username_key
-        self._password_key = password_key
-
-        self._redisdb = RedisDB()
-        self._mongo = MongoDB(db='user_login')
-
-    def create_cookie(self, username, password):
-
-        """
-        创建cookie
-        @param username: 用户名
-        @param password: 密码
-        @return: return cookie / None
-        """
-        raise NotImplementedError
-
-    def get_user_info(self):
-        """
-        返回用户信息
-        @return: yield username, password
-        """
-
-        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
-
-    def handle_login_failed_user(self, username, password):
-        """
-        处理登录失败的user
-        @param username:
-        @param password:
-        @return:
-        """
-
-        pass
-
-    def handel_exception(self, e):
-        """
-        处理异常
-        @param e:
-        @return:
-        """
-        log.exception(e)
-
-    def save_cookie(self, username, cookie):
-        user_cookie = {"username": username, "cookie": cookie}
-
-        self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
-        self._mongo.add(
-                coll_name=self._table_userbase,
-                data={self._login_state_key:1},
-                update_columns=self._username_key,
-                update_columns_value=username)
-
-    @func_set_timeout(60)
-    def get_cookie(self, wait_when_null=True) -> User:
-        for i in range(3):
-            try:
-                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
-                if not user_cookie and wait_when_null:
-                    log.info("暂无cookie 生产中...")
-                    self.login()
-                    continue
-
-                if user_cookie:
-                    user_cookie = eval(user_cookie)
-                    return User(**user_cookie)
-
-                return None
-            except Exception as e:
-                log.exception(e)
-                tools.delay_time(1)
-
-    def del_cookie(self, user: User):
-        """
-        删除失效的cookie
-        @param user:
-        @return:
-        """
-        user_info = {"username": user.username, "cookie": user.cookie}
-        self._redisdb.lrem(self._tab_cookie_pool, user_info)
-
-        self._mongo.add(
-            coll_name=self._table_userbase,
-            data={self._login_state_key: 1},
-            update_columns=self._username_key,
-            update_columns_value=user.username)
-
-    def user_is_locked(self, user: User):
-
-        self._mongo.add(
-            coll_name=self._table_userbase,
-            data={self._lock_state_key: 1},
-            update_columns=self._username_key,
-            update_columns_value=user.username)
-
-    def run(self):
-        with RedisLock(
-            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
-        ) as _lock:
-            if _lock.locked:
-                user_infos = self.get_user_info()
-                if not isinstance(user_infos, Iterable):
-                    raise ValueError("get_user_info 返回值必须可迭代")
-
-                if not user_infos:
-                    log.info("无可用用户")
-
-                for info in user_infos:
-                    username = info.get("username")
-                    password = info.get("password")
-                    for i in range(self._login_retry_times):
-                        try:
-                            cookie = self.create_cookie(username, password)
-                            if cookie:
-                                self.save_cookie(username, cookie)
-                            else:
-                                self.handle_login_failed_user(username, password)
-
-                            break
-                        except Exception as e:
-                            self.handel_exception(e)
-
-                    else:
-                        self.handle_login_failed_user(username, password)
-
-    login = run
-
-
-@unique
-class LimitTimesUserStatus(Enum):
-    # 使用状态
-    USED = "used"
-    SUCCESS = "success"
-    OVERDUE = "overdue"  # cookie 过期
-    SLEEP = "sleep"
-    EXCEPTION = "exception"
-    # 登陆状态
-    LOGIN_SUCCESS = "login_success"
-    LOGIN_FALIED = "login_failed"
-
-
-class LimitTimesUser:
-    """
-    有次数限制的账户
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
-    SITE_NAME = ""  # 网站名
-
-    redisdb = None
-
-    def __init__(
-        self,
-        username,
-        password,
-        max_search_times,
-        proxies=None,
-        search_interval=0,
-        **kwargs,
-    ):
-        """
-        @param username:
-        @param password:
-        @param max_search_times:
-        @param proxies:
-        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
-        """
-        self.__dict__.update(kwargs)
-        self.username = username
-        self.password = password
-        self.max_search_times = max_search_times
-        self.proxies = proxies
-        self.search_interval = search_interval
-        self.delay_use = 0  # 延时使用,用于等待解封的用户
-
-        if isinstance(search_interval, (tuple, list)):
-            if len(search_interval) != 2:
-                raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
-
-            self.used_for_time_length = (
-                search_interval[1] * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-        else:
-            self.used_for_time_length = (
-                search_interval * 5
-            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
-
-        self.account_info = {
-            "login_time": 0,
-            "cookies": {},
-            "search_times": 0,
-            "last_search_time": 0,
-            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
-            "init_search_times_time": 0,  # 初始化搜索次数的时间
-        }
-
-        if not self.__class__.redisdb:
-            self.__class__.redisdb = RedisDB()
-
-        self.sync_account_info_from_redis()
-
-        self.__init_metrics()
-
-    def __init_metrics(self):
-        """
-        初始化打点系统
-        @return:
-        """
-        metrics.init(**setting.METRICS_OTHER_ARGS)
-
-    def record_user_status(self, status: LimitTimesUserStatus):
-        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
-
-    def __repr__(self):
-        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
-
-    def __eq__(self, other):
-        return self.username == other.username
-
-    def sync_account_info_from_redis(self):
-        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
-        if account_info:
-            account_info = eval(account_info)
-            self.account_info.update(account_info)
-
-    @property
-    def cookies(self):
-        cookies = self.account_info.get("cookies")
-        return cookies
-
-    def set_cookies(self, cookies):
-        self.account_info["cookies"] = cookies
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def set_login_time(self, login_time=None):
-        self.account_info["login_time"] = login_time or time.time()
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def get_login_time(self):
-        return self.account_info.get("login_time")
-
-    def is_time_to_login(self):
-        return time.time() - self.get_login_time() > 40 * 60
-
-    def get_last_search_time(self):
-        return self.account_info.get("last_search_time", 0)
-
-    def is_time_to_search(self):
-        if self.delay_use:
-            is_time = time.time() - self.get_last_search_time() > self.delay_use
-            if is_time:
-                self.delay_use = 0
-
-        else:
-            is_time = time.time() - self.get_last_search_time() > (
-                random.randint(*self.search_interval)
-                if isinstance(self.search_interval, (tuple, list))
-                else self.search_interval
-            )
-
-        return is_time
-
-    @property
-    def used_for_spider_name(self):
-        return self.account_info.get("used_for_spider_name")
-
-    @used_for_spider_name.setter
-    def used_for_spider_name(self, spider_name):
-        self.account_info["used_for_spider_name"] = spider_name
-
-    def update_status(self):
-        """
-        更新search的一些状态
-        @return:
-        """
-        self.account_info["search_times"] += 1
-        self.account_info["last_search_time"] = time.time()
-
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    @property
-    def search_times(self):
-        init_search_times_time = self.account_info.get("init_search_times_time")
-        current_time = time.time()
-        if (
-            current_time - init_search_times_time >= 86400
-        ):  # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
-            self.account_info["search_times"] = 0
-            self.account_info["init_search_times_time"] = current_time
-
-            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
-
-        return self.account_info["search_times"]
-
-    def is_overwork(self):
-        if self.search_times > self.max_search_times:
-            log.warning("账号 {} 请求次数超限制".format(self.username))
-            return True
-
-        return False
-
-    def is_at_work_time(self):
-        if datetime.datetime.now().hour in list(range(7, 23)):
-            return True
-
-        log.warning("账号 {} 不再工作时间内".format(self.username))
-        return False
-
-    def del_cookie(self):
-        self.account_info["cookies"] = {}
-        return self.redisdb.hset(
-            self.ACCOUNT_INFO_KEY, self.username, self.account_info
-        )
-
-    def create_cookie(self):
-        """
-        生产cookie 有异常需要抛出
-        @return: cookie_dict
-        """
-
-        raise NotImplementedError
-
-    def login(self):
-        """
-        @return: 1 成功 0 失败
-        """
-
-        try:
-            # 预检查
-            if not self.is_time_to_login():
-                log.info("此账号尚未到登陆时间: {}".format(self.username))
-                time.sleep(5)
-                return 0
-
-            cookies = self.create_cookie()
-            if not cookies:
-                raise Exception("登陆失败 未获取到合法cookie")
-
-            if not isinstance(cookies, dict):
-                raise Exception("cookie 必须为字典格式")
-
-            # 保存cookie
-            self.set_login_time()
-            self.set_cookies(cookies)
-            log.info("登录成功 {}".format(self.username))
-            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
-            return 1
-
-        except Exception as e:
-            log.exception(e)
-            send_msg(
-                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
-                level="error",
-                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
-            )
-
-        log.info("登录失败 {}".format(self.username))
-        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
-        return 0
-
-
-class LimitTimesUserPool:
-    """
-    限制查询次数的用户的User pool
-    基于本地做的缓存,不支持多进程调用
-    """
-
-    LOAD_USER_INTERVAL = 60
-
-    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
-        """
-        @param accounts_dic: 账户信息字典
-            {
-                "15011300228": {
-                    "password": "300228",
-                    "proxies": {},
-                    "max_search_times": 500,
-                    "search_interval": 1, # 使用时间间隔
-                    # 其他携带信息
-                }
-            }
-        @param limit_user_class: 用户重写的 limit_user_class
-        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
-        """
-        self.accounts_dict = accounts_dict
-        self.limit_user_class = limit_user_class
-
-        self.limit_times_users = []
-        self.current_user_index = -1
-
-        self.support_more_client = support_more_client
-
-        self.last_load_user_time = 0
-
-    def __load_users(self, username=None):
-        # 装载user
-        log.info("更新可用用户")
-
-        for _username, detail in self.accounts_dict.items():
-            if username and username != _username:
-                continue
-
-            limit_times_users = self.limit_user_class(username=_username, **detail)
-            if limit_times_users in self.limit_times_users:
-                continue
-
-            if limit_times_users.is_overwork():
-                continue
-            else:
-                if (
-                    limit_times_users.cookies or limit_times_users.login()
-                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
-                    self.limit_times_users.append(limit_times_users)
-
-        self.last_load_user_time = time.time()
-
-    def get_user(
-        self,
-        username=None,
-        used_for_spider_name=None,
-        wait_when_null=True,
-        not_limit_frequence=False,
-    ) -> LimitTimesUser:
-        """
-        @params username: 获取指定的用户
-        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
-        @params wait_when_null: 无用户时是否等待
-        @params not_limit_frequence: 不限制使用频率
-        @return: LimitTimesUser
-        """
-        if not self.support_more_client:
-            warnings.warn(
-                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
-                category=Warning,
-            )
-            self._is_show_warning = True
-
-        while True:
-            if (
-                not self.limit_times_users
-                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
-            ):
-                self.__load_users(username)
-                if not self.limit_times_users:
-                    log.warning("无可用的用户")
-                    if wait_when_null:
-                        time.sleep(1)
-                        continue
-                    else:
-                        return None
-
-            self.current_user_index += 1
-            self.current_user_index = self.current_user_index % len(
-                self.limit_times_users
-            )
-
-            limit_times_user = self.limit_times_users[self.current_user_index]
-            if self.support_more_client:  # 需要先同步下最新数据
-                limit_times_user.sync_account_info_from_redis()
-
-            if username and limit_times_user.username != username:
-                log.info(
-                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
-                )
-                time.sleep(1)
-                continue
-
-            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
-            if (
-                limit_times_user.used_for_spider_name
-                and limit_times_user.used_for_spider_name != used_for_spider_name
-            ):
-                wait_time = time.time() - limit_times_user.get_last_search_time()
-                if wait_time < limit_times_user.used_for_time_length:
-                    log.info(
-                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
-                            limit_times_user.username,
-                            limit_times_user.used_for_spider_name,
-                            limit_times_user.used_for_time_length - wait_time,
-                        )
-                    )
-                    time.sleep(1)
-                    continue
-
-            if (
-                not limit_times_user.is_overwork()
-                and limit_times_user.is_at_work_time()
-            ):
-                if not limit_times_user.cookies:
-                    self.limit_times_users.remove(limit_times_user)
-                    continue
-
-                if not_limit_frequence or limit_times_user.is_time_to_search():
-                    limit_times_user.used_for_spider_name = used_for_spider_name
-
-                    limit_times_user.update_status()
-                    log.info("使用用户 {}".format(limit_times_user.username))
-                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
-                    return limit_times_user
-                else:
-                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
-                    time.sleep(1)
-                    continue
-            else:
-                self.limit_times_users.remove(limit_times_user)
-                self.current_user_index -= 1
-
-                if not limit_times_user.is_at_work_time():
-                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
-                    if wait_when_null:
-                        time.sleep(30)
-                        continue
-                    else:
-                        return None
-
-    def del_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.del_cookie()
-                self.limit_times_users.remove(limit_times_user)
-                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
-                self.__load_users(username)
-                break
-
-    def update_cookies(self, username, cookies):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.set_cookies(cookies)
-                break
-
-    def delay_use(self, username, delay_seconds):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.delay_use = delay_seconds
-                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
-                break
-
-    def record_success_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
-
-    def record_exception_user(self, username):
-        for limit_times_user in self.limit_times_users:
-            if limit_times_user.username == username:
-                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)

+ 0 - 273
spider_frame/FworkSpider/feapder/network/item.py

@@ -1,273 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-26 22:28:10
----------
-@summary: 定义实体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import feapder.utils.tools as tools
-
-
-class ItemMetaclass(type):
-    def __new__(cls, name, bases, attr):
-        attr.setdefault("__name__", None)
-        attr.setdefault("__table_name__", None)
-        attr.setdefault("__name_underline__", None)
-        attr.setdefault("__update_key__", None)
-        attr.setdefault("__unique_key__", None)
-        attr.setdefault("__dont_save__", False)
-
-        return type.__new__(cls, name, bases, attr)
-
-
-class Item(metaclass=ItemMetaclass):
-    __unique_key__ = []
-
-    def __init__(self, **kwargs):
-        self.__dict__ = kwargs
-
-    def __repr__(self):
-        return "<{}: {}>".format(self.item_name, tools.dumps_json(self.to_dict))
-
-    def __getitem__(self, key):
-        return self.__dict__[key]
-
-    def __setitem__(self, key, value):
-        self.__dict__[key] = value
-
-    def pre_to_db(self):
-        """
-        入库前的处理
-        """
-        pass
-
-    @property
-    def to_dict(self):
-        properties = {}
-        for key, value in self.__dict__.items():
-            if key not in (
-                "__name__",
-                "__table_name__",
-                "__name_underline__",
-                "__update_key__",
-                "__unique_key__",
-                "__dont_save__",
-            ):
-                if key.startswith(f"_{self.__class__.__name__}"):
-                    key = key.replace(f"_{self.__class__.__name__}", "")
-                properties[key] = value
-
-        return properties
-
-    def to_sql(self, auto_update=False, update_columns=()):
-        return tools.make_insert_sql(
-            self.table_name, self.to_dict, auto_update, update_columns
-        )
-
-    @property
-    def item_name(self):
-        return self.__name__ or self.__class__.__name__
-
-    @item_name.setter
-    def item_name(self, name):
-        self.__name__ = name
-        self.__table_name__ = self.name_underline.replace("_item", "")
-
-    @property
-    def table_name(self):
-        if not self.__table_name__:
-            self.__table_name__ = self.name_underline.replace("_item", "")
-        return self.__table_name__
-
-    @table_name.setter
-    def table_name(self, name):
-        self.__table_name__ = name
-        self.__name__ = tools.key2hump(name) + "Item"
-
-    @property
-    def name_underline(self):
-        if not self.__name_underline__:
-            self.__name_underline__ = tools.key2underline(self.item_name)
-
-        return self.__name_underline__
-
-    @name_underline.setter
-    def name_underline(self, name):
-        self.__name_underline__ = name
-
-    @property
-    def unique_key(self):
-        return self.__unique_key__ or self.__class__.__unique_key__
-
-    @unique_key.setter
-    def unique_key(self, keys):
-        if isinstance(keys, (tuple, list)):
-            self.__unique_key__ = keys
-        else:
-            self.__unique_key__ = (keys,)
-
-    @property
-    def fingerprint(self):
-        args = []
-        for key, value in self.to_dict.items():
-            if value:
-                if (self.unique_key and key in self.unique_key) or not self.unique_key:
-                    args.append(str(value))
-
-        if args:
-            args = sorted(args)
-            return tools.get_md5(*args)
-        else:
-            return None
-
-    @property
-    def dont_save(self):
-        return self.__dont_save__
-
-    @dont_save.setter
-    def dont_save(self, state: bool):
-        self.__dont_save__ = state
-
-    def to_UpdateItem(self):
-        update_item = UpdateItem(**self.__dict__)
-        update_item.item_name = self.item_name
-        return update_item
-
-
-class UpdateItem(Item):
-    __update_key__ = []
-
-    def __init__(self, **kwargs):
-        super(UpdateItem, self).__init__(**kwargs)
-
-    @property
-    def update_key(self):
-        return self.__update_key__ or self.__class__.__update_key__
-
-    @update_key.setter
-    def update_key(self, keys):
-        if isinstance(keys, (tuple, list)):
-            self.__update_key__ = keys
-        else:
-            self.__update_key__ = (keys,)
-
-
-class BaseItem(Item):
-    """数据采集基础类"""
-
-    def __init__(self, site='', channel='', spidercode='',
-                 area='全国', city='', district='', href='', pyuuid=None, **kwargs):
-        """
-
-        @param pyuuid: 采集数据唯一标识
-        @param site: 站点名称(数据源定义)
-        @param channel: 栏目名称(数据源定义)
-        @param spidercode: 爬虫代码(数据源定义)
-        @param area: 省, 默认:全国
-        @param city: 市
-        @param district: 区/县
-        @param href: 采集地址
-        """
-        super(BaseItem, self).__init__()
-
-        self.pyuuid = pyuuid or tools.get_uuid().replace('-', '')
-        self.comeintime = tools.ensure_int64(
-            tools.get_current_timestamp()
-        )  # 入库时间
-
-        self.site = site
-        self.channel = channel
-        self.spidercode = spidercode
-
-        self.area = area
-        self.city = city
-        self.district = district
-
-        self.href = href
-
-        kwargs = {k: v for k, v in kwargs.items() if k not in self.__dict__}
-        self.__dict__.update(kwargs)
-
-    @property
-    def fingerprint(self):
-        args = []
-        # 1、入库字段去重
-        for key, value in self.to_dict.items():
-            if value:
-                if (self.unique_key and key in self.unique_key) or not self.unique_key:
-                    args.append(str(value))
-
-        # 2、入库字段去重_招投标字段
-        bidding = self.to_dict.get("item", {})
-        for key, value in bidding.items():
-            if value:
-                if (self.unique_key and key in self.unique_key) or not self.unique_key:
-                    args.append(str(value))
-
-        # 3、非入库字段去重
-        if self.unique_key:
-            for key in filter(lambda x: x not in bidding and x not in self.to_dict, self.unique_key):
-                if key not in args:
-                    args.append(str(key))
-
-        if args:
-            args = sorted(args)
-            return tools.get_sha256(*args)
-        else:
-            return None
-
-    def __getstate__(self):
-        state = self.__dict__.copy()  # 复制对象的属性字典(序列化)
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__ = state  # 从字典中提取出需要的值,恢复对象状态(反序列化)
-
-
-class BaseListItem(BaseItem):
-    """列表数据采集基础类"""
-
-    def __init__(self):
-        super(BaseListItem, self).__init__()
-
-        self.request_params = {}  # 访问详情数据时额外的请求参数
-
-
-class BaseDetailItem(BaseItem):
-    """详情数据采集基础类"""
-
-    def __init__(self, title='', contenthtml='', detail='', sendflag='false',
-                 projectinfo=None, **kwargs):
-        """
-
-        @param title: 详情页标题
-        @param contenthtml: 详情页源码
-        @param detail: 清洗之后的详情页源码
-        @param sendflag: 该数据是否保存到正式库
-        @param projectinfo: 附件信息,格式详见剑鱼数据采集规范
-        """
-        super(BaseDetailItem, self).__init__(**kwargs)
-
-        self.title = title
-        self.contenthtml = contenthtml
-        self.detail = detail
-        self.projectinfo = projectinfo
-
-        self.sendflag = sendflag
-
-
-class FailedTaskItem(Item):
-
-    def __init__(self, **kwargs):
-        super(FailedTaskItem, self).__init__(**kwargs)
-        self.pyuuid = self.__dict__['pyuuid']
-        self.failed_retries = self.__dict__.get('failed_retries', 0)  # 失败重试次数
-
-
-class HeartBeatItem(Item):
-
-    def __init__(self, **kwargs):
-        super(HeartBeatItem, self).__init__(**kwargs)

+ 0 - 723
spider_frame/FworkSpider/feapder/network/proxy_pool.py

@@ -1,723 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-代理池
-"""
-import datetime
-import json
-import os
-import random
-import socket
-import time
-from urllib import parse
-
-import redis
-import requests
-
-from feapder import setting
-from feapder.utils import tools
-from feapder.utils.log import log as logger
-
-# 建立本地缓存代理文件夹
-proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
-if not os.path.exists(proxy_path):
-    os.mkdir(proxy_path)
-
-
-def get_proxy_from_jyapi(timeout=5, default=None, show_error_log=False):
-    """
-    剑鱼代理
-
-    @param timeout: 访问超时时间
-    @param default: 默认返回值
-    @param show_error_log: 展示错误堆栈信息日志
-    """
-    request_params = dict(
-        headers=dict(Authorization=setting.JY_PROXY_AUTHOR),
-        timeout=timeout
-    )
-    try:
-        response = requests.get(setting.JY_PROXY_URL, **request_params)
-    except requests.exceptions.RequestException as why:
-        if show_error_log:
-            logger.exception(why)
-        return default
-
-    try:
-        proxies = response.json()["data"]
-        return proxies
-    except KeyError:
-        pass
-
-    return default
-
-
-def get_proxy_from_url(**kwargs):
-    """
-    获取指定url的代理
-    :param kwargs:
-    :return:
-    """
-    proxy_source_url = kwargs.get("proxy_source_url", [])
-    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
-
-    if not isinstance(proxy_source_url, list):
-        proxy_source_url = [proxy_source_url]
-        proxy_source_url = [x for x in proxy_source_url if x]
-    if not proxy_source_url:
-        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
-    kwargs = kwargs.copy()
-    kwargs.pop("proxy_source_url")
-    proxies_list = []
-    for url in proxy_source_url:
-        if url.startswith("http"):
-            proxies_list.extend(get_proxy_from_http(url, **kwargs))
-        elif url.startswith("redis"):
-            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
-
-    if proxies_list:
-        # 顺序打乱
-        random.shuffle(proxies_list)
-    return proxies_list
-
-
-def get_proxy_from_http(proxy_source_url, **kwargs):
-    """
-    从指定 http 地址获取代理
-    :param proxy_source_url:
-    :param kwargs:
-    :return:
-    """
-    filename = tools.get_md5(proxy_source_url) + ".txt"
-    abs_filename = os.path.join(proxy_path, filename)
-    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
-    update_flag = 0
-    if not update_interval:
-        # 强制更新
-        update_flag = 1
-    elif not os.path.exists(abs_filename):
-        # 文件不存在则更新
-        update_flag = 1
-    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
-        # 超过更新间隔
-        update_flag = 1
-    if update_flag:
-        pool = []
-        response = requests.get(proxy_source_url, timeout=20)
-        # 改写:获取scocks代理的response处理
-        for proxy in response.json():
-            host = tools.decrypt(proxy["ip"])
-            port = proxy["ports"][0]
-            endTime = proxy["lifetime"]
-            pool.append(f"{host}:{port}&&{endTime}")
-
-        with open(os.path.join(proxy_path, filename), "w") as f:
-            f.write("\n".join(pool))
-    return get_proxy_from_file(filename)
-
-
-def get_proxy_from_file(filename):
-    """
-    从指定本地文件获取代理
-        文件格式
-        ip:port:https
-        ip:port:http
-        ip:port
-    :param filename:
-    :return:
-    """
-    proxies_list = []
-    with open(os.path.join(proxy_path, filename), "r") as f:
-        lines = f.readlines()
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # 解析
-        auth = ""
-        if "@" in line:
-            auth, line = line.split("@")
-        # 改写,解析代理有效期结束时间
-        line, end = line.split("&&")
-
-        items = line.split(":")
-        if len(items) < 2:
-            continue
-
-        ip, port, *protocol = items
-        if not all([port, ip]):
-            continue
-        if auth:
-            ip = "{}@{}".format(auth, ip)
-        if not protocol:
-            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
-            if time.time() < int(end):
-                proxies = {
-                    "https": "socks5://%s:%s" % (ip, port),
-                    "http": "socks5://%s:%s" % (ip, port),
-                    # "end":end
-                }
-            else:
-                continue
-        else:
-            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
-        proxies_list.append(proxies)
-
-    return proxies_list
-
-
-def get_proxy_from_redis(proxy_source_url, **kwargs):
-    """
-    从指定 redis 地址获取代理
-    @param proxy_source_url: redis://:passwd@host:ip/db
-        redis 存储结构 zset
-        ip:port ts
-    @param kwargs:
-        {"redis_proxies_key": "xxx"}
-    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
-    """
-
-    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
-    key = kwargs.get("redis_proxies_key")
-    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
-    proxies = redis_conn.zrange(key, 0, -1)
-    proxies_list = []
-    for proxy in proxies:
-        proxy = proxy.decode()
-        proxies_list.append(
-            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
-        )
-    return proxies_list
-
-
-def check_proxy(ip="", port="", proxies=None, type=0, timeout=5, show_error_log=True):
-    """
-    代理有效性检查
-
-    :param ip:
-    :param port:
-    :param proxies:
-    :param type: 0:socket  1:requests
-    :param timeout:
-    :param show_error_log:
-    :return:
-    """
-    ok = 0
-    if type == 0 and ip and port:
-        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(timeout)
-            try:
-                # 必须检测 否则代理永远不刷新
-                sk.connect((ip, int(port)))
-                ok = 1
-            except Exception as e:
-                if show_error_log:
-                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
-            sk.close()
-    else:
-        if not proxies:
-            proxies = {
-                "http": "socks5://{}:{}".format(ip, port),
-                "https": "socks5//{}:{}".format(ip, port),
-            }
-        try:
-            r = requests.get("https://myip.ipip.net",
-                             proxies=proxies,
-                             timeout=timeout,
-                             stream=True)
-            ok = 1
-            r.close()
-        except Exception as e:
-            if show_error_log:
-                args = (e, ip, port, proxies)
-                logger.debug("check proxy failed: {} {}:{} {}".format(*args))
-    return ok
-
-
-class ProxyItem(object):
-    """单个代理对象"""
-
-    # 代理标记
-    proxy_tag_list = (-1, 0, 1)
-
-    def __init__(
-            self,
-            proxies=None,
-            valid_timeout=20,
-            check_interval=180,
-            max_proxy_use_num=10000,
-            delay=30,
-            use_interval=None,
-            **kwargs,
-    ):
-        """
-        :param proxies:
-        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
-        :param check_interval:
-        :param max_proxy_use_num:
-        :param delay:
-        :param use_interval: 使用间隔 单位秒 默认不限制
-        :param kwargs:
-        """
-        # {"http": ..., "https": ...}
-        self.proxies = proxies
-        # 检测超时时间 秒
-        self.valid_timeout = valid_timeout
-        # 检测间隔 秒
-        self.check_interval = check_interval
-
-        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
-        self.flag = 0
-        # 上次状态变化时间
-        self.flag_ts = 0
-        # 上次更新时间 有效时间
-        self.update_ts = 0
-        # 最大被使用次数
-        self.max_proxy_use_num = max_proxy_use_num
-        # 被使用次数记录
-        self.use_num = 0
-        # 延迟使用时间
-        self.delay = delay
-        # 使用间隔 单位秒
-        self.use_interval = use_interval
-        # 使用时间
-        self.use_ts = 0
-
-        self.proxy_args = self.parse_proxies(self.proxies)
-        self.proxy_ip = self.proxy_args["ip"]
-        self.proxy_port = self.proxy_args["port"]
-        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
-        if self.proxy_args["user"]:
-            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
-        else:
-            self.proxy_id = self.proxy_ip_port
-
-    def get_proxies(self):
-        self.use_num += 1
-        return self.proxies
-
-    def is_delay(self):
-        return self.flag == 1
-
-    def is_valid(self, force=0, type=0):
-        """
-        检测代理是否有效
-            1 有效
-            2 延时使用
-            0 无效 直接在代理池删除
-        :param force:
-        :param type:
-        :return:
-        """
-        if self.use_num > self.max_proxy_use_num > 0:
-            logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
-            return 0
-        if self.flag == -1:
-            logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
-            return 0
-        if self.delay > 0 and self.flag == 1:
-            if time.time() - self.flag_ts < self.delay:
-                logger.debug("代理被标记 1 延迟 %s" % self.proxies)
-                return 2
-            else:
-                self.flag = 0
-                logger.debug("延迟代理释放: {}".format(self.proxies))
-        if self.use_interval:
-            if time.time() - self.use_ts < self.use_interval:
-                return 2
-        if not force:
-            if time.time() - self.update_ts < self.check_interval:
-                return 1
-        if self.valid_timeout > 0:
-            ok = check_proxy(
-                proxies=self.proxies,
-                type=type,
-                timeout=self.valid_timeout,
-            )
-        else:
-            ok = 1
-        self.update_ts = time.time()
-        return ok
-
-    @staticmethod
-    def parse_proxies(proxies):
-        """
-        分解代理组成部分
-        :param proxies:
-        :return:
-        """
-        if not proxies:
-            return {}
-        if isinstance(proxies, (str, bytes)):
-            proxies = json.loads(proxies)
-        protocol = list(proxies.keys())
-        if not protocol:
-            return {}
-        _url = proxies.get(protocol[0])
-        # 改写:注释http代理url的拼接,以正常生成代理池
-        # if not _url.startswith("http"):
-        #     _url = "http://" + _url
-        _url_parse = parse.urlparse(_url)
-        netloc = _url_parse.netloc
-        if "@" in netloc:
-            netloc_auth, netloc_host = netloc.split("@")
-        else:
-            netloc_auth, netloc_host = "", netloc
-        ip, *port = netloc_host.split(":")
-        port = port[0] if port else "80"
-        user, *password = netloc_auth.split(":")
-        password = password[0] if password else ""
-        return {
-            "protocol": protocol,
-            "ip": ip,
-            "port": port,
-            "user": user,
-            "password": password,
-            "ip_port": "{}:{}".format(ip, port),
-        }
-
-
-class ProxyPoolBase(object):
-    def __init__(self, *args, **kwargs):
-        pass
-
-    def get(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class ProxyPool(ProxyPoolBase):
-    """代理池"""
-
-    def __init__(self, **kwargs):
-        """
-        :param size: 代理池大小  -1 为不限制
-        :param proxy_source_url: 代理文件地址 支持列表
-        :param proxy_instance:  提供代理的实例
-        :param reset_interval:  代理池重置间隔 最小间隔
-        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
-        :param check_valid: 是否在获取代理时进行检测有效性
-        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
-        :param kwargs: 其他的参数
-        """
-        kwargs.setdefault("size", -1)
-        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
-
-        super(ProxyPool, self).__init__(**kwargs)
-        # 队列最大长度
-        self.max_queue_size = kwargs.get("size", -1)
-        # 实际代理数量
-        self.real_max_proxy_count = 1000
-        # 代理可用最大次数
-        # 代理获取地址 http://localhost/proxy.txt
-        self.proxy_source_url = kwargs.get("proxy_source_url", [])
-        if not isinstance(self.proxy_source_url, list):
-            self.proxy_source_url = [self.proxy_source_url]
-            self.proxy_source_url = [x for x in self.proxy_source_url if x]
-            self.proxy_source_url = list(set(self.proxy_source_url))
-            kwargs.update({"proxy_source_url": self.proxy_source_url})
-
-        if not self.proxy_source_url:
-            logger.warn("need set proxy_source_url or proxy_instance")
-
-        # 代理池重置间隔
-        self.reset_interval = kwargs.get("reset_interval", 5)
-        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
-        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
-        # 是否监测代理有效性
-        self.check_valid = kwargs.get("check_valid", True)
-
-        # 代理队列
-        self.proxy_queue = None
-        # {代理id: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 失效代理队列
-        self.invalid_proxy_dict = {}
-
-        self.kwargs = kwargs
-
-        # 重置代理池锁
-        self.reset_lock = None
-        # 重置时间
-        self.last_reset_time = 0
-        # 重置的太快了  计数
-        self.reset_fast_count = 0
-        # 计数 获取代理重试3次仍然失败 次数
-        self.no_valid_proxy_times = 0
-
-        # 上次获取代理时间
-        self.last_get_ts = time.time()
-
-        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
-        self.proxy_item_update_ts_dict = {}
-
-        # 警告
-        self.warn_flag = False
-
-    def warn(self):
-        if not self.warn_flag:
-            for url in self.proxy_source_url:
-                if "zhima" in url:
-                    continue
-            self.warn_flag = True
-        return
-
-    @property
-    def queue_size(self):
-        """
-        当前代理池中代理数量
-        :return:
-        """
-        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
-
-    def clear(self):
-        """
-        清空自己
-        :return:
-        """
-        self.proxy_queue = None
-        # {代理ip: ProxyItem, ...}
-        self.proxy_dict = {}
-        # 清理失效代理集合
-        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
-        self.invalid_proxy_dict = {
-            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
-        }
-        # 清理超时的update_ts记录
-        _limit = time.time() - 600
-        self.proxy_item_update_ts_dict = {
-            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
-        }
-        return
-
-    def get(self, retry: int = 0) -> dict:
-        """
-        从代理池中获取代理
-        :param retry:
-        :return:
-        """
-        retry += 1
-        if retry > 3:
-            self.no_valid_proxy_times += 1
-            return None
-        # if time.time() - self.last_get_ts > 3 * 60:
-        #     # 3分钟没有获取过 重置一下
-        #     try:
-        #         self.reset_proxy_pool()
-        #     except Exception as e:
-        #         logger.exception(e)
-        # 记录获取时间
-        self.last_get_ts = time.time()
-        #
-        self.warn()
-        proxy_item = self.get_random_proxy()
-        if proxy_item:
-            # 不检测
-            if not self.check_valid:  #
-                # 塞回去
-                proxies = proxy_item.get_proxies()
-                self.put_proxy_item(proxy_item)
-                return proxies
-            else:
-                is_valid = proxy_item.is_valid()
-                if is_valid:
-                    # 记录update_ts
-                    self.proxy_item_update_ts_dict[
-                        proxy_item.proxy_id
-                    ] = proxy_item.update_ts
-                    # 塞回去
-                    proxies = proxy_item.get_proxies()
-                    self.put_proxy_item(proxy_item)
-                    if is_valid == 1:
-                        if proxy_item.use_interval:
-                            proxy_item.use_ts = time.time()
-                        return proxies
-                else:
-                    # 处理失效代理
-                    self.proxy_dict.pop(proxy_item.proxy_id, "")
-                    self.invalid_proxy_dict[
-                        proxy_item.proxy_id
-                    ] = datetime.datetime.now()
-        else:
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                logger.exception(e)
-        if self.no_valid_proxy_times >= 5:
-            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
-            # 导致爬虫烂尾
-            try:
-                time.sleep(3)
-                self.reset_proxy_pool()
-            except Exception as e:
-                logger.exception(e)
-        return self.get(retry)
-
-    get_proxy = get
-
-    def get_random_proxy(self) -> ProxyItem:
-        """
-        随机获取代理
-        :return:
-        """
-        if self.proxy_queue is not None:
-            if random.random() < 0.5:
-                # 一半概率检查 这是个高频操作 优化一下
-                if time.time() - self.last_reset_time > self.reset_interval_max:
-                    time.sleep(3)
-                    self.reset_proxy_pool(force=True)
-                else:
-                    min_q_size = (
-                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
-                        if self.max_queue_size > 0
-                        else self.real_max_proxy_count / 2
-                    )
-                    if self.proxy_queue.qsize() < min_q_size:
-                        time.sleep(3)
-                        self.reset_proxy_pool()
-            try:
-                return self.proxy_queue.get_nowait()
-            except Exception:
-                pass
-        return None
-
-    def append_proxies(self, proxies_list: list) -> int:
-        """
-        添加代理到代理池
-        :param proxies_list:
-        :return:
-        """
-        count = 0
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if proxies:
-                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
-                # 增加失效判断 2018/12/18
-                if proxy_item.proxy_id in self.invalid_proxy_dict:
-                    continue
-                if proxy_item.proxy_id not in self.proxy_dict:
-                    # 补充update_ts
-                    if not proxy_item.update_ts:
-                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
-                            proxy_item.proxy_id, 0
-                        )
-                    self.put_proxy_item(proxy_item)
-                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
-                    count += 1
-        return count
-
-    def put_proxy_item(self, proxy_item: ProxyItem):
-        """
-        添加 ProxyItem 到代理池
-        :param proxy_item:
-        :return:
-        """
-        return self.proxy_queue.put_nowait(proxy_item)
-
-    def reset_proxy_pool(self, force: bool = False):
-        """
-        重置代理池
-        :param force: 是否强制重置代理池
-        :return:
-        """
-        if not self.reset_lock:
-            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
-            import threading
-
-            self.reset_lock = threading.RLock()
-        with self.reset_lock:
-            if (
-                    force
-                    or self.proxy_queue is None
-                    or (
-                    self.max_queue_size > 0
-                    and self.proxy_queue.qsize() < self.max_queue_size / 2
-            )
-                    or (
-                    self.max_queue_size < 0
-                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
-            )
-                    or self.no_valid_proxy_times >= 5
-            ):
-                if time.time() - self.last_reset_time < self.reset_interval:
-                    self.reset_fast_count += 1
-                    if self.reset_fast_count % 10 == 0:
-                        logger.debug(
-                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
-                        )
-                        time.sleep(1)
-                else:
-                    self.clear()
-                    if self.proxy_queue is None:
-                        import queue
-
-                        self.proxy_queue = queue.Queue()
-                    # TODO 这里获取到的可能重复
-                    proxies_list = get_proxy_from_url(**self.kwargs)
-                    self.real_max_proxy_count = len(proxies_list)
-                    if 0 < self.max_queue_size < self.real_max_proxy_count:
-                        proxies_list = random.sample(proxies_list, self.max_queue_size)
-                    _valid_count = self.append_proxies(proxies_list)
-                    self.last_reset_time = time.time()
-                    self.no_valid_proxy_times = 0
-                    logger.debug(
-                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
-                            len(proxies_list),
-                            _valid_count,
-                            len(self.invalid_proxy_dict),
-                            len(self.proxy_dict),
-                        )
-                    )
-        return
-
-    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
-        """
-        对代理进行标记
-        :param proxies_list:
-        :param flag:
-                    -1  废弃
-                    1 延迟使用
-        :param delay: 延迟时间
-        :return:
-        """
-        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
-            return False
-        if not isinstance(proxies_list, list):
-            proxies_list = [proxies_list]
-        for proxies in proxies_list:
-            if not proxies:
-                continue
-            proxy_id = ProxyItem(proxies).proxy_id
-            if proxy_id not in self.proxy_dict:
-                continue
-            self.proxy_dict[proxy_id].flag = flag
-            self.proxy_dict[proxy_id].flag_ts = time.time()
-            self.proxy_dict[proxy_id].delay = delay
-
-        return True
-
-    def get_proxy_item(self, proxy_id="", proxies=None):
-        """
-        获取代理对象
-        :param proxy_id:
-        :param proxies:
-        :return:
-        """
-        if proxy_id:
-            return self.proxy_dict.get(proxy_id)
-        if proxies:
-            proxy_id = ProxyItem(proxies).proxy_id
-            return self.proxy_dict.get(proxy_id)
-        return
-
-    def copy(self):
-        return ProxyPool(**self.kwargs)
-
-    def all(self) -> list:
-        """
-        获取当前代理池中的全部代理
-        :return:
-        """
-        return get_proxy_from_url(**self.kwargs)

+ 0 - 557
spider_frame/FworkSpider/feapder/network/request.py

@@ -1,557 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-25 11:49:08
----------
-@summary: 请求结构体
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-import copy
-import re
-
-import requests
-from requests.adapters import HTTPAdapter
-from requests.cookies import RequestsCookieJar
-from requests.packages.urllib3.exceptions import InsecureRequestWarning
-from requests.packages.urllib3.util.ssl_ import create_urllib3_context
-
-import feapder.setting as setting
-import feapder.utils.tools as tools
-from feapder.db.redisdb import RedisDB
-from feapder.network import user_agent, proxy_pool
-from feapder.network.response import Response
-from feapder.utils.log import log
-from feapder.utils.webdriver import WebDriverPool
-
-# 屏蔽warning信息
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-
-
-class DESAdapter(HTTPAdapter):
-
-    def __init__(self, *args, **kwargs):
-        """
-        A TransportAdapter that re-enables 3DES support in Requests.
-        """
-        ciphers = ":".join(setting.JA3_REQUEST_CIPHERS).split(":")
-        tools.random.shuffle(ciphers)
-        ciphers = ":".join(ciphers)
-        self.ciphers = ciphers + ":!aNULL:!eNULL:!MD5"
-        super().__init__(*args, **kwargs)
-
-    def init_poolmanager(self, *args, **kwargs):
-        context = create_urllib3_context(ciphers=self.ciphers)
-        kwargs["ssl_context"] = context
-        return super(DESAdapter, self).init_poolmanager(*args, **kwargs)
-
-    def proxy_manager_for(self, *args, **kwargs):
-        context = create_urllib3_context(ciphers=self.ciphers)
-        kwargs["ssl_context"] = context
-        return super(DESAdapter, self).proxy_manager_for(*args, **kwargs)
-
-
-class Request(object):
-    session = None
-    webdriver_pool: WebDriverPool = None
-    user_agent_pool = user_agent
-
-    cache_db = None  # redis / pika
-    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
-    cached_expire_time = 1200  # 缓存过期时间
-
-    local_filepath = None
-    oss_handler = None
-
-    __REQUEST_ATTRS__ = {
-        # "method", "url", 必须传递 不加入**kwargs中
-        "params",
-        "data",
-        "headers",
-        "cookies",
-        "files",
-        "auth",
-        "timeout",
-        "allow_redirects",
-        "proxies",
-        "hooks",
-        "stream",
-        "verify",
-        "cert",
-        "json",
-    }
-
-    DEFAULT_KEY_VALUE = dict(
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        use_ja3_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-    )
-
-    def __init__(
-        self,
-        url="",
-        retry_times=0,
-        priority=300,
-        parser_name=None,
-        callback=None,
-        filter_repeat=True,
-        auto_request=True,
-        request_sync=False,
-        use_session=None,
-        use_ja3_session=None,
-        random_user_agent=True,
-        download_midware=None,
-        is_abandoned=False,
-        render=False,
-        render_time=0,
-        splash=False,
-        iframes=0,
-        page=None,
-        **kwargs,
-    ):
-        """
-        @summary: Request参数
-        ---------
-        框架参数
-        @param url: 待抓取url
-        @param retry_times: 当前重试次数
-        @param priority: 优先级 越小越优先 默认300
-        @param parser_name: 回调函数所在的类名 默认为当前类
-        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
-        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
-        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
-        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
-        @param use_session: 是否使用session方式
-        @param use_ja3_session: 是否使用ja3_session方式
-        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
-        @param download_midware: 下载中间件。默认为parser中的download_midware
-        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
-        @param render: 是否用浏览器渲染
-        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
-        --
-        以下参数与requests参数使用方式一致
-        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
-        @param params: 请求参数
-        @param data: 请求body
-        @param json: 请求json字符串,同 json.dumps(data)
-        @param headers:
-        @param cookies: 字典 或 CookieJar 对象
-        @param files:
-        @param auth:
-        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
-        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
-        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
-        @param verify: 为 True 时将会验证 SSL 证书
-        @param stream: 如果为 False,将会立即下载响应内容
-        @param cert:
-        --
-        自定义新增参数
-        @param splash: 是否使用 splash 渲染服务
-        @param iframes: splash 获取页面嵌入的 iframe 内容, 0=不获取,1=获取
-        @param page: 请求列表页页码数
-        --
-        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
-        ---------
-        @result:
-        """
-
-        self.url = url
-        self.retry_times = retry_times
-        self.priority = priority
-        self.parser_name = parser_name
-        self.callback = callback
-        self.filter_repeat = filter_repeat
-        self.auto_request = auto_request
-        self.request_sync = request_sync
-        self.use_session = use_session
-        self.use_ja3_session = use_ja3_session
-        self.random_user_agent = random_user_agent
-        self.download_midware = download_midware
-        self.is_abandoned = is_abandoned
-        self.render = render
-        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
-
-        self.splash = splash
-        self.iframes = iframes
-        self.page = page
-
-        self.requests_kwargs = {}
-        for key, value in kwargs.items():
-            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
-                self.requests_kwargs[key] = value
-
-            self.__dict__[key] = value
-
-    def __repr__(self):
-        try:
-            return "<Request {}>".format(self.url)
-        except:
-            return "<Request {}>".format(str(self.to_dict)[:40])
-
-    def __setattr__(self, key, value):
-        """
-        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
-        @param key:
-        @param value:
-        @return:
-        """
-        self.__dict__[key] = value
-
-        if key in self.__class__.__REQUEST_ATTRS__:
-            self.requests_kwargs[key] = value
-
-    def __lt__(self, other):
-        return self.priority < other.priority
-
-    @property
-    def _session(self):
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-        use_ja3_session = (
-            setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
-        )  # self.use_ja3_session 优先级高
-        use_session = use_session or use_ja3_session
-        if use_session and not self.__class__.session:
-            self.__class__.session = requests.Session()
-            if use_ja3_session:
-                # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-                des_adapter = DESAdapter(pool_connections=1000, pool_maxsize=1000)
-                # 任何使用该session会话的 HTTP/HTTPS 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-                self.__class__.session.mount("https://", des_adapter)
-                self.__class__.session.mount("http://", des_adapter)
-            else:
-                # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
-                http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
-                # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
-                self.__class__.session.mount("http", http_adapter)
-
-        return self.__class__.session
-
-    @property
-    def _webdriver_pool(self):
-        if not self.__class__.webdriver_pool:
-            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
-
-        return self.__class__.webdriver_pool
-
-    @property
-    def to_dict(self):
-        request_dict = {}
-
-        self.callback = (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-        self.download_midware = (
-            getattr(self.download_midware, "__name__")
-            if callable(self.download_midware)
-            else self.download_midware
-        )
-
-        for key, value in self.__dict__.items():
-            if (
-                key in self.__class__.DEFAULT_KEY_VALUE
-                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
-                or key == "requests_kwargs"
-            ):
-                continue
-
-            if key in self.__class__.__REQUEST_ATTRS__:
-                if not isinstance(
-                    value, (bytes, bool, float, int, str, tuple, list, dict)
-                ):
-                    value = tools.dumps_obj(value)
-            else:
-                if not isinstance(value, (bytes, bool, float, int, str)):
-                    value = tools.dumps_obj(value)
-
-            request_dict[key] = value
-
-        return request_dict
-
-    @property
-    def callback_name(self):
-        return (
-            getattr(self.callback, "__name__")
-            if callable(self.callback)
-            else self.callback
-        )
-
-    def get_response(self, save_cached=False, show_log=True):
-        """
-        获取带有selector功能的response
-        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
-        @param show_log: 展示日志
-        @return:
-        """
-        # 设置超时默认时间
-        self.requests_kwargs.setdefault(
-            "timeout", setting.REQUEST_TIMEOUT
-        )  # connect=22 read=22
-
-        # 设置stream
-        # 默认情况下,当你进行网络请求后,响应体会立即被下载。
-        # stream=True是,调用Response.content 才会下载响应体,默认只返回header。
-        # 缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
-        self.requests_kwargs.setdefault("stream", True)
-
-        # 关闭证书验证
-        self.requests_kwargs.setdefault("verify", False)
-
-        # 设置请求方法
-        method = self.__dict__.get("method")
-        if not method:
-            if "data" in self.requests_kwargs or "json" in self.requests_kwargs:
-                method = "POST"
-            else:
-                method = "GET"
-
-        # 随机user—agent
-        headers = self.requests_kwargs.get("headers", {})
-        if "user-agent" not in headers and "User-Agent" not in headers:
-            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
-                ua = setting.WEBDRIVER.get(
-                    "user_agent"
-                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-            else:
-                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
-
-            if self.random_user_agent and setting.RANDOM_HEADERS:
-                headers.update({"User-Agent": ua})
-                self.requests_kwargs.update(headers=headers)
-        else:
-            self.requests_kwargs.setdefault(
-                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
-            )
-
-        # 代理
-        proxies = self.requests_kwargs.get("proxies", -1)
-        if proxies == -1 and setting.PROXY_ENABLE and setting.JY_PROXY_URL:
-            while True:
-                proxies = proxy_pool.get_proxy_from_jyapi()
-                if proxies:
-                    self.requests_kwargs.update(proxies=proxies)
-                    break
-                else:
-                    log.debug("暂无可用代理 ...")
-
-        if show_log:
-            log.debug(
-                """
-                    -------------- %srequest for ----------------
-                    url  = %s
-                    method = %s
-                    body = %s
-                    """
-                % (
-                    ""
-                    if not self.parser_name
-                    else "%s.%s "
-                    % (
-                        self.parser_name,
-                        (
-                            self.callback
-                            and callable(self.callback)
-                            and getattr(self.callback, "__name__")
-                            or self.callback
-                        )
-                        or "parse",
-                    ),
-                    self.url,
-                    method,
-                    self.requests_kwargs,
-                )
-            )
-
-        use_session = (
-            setting.USE_SESSION if self.use_session is None else self.use_session
-        )  # self.use_session 优先级高
-        use_ja3_session = (
-            setting.USE_JA3_SESSION if self.use_ja3_session is None else self.use_ja3_session
-        )  # self.use_ja3_session 优先级高
-        use_session = use_session or use_ja3_session
-
-        if self.render:
-            # 使用request的user_agent、cookies、proxy
-            user_agent = headers.get("User-Agent") or headers.get("user-agent")
-            cookies = self.requests_kwargs.get("cookies")
-            if cookies and isinstance(cookies, RequestsCookieJar):
-                cookies = cookies.get_dict()
-
-            if not cookies:
-                cookie_str = headers.get("Cookie") or headers.get("cookie")
-                if cookie_str:
-                    cookies = tools.get_cookies_from_str(cookie_str)
-
-            browser_kwargs = dict(user_agent=user_agent, proxy=self.proxy())
-            browser = self._webdriver_pool.get(**browser_kwargs)
-
-            try:
-                browser.get(self.url)
-                if cookies:
-                    browser.cookies = cookies
-                if self.render_time:
-                    tools.delay_time(self.render_time)
-
-                html = browser.page_source
-                response = Response.from_dict({
-                    "url": browser.current_url,
-                    "cookies": browser.cookies,
-                    "_content": html.encode(),
-                    "status_code": 200,
-                    "elapsed": 666,
-                    "headers": {
-                        "User-Agent": browser.execute_script("return navigator.userAgent"),
-                        "Cookie": tools.cookies2str(browser.cookies),
-                    },
-                })
-                response.browser = browser
-            except Exception as e:
-                self._webdriver_pool.remove(browser)
-                raise e
-
-        elif use_session:
-            response = self._session.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-
-        elif self.splash:
-            headers = self.requests_kwargs.get("headers")
-            if not headers:
-                headers = {"User-Agent": self.user_agent()}
-
-            params = {
-                "iframes": self.iframes,
-                "wait": self.render_time,
-                "html": 1,
-                "proxy": self.proxy(),
-                "url": self.url,
-            }
-            data = {"headers": [(key, val) for key, val in headers.items()]}
-            resp = requests.get(setting.SPLASH_API, params=params, json=data)
-            response = Response(resp)
-
-        else:
-            response = requests.request(method, self.url, **self.requests_kwargs)
-            response = Response(response)
-
-        if save_cached:
-            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
-
-        return response
-
-    def proxies(self):
-        """
-        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
-        """
-        return self.requests_kwargs.get("proxies")
-
-    def proxy(self):
-        """
-
-        Returns: ip:port
-
-        """
-        proxies = self.requests_kwargs.get("proxies")
-        if proxies:
-            return re.sub(
-                "http.*?//", "", proxies.get("http", "") or proxies.get("https", "")
-            )
-
-    def user_agent(self):
-        headers = self.requests_kwargs.get("headers")
-        if headers:
-            return headers.get("user_agent") or headers.get("User-Agent")
-
-    @property
-    def fingerprint(self):
-        """
-        request唯一表识
-        @return:
-        """
-        url = self.__dict__.get("url", "")
-        # url 归一化
-        url = tools.canonicalize_url(url)
-        args = [url]
-
-        for arg in ["params", "data", "files", "auth", "cert", "json"]:
-            if self.requests_kwargs.get(arg):
-                args.append(self.requests_kwargs.get(arg))
-
-        return tools.get_md5(*args)
-
-    @property
-    def _cache_db(self):
-        if not self.__class__.cache_db:
-            self.__class__.cache_db = RedisDB()
-
-        return self.__class__.cache_db
-
-    @property
-    def _cached_redis_key(self):
-        if self.__class__.cached_redis_key:
-            return (
-                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
-            )
-        else:
-            return f"response_cached:test:{self.fingerprint}"
-
-    def save_cached(self, response, expire_time=1200):
-        """
-        使用redis保存response 用于调试 不用每回都下载
-        @param response:
-        @param expire_time: 过期时间
-        @return:
-        """
-
-        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
-
-    def get_response_from_cached(self, save_cached=True):
-        """
-        从缓存中获取response
-        注意:
-            属性值为空:
-                -raw : urllib3.response.HTTPResponse
-                -connection:requests.adapters.HTTPAdapter
-                -history
-
-            属性含义改变:
-                - request 由requests 改为Request
-        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
-        @return:
-        """
-        response_dict = self._cache_db.strget(self._cached_redis_key)
-        if not response_dict:
-            log.info("无response缓存  重新下载")
-            response_obj = self.get_response(save_cached=save_cached)
-        else:
-            response_dict = eval(response_dict)
-            response_obj = Response.from_dict(response_dict)
-        return response_obj
-
-    def del_response_cached(self):
-        self._cache_db.clear(self._cached_redis_key)
-
-    @classmethod
-    def from_dict(cls, request_dict):
-        for key, value in request_dict.items():
-            if isinstance(value, bytes):  # 反序列化 如item
-                request_dict[key] = tools.loads_obj(value)
-
-        return cls(**request_dict)
-
-    def copy(self):
-        return self.__class__.from_dict(copy.deepcopy(self.to_dict))

+ 0 - 356
spider_frame/FworkSpider/feapder/network/response.py

@@ -1,356 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-26 11:40:28
----------
-@summary:
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import datetime
-import os
-import re
-import time
-from urllib.parse import urlparse, urlunparse, urljoin
-
-from bs4 import UnicodeDammit, BeautifulSoup
-from requests.cookies import RequestsCookieJar
-from requests.models import Response as res
-from w3lib.encoding import http_content_type_encoding, html_body_declared_encoding
-
-from feapder.network.selector import Selector
-from feapder.utils.log import log
-
-FAIL_ENCODING = "ISO-8859-1"
-
-# html 源码中的特殊字符,需要删掉,否则会影响etree的构建
-SPECIAL_CHARACTERS = [
-    # 移除控制字符 全部字符列表 https://zh.wikipedia.org/wiki/%E6%8E%A7%E5%88%B6%E5%AD%97%E7%AC%A6
-    "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]"
-]
-
-SPECIAL_CHARACTER_PATTERNS = [
-    re.compile(special_character) for special_character in SPECIAL_CHARACTERS
-]
-
-
-class Response(res):
-    def __init__(self, response):
-        super(Response, self).__init__()
-        self.__dict__.update(response.__dict__)
-
-        self._cached_selector = None
-        self._cached_text = None
-        self._cached_json = None
-
-        self._encoding = None
-
-        self.encoding_errors = "strict"  # strict / replace / ignore
-
-    @classmethod
-    def from_dict(cls, response_dict):
-        """
-        利用字典获取Response对象
-        @param response_dict: 原生的response.__dict__
-        @return:
-        """
-        cookie_jar = RequestsCookieJar()
-        cookie_jar.update(other=response_dict["cookies"])
-        response_dict["cookies"] = cookie_jar
-
-        response_dict["elapsed"] = datetime.timedelta(
-            0, 0, response_dict["elapsed"]
-        )  # 耗时
-        response_dict["connection"] = None
-        response_dict["_content_consumed"] = True
-
-        response = res()
-        response.__dict__.update(response_dict)
-        return cls(response)
-
-    @property
-    def to_dict(self):
-        response_dict = {
-            "_content": self.content,
-            "cookies": self.cookies.get_dict(),
-            "encoding": self.encoding,
-            "headers": self.headers,
-            "status_code": self.status_code,
-            "elapsed": self.elapsed.microseconds,  # 耗时
-            "url": self.url,
-        }
-
-        return response_dict
-
-    def __clear_cache(self):
-        self.__dict__["_cached_selector"] = None
-        self.__dict__["_cached_text"] = None
-        self.__dict__["_cached_json"] = None
-
-    @property
-    def encoding(self):
-        """
-        编码优先级:自定义编码 > header中编码 > 页面编码 > 根据content猜测的编码
-        """
-        self._encoding = (
-            self._encoding
-            or self._headers_encoding()
-            or self._body_declared_encoding()
-            or self.apparent_encoding
-        )
-        return self._encoding
-
-    @encoding.setter
-    def encoding(self, val):
-        self.__clear_cache()
-        self._encoding = val
-
-    code = encoding
-
-    def _headers_encoding(self):
-        """
-        从headers获取头部charset编码
-        """
-        content_type = self.headers.get("Content-Type") or self.headers.get(
-            "content-type"
-        )
-        if content_type:
-            return (
-                http_content_type_encoding(content_type) or "utf-8"
-                if "application/json" in content_type
-                else None
-            )
-
-    def _body_declared_encoding(self):
-        """
-        从html xml等获取<meta charset="编码">
-        """
-
-        return html_body_declared_encoding(self.content)
-
-    def _get_unicode_html(self, html):
-        if not html or not isinstance(html, bytes):
-            return html
-
-        converted = UnicodeDammit(html, is_html=True)
-        if not converted.unicode_markup:
-            raise Exception(
-                "Failed to detect encoding of article HTML, tried: %s"
-                % ", ".join(converted.tried_encodings)
-            )
-
-        html = converted.unicode_markup
-        return html
-
-    def _make_absolute(self, link):
-        """Makes a given link absolute."""
-        try:
-
-            link = link.strip()
-
-            # Parse the link with stdlib.
-            parsed = urlparse(link)._asdict()
-
-            # If link is relative, then join it with base_url.
-            if not parsed["netloc"]:
-                return urljoin(self.url, link)
-
-            # Link is absolute; if it lacks a scheme, add one from base_url.
-            if not parsed["scheme"]:
-                parsed["scheme"] = urlparse(self.url).scheme
-
-                # Reconstruct the URL to incorporate the new scheme.
-                parsed = (v for v in parsed.values())
-                return urlunparse(parsed)
-
-        except Exception as e:
-            log.error(
-                "Invalid URL <{}> can't make absolute_link. exception: {}".format(
-                    link, e
-                )
-            )
-
-        # Link is absolute and complete with scheme; nothing to be done here.
-        return link
-
-    def _absolute_links(self, text):
-        regexs = [
-            r'(<(?i)a.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # a
-            r'(<(?i)img.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # img
-            r'(<(?i)link.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # css
-            r'(<(?i)script.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # js
-        ]
-
-        for regex in regexs:
-
-            def replace_href(text):
-                # html = text.group(0)
-                link = text.group(2)
-                absolute_link = self._make_absolute(link)
-
-                # return re.sub(regex, r'\1{}\3'.format(absolute_link), html) # 使用正则替换,个别字符不支持。如该网址源代码http://permit.mep.gov.cn/permitExt/syssb/xxgk/xxgk!showImage.action?dataid=0b092f8115ff45c5a50947cdea537726
-                return text.group(1) + absolute_link + text.group(3)
-
-            text = re.sub(regex, replace_href, text, flags=re.S)
-
-        return text
-
-    def _del_special_character(self, text):
-        """
-        删除特殊字符
-        """
-        for special_character_pattern in SPECIAL_CHARACTER_PATTERNS:
-            text = special_character_pattern.sub("", text)
-
-        return text
-
-    @property
-    def __text(self):
-        """Content of the response, in unicode.
-
-        If Response.encoding is None, encoding will be guessed using
-        ``chardet``.
-
-        The encoding of the response content is determined based solely on HTTP
-        headers, following RFC 2616 to the letter. If you can take advantage of
-        non-HTTP knowledge to make a better guess at the encoding, you should
-        set ``r.encoding`` appropriately before accessing this property.
-        """
-
-        if not self.content:
-            return ""
-
-        # Decode unicode from given encoding.
-        try:
-            content = str(self.content, self.encoding, errors=self.encoding_errors)
-        except (LookupError, TypeError):
-            # A LookupError is raised if the encoding was not found which could
-            # indicate a misspelling or similar mistake.
-            #
-            # A TypeError can be raised if encoding is None
-            #
-            # So we try blindly encoding.
-            content = str(self.content, errors=self.encoding_errors)
-
-        return content
-
-    @property
-    def text(self):
-        if self._cached_text is None:
-            if self.encoding and self.encoding.upper() != FAIL_ENCODING:
-                try:
-                    self._cached_text = self.__text
-                except UnicodeDecodeError:
-                    self._cached_text = self._get_unicode_html(self.content)
-            else:
-                self._cached_text = self._get_unicode_html(self.content)
-
-            if self._cached_text:
-                self._cached_text = self._absolute_links(self._cached_text)
-                self._cached_text = self._del_special_character(self._cached_text)
-
-        return self._cached_text
-
-    @text.setter
-    def text(self, html):
-        self._cached_text = html
-        self._cached_text = self._absolute_links(self._cached_text)
-        self._cached_text = self._del_special_character(self._cached_text)
-        self._cached_selector = Selector(self.text)
-
-    @property
-    def json(self, **kwargs):
-        if self._cached_json is None:
-            self.encoding = self.encoding or "utf-8"
-            self._cached_json = super(Response, self).json(**kwargs)
-
-        return self._cached_json
-
-    @property
-    def content(self):
-        content = super(Response, self).content
-        return content
-
-    @property
-    def is_html(self):
-        content_type = self.headers.get("Content-Type", "")
-        if "text/html" in content_type:
-            return True
-        else:
-            return False
-
-    @property
-    def selector(self):
-        if self._cached_selector is None:
-            self._cached_selector = Selector(self.text)
-        return self._cached_selector
-
-    def bs4(self, features="html.parser"):
-        soup = BeautifulSoup(self.text, features)
-        return soup
-
-    def extract(self):
-        return self.selector.get()
-
-    def xpath(self, query, **kwargs):
-        return self.selector.xpath(query, **kwargs)
-
-    def css(self, query):
-        return self.selector.css(query)
-
-    def re(self, regex, replace_entities=False):
-        """
-        @summary: 正则匹配
-        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
-        为了使用方便,正则单双引号自动处理为不敏感
-        ---------
-        @param regex: 正则或者re.compile
-        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
-        ---------
-        @result: 列表
-        """
-
-        # 将单双引号设置为不敏感
-        if isinstance(regex, str):
-            regex = re.sub("['\"]", "['\"]", regex)
-
-        return self.selector.re(regex, replace_entities)
-
-    def re_first(self, regex, default=None, replace_entities=False):
-        """
-        @summary: 正则匹配
-        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
-        为了使用方便,正则单双引号自动处理为不敏感
-        ---------
-        @param regex: 正则或者re.compile
-        @param default: 未匹配到, 默认值
-        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
-        ---------
-        @result: 第一个值或默认值
-        """
-
-        # 将单双引号设置为不敏感
-        if isinstance(regex, str):
-            regex = re.sub("['\"]", "['\"]", regex)
-
-        return self.selector.re_first(regex, default, replace_entities)
-
-    def close_browser(self, request):
-        if hasattr(self, "browser"):
-            request._webdriver_pool.remove(self.browser)
-            del self.browser
-
-    def __del__(self):
-        self.close()
-
-    def open(self, delete_temp_file=False):
-        with open("temp.html", "w", encoding=self.encoding, errors="replace") as html:
-            self.encoding_errors = "replace"
-            html.write(self.text)
-
-        os.system("open temp.html")
-
-        if delete_temp_file:
-            time.sleep(1)
-            os.remove("temp.html")

+ 0 - 155
spider_frame/FworkSpider/feapder/network/selector.py

@@ -1,155 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-10-08 15:33:37
----------
-@summary: 重新定义 selector
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-import re
-
-import six
-from lxml import etree
-from parsel import Selector as ParselSelector
-from parsel import SelectorList as ParselSelectorList
-from w3lib.html import replace_entities as w3lib_replace_entities
-
-
-def extract_regex(regex, text, replace_entities=True, flags=0):
-    """Extract a list of unicode strings from the given text/encoding using the following policies:
-    * if the regex contains a named group called "extract" that will be returned
-    * if the regex contains multiple numbered groups, all those will be returned (flattened)
-    * if the regex doesn't contain any group the entire regex matching is returned
-    """
-    if isinstance(regex, six.string_types):
-        regex = re.compile(regex, flags=flags)
-
-    if "extract" in regex.groupindex:
-        # named group
-        try:
-            extracted = regex.search(text).group("extract")
-        except AttributeError:
-            strings = []
-        else:
-            strings = [extracted] if extracted is not None else []
-    else:
-        # full regex or numbered groups
-        strings = regex.findall(text)
-
-    # strings = flatten(strings) # 这东西会把多维列表铺平
-    if not replace_entities:
-        return strings
-
-    values = []
-    for value in strings:
-        if isinstance(value, (list, tuple)):  # w3lib_replace_entities 不能接收list tuple
-            values.append(
-                [w3lib_replace_entities(v, keep=["lt", "amp"]) for v in value]
-            )
-        else:
-            values.append(w3lib_replace_entities(value, keep=["lt", "amp"]))
-
-    return values
-
-
-def create_root_node(text, parser_cls, base_url=None):
-    """Create root node for text using given parser class.
-    """
-    body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
-    parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
-    root = etree.fromstring(body, parser=parser, base_url=base_url)
-    if root is None:
-        root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
-    return root
-
-
-class SelectorList(ParselSelectorList):
-    """
-    The :class:`SelectorList` class is a subclass of the builtin ``list``
-    class, which provides a few additional methods.
-    """
-
-    def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
-        """
-        Call the ``.re()`` method for the first element in this list and
-        return the result in an unicode string. If the list is empty or the
-        regex doesn't match anything, return the default value (``None`` if
-        the argument is not provided).
-
-        By default, character entity references are replaced by their
-        corresponding character (except for ``&amp;`` and ``&lt;``.
-        Passing ``replace_entities`` as ``False`` switches off these
-        replacements.
-        """
-
-        datas = self.re(regex, replace_entities=replace_entities, flags=flags)
-        return datas[0] if datas else default
-
-    def re(self, regex, replace_entities=True, flags=re.S):
-        """
-        Call the ``.re()`` method for each element in this list and return
-        their results flattened, as a list of unicode strings.
-
-        By default, character entity references are replaced by their
-        corresponding character (except for ``&amp;`` and ``&lt;``.
-        Passing ``replace_entities`` as ``False`` switches off these
-        replacements.
-        """
-        datas = [
-            x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
-        ]
-        return datas[0] if len(datas) == 1 else datas
-
-
-class Selector(ParselSelector):
-    selectorlist_cls = SelectorList
-
-    def __str__(self):
-        data = repr(self.get())
-        return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
-
-    __repr__ = __str__
-
-    def __init__(self, text=None, *args, **kwargs):
-        # 先将&nbsp; 转为空格,否则selector 会转为 \xa0
-        if text:
-            text = re.sub("&nbsp;", "\x20", text)
-        super(Selector, self).__init__(text, *args, **kwargs)
-
-    def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
-        """
-        Apply the given regex and return the first unicode string which
-        matches. If there is no match, return the default value (``None`` if
-        the argument is not provided).
-
-        By default, character entity references are replaced by their
-        corresponding character (except for ``&amp;`` and ``&lt;``.
-        Passing ``replace_entities`` as ``False`` switches off these
-        replacements.
-        """
-
-        datas = self.re(regex, replace_entities=replace_entities, flags=flags)
-
-        return datas[0] if datas else default
-
-    def re(self, regex, replace_entities=True, flags=re.S):
-        """
-        Apply the given regex and return a list of unicode strings with the
-        matches.
-
-        ``regex`` can be either a compiled regular expression or a string which
-        will be compiled to a regular expression using ``re.compile(regex)``.
-
-        By default, character entity references are replaced by their
-        corresponding character (except for ``&amp;`` and ``&lt;``.
-        Passing ``replace_entities`` as ``False`` switches off these
-        replacements.
-        """
-
-        return extract_regex(
-            regex, self.get(), replace_entities=replace_entities, flags=flags
-        )
-
-    def _get_root(self, text, base_url=None):
-        return create_root_node(text, self._parser, base_url=base_url)

+ 0 - 389
spider_frame/FworkSpider/feapder/network/user_agent.py

@@ -1,389 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2016-12-28 17:55
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import random
-
-USER_AGENTS = {
-    "chrome": [
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
-        "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
-        "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
-        "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
-        "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
-    ],
-    "opera": [
-        "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
-        "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
-        "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
-        "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
-        "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
-        "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
-        "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
-        "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
-        "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
-        "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
-        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
-        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
-        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
-        "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
-        "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
-        "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
-        "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
-        "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
-        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
-        "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
-        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
-        "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
-        "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
-        "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
-        "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
-        "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
-        "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
-        "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
-        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
-    ],
-    "firefox": [
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
-        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
-        "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
-        "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
-        "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
-        "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
-        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
-        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
-        "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
-        "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
-        "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
-    ],
-    "internetexplorer": [
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
-        "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko",
-        "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
-        "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
-        "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
-        "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
-        "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
-        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
-        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
-        "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
-        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)",
-    ],
-    "safari": [
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
-    ],
-    "mobile": [
-        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
-        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
-        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
-        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
-        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
-        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
-        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
-        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
-        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
-        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
-        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
-        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
-        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
-        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
-        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Safari/605.1.15",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
-    ],
-}
-
-
-def get(ua_type: str = None):
-    if not ua_type:
-        ua_type = random.choice(list(USER_AGENTS.keys()))
-    elif ua_type not in USER_AGENTS:
-        raise ValueError(
-            "ua_type error, expect one of {}".format(list(USER_AGENTS.keys()))
-        )
-
-    return random.choice(USER_AGENTS[ua_type])

+ 0 - 56
spider_frame/FworkSpider/feapder/pipelines/__init__.py

@@ -1,56 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/3/17 10:57 下午
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import abc
-from typing import Dict, List, Tuple
-
-
-class BasePipeline(metaclass=abc.ABCMeta):
-    """
-    pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
-    """
-
-    @abc.abstractmethod
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-
-        return True
-
-    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
-        """
-        更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-            update_keys: 更新的字段, 如 ("title", "publish_time")
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-
-        return True
-
-    def close(self):
-        """
-        关闭,爬虫结束时调用
-        Returns:
-
-        """
-        pass

+ 0 - 47
spider_frame/FworkSpider/feapder/pipelines/console_pipeline.py

@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/3/18 12:39 上午
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-from feapder.pipelines import BasePipeline
-from typing import Dict, List, Tuple
-
-
-class ConsolePipeline(BasePipeline):
-    """
-    pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
-    """
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-
-        return True
-
-    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
-        """
-        更新数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-            update_keys: 更新的字段, 如 ("title", "publish_time")
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-
-        return True

+ 0 - 97
spider_frame/FworkSpider/feapder/pipelines/mongo_pipeline.py

@@ -1,97 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 导出数据
----------
-@author: Mkdir700
-@email:  mkdir700@gmail.com
-"""
-from typing import Dict, List, Tuple
-
-from feapder.db.mongodb import MongoDB
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-import feapder.utils.tools as tools
-
-
-class MongoPipeline(BasePipeline):
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MongoDB()
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-        try:
-            add_count = self.to_db.add_batch(coll_name=table, datas=items)
-            datas_size = len(items)
-            log.info(
-                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
-                % (datas_size, table, add_count, datas_size - add_count)
-            )
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-
-    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
-        """
-        更新数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-            update_keys: 更新的字段, 如 ("title", "publish_time")
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-        try:
-            add_count = self.to_db.add_batch(
-                coll_name=table,
-                datas=items,
-                update_columns=update_keys or list(items[0].keys()),
-            )
-            datas_size = len(items)
-            update_count = datas_size - add_count
-            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
-                datas_size,
-                table,
-                add_count,
-                update_count,
-            )
-            if update_keys:
-                msg += " 更新字段为 {}".format(update_keys)
-            log.info(msg)
-
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-
-
-class TaskPipeline(MongoPipeline):
-
-    def find_items(self, table, condition=None, limit=10):
-        """
-        数据查询
-        @param str table: 表名
-        @param dict condition: 查询条件
-        @param limit: 查询数量
-        """
-        return self.to_db.find(table, condition, limit)

+ 0 - 74
spider_frame/FworkSpider/feapder/pipelines/mysql_pipeline.py

@@ -1,74 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-07-29 22:48:30
----------
-@summary: 导出数据
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-from typing import Dict, List, Tuple
-
-import feapder.utils.tools as tools
-from feapder.db.mysqldb import MysqlDB
-from feapder.pipelines import BasePipeline
-from feapder.utils.log import log
-
-
-class MysqlPipeline(BasePipeline):
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = MysqlDB()
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-
-        sql, datas = tools.make_batch_sql(table, items)
-        add_count = self.to_db.add_batch(sql, datas)
-        datas_size = len(datas)
-        if add_count:
-            log.info(
-                "共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count)
-            )
-
-        return add_count != None
-
-    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
-        """
-        更新数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-            update_keys: 更新的字段, 如 ("title", "publish_time")
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-
-        sql, datas = tools.make_batch_sql(
-            table, items, update_columns=update_keys or list(items[0].keys())
-        )
-        update_count = self.to_db.add_batch(sql, datas)
-        if update_count:
-            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table)
-            if update_keys:
-                msg += " 更新字段为 {}".format(update_keys)
-            log.info(msg)
-
-        return update_count != None

+ 0 - 60
spider_frame/FworkSpider/feapder/pipelines/rabbitmq_pipeline.py

@@ -1,60 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-09-23
----------
-@summary:  rabbitmq数据通道
----------
-@author: Dzr
-"""
-from typing import Dict, List
-
-from feapder.db.rabbitMq import RabbitMQ
-from feapder.pipelines import BasePipeline
-from feapder.utils.tools import log
-
-
-class RabbitMqPipeline(BasePipeline):
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = RabbitMQ()
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-        """
-        try:
-            self.to_db.add_batch(table, items)
-            datas_size = len(items)
-            log.info("共导出 %s 条数据到 %s" % (datas_size, table))
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False
-
-    def update_items(self, table, items: List[Dict], **kwargs) -> bool:
-        """
-        更新数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否更新成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-
-        """
-        items = [{'amq_update': items}]
-        return self.save_items(table, items)
-
-    def close(self):
-        self.to_db.close()

+ 0 - 45
spider_frame/FworkSpider/feapder/pipelines/redis_pipeline.py

@@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021-04-18 14:12:21
----------
-@summary: 导出数据(写入Redis,不直接保存在MongoDB)
----------
-@author: 马国鹏
-@email:  305021384@qq.com
-"""
-from typing import Dict, List
-
-from feapder.db.redisdb import RedisDB
-from feapder.pipelines import BasePipeline
-from feapder.utils.tools import log
-
-
-class RedisPipeline(BasePipeline):
-    def __init__(self):
-        self._to_db = None
-
-    @property
-    def to_db(self):
-        if not self._to_db:
-            self._to_db = RedisDB()
-
-        return self._to_db
-
-    def save_items(self, table, items: List[Dict]) -> bool:
-        """
-        保存数据
-        Args:
-            table: 表名
-            items: 数据,[{},{},...]
-
-        Returns: 是否保存成功 True / False
-                 若False,不会将本批数据入到去重库,以便再次入库
-        """
-        try:
-            self.to_db.lpush(table=table, values=items)
-            datas_size = len(items)
-            log.info("共导出 %s 条数据到 %s" % (datas_size, table))
-            return True
-        except Exception as e:
-            log.exception(e)
-            return False

+ 0 - 17
spider_frame/FworkSpider/feapder/requirements.txt

@@ -1,17 +0,0 @@
-better-exceptions>=0.2.2
-DBUtils>=2.0
-parsel>=1.5.2
-PyExecJS>=1.5.1
-pymongo>=3.10.1
-PyMySQL>=0.9.3
-redis>=2.10.6
-requests>=2.22.0
-selenium>=3.141.0
-bs4>=0.0.1
-ipython>=7.14.0
-bitarray>=1.5.3
-redis-py-cluster>=2.1.0
-cryptography>=3.3.2
-urllib3>=1.25.8
-loguru>=0.5.3
-influxdb>=5.3.1

+ 0 - 204
spider_frame/FworkSpider/feapder/setting.py

@@ -1,204 +0,0 @@
-# -*- coding: utf-8 -*-
-"""爬虫配置文件"""
-import os
-
-# 列表任务表模版
-TAB_REQUESTS = "{redis_key}:z_requests"
-# 详情待处理任务表模版
-TAB_ITEMS = "{redis_key}:z_items"
-# 任务失败表
-TAB_FAILED_REQUESTS = os.getenv("TAB_FAILED_REQUESTS", "spider:z_failed_requests")
-# 数据保存失败表
-TAB_FAILED_ITEMS = os.getenv("TAB_FAILED_ITEMS", "spider:s_failed_items")
-# 采集任务生产表
-TASK_REQUEST_PRODUCE = os.getenv("TASK_REQUEST_PRODUCE", "spider_listdata")
-# 失败任务记录表
-TASK_REQUEST_FAILED = os.getenv("TASK_REQUEST_FAILED", "spider_listdata_err")
-
-# MYSQL
-MYSQL_IP = os.getenv("MYSQL_IP")
-MYSQL_PORT = int(os.getenv("MYSQL_PORT", 3306))
-MYSQL_DB = os.getenv("MYSQL_DB")
-MYSQL_USER_NAME = os.getenv("MYSQL_USER_NAME")
-MYSQL_USER_PASS = os.getenv("MYSQL_USER_PASS")
-
-# MONGODB
-MONGO_IP = os.getenv("MONGO_IP", "localhost")
-MONGO_PORT = int(os.getenv("MONGO_PORT", 27017))
-MONGO_DB = os.getenv("MONGO_DB")
-MONGO_USER_NAME = os.getenv("MONGO_USER_NAME")
-MONGO_USER_PASS = os.getenv("MONGO_USER_PASS")
-
-# REDIS
-# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-REDISDB_IP_PORTS = os.getenv("REDISDB_IP_PORTS")
-REDISDB_USER_PASS = os.getenv("REDISDB_USER_PASS")
-REDISDB_DB = int(os.getenv("REDISDB_DB", 0))
-# 适用于redis哨兵模式
-REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME")
-
-# rabbitMq
-RABBITMQ_IP_PORT = os.getenv("RABBITMQ_IP_PORT")
-RABBITMQ_USER = os.getenv("RABBITMQ_USER")
-RABBITMQ_USER_PASS = os.getenv("RABBITMQ_USER_PASS")
-RABBITMQ_VIRTUAL_HOST = os.getenv("RABBITMQ_VIRTUAL_HOST", "/")
-RABBITMQ_HEARTBEAT = int(os.getenv("RABBITMQ_HEARTBEAT", 1200))
-RABBITMQ_SOCKET_TIMEOUT = int(os.getenv("RABBITMQ_SOCKET_TIMEOUT", 10))
-RABBITMQ_EXCHANGE = os.getenv("RABBITMQ_EXCHANGE", "spider")
-RABBITMQ_EXCHANGE_TYPE = os.getenv("RABBITMQ_EXCHANGE_TYPE", "direct")
-
-# 数据入库的pipeline,可自定义,默认MongoPipeline
-ITEM_PIPELINES = [
-    # "feapder.pipelines.mysql_pipeline.MysqlPipeline",
-    "feapder.pipelines.mongo_pipeline.MongoPipeline",
-]
-EXPORT_DATA_MAX_FAILED_TIMES = 10  # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
-EXPORT_DATA_MAX_RETRY_TIMES = 10  # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
-
-# 爬虫相关
-# COLLECTOR
-COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
-COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
-
-# SPIDER
-SPIDER_THREAD_COUNT = 1  # 爬虫并发数
-SPIDER_SLEEP_TIME = (
-    0
-)  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
-SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
-SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
-SPIDER_AUTO_START_REQUESTS = (
-    True
-)  # 是否主动执行添加 设置为False 需要手动调用start_monitor_task,适用于多进程情况下
-KEEP_ALIVE = False  # 爬虫是否常驻
-
-# 浏览器渲染
-WEBDRIVER = dict(
-    pool_size=1,  # 浏览器的数量
-    load_images=True,  # 是否加载图片
-    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=False,  # 是否为无头浏览器
-    driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
-    timeout=30,  # 请求超时时间
-    window_size=(1024, 800),  # 窗口大小
-    executable_path=None,  # 浏览器路径,默认为默认路径
-    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
-    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
-)
-
-# splash 渲染
-SPLASH_API = os.getenv("SPLASH_API")
-
-# 验证码
-CAPTCHA_URL = os.getenv("CAPTCHA_URL", "http://pycaptcha.spdata.jianyu360.com")
-
-# 爬虫启动时,重新抓取失败的requests
-RETRY_FAILED_REQUESTS = False
-# 爬虫启动时,重新入库失败的item
-RETRY_FAILED_ITEMS = False
-# 保存失败的request
-SAVE_FAILED_REQUEST = False
-# request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
-REQUEST_LOST_TIMEOUT = 600  # 10分钟
-# request网络请求超时时间
-REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
-
-# 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
-RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
-RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
-RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
-
-# redis 存放item与request的根目录
-REDIS_KEY = ""
-# 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
-DELETE_KEYS = []
-
-# 设置代理
-PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
-PROXY_ENABLE = True
-# 剑鱼代理
-JY_PROXY_URL = None
-JY_PROXY_AUTHOR = os.getenv("JY_PROXY_AUTHOR")
-
-# 随机headers
-RANDOM_HEADERS = True
-# UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
-USER_AGENT_TYPE = "chrome"
-# 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
-DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-# requests 使用session
-USE_SESSION = False
-USE_JA3_SESSION = False
-# 遇见 ja3 反爬网站,修改 requests模块用的传输密码
-JA3_REQUEST_CIPHERS = ["DH+AES", "RSA+AES"]
-# 去重
-ITEM_FILTER_ENABLE = False  # item 去重
-ITEM_FILTER_SETTING = dict(
-    filter_type=1  # 永久去重(BloomFilter)= 1; 内存去重(MemoryFilter)= 2; 临时去重(ExpireFilter)= 3; 轻量去重(LiteFilter)= 4;
-)
-REQUEST_FILTER_ENABLE = False  # request 去重
-REQUEST_FILTER_SETTING = dict(
-    filter_type=3,  # 永久去重(BloomFilter)= 1; 内存去重(MemoryFilter)= 2; 临时去重(ExpireFilter)= 3; 轻量去重(LiteFilter)= 4;
-    expire_time=2592000,  # 过期时间1个月
-)
-TASK_FILTER_ENABLE = False  # task 去重
-TASK_FILTER_SETTING = dict(
-    filter_type=3,  # 永久去重(BloomFilter)= 1; 内存去重(MemoryFilter)= 2; 临时去重(ExpireFilter)= 3; 轻量去重(LiteFilter)= 4;
-    expire_time=2592000,  # 过期时间1个月
-)
-
-# 报警 支持钉钉、企业微信、邮件
-# 钉钉报警
-DINGDING_WARNING_URL = ""  # 钉钉机器人api
-DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
-DINGDING_WARNING_ALL = False  # 是否提示所有人, 默认为False
-# 邮件报警
-EMAIL_SENDER = ""  # 发件人
-EMAIL_PASSWORD = ""  # 授权码
-EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
-EMAIL_SMTPSERVER = "smtp.163.com"  # 邮件服务器 默认为163邮箱
-# 企业微信报警
-WECHAT_WARNING_URL = ""  # 企业微信机器人api
-WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
-WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
-# 时间间隔
-WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
-WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
-WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
-
-LOG_NAME = os.path.basename(os.getcwd())
-LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
-LOG_LEVEL = "DEBUG"
-LOG_COLOR = True  # 是否带有颜色
-LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
-LOG_IS_WRITE_TO_FILE = False  # 是否写文件
-LOG_MODE = "w"  # 写文件的模式
-LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
-LOG_BACKUP_COUNT = 20  # 日志文件保留数量
-LOG_ENCODING = "utf8"  # 日志文件编码
-OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
-LOG_IS_SEND_TO_LOGSTASH = False  # 是否开启elk服务
-LOG_STASH_IP = os.getenv("STASH_IP", "localhost")  # elk服务地址
-LOG_STASH_PORT = os.getenv("STASH_IP", 5959)  # elk服务端口
-
-# 打点监控 influxdb 配置
-INFLUXDB_HOST = os.getenv("INFLUXDB_HOST", "localhost")
-INFLUXDB_PORT = int(os.getenv("INFLUXDB_PORT", 8086))
-INFLUXDB_UDP_PORT = int(os.getenv("INFLUXDB_UDP_PORT", 8089))
-INFLUXDB_USER = os.getenv("INFLUXDB_USER")
-INFLUXDB_PASSWORD = os.getenv("INFLUXDB_PASSWORD")
-INFLUXDB_DATABASE = os.getenv("INFLUXDB_DB")
-# 监控数据存储的表名,爬虫管理系统上会以task_id命名
-INFLUXDB_MEASUREMENT = "task_" + os.getenv("TASK_ID") if os.getenv("TASK_ID") else None
-# 打点监控其他参数,若这里也配置了influxdb的参数, 则会覆盖外面的配置
-METRICS_OTHER_ARGS = dict(retention_policy_duration="180d", emit_interval=60)
-
-############# 导入用户自定义的setting #############
-try:
-    from setting import *
-
-    # 兼容老版本的配置
-    KEEP_ALIVE = not AUTO_STOP_WHEN_SPIDER_DONE
-except:
-    pass

+ 0 - 22
spider_frame/FworkSpider/feapder/templates/air_spider_template.tmpl

@@ -1,22 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary:
----------
-@author: {USER}
-"""
-
-import feapder
-
-
-class ${spider_name}(feapder.AirSpider):
-    def start_requests(self):
-        yield feapder.Request("https://www.baidu.com")
-
-    def parse(self, request, response):
-        print(response)
-
-
-if __name__ == "__main__":
-    ${spider_name}().start()

+ 0 - 121
spider_frame/FworkSpider/feapder/templates/detail_template.tmpl

@@ -1,121 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary:  ${spider_name}
----------
-@author: {USER}
-"""
-from urllib.parse import urljoin
-import feapder
-from items.spider_item import DataBakItem
-from untils.attachment import AttachmentDownloader
-from untils.tools import remove_htmldata, extract_file_type
-from feapder.utils.log import log
-import time
-import json
-import re
-
-
-class Details(feapder.BiddingDetailSpider):
-
-    def start_requests(self):
-        while True:
-            data_lsit = self.get_tasks_by_rabbitmq(limit=20)
-            for item in data_lsit:
-                # log.debug(item)
-                request_params = item.get("request_params")
-                timeout = request_params.get('timeout', 10)
-                request_params.pop('timeout', None)
-                if item.get("js"):
-                    eval(item.get("js"))
-                if item.get("ex_python"):
-                    exec(item.get("ex_python"))
-                if item.get("proxies"):
-                    yield feapder.Request(url=item.get("parse_url"), item=item, files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
-                                          **request_params, timeout=timeout)
-                else:
-                    yield feapder.Request(url=item.get("parse_url"), item=item, files_info=item.get("files"),
-                                          deal_detail=item.get("deal_detail"), timeout=timeout,
-                                          callback=eval(item.get("parse")), proxies=False, **request_params)
-
-            break
-
-    def detail_get(self, request, response):
-
-        items = request.item
-        list_item = DataBakItem(**items)
-
-        html = ''
-        for xpath in request.deal_detail:
-            html = response.xpath(xpath).extract_first()  # 标书详细内容
-            if html is not None:
-                break
-
-        if request.to_dict.get('rm_list', None) and html:
-            rm_list = request.rm_list
-            html = remove_htmldata(rm_list, html, response)
-
-        if request.to_dict.get('title_xpath', None):
-            for sxpath in request.title_xpath:
-                title = response.xpath(sxpath).extract_first("").strip() # 三级页标题
-                if title:
-                    list_item.title = title
-                    break
-
-        list_item.contenthtml = html
-
-        if request.files_info:
-            files_info = request.files_info
-            files = response.xpath(files_info.get("list_xpath"))
-            if len(files) > 0:
-                attachments = {}
-                for index, info in enumerate(files):
-                    file_url = info.xpath(files_info.get("url_xpath")).extract_first()
-                    file_name = info.xpath(files_info.get("name_xpath")).extract_first()
-                    if not file_name:
-                        file_name = info.xpath(files_info.get("name_xpath")).extract()
-                    if file_name:
-                        file_name = "".join("".join(file_name).split()).strip()
-                        if files_info.get("host"):
-                            file_url = urljoin(files_info.get("host"), file_url)
-                        if not files_info.get("file_type"):
-                            file_type = extract_file_type(file_name, file_url)
-                        else:
-                            file_type = files_info.get("file_type")
-
-                        if request.proxies:
-                            fpx = request.proxies()
-                        else:
-                            fpx = False
-
-                        if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
-                            attachment = AttachmentDownloader().fetch_attachment(
-                                file_name=file_name, file_type=file_type, download_url=file_url,
-                                proxies=fpx, headers=request.to_dict.get('headers', None))
-                            attachments[str(len(attachments) + 1)] = attachment
-                if attachments:
-                    list_item.projectinfo = {"attachments": attachments}
-
-        yield list_item
-
-    def detail_json(self, request, response):
-        items = request.item
-        list_item = DataBakItem(**items)
-
-        exec(request.deal_detail)
-
-        yield list_item
-
-    def detail_post(self, request, response):
-        items = request.item
-        list_item = DataBakItem(**items)
-
-        exec(request.deal_detail)
-
-        yield list_item
-
-
-if __name__ == "__main__":
-    Details(redis_key="detail:normal_details").start()

+ 0 - 22
spider_frame/FworkSpider/feapder/templates/item_template.tmpl

@@ -1,22 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary:
----------
-@author: {USER}
-"""
-
-from feapder import Item
-
-
-class ${item_name}Item(Item):
-    """
-    This class was generated by feapder.
-    command: feapder create -i ${table_name}.
-    """
-
-    __table_name__ = "${table_name}"
-
-    def __init__(self, *args, **kwargs):
-        ${propertys}

+ 0 - 146
spider_frame/FworkSpider/feapder/templates/njpc_detail_template.tmpl

@@ -1,146 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: 拟建爬虫通用快照页
----------
-@author: njpc_feapder
-"""
-import feapder
-import re
-import json
-import time, random
-from items.njpc_item import DataNjpcItem
-from untils.attachment import AttachmentDownloader as AD
-from untils.attachment_res import AttachmentDownloader as ADres
-from lxml.html import fromstring
-from untils.tools import remove_htmldata, extract_file_type
-from feapder.utils.log import log
-
-redis_key = "njpc_details"
-
-
-# 拟建爬虫下载附件
-def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
-    def parse_filetype(response, filetypes):
-        val = response.headers.get("content-disposition")
-        filetype = val.split('.')[-1].replace('"', '').replace("'", "")
-        filetypes.append(filetype)
-
-    root = fromstring(html)
-    file_info = root.xpath('//a[@href]')
-    if file_info:
-        attachments = {}
-        for info in file_info:
-            file_url = "".join(info.xpath('./@href'))
-            file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
-                          'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
-            file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
-            if file_type.lower() == "res":
-                if s_key in file_url and file_name:
-                    file_name = file_name.strip()
-                    attachment = ADres().fetch_attachment(
-                        file_name=file_name,
-                        download_url=file_url,
-                        callback=parse_filetype,
-                        proxies=proxies,
-                        headers=headers,
-                    )
-                    attachments[str(len(attachments) + 1)] = attachment
-            else:
-                if file_type.lower() in file_types:
-                    file_tp = file_type
-                else:
-                    file_tp = extract_file_type(file_name, file_url, [file_type])
-
-                if file_tp and s_key in file_url and file_name:
-                    file_name = file_name.strip()
-                    attachment = AD().fetch_attachment(
-                        file_name=file_name, file_type=file_tp, download_url=file_url,
-                        proxies=proxies, headers=headers,)
-                    attachments[str(len(attachments) + 1)] = attachment
-        return attachments
-
-
-class Details(feapder.PlanToBuildDetailSpider):
-
-    def start_requests(self):
-        data_lsit = self.get_tasks_by_rabbitmq(limit=100)
-        for item in data_lsit:
-            # log.debug(item)
-            request_params = item.get("request_params")
-            timeout = request_params.get('timeout', 10)
-            request_params.pop('timeout', None)
-            is_join_html = item.get("is_join_html")  # 正文是否根据xpath拼接
-            extra_html = item.get("extra_html")  # 过滤无效内容
-            title_xpath = item.get("title_xpath")  # 三级页标题
-            extra_activity = item.get("extra_activity")  # 额外的需求动作
-            file_params = item.get("file_params")  # 附件下载配置
-            if item.get("proxies"):
-                yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
-                                      callback=item.get("parser"), file_params=file_params,
-                                      extra_activity=extra_activity, timeout=timeout, **request_params)
-            else:
-                yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
-                                      is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
-                                      callback=item.get("parser"), file_params=file_params,
-                                      extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
-
-    def detail_get(self, request, response):
-        items = request.item
-        data_item = DataNjpcItem(**items)
-
-        html = ''
-        for xpath in request.deal_detail:
-            htmls = response.xpath(xpath).extract_first()  # 标书详细内容
-            if request.is_join_html:
-                if htmls is not None:
-                    html += htmls
-            else:
-                if htmls is not None:
-                    html = htmls
-                    break
-
-        if request.title_xpath:
-            for sxpath in request.title_xpath:
-                title = response.xpath(sxpath).extract_first()  # 三级页标题
-                if title:
-                    data_item.title = title.strip()
-                    if "..." in data_item.projectname or "…" in data_item.projectname:
-                        data_item.projectname = title.strip()
-                    break
-
-        try:
-            if request.extra_activity:
-                from untils.tools import njpc_fields_extract, njpc_fields_extract_special
-                exec(request.extra_activity)
-        except:
-            pass
-
-        data_item.contenthtml = remove_htmldata(request.extra_html, html, response)
-
-        fp = request.file_params or {}
-        attachments = njpc_get_files(
-            html,
-            file_type=fp.get("file_type", ""),
-            s_key=fp.get("s_key", "http"),
-            proxies=fp.get("proxies", False),
-            headers=fp.get('headers', {}
-        )
-        if attachments:
-            data_item.projectinfo = {"attachments": attachments}
-
-        yield data_item
-
-    def detail_json(self, request, response):
-        items = request.item
-        data_item = DataNjpcItem(**items)
-
-        exec(request.deal_detail)
-
-        yield data_item
-
-
-if __name__ == '__main__':
-    Details(redis_key="detail:njpc_details").start()

+ 0 - 88
spider_frame/FworkSpider/feapder/templates/njpc_list_template.tmpl

@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: ${spider_name}
----------
-@author: {USER}
-"""
-import feapder
-from items.njpc_item import NjpcListItem
-from collections import namedtuple
-import time, random
-
-
-class Njpc_Feapder(feapder.PlanToBuildListSpider):
-
-    def start_callback(self):
-
-        self.site = ""
-
-        #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-        self.menus = [
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-        ]
-
-        self.headers = {}
-
-    def start_requests(self):
-        start_url = ''
-        for menu in self.menus:
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
-
-    def download_midware(self, request):
-        page = request.page
-        request.headers = self.headers
-
-    def parse(self, request, response):
-        menu = request.item
-        info_list = response.xpath('')  # 数据结构为html
-        for info in info_list:
-            detail_href = info.xpath('').extract_first().strip()
-            projectname = info.xpath('').extract_first().strip()
-            publish_time = info.xpath('').extract_first().strip()
-
-            area = ""  # 省份
-            city = ""  # 城市
-            district = ""  # 区县
-
-            data_item = NjpcListItem()          # 存储数据的管道
-            data_item.unique_key = ("href", publish_time)  # 去重
-            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            data_item.projectname = projectname      # 项目名称
-            data_item.publishtime = publish_time     # 发布时间
-
-            data_item.site = self.site
-            data_item.area = area or "全国"                   # 城市默认:全国
-            data_item.city = city                            # 城市 默认为空
-            data_item.district = district                    # 城市 默认为空
-            data_item.parser_url = detail_href               # 详情页数据链接
-            data_item.href = detail_href                     # 详情链接
-            data_item.request_params = {"headers": self.headers}
-            data_item.parser = "detail_get"                  # 快照页爬虫调用的方法
-            data_item.deal_detail = ['//div[@class="***"]']  # 正文解析规则
-
-            # data_item.proxies = True               # 快照页是否开启代理
-            # data_item.is_join_html = True          # 正文是否根据xpath拼接
-            # data_item.extra_html = []              # 删除正文的无效数据(xpath列表 或 删除的内容)
-            # data_item.title_xpath = []             # 三级页标题 xpath列表
-            # data_item.file_params = {"file_type":"", "s_key":"http", "proxies":False}
-                                                     # 附件下载配置
-            # data_item.render = True                # 是否开启开启浏览器
-            # data_item.render_time = 3              # 渲染时间
-            # data_item.extra_activity = '''***'''   # 额外的需求动作(三引号内顶左边框写执行语句)
-
-            yield data_item
-
-        # 翻页
-        time.sleep(random.randint(2, 5))
-        request = self.infinite_pages(request, response)
-        yield request
-
-
-if __name__ == "__main__":
-    Njpc_Feapder(redis_key="detail:njpc_details").start()

+ 0 - 49
spider_frame/FworkSpider/feapder/templates/project_template/CHECK_DATA.md

@@ -1,49 +0,0 @@
-# 数据审核 
-## 表说明:
-
-> 表名 含义(更新策略)
-
-## 一、准确性
-
-**字段设计是否满足需求? 表之间的关联字段是否满足要求? (需要人工检查)**
-
-> 注意:是否设计了自增 id,id 的类型是否设置为 bigint?
-> 注意:unique index 是否需要设计?
-> 注意:各张表之间是否需要设计关联字段;
-
-* [ ] 是
-* [ ] 否
-
-**各字段采集内容及存储格式是否满足要求?是否与网页一致?是否有信息缺失?**
-
-> 备注:可尝试对每个字段进行升降序排列,然后抽样检查;
-     
-**是否考虑了网站同一类数据可能出现的数据格式不一致情况?**
-
-> 建议:代码对各个字段不做兼容性处理、数据不一致则抛出异常并记录 
-
-* [ ] 是
-* [ ] 否
-
-## 二、全量性
-
-**如果是增量采集,是否最早信息和最晚信息都采集了,同时条目总数是否正确;**
-**如果是批次采集,是否每个批次都有?**
-
->备注:需要去网页端评估单个批次的总量;
->参考sql语句:SELECT count(1), batch_date from [table_name] GROUP BY batch_date;
-
-**如果与另外一张表有关联关系,是否信息关联完整?**
-
-## 三、稳定性
-
-* [ ] 是否能够长期稳定采集? 
-* [ ] 是否加IP代理?
-* [ ] 是否支持断点续跑?
-* [ ] 是否能确保按时启动,定期采集?
-* [ ] 是否已开启报警? 
-
-## 四、采集频次、类型、存储方式
-
-* [ ] 采集频次是否满足要求?
-* [ ] 采集类型是否满足要求:增量采集 or 批次采集? 

+ 0 - 8
spider_frame/FworkSpider/feapder/templates/project_template/README.md

@@ -1,8 +0,0 @@
-# xxx爬虫文档
-## 调研
-
-## 数据库设计
-
-## 爬虫逻辑
-
-## 项目架构

+ 0 - 0
spider_frame/FworkSpider/feapder/templates/project_template/items/__init__.py


+ 0 - 44
spider_frame/FworkSpider/feapder/templates/project_template/main.py

@@ -1,44 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: 爬虫入口
----------
-@author: {USER}
-"""
-
-from feapder import ArgumentParser
-
-from spiders import *
-
-def crawl_xxx():
-    """
-    AirSpider爬虫
-    """
-    spider = xxx.XXXSpider()
-    spider.start()
-
-def crawl_xxx():
-    """
-    Spider爬虫
-    """
-    spider = xxx.XXXSpider(redis_key="xxx:xxx")
-    spider.start()
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(description="xxx爬虫")
-
-    parser.add_argument(
-        "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
-    )
-    parser.add_argument(
-        "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
-    )
-
-    parser.start()
-
-    # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫,若只有一个爬虫,可不编写main.py
-    # 将上面的xxx修改为自己实际的爬虫名
-    # 查看运行命令 python main.py --help
-    # AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx

+ 0 - 137
spider_frame/FworkSpider/feapder/templates/project_template/setting.py

@@ -1,137 +0,0 @@
-# -*- coding: utf-8 -*-
-"""爬虫配置文件"""
-# import os
-# import sys
-#
-# # MYSQL
-# MYSQL_IP = "localhost"
-# MYSQL_PORT = 3306
-# MYSQL_DB = ""
-# MYSQL_USER_NAME = ""
-# MYSQL_USER_PASS = ""
-#
-# # MONGODB
-# MONGO_IP = "localhost"
-# MONGO_PORT = 27017
-# MONGO_DB = ""
-# MONGO_USER_NAME = ""
-# MONGO_USER_PASS = ""
-#
-# # REDIS
-# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
-# REDISDB_IP_PORTS = "localhost:6379"
-# REDISDB_USER_PASS = ""
-# REDISDB_DB = 0
-# # 适用于redis哨兵模式
-# REDISDB_SERVICE_NAME = ""
-#
-# # 数据入库的pipeline,可自定义,默认MysqlPipeline
-# ITEM_PIPELINES = [
-#     "feapder.pipelines.mysql_pipeline.MysqlPipeline",
-#     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
-# ]
-# EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
-# EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
-#
-# # 爬虫相关
-# # COLLECTOR
-# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
-# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
-#
-# # SPIDER
-# SPIDER_THREAD_COUNT = 1  # 爬虫并发数
-# SPIDER_SLEEP_TIME = 0  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
-# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
-# SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
-# KEEP_ALIVE = False  # 爬虫是否常驻
-#
-# # 浏览器渲染
-# WEBDRIVER = dict(
-#     pool_size=1,  # 浏览器的数量
-#     load_images=True,  # 是否加载图片
-#     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-#     proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
-#     headless=False,  # 是否为无头浏览器
-#     driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
-#     timeout=30,  # 请求超时时间
-#     window_size=(1024, 800),  # 窗口大小
-#     executable_path=None,  # 浏览器路径,默认为默认路径
-#     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
-#     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
-# )
-#
-# # 爬虫启动时,重新抓取失败的requests
-# RETRY_FAILED_REQUESTS = False
-# # 保存失败的request
-# SAVE_FAILED_REQUEST = True
-# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
-# REQUEST_LOST_TIMEOUT = 600  # 10分钟
-# # request网络请求超时时间
-# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
-#
-# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
-# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
-# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
-# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
-#
-# # 设置代理
-# PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
-# PROXY_ENABLE = True
-#
-# # 随机headers
-# RANDOM_HEADERS = True
-# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
-# USER_AGENT_TYPE = "chrome"
-# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
-# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-# # requests 使用session
-# USE_SESSION = False
-#
-# # 去重
-# ITEM_FILTER_ENABLE = False  # item 去重
-# REQUEST_FILTER_ENABLE = False  # request 去重
-# ITEM_FILTER_SETTING = dict(
-#     filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
-# )
-# REQUEST_FILTER_ENABLE = False  # request 去重
-# REQUEST_FILTER_SETTING = dict(
-#     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
-#     expire_time=2592000,  # 过期时间1个月
-# )
-#
-# # 报警 支持钉钉、企业微信、邮件
-# # 钉钉报警
-# DINGDING_WARNING_URL = ""  # 钉钉机器人api
-# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
-# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
-# # 邮件报警
-# EMAIL_SENDER = ""  # 发件人
-# EMAIL_PASSWORD = ""  # 授权码
-# EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
-# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
-# # 企业微信报警
-# WECHAT_WARNING_URL = ""  # 企业微信机器人api
-# WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
-# WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
-# # 时间间隔
-# WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
-# WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
-# WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
-#
-# LOG_NAME = os.path.basename(os.getcwd())
-# LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
-# LOG_LEVEL = "DEBUG"
-# LOG_COLOR = True  # 是否带有颜色
-# LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
-# LOG_IS_WRITE_TO_FILE = False  # 是否写文件
-# LOG_MODE = "w"  # 写文件的模式
-# LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
-# LOG_BACKUP_COUNT = 20  # 日志文件保留数量
-# LOG_ENCODING = "utf8"  # 日志文件编码
-# OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
-#
-# # 切换工作路径为当前项目路径
-# project_path = os.path.abspath(os.path.dirname(__file__))
-# os.chdir(project_path)  # 切换工作路经
-# sys.path.insert(0, project_path)
-# print('当前工作路径为 ' + os.getcwd())

+ 0 - 0
spider_frame/FworkSpider/feapder/templates/project_template/spiders/__init__.py


+ 0 - 88
spider_frame/FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -1,88 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: ${spider_name}
----------
-@author: {USER}
-"""
-import feapder
-from items.spider_item import BidingListItem
-from collections import namedtuple
-
-
-class ${spider_name}(feapder.BiddingListSpider):
-
-    def start_callback(self):
-
-        self.site = ""
-
-        #               --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-        Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
-
-        self.menus = [
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
-        ]
-
-        self.headers = {}
-
-    def start_requests(self):
-        for menu in self.menus:
-            start_url = ''
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
-
-    def download_midware(self, request):
-        page = request.page
-        request.headers = self.headers
-
-    def parse(self, request, response):
-
-        menu = request.item
-        info_list = response.xpath('')  # 数据结构为html
-        for info in info_list:
-            href = info.xpath('').extract_first().strip()
-            title = info.xpath('').extract_first().strip()
-            publish_time = info.xpath('').extract_first().strip()
-
-            area = ""  # 省份
-            city = ""  # 城市
-            district = ""  # 区县
-
-            list_item = BidingListItem()     # 存储数据的管道
-            list_item.href = href            # 标书链接
-            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-            list_item.title = title                  # 标题
-            list_item.publishtime = publish_time     # 标书发布时间
-            list_item.site = self.site
-            list_item.area = area or "全国"  # 省份 默认:全国
-            list_item.city = city           # 城市 默认 为空
-            list_item.district = district   # 区县 默认 为空
-
-            list_item.unique_key = ('href',)
-            list_item.parse = "self.detail_get"        # 详情页回调方法
-            list_item.deal_detail = ['//div[@class="****"]']  # 抽取正文xpath
-            list_item.proxies = False
-            list_item.parse_url = href                 # 详情页请求地址
-            # list_item.is_delay = 1                   # 延时推送标识
-            # list_item.if_es = 1                      # 查询es标识
-
-            list_item.files = {                       # 附件采集规则
-                "list_xpath": '//div[@class="***"]//a[@href]',
-                "url_xpath": './@href',
-                "name_xpath": './text()',
-                # "file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
-                "url_key": 'http',    # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                "host": '',           # 需要拼接url的host
-            }
-
-            yield list_item
-
-        # 翻页
-        request = self.infinite_pages(request, response)
-        yield request
-
-
-if __name__ == "__main__":
-    ${spider_name}(redis_key="detail:normal_details").start()

+ 0 - 108
spider_frame/FworkSpider/feapder/templates/spider_template.tmpl

@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on {DATE}
----------
-@summary: selenium抓取列表页无法获取href的信息
----------
-@author: {USER}
-"""
-import feapder
-from items.spider_item import DataBakItem
-from feapder.network.selector import Selector
-from collections import namedtuple
-import time
-
-
-class ${spider_name}(feapder.BiddingListSpider):
-
-    def start_callback(self):
-
-        self.site = ""
-
-        #   --- --- crawl_page 必须存在,且为纯数字(int) --- ---
-        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
-
-        self.menus = [
-            Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', 1),
-        ]
-
-        self.headers = {}
-
-    def start_requests(self):
-        for menu in self.menus:
-            start_url = ''
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
-                                  render=True, render_time=3, proxies=False)
-
-    def download_midware(self, request):
-        page = request.page
-        request.headers = self.headers
-
-    def parse(self, request, response):
-        driver = response.browser
-        menu = request.item
-        info_list = response.xpath('')
-        for info in info_list:
-            # href = info.xpath('').extract_first().strip()
-            title = info.xpath('').extract_first().strip()
-            publish_time = info.xpath('').extract_first().strip()
-
-            area = ""  # 省份
-            city = ""  # 城市
-            district = ""  # 区县
-
-            try:
-                next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title}")]')
-            except:
-                try:
-                    next_page = driver.find_element_by_xpath(f'//a[contains(text(),"{title[:10]}")]')  # 标题过长
-                except:
-                    continue
-
-            driver.execute_script("arguments[0].click();", next_page)  # js点击
-            time.sleep(3)
-
-            # 点击三级页标题后打开新窗口
-            # handles = driver.window_handles
-            # driver.switch_to.window(handles[-1])
-
-            href = driver.current_url
-
-            data_item = DataBakItem()         # 存储数据的管道
-            data_item.href = href             # 标书链接
-            data_item.unique_key = ('title', 'href')  # 去重
-            data_item.channel = menu.get("channel")   # 最上方定义的抓取栏目 (编辑器定的)
-            data_item.spidercode = menu.get("code")   # 最上方定义的爬虫code(编辑器定的)
-            data_item.title = title                   # 标题
-            data_item.publishtime = publish_time      # 标书发布时间
-            data_item.site = self.site
-            data_item.area = area or "全国"  # 省份 默认:全国
-            data_item.city = city           # 城市 默认 为空
-            data_item.district = district   # 区县 默认 为空
-
-            detail_html = Selector(text=driver.page_source)
-            html = ""
-            dx_list = ['//div[@class="***"]', ]
-            for dx in dx_list:
-                html = detail_html.xpath(dx).extract_first()
-                if html:
-                    break
-
-            data_item.contenthtml = html
-
-            # (不同窗口)切换回主窗口
-            # driver.close()
-            # driver.switch_to.window(handles[0])
-
-            driver.back()
-            time.sleep(3)
-
-            yield data_item
-
-        # 翻页
-        request = self.infinite_pages(request, response)
-        yield request
-
-
-if __name__ == "__main__":
-    ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 0 - 9
spider_frame/FworkSpider/feapder/utils/__init__.py

@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-Created on 2019/11/5 4:41 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-'''

+ 0 - 63
spider_frame/FworkSpider/feapder/utils/custom_argparse.py

@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-10-15 14:32:12
----------
-@summary: 封装ArgumentParser, 使其支持function, 调用start自动执行
----------
-@author: Boris
-@email:  boris_liu@foxmail.com
-"""
-
-import argparse
-
-
-class ArgumentParser(argparse.ArgumentParser):
-    def __init__(self, *args, **kwargs):
-        self.functions = {}
-
-        super(ArgumentParser, self).__init__(*args, **kwargs)
-
-    def add_argument(self, *args, **kwargs):
-        function = kwargs.pop("function") if "function" in kwargs else None
-        key = self._get_optional_kwargs(*args, **kwargs).get("dest")
-        self.functions[key] = function
-
-        return super(ArgumentParser, self).add_argument(*args, **kwargs)
-
-    def start(self, args=None, namespace=None):
-        args = self.parse_args(args=args, namespace=namespace)
-        for key, value in vars(args).items():  # vars() 函数返回对象object的属性和属性值的字典对象
-            if value not in (None, False):
-                if callable(self.functions[key]):
-                    if value != True:
-                        if isinstance(value, list) and len(value) == 1:
-                            value = value[0]
-                        self.functions[key](value)
-                    else:
-                        self.functions[key]()
-
-    def run(self, args, values=None):
-        if args in self.functions:
-            if values:
-                self.functions[args](values)
-            else:
-                self.functions[args]()
-
-        else:
-            raise Exception(f"无此方法: {args}")
-
-
-if __name__ == "__main__":
-
-    def test():
-        print("test not args func")
-
-    def test2(args):
-        print("test args func", args)
-
-    parser = ArgumentParser(description="测试")
-
-    parser.add_argument("--test2", type=int, nargs=1, help="(1|2)", function=test2)
-    parser.add_argument("--test", action="store_true", help="", function=test)
-
-    parser.start()

+ 0 - 93
spider_frame/FworkSpider/feapder/utils/email_sender.py

@@ -1,93 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2020/2/19 12:57 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-import os
-import smtplib
-from email.header import Header
-from email.mime.multipart import MIMEMultipart
-from email.mime.text import MIMEText
-from email.utils import formataddr
-
-from feapder.utils.log import log
-
-
-class EmailSender(object):
-    SENDER = "feapder报警系统"
-
-    def __init__(self, username, password, smtpserver="smtp.163.com"):
-        self.username = username
-        self.password = password
-        self.smtpserver = smtpserver
-        self.smtp_client = smtplib.SMTP_SSL(smtpserver)
-        self.sender = EmailSender.SENDER
-
-    def __enter__(self):
-        self.login()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.quit()
-
-    def quit(self):
-        self.smtp_client.quit()
-
-    def login(self):
-        self.smtp_client.connect(self.smtpserver)
-        self.smtp_client.login(self.username, self.password)
-
-    def send(
-        self,
-        receivers: list,
-        title: str,
-        content: str,
-        content_type: str = "plain",
-        filepath: str = None,
-    ):
-        """
-
-        Args:
-            receivers:
-            title:
-            content:
-            content_type: html / plain
-            filepath:
-
-        Returns:
-
-        """
-        # 创建一个带附件的实例
-        message = MIMEMultipart()
-        message["From"] = formataddr(
-            (self.sender, self.username)
-        )  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
-        message["To"] = formataddr((receivers[0], receivers[0]))  # ",".join(receivers)
-
-        message["Subject"] = Header(title, "utf-8")
-
-        content = MIMEText(content, content_type, "utf-8")
-        message.attach(content)
-
-        # 构造附件
-        if filepath:
-            attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8")
-            attach.add_header(
-                "content-disposition",
-                "attachment",
-                filename=("utf-8", "", os.path.basename(filepath)),
-            )
-            message.attach(attach)
-
-        msg = message.as_string()
-        # 此处直接发送多个邮箱有问题,改成一个个发送
-        for receiver in receivers:
-            log.debug("发送邮件到 {}".format(receiver))
-            self.smtp_client.sendmail(self.username, receiver, msg)
-        log.debug("邮件发送成功!!!")
-        return True

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 0 - 6
spider_frame/FworkSpider/feapder/utils/js/stealth.min.js


+ 0 - 278
spider_frame/FworkSpider/feapder/utils/log.py

@@ -1,278 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-12-08 16:50
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import logging
-import os
-import sys
-from logging.handlers import BaseRotatingHandler
-
-import logstash
-import loguru
-from better_exceptions import format_exception
-
-import feapder.setting as setting
-
-LOG_FORMAT = "%(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s| %(message)s"
-PRINT_EXCEPTION_DETAILS = True
-
-
-class InterceptHandler(logging.Handler):
-    def emit(self, record):
-        # Retrieve context where the logging call occurred, this happens to be in the 6th frame upward
-        logger_opt = loguru.logger.opt(depth=6, exception=record.exc_info)
-        logger_opt.log(record.levelname, record.getMessage())
-
-
-# 重写 RotatingFileHandler 自定义log的文件名
-# 原来 xxx.log xxx.log.1 xxx.log.2 xxx.log.3 文件由近及远
-# 现在 xxx.log xxx1.log xxx2.log  如果backup_count 是2位数时  则 01  02  03 三位数 001 002 .. 文件由近及远
-class RotatingFileHandler(BaseRotatingHandler):
-    def __init__(
-        self, filename, mode="a", max_bytes=0, backup_count=0, encoding=None, delay=0
-    ):
-        BaseRotatingHandler.__init__(self, filename, mode, encoding, delay)
-        self.max_bytes = max_bytes
-        self.backup_count = backup_count
-        self.placeholder = str(len(str(backup_count)))
-
-    def doRollover(self):
-        if self.stream:
-            self.stream.close()
-            self.stream = None
-        if self.backup_count > 0:
-            for i in range(self.backup_count - 1, 0, -1):
-                sfn = ("%0" + self.placeholder + "d.") % i  # '%2d.'%i -> 02
-                sfn = sfn.join(self.baseFilename.split("."))
-                # sfn = "%d_%s" % (i, self.baseFilename)
-                # dfn = "%d_%s" % (i + 1, self.baseFilename)
-                dfn = ("%0" + self.placeholder + "d.") % (i + 1)
-                dfn = dfn.join(self.baseFilename.split("."))
-                if os.path.exists(sfn):
-                    # print "%s -> %s" % (sfn, dfn)
-                    if os.path.exists(dfn):
-                        os.remove(dfn)
-                    os.rename(sfn, dfn)
-            dfn = (("%0" + self.placeholder + "d.") % 1).join(
-                self.baseFilename.split(".")
-            )
-            if os.path.exists(dfn):
-                os.remove(dfn)
-            # Issue 18940: A file may not have been created if delay is True.
-            if os.path.exists(self.baseFilename):
-                os.rename(self.baseFilename, dfn)
-        if not self.delay:
-            self.stream = self._open()
-
-    def shouldRollover(self, record):
-
-        if self.stream is None:  # delay was set...
-            self.stream = self._open()
-        if self.max_bytes > 0:  # are we rolling over?
-            # print('record >>>> ', record)
-            msg = "%s\n" % self.format(record)
-            self.stream.seek(0, 2)  # due to non-posix-compliant Windows feature
-            if self.stream.tell() + len(msg) >= self.max_bytes:
-                return 1
-        return 0
-
-
-def get_logger(
-    name=None,
-    path=None,
-    log_level=None,
-    is_write_to_console=None,
-    is_write_to_file=None,
-    is_send_to_logstash = None,
-    color=None,
-    mode=None,
-    max_bytes=None,
-    backup_count=None,
-    encoding=None,
-):
-    """
-    @summary: 获取log
-    ---------
-    @param name: log名
-    @param path: log文件存储路径 如 D://xxx.log
-    @param log_level: log等级 CRITICAL/ERROR/WARNING/INFO/DEBUG
-    @param is_write_to_console: 是否输出到控制台
-    @param is_write_to_file: 是否写入到文件 默认否
-    @param color:是否有颜色
-    @param mode:写文件模式
-    @param max_bytes: 每个日志文件的最大字节数
-    @param backup_count:日志文件保留数量
-    @param encoding:日志文件编码
-    ---------
-    @result:
-    """
-    # 加载setting里最新的值
-    name = name or setting.LOG_NAME
-    path = path or setting.LOG_PATH
-    log_level = log_level or setting.LOG_LEVEL
-    is_write_to_console = (
-        is_write_to_console
-        if is_write_to_console is not None
-        else setting.LOG_IS_WRITE_TO_CONSOLE
-    )
-    is_write_to_file = (
-        is_write_to_file
-        if is_write_to_file is not None
-        else setting.LOG_IS_WRITE_TO_FILE
-    )
-
-    is_send_to_logstash = (
-        is_send_to_logstash
-        if is_send_to_logstash is not None
-        else setting.LOG_IS_SEND_TO_LOGSTASH
-    )
-
-    color = color if color is not None else setting.LOG_COLOR
-    mode = mode or setting.LOG_MODE
-    max_bytes = max_bytes or setting.LOG_MAX_BYTES
-    backup_count = backup_count or setting.LOG_BACKUP_COUNT
-    encoding = encoding or setting.LOG_ENCODING
-
-    # logger 配置
-    name = name.split(os.sep)[-1].split(".")[0]  # 取文件名
-
-    logger = logging.getLogger(name)
-    logger.setLevel(log_level)
-
-    formatter = logging.Formatter(LOG_FORMAT)
-    if PRINT_EXCEPTION_DETAILS:
-        formatter.formatException = lambda exc_info: format_exception(*exc_info)
-
-    # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
-    if is_write_to_file:
-        if path and not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path), exist_ok=True)
-
-        rf_handler = RotatingFileHandler(
-            path,
-            mode=mode,
-            max_bytes=max_bytes,
-            backup_count=backup_count,
-            encoding=encoding,
-        )
-        rf_handler.setFormatter(formatter)
-        logger.addHandler(rf_handler)
-
-    if is_send_to_logstash:
-        stash_handler = logstash.TCPLogstashHandler(
-            setting.LOG_STASH_IP, setting.LOG_STASH_PORT, version=1)
-        logger.addHandler(stash_handler)
-
-    if color and is_write_to_console:
-        loguru_handler = InterceptHandler()
-        loguru_handler.setFormatter(formatter)
-        # logging.basicConfig(handlers=[loguru_handler], level=0)
-        logger.addHandler(loguru_handler)
-
-    elif is_write_to_console:
-        stream_handler = logging.StreamHandler()
-        stream_handler.stream = sys.stdout
-        stream_handler.setFormatter(formatter)
-        logger.addHandler(stream_handler)
-
-    _handler_list = []
-    _handler_name_list = []
-    # 检查是否存在重复handler
-    for _handler in logger.handlers:
-        if str(_handler) not in _handler_name_list:
-            _handler_name_list.append(str(_handler))
-            _handler_list.append(_handler)
-    logger.handlers = _handler_list
-    return logger
-
-
-# logging.disable(logging.DEBUG) # 关闭所有log
-
-# 不让打印log的配置
-STOP_LOGS = [
-    # ES
-    "urllib3.response",
-    "urllib3.connection",
-    "elasticsearch.trace",
-    "requests.packages.urllib3.util",
-    "requests.packages.urllib3.util.retry",
-    "urllib3.util",
-    "requests.packages.urllib3.response",
-    "requests.packages.urllib3.contrib.pyopenssl",
-    "requests.packages",
-    "urllib3.util.retry",
-    "requests.packages.urllib3.contrib",
-    "requests.packages.urllib3.connectionpool",
-    "requests.packages.urllib3.poolmanager",
-    "urllib3.connectionpool",
-    "requests.packages.urllib3.connection",
-    "elasticsearch",
-    "log_request_fail",
-    # requests
-    "requests",
-    "selenium.webdriver.remote.remote_connection",
-    "selenium.webdriver.remote",
-    "selenium.webdriver",
-    "selenium",
-    # markdown
-    "MARKDOWN",
-    "build_extension",
-    # newspaper
-    "calculate_area",
-    "largest_image_url",
-    "newspaper.images",
-    "newspaper",
-    "Importing",
-    "PIL",
-]
-
-# 关闭日志打印
-for STOP_LOG in STOP_LOGS:
-    log_level = eval("logging." + setting.OTHERS_LOG_LEVAL)
-    logging.getLogger(STOP_LOG).setLevel(log_level)
-
-# print(logging.Logger.manager.loggerDict) # 取使用debug模块的name
-
-# 日志级别大小关系为:CRITICAL > ERROR > WARNING > INFO > DEBUG
-
-
-class Log:
-    log = None
-
-    def __getattr__(self, name):
-        # 调用log时再初始化,为了加载最新的setting
-        if self.__class__.log is None:
-            self.__class__.log = get_logger()
-        return getattr(self.__class__.log, name)
-
-    @property
-    def debug(self):
-        return self.__class__.log.debug
-
-    @property
-    def info(self):
-        return self.__class__.log.info
-
-    @property
-    def warning(self):
-        return self.__class__.log.warning
-
-    @property
-    def exception(self):
-        return self.__class__.log.exception
-
-    @property
-    def error(self):
-        return self.__class__.log.error
-
-    @property
-    def critical(self):
-        return self.__class__.log.critical
-
-
-log = Log()

+ 0 - 539
spider_frame/FworkSpider/feapder/utils/metrics.py

@@ -1,539 +0,0 @@
-import concurrent.futures
-import json
-import os
-import queue
-import random
-import socket
-import threading
-import time
-from collections import Counter
-from typing import Any
-
-from influxdb import InfluxDBClient
-
-from feapder import setting
-from feapder.utils.log import log
-from feapder.utils.tools import aio_wrap, ensure_float, ensure_int
-
-_inited_pid = None
-# this thread should stop running in the forked process
-_executor = concurrent.futures.ThreadPoolExecutor(
-    max_workers=1, thread_name_prefix="metrics"
-)
-
-
-class MetricsEmitter:
-    def __init__(
-        self,
-        influxdb,
-        *,
-        batch_size=10,
-        max_timer_seq=0,
-        emit_interval=10,
-        retention_policy=None,
-        ratio=1.0,
-        debug=False,
-        add_hostname=False,
-        max_points=10240,
-        default_tags=None,
-        time_precision="s",
-    ):
-        """
-        Args:
-            influxdb: influxdb instance
-            batch_size: 打点的批次大小
-            max_timer_seq: 每个时间间隔内最多收集多少个 timer 类型点, 0 表示不限制
-            emit_interval: 最多等待多长时间必须打点
-            retention_policy: 对应的 retention policy
-            ratio: store 和 timer 类型采样率,比如 0.1 表示只有 10% 的点会留下
-            debug: 是否打印调试日志
-            add_hostname: 是否添加 hostname 作为 tag
-            max_points: 本地 buffer 最多累计多少个点
-            time_precision: 打点精度 默认 s
-        """
-        self.pending_points = queue.Queue()
-        self.batch_size = batch_size
-        self.influxdb: InfluxDBClient = influxdb
-        self.tagkv = {}
-        self.max_timer_seq = max_timer_seq
-        self.lock = threading.Lock()
-        self.hostname = socket.gethostname()
-        self.last_emit_ts = time.time()  # 上次提交时间
-        self.emit_interval = emit_interval  # 提交间隔
-        self.max_points = max_points
-        self.retention_policy = retention_policy  # 支持自定义保留策略
-        self.debug = debug
-        self.add_hostname = add_hostname
-        self.ratio = ratio
-        self.default_tags = default_tags or {}
-        self.time_precision = time_precision
-
-    def define_tagkv(self, tagk, tagvs):
-        self.tagkv[tagk] = set(tagvs)
-
-    def _point_tagset(self, p):
-        return f"{p['measurement']}-{sorted(p['tags'].items())}-{p['time']}"
-
-    def _accumulate_points(self, points):
-        """
-        对于处于同一个 key 的点做聚合
-
-          - 对于 counter 类型,同一个 key 的值(_count)可以累加
-          - 对于 store 类型,不做任何操作,influxdb 会自行覆盖
-          - 对于 timer 类型,通过添加一个 _seq 值来区分每个不同的点
-        """
-        counters = {}  # 临时保留 counter 类型的值
-        timer_seqs = Counter()  # 记录不同 key 的 timer 序列号
-        new_points = []
-
-        for point in points:
-            point_type = point["tags"].get("_type", None)
-            tagset = self._point_tagset(point)
-
-            # counter 类型全部聚合,不做丢弃
-            if point_type == "counter":
-                if tagset not in counters:
-                    counters[tagset] = point
-                else:
-                    counters[tagset]["fields"]["_count"] += point["fields"]["_count"]
-            elif point_type == "timer":
-                if self.max_timer_seq and timer_seqs[tagset] > self.max_timer_seq:
-                    continue
-                # 掷一把骰子,如果足够幸运才打点
-                if self.ratio < 1.0 and random.random() > self.ratio:
-                    continue
-                # 增加 _seq tag,以便区分不同的点
-                point["tags"]["_seq"] = timer_seqs[tagset]
-                timer_seqs[tagset] += 1
-                new_points.append(point)
-            else:
-                if self.ratio < 1.0 and random.random() > self.ratio:
-                    continue
-                new_points.append(point)
-
-        # 把累加得到的 counter 值添加进来
-        new_points.extend(counters.values())
-        return new_points
-
-    def _get_ready_emit(self, force=False):
-        """
-        把当前 pending 的值做聚合并返回
-        """
-        if self.debug:
-            log.info("got %s raw points", self.pending_points.qsize())
-
-        # 从 pending 中读取点, 设定一个最大值,避免一直打点,一直获取
-        points = []
-        while len(points) < self.max_points or force:
-            try:
-                points.append(self.pending_points.get_nowait())
-            except queue.Empty:
-                break
-
-        # 聚合点
-        points = self._accumulate_points(points)
-
-        if self.debug:
-            log.info("got %s point", len(points))
-            log.info(json.dumps(points, indent=4))
-
-        return points
-
-    def emit(self, point=None, force=False):
-        """
-        1. 添加新点到 pending
-        2. 如果符合条件,尝试聚合并打点
-        3. 更新打点时间
-
-        :param point:
-        :param force: 强制提交所有点 默认False
-        :return:
-        """
-        if point:
-            self.pending_points.put(point)
-
-        # 判断是否需要提交点 1、数量 2、间隔 3、强力打点
-        if not (
-            force
-            or self.pending_points.qsize() >= self.max_points  # noqa: W503
-            or time.time() - self.last_emit_ts > self.emit_interval  # noqa: W503
-        ):
-            return
-
-        # 需要打点,读取可以打点的值, 确保只有一个线程在做点的压缩
-        with self.lock:
-            points = self._get_ready_emit(force=force)
-
-            if not points:
-                return
-            try:
-                self.influxdb.write_points(
-                    points,
-                    batch_size=self.batch_size,
-                    time_precision=self.time_precision,
-                    retention_policy=self.retention_policy,
-                )
-            except Exception:
-                log.exception("error writing points")
-
-            self.last_emit_ts = time.time()
-
-    def flush(self):
-        if self.debug:
-            log.info("start draining points %s", self.pending_points.qsize())
-        self.emit(force=True)
-
-    def close(self):
-        self.flush()
-        try:
-            self.influxdb.close()
-        except Exception as e:
-            log.exception(e)
-
-    def make_point(self, measurement, tags: dict, fields: dict, timestamp=None):
-        """
-        默认的时间戳是"秒"级别的
-        """
-        assert measurement, "measurement can't be null"
-        tags = tags.copy() if tags else {}
-        tags.update(self.default_tags)
-        fields = fields.copy() if fields else {}
-        if timestamp is None:
-            timestamp = int(time.time())
-        # 支持自定义hostname
-        if self.add_hostname and "hostname" not in tags:
-            tags["hostname"] = self.hostname
-        point = dict(measurement=measurement, tags=tags, fields=fields, time=timestamp)
-        if self.tagkv:
-            for tagk, tagv in tags.items():
-                if tagv not in self.tagkv[tagk]:
-                    raise ValueError("tag value = %s not in %s", tagv, self.tagkv[tagk])
-        return point
-
-    def get_counter_point(
-        self,
-        measurement: str,
-        key: str = None,
-        count: int = 1,
-        tags: dict = None,
-        timestamp: int = None,
-    ):
-        """
-        counter 不能被覆盖
-        """
-        tags = tags.copy() if tags else {}
-        if key is not None:
-            tags["_key"] = key
-        tags["_type"] = "counter"
-        count = ensure_int(count)
-        fields = dict(_count=count)
-        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
-        return point
-
-    def get_store_point(
-        self,
-        measurement: str,
-        key: str = None,
-        value: Any = 0,
-        tags: dict = None,
-        timestamp=None,
-    ):
-        tags = tags.copy() if tags else {}
-        if key is not None:
-            tags["_key"] = key
-        tags["_type"] = "store"
-        fields = dict(_value=value)
-        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
-        return point
-
-    def get_timer_point(
-        self,
-        measurement: str,
-        key: str = None,
-        duration: float = 0,
-        tags: dict = None,
-        timestamp=None,
-    ):
-        tags = tags.copy() if tags else {}
-        if key is not None:
-            tags["_key"] = key
-        tags["_type"] = "timer"
-        fields = dict(_duration=ensure_float(duration))
-        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
-        return point
-
-    def emit_any(self, *args, **kwargs):
-        point = self.make_point(*args, **kwargs)
-        self.emit(point)
-
-    def emit_counter(self, *args, **kwargs):
-        point = self.get_counter_point(*args, **kwargs)
-        self.emit(point)
-
-    def emit_store(self, *args, **kwargs):
-        point = self.get_store_point(*args, **kwargs)
-        self.emit(point)
-
-    def emit_timer(self, *args, **kwargs):
-        point = self.get_timer_point(*args, **kwargs)
-        self.emit(point)
-
-
-_emitter: MetricsEmitter = None
-_measurement: str = None
-
-
-def init(
-    *,
-    influxdb_host=None,
-    influxdb_port=None,
-    influxdb_udp_port=None,
-    influxdb_database=None,
-    influxdb_user=None,
-    influxdb_password=None,
-    influxdb_measurement=None,
-    retention_policy=None,
-    retention_policy_duration="180d",
-    emit_interval=60,
-    batch_size=10,
-    debug=False,
-    use_udp=False,
-    timeout=10,
-    time_precision="s",
-    **kwargs,
-):
-    """
-    打点监控初始化
-    Args:
-        influxdb_host:
-        influxdb_port:
-        influxdb_udp_port:
-        influxdb_database:
-        influxdb_user:
-        influxdb_password:
-        influxdb_measurement: 存储的表,也可以在打点的时候指定
-        retention_policy: 保留策略
-        retention_policy_duration: 保留策略过期时间
-        emit_interval: 打点最大间隔
-        batch_size: 打点的批次大小
-        debug: 是否开启调试
-        use_udp: 是否使用udp协议打点
-        timeout: 与influxdb建立连接时的超时时间
-        time_precision: 打点精度 默认秒
-        **kwargs: 可传递MetricsEmitter类的参数
-
-    Returns:
-
-    """
-    global _inited_pid, _emitter, _measurement
-    if _inited_pid == os.getpid():
-        return
-
-    influxdb_host = influxdb_host or setting.INFLUXDB_HOST
-    influxdb_port = influxdb_port or setting.INFLUXDB_PORT
-    influxdb_udp_port = influxdb_udp_port or setting.INFLUXDB_UDP_PORT
-    influxdb_database = influxdb_database or setting.INFLUXDB_DATABASE
-    influxdb_user = influxdb_user or setting.INFLUXDB_USER
-    influxdb_password = influxdb_password or setting.INFLUXDB_PASSWORD
-    _measurement = influxdb_measurement or setting.INFLUXDB_MEASUREMENT
-    retention_policy = (
-        retention_policy or f"{influxdb_database}_{retention_policy_duration}"
-    )
-
-    if not all(
-        [
-            influxdb_host,
-            influxdb_port,
-            influxdb_udp_port,
-            influxdb_database,
-            influxdb_user,
-            influxdb_password,
-        ]
-    ):
-        return
-
-    influxdb_client = InfluxDBClient(
-        host=influxdb_host,
-        port=influxdb_port,
-        udp_port=influxdb_udp_port,
-        database=influxdb_database,
-        use_udp=use_udp,
-        timeout=timeout,
-        username=influxdb_user,
-        password=influxdb_password,
-    )
-    # 创建数据库
-    if influxdb_database:
-        try:
-            influxdb_client.create_database(influxdb_database)
-            influxdb_client.create_retention_policy(
-                retention_policy,
-                retention_policy_duration,
-                replication="1",
-                default=True,
-            )
-        except Exception as e:
-            log.error("metrics init falied: {}".format(e))
-            return
-
-    _emitter = MetricsEmitter(
-        influxdb_client,
-        debug=debug,
-        batch_size=batch_size,
-        time_precision=time_precision,
-        retention_policy=retention_policy,
-        emit_interval=emit_interval,
-        **kwargs,
-    )
-    _inited_pid = os.getpid()
-    log.info("metrics init successfully")
-
-
-def emit_any(
-    tags: dict,
-    fields: dict,
-    *,
-    classify: str = "",
-    measurement: str = None,
-    timestamp=None,
-):
-    """
-    原生的打点,不进行额外的处理
-    Args:
-        tags: influxdb的tag的字段和值
-        fields: influxdb的field的字段和值
-        classify: 点的类别
-        measurement: 存储的表
-        timestamp: 点的时间搓,默认为当前时间
-
-    Returns:
-
-    """
-    if not _emitter:
-        return
-
-    tags = tags or {}
-    tags["_classify"] = classify
-    measurement = measurement or _measurement
-    _emitter.emit_any(measurement, tags, fields, timestamp)
-
-
-def emit_counter(
-    key: str = None,
-    count: int = 1,
-    *,
-    classify: str = "",
-    tags: dict = None,
-    measurement: str = None,
-    timestamp: int = None,
-):
-    """
-    聚合打点,即会将一段时间内的点求和,然后打一个点数和
-    Args:
-        key: 与点绑定的key值
-        count: 点数
-        classify: 点的类别
-        tags: influxdb的tag的字段和值
-        measurement: 存储的表
-        timestamp: 点的时间搓,默认为当前时间
-
-    Returns:
-
-    """
-    if not _emitter:
-        return
-
-    tags = tags or {}
-    tags["_classify"] = classify
-    measurement = measurement or _measurement
-    _emitter.emit_counter(measurement, key, count, tags, timestamp)
-
-
-def emit_timer(
-    key: str = None,
-    duration: float = 0,
-    *,
-    classify: str = "",
-    tags: dict = None,
-    measurement: str = None,
-    timestamp=None,
-):
-    """
-    时间打点,用于监控程序的运行时长等,每个duration一个点,不会被覆盖
-    Args:
-        key: 与点绑定的key值
-        duration: 时长
-        classify: 点的类别
-        tags: influxdb的tag的字段和值
-        measurement: 存储的表
-        timestamp: 点的时间搓,默认为当前时间
-
-    Returns:
-
-    """
-    if not _emitter:
-        return
-
-    tags = tags or {}
-    tags["_classify"] = classify
-    measurement = measurement or _measurement
-    _emitter.emit_timer(measurement, key, duration, tags, timestamp)
-
-
-def emit_store(
-    key: str = None,
-    value: Any = 0,
-    *,
-    classify: str = "",
-    tags: dict = None,
-    measurement: str,
-    timestamp=None,
-):
-    """
-    直接打点,不进行额外的处理
-    Args:
-        key: 与点绑定的key值
-        value: 点的值
-        classify: 点的类别
-        tags: influxdb的tag的字段和值
-        measurement: 存储的表
-        timestamp: 点的时间搓,默认为当前时间
-
-    Returns:
-
-    """
-    if not _emitter:
-        return
-
-    tags = tags or {}
-    tags["_classify"] = classify
-    measurement = measurement or _measurement
-    _emitter.emit_store(measurement, key, value, tags, timestamp)
-
-
-def flush():
-    """
-    强刷点到influxdb
-    Returns:
-
-    """
-    if not _emitter:
-        return
-    _emitter.flush()
-
-
-def close():
-    """
-    关闭
-    Returns:
-
-    """
-    if not _emitter:
-        return
-    _emitter.close()
-
-
-# 协程打点
-aemit_counter = aio_wrap(executor=_executor)(emit_counter)
-aemit_store = aio_wrap(executor=_executor)(emit_store)
-aemit_timer = aio_wrap(executor=_executor)(emit_timer)

+ 0 - 94
spider_frame/FworkSpider/feapder/utils/perfect_dict.py

@@ -1,94 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2021/4/8 11:32 上午
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-
-
-def ensure_value(value):
-    if isinstance(value, (list, tuple)):
-        _value = []
-        for v in value:
-            _value.append(ensure_value(v))
-
-        if isinstance(value, tuple):
-            value = tuple(_value)
-        else:
-            value = _value
-
-    if isinstance(value, dict):
-        return PerfectDict(value)
-    else:
-        return value
-
-
-class PerfectDict(dict):
-    """
-    >>> data = PerfectDict({"id":1, "url":"xxx"})
-    >>> data
-    {'id': 1, 'url': 'xxx'}
-    >>> data = PerfectDict(id=1, url="xxx")
-    >>> data
-    {'id': 1, 'url': 'xxx'}
-    >>> data.id
-    1
-    >>> data.get("id")
-    1
-    >>> data["id"]
-    1
-    >>> id, url = data
-    >>> id
-    1
-    >>> url
-    'xxx'
-    >>> data[0]
-    1
-    >>> data[1]
-    'xxx'
-    >>> data = PerfectDict({"a": 1, "b": {"b1": 2}, "c": [{"c1": [{"d": 1}]}]})
-    >>> data.b.b1
-    2
-    >>> data[1].b1
-    2
-    >>> data.get("b").b1
-    2
-    >>> data.c[0].c1
-    [{'d': 1}]
-    >>> data.c[0].c1[0]
-    {'d': 1}
-    """
-
-    def __init__(self, _dict: dict = None, _values: list = None, **kwargs):
-        self.__dict__ = _dict or kwargs or {}
-        self.__dict__.pop("__values__", None)
-        super().__init__(self.__dict__, **kwargs)
-        self.__values__ = _values or list(self.__dict__.values())
-
-    def __getitem__(self, key):
-        if isinstance(key, int):
-            value = self.__values__[key]
-        else:
-            value = self.__dict__[key]
-
-        return ensure_value(value)
-
-    def __iter__(self, *args, **kwargs):
-        for value in self.__values__:
-            yield ensure_value(value)
-
-    def __getattribute__(self, item):
-        value = object.__getattribute__(self, item)
-        if item == "__dict__" or item == "__values__":
-            return value
-        return ensure_value(value)
-
-    def get(self, key, default=None):
-        if key in self.__dict__:
-            value = self.__dict__[key]
-            return ensure_value(value)
-
-        return default

+ 0 - 121
spider_frame/FworkSpider/feapder/utils/redis_lock.py

@@ -1,121 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2019/11/5 5:25 PM
----------
-@summary:
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import threading
-import time
-
-from feapder.db.redisdb import RedisDB
-from feapder.utils.log import log
-
-
-class RedisLock:
-    redis_cli = None
-
-    def __init__(
-        self, key, *, wait_timeout=0, lock_timeout=86400, redis_cli=None, redis_url=None
-    ):
-        """
-        redis超时锁
-        :param key: 存储锁的key redis_lock:[key]
-        :param wait_timeout: 等待加锁超时时间,为0时则不等待加锁,加锁失败
-        :param lock_timeout: 锁超时时间 为0时则不会超时,直到锁释放或意外退出,默认超时为1天
-        :param redis_cli: redis客户端对象
-        :param redis_url: redis连接地址,若redis_cli传值,则不使用redis_url
-
-        用法示例:
-        with RedisLock(key="test") as _lock:
-            if _lock.locked:
-                # 用来判断是否加上了锁
-                # do somethings
-        """
-        self.redis_conn = redis_cli
-        self.redis_url = redis_url
-        self.lock_key = "redis_lock:{}".format(key)
-        # 锁超时时间
-        self.lock_timeout = lock_timeout
-        # 等待加锁时间
-        self.wait_timeout = wait_timeout
-        self.locked = False
-        self.stop_prolong_life = False
-
-    @property
-    def redis_conn(self):
-        if not self.__class__.redis_cli:
-            self.__class__.redis_cli = RedisDB(url=self.redis_url).get_redis_obj()
-
-        return self.__class__.redis_cli
-
-    @redis_conn.setter
-    def redis_conn(self, cli):
-        if cli:
-            self.__class__.redis_cli = cli
-
-    def __enter__(self):
-        if not self.locked:
-            self.acquire()
-            if self.locked:
-                # 延长锁的时间
-                thread = threading.Thread(target=self.prolong_life)
-                thread.setDaemon(True)
-                thread.start()
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.stop_prolong_life = True
-        self.release()
-
-    def __repr__(self):
-        return "<RedisLock: {} >".format(self.lock_key)
-
-    def acquire(self):
-        start = time.time()
-        while True:
-            # 尝试加锁
-            if self.redis_conn.set(self.lock_key, time.time(), nx=True, ex=5):
-                self.locked = True
-                break
-
-            if self.wait_timeout > 0:
-                if time.time() - start > self.wait_timeout:
-                    log.info("加锁失败")
-                    break
-            else:
-                break
-            log.debug("等待加锁: {} wait:{}".format(self, time.time() - start))
-            if self.wait_timeout > 10:
-                time.sleep(5)
-            else:
-                time.sleep(1)
-        return
-
-    def release(self):
-        if self.locked:
-            self.redis_conn.delete(self.lock_key)
-            self.locked = False
-        return
-
-    def prolong_life(self):
-        """
-        延长锁的过期时间
-        :return:
-        """
-
-        spend_time = 0
-        while not self.stop_prolong_life:
-            expire = self.redis_conn.ttl(self.lock_key)
-            if expire < 0:  # key 不存在
-                time.sleep(1)
-                continue
-            self.redis_conn.expire(self.lock_key, expire + 5)  # 延长5秒
-            time.sleep(expire)  # 临过期5秒前,再次延长
-            spend_time += expire
-            if self.lock_timeout and spend_time > self.lock_timeout:
-                log.info("锁超时,释放")
-                self.redis_conn.delete(self.lock_key)
-                break

+ 0 - 2683
spider_frame/FworkSpider/feapder/utils/tools.py

@@ -1,2683 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2018-09-06 14:21
----------
-@summary: 工具
----------
-@author: Boris
-@email: boris_liu@foxmail.com
-"""
-import asyncio
-import calendar
-import codecs
-import configparser  # 读配置文件的
-import datetime
-import functools
-import hashlib
-import html
-import importlib
-import inspect
-import json
-import os
-import pickle
-import random
-import re
-import socket
-import ssl
-import string
-import sys
-import time
-import traceback
-import urllib
-import urllib.parse
-import uuid
-import weakref
-from functools import partial, wraps
-from hashlib import md5
-from pprint import pformat
-from pprint import pprint
-from urllib import request
-from urllib.parse import urljoin
-
-import bson
-import redis
-import requests
-import six
-from requests.cookies import RequestsCookieJar
-from w3lib.url import canonicalize_url as _canonicalize_url
-
-import feapder.setting as setting
-from feapder.db.redisdb import RedisDB
-from feapder.utils.email_sender import EmailSender
-from feapder.utils.log import log
-
-try:
-    import execjs  # pip install PyExecJS
-except Exception as e:
-    pass
-
-os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
-
-# 全局取消ssl证书验证
-ssl._create_default_https_context = ssl._create_unverified_context
-
-TIME_OUT = 30
-TIMER_TIME = 5
-
-redisdb = None
-
-
-def get_redisdb():
-    global redisdb
-    if not redisdb:
-        redisdb = RedisDB()
-    return redisdb
-
-
-# 装饰器
-class Singleton(object):
-    def __init__(self, cls):
-        self._cls = cls
-        self._instance = {}
-
-    def __call__(self, *args, **kwargs):
-        if self._cls not in self._instance:
-            self._instance[self._cls] = self._cls(*args, **kwargs)
-        return self._instance[self._cls]
-
-
-def log_function_time(func):
-    try:
-
-        @functools.wraps(func)  # 将函数的原来属性付给新函数
-        def calculate_time(*args, **kw):
-            began_time = time.time()
-            callfunc = func(*args, **kw)
-            end_time = time.time()
-            log.debug(func.__name__ + " run time  = " + str(end_time - began_time))
-            return callfunc
-
-        return calculate_time
-    except:
-        log.debug("求取时间无效 因为函数参数不符")
-        return func
-
-
-def run_safe_model(module_name):
-    def inner_run_safe_model(func):
-        try:
-
-            @functools.wraps(func)  # 将函数的原来属性付给新函数
-            def run_func(*args, **kw):
-                callfunc = None
-                try:
-                    callfunc = func(*args, **kw)
-                except Exception as e:
-                    log.error(module_name + ": " + func.__name__ + " - " + str(e))
-                    traceback.print_exc()
-                return callfunc
-
-            return run_func
-        except Exception as e:
-            log.error(module_name + ": " + func.__name__ + " - " + str(e))
-            traceback.print_exc()
-            return func
-
-    return inner_run_safe_model
-
-
-def memoizemethod_noargs(method):
-    """Decorator to cache the result of a method (without arguments) using a
-    weak reference to its object
-    """
-    cache = weakref.WeakKeyDictionary()
-
-    @functools.wraps(method)
-    def new_method(self, *args, **kwargs):
-        if self not in cache:
-            cache[self] = method(self, *args, **kwargs)
-        return cache[self]
-
-    return new_method
-
-
-########################【网页解析相关】###############################
-
-
-# @log_function_time
-def get_html_by_requests(
-    url, headers=None, code="utf-8", data=None, proxies={}, with_response=False
-):
-    html = ""
-    r = None
-    try:
-        if data:
-            r = requests.post(
-                url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies
-            )
-        else:
-            r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies)
-
-        if code:
-            r.encoding = code
-        html = r.text
-
-    except Exception as e:
-        log.error(e)
-    finally:
-        r and r.close()
-
-    if with_response:
-        return html, r
-    else:
-        return html
-
-
-def get_json_by_requests(
-    url,
-    params=None,
-    headers=None,
-    data=None,
-    proxies={},
-    with_response=False,
-    cookies=None,
-):
-    json = {}
-    response = None
-    try:
-        # response = requests.get(url, params = params)
-        if data:
-            response = requests.post(
-                url,
-                headers=headers,
-                data=data,
-                params=params,
-                timeout=TIME_OUT,
-                proxies=proxies,
-                cookies=cookies,
-            )
-        else:
-            response = requests.get(
-                url,
-                headers=headers,
-                params=params,
-                timeout=TIME_OUT,
-                proxies=proxies,
-                cookies=cookies,
-            )
-        response.encoding = "utf-8"
-        json = response.json()
-    except Exception as e:
-        log.error(e)
-    finally:
-        response and response.close()
-
-    if with_response:
-        return json, response
-    else:
-        return json
-
-
-def get_cookies(response):
-    cookies = requests.utils.dict_from_cookiejar(response.cookies)
-    return cookies
-
-
-def get_cookies_from_str(cookie_str):
-    """
-    >>> get_cookies_from_str("key=value; key2=value2; key3=; key4=; ")
-    {'key': 'value', 'key2': 'value2', 'key3': '', 'key4': ''}
-
-    Args:
-        cookie_str: key=value; key2=value2; key3=; key4=
-
-    Returns:
-
-    """
-    cookies = {}
-    for cookie in cookie_str.split(";"):
-        cookie = cookie.strip()
-        if not cookie:
-            continue
-        key, value = cookie.split("=", 1)
-        key = key.strip()
-        value = value.strip()
-        cookies[key] = value
-
-    return cookies
-
-
-def get_cookies_jar(cookies):
-    """
-    @summary: 适用于selenium生成的cookies转requests的cookies
-    requests.get(xxx, cookies=jar)
-    参考:https://www.cnblogs.com/small-bud/p/9064674.html
-
-    ---------
-    @param cookies: [{},{}]
-    ---------
-    @result: cookie jar
-    """
-
-    cookie_jar = RequestsCookieJar()
-    for cookie in cookies:
-        cookie_jar.set(cookie["name"], cookie["value"])
-
-    return cookie_jar
-
-
-def get_cookies_from_selenium_cookie(cookies):
-    """
-    @summary: 适用于selenium生成的cookies转requests的cookies
-    requests.get(xxx, cookies=jar)
-    参考:https://www.cnblogs.com/small-bud/p/9064674.html
-
-    ---------
-    @param cookies: [{},{}]
-    ---------
-    @result: cookie jar
-    """
-
-    cookie_dict = {}
-    for cookie in cookies:
-        if cookie.get("name"):
-            cookie_dict[cookie["name"]] = cookie["value"]
-
-    return cookie_dict
-
-
-def cookiesjar2str(cookies):
-    str_cookie = ""
-    for k, v in requests.utils.dict_from_cookiejar(cookies).items():
-        str_cookie += k
-        str_cookie += "="
-        str_cookie += v
-        str_cookie += "; "
-    return str_cookie
-
-
-def cookies2str(cookies):
-    str_cookie = ""
-    for k, v in cookies.items():
-        str_cookie += k
-        str_cookie += "="
-        str_cookie += v
-        str_cookie += "; "
-    return str_cookie
-
-
-def get_urls(
-    html,
-    stop_urls=(
-        "javascript",
-        "+",
-        ".css",
-        ".js",
-        ".rar",
-        ".xls",
-        ".exe",
-        ".apk",
-        ".doc",
-        ".jpg",
-        ".png",
-        ".flv",
-        ".mp4",
-    ),
-):
-    # 不匹配javascript、 +、 # 这样的url
-    regex = r'<a.*?href.*?=.*?["|\'](.*?)["|\']'
-
-    urls = get_info(html, regex)
-    urls = sorted(set(urls), key=urls.index)
-    if stop_urls:
-        stop_urls = isinstance(stop_urls, str) and [stop_urls] or stop_urls
-        use_urls = []
-        for url in urls:
-            for stop_url in stop_urls:
-                if stop_url in url:
-                    break
-            else:
-                use_urls.append(url)
-
-        urls = use_urls
-    return urls
-
-
-def get_full_url(root_url, sub_url):
-    """
-    @summary: 得到完整的ur
-    ---------
-    @param root_url: 根url (网页的url)
-    @param sub_url:  子url (带有相对路径的 可以拼接成完整的)
-    ---------
-    @result: 返回完整的url
-    """
-
-    return urljoin(root_url, sub_url)
-
-
-def joint_url(url, params):
-    # param_str = "?"
-    # for key, value in params.items():
-    #     value = isinstance(value, str) and value or str(value)
-    #     param_str += key + "=" + value + "&"
-    #
-    # return url + param_str[:-1]
-
-    if not params:
-        return url
-
-    params = urlencode(params)
-    separator = "?" if "?" not in url else "&"
-    return url + separator + params
-
-
-def canonicalize_url(url):
-    """
-    url 归一化 会参数排序 及去掉锚点
-    """
-    return _canonicalize_url(url)
-
-
-def get_url_md5(url):
-    url = canonicalize_url(url)
-    url = re.sub("^http://", "https://", url)
-    return get_md5(url)
-
-
-def fit_url(urls, identis):
-    identis = isinstance(identis, str) and [identis] or identis
-    fit_urls = []
-    for link in urls:
-        for identi in identis:
-            if identi in link:
-                fit_urls.append(link)
-    return list(set(fit_urls))
-
-
-def get_param(url, key):
-    params = url.split("?")[-1].split("&")
-    for param in params:
-        key_value = param.split("=", 1)
-        if key == key_value[0]:
-            return key_value[1]
-    return None
-
-
-def urlencode(params):
-    """
-    字典类型的参数转为字符串
-    @param params:
-    {
-        'a': 1,
-        'b': 2
-    }
-    @return: a=1&b=2
-    """
-    return urllib.parse.urlencode(params)
-
-
-def urldecode(url):
-    """
-    将字符串类型的参数转为json
-    @param url: xxx?a=1&b=2
-    @return:
-    {
-        'a': 1,
-        'b': 2
-    }
-    """
-    params_json = {}
-    params = url.split("?")[-1].split("&")
-    for param in params:
-        key, value = param.split("=")
-        params_json[key] = unquote_url(value)
-
-    return params_json
-
-
-def unquote_url(url, encoding="utf-8"):
-    """
-    @summary: 将url解码
-    ---------
-    @param url:
-    ---------
-    @result:
-    """
-
-    return urllib.parse.unquote(url, encoding=encoding)
-
-
-def quote_url(url, encoding="utf-8"):
-    """
-    @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html
-    ---------
-    @param url:
-    ---------
-    @result:
-    """
-
-    return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding)
-
-
-def quote_chinese_word(text, encoding="utf-8"):
-    def quote_chinese_word_func(text):
-        chinese_word = text.group(0)
-        return urllib.parse.quote(chinese_word, encoding=encoding)
-
-    return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S)
-
-
-def unescape(str):
-    """
-    反转译
-    """
-    return html.unescape(str)
-
-
-def excape(str):
-    """
-    转译
-    """
-    return html.escape(str)
-
-
-_regexs = {}
-
-
-# @log_function_time
-def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
-    regexs = isinstance(regexs, str) and [regexs] or regexs
-
-    infos = []
-    for regex in regexs:
-        if regex == "":
-            continue
-
-        if regex not in _regexs.keys():
-            _regexs[regex] = re.compile(regex, re.S)
-
-        if fetch_one:
-            infos = _regexs[regex].search(html)
-            if infos:
-                infos = infos.groups()
-            else:
-                continue
-        else:
-            infos = _regexs[regex].findall(str(html))
-
-        if len(infos) > 0:
-            # print(regex)
-            break
-
-    if fetch_one:
-        infos = infos if infos else ("",)
-        return infos if len(infos) > 1 else infos[0]
-    else:
-        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
-        infos = split.join(infos) if split else infos
-        return infos
-
-
-def table_json(table, save_one_blank=True):
-    """
-    将表格转为json 适应于 key:value 在一行类的表格
-    @param table: 使用selector封装后的具有xpath的selector
-    @param save_one_blank: 保留一个空白符
-    @return:
-    """
-    data = {}
-
-    trs = table.xpath(".//tr")
-    for tr in trs:
-        tds = tr.xpath("./td|./th")
-
-        for i in range(0, len(tds), 2):
-            if i + 1 > len(tds) - 1:
-                break
-
-            key = tds[i].xpath("string(.)").extract_first(default="").strip()
-            value = tds[i + 1].xpath("string(.)").extract_first(default="").strip()
-            value = replace_str(value, "[\f\n\r\t\v]", "")
-            value = replace_str(value, " +", " " if save_one_blank else "")
-
-            if key:
-                data[key] = value
-
-    return data
-
-
-def get_table_row_data(table):
-    """
-    获取表格里每一行数据
-    @param table: 使用selector封装后的具有xpath的selector
-    @return: [[],[]..]
-    """
-
-    datas = []
-    rows = table.xpath(".//tr")
-    for row in rows:
-        cols = row.xpath("./td|./th")
-        row_datas = []
-        for col in cols:
-            data = col.xpath("string(.)").extract_first(default="").strip()
-            row_datas.append(data)
-        datas.append(row_datas)
-
-    return datas
-
-
-def rows2json(rows, keys=None):
-    """
-    将行数据转为json
-    @param rows: 每一行的数据
-    @param keys: json的key,空时将rows的第一行作为key
-    @return:
-    """
-    data_start_pos = 0 if keys else 1
-    datas = []
-    keys = keys or rows[0]
-    for values in rows[data_start_pos:]:
-        datas.append(dict(zip(keys, values)))
-
-    return datas
-
-
-def get_form_data(form):
-    """
-    提取form中提交的数据
-    :param form: 使用selector封装后的具有xpath的selector
-    :return:
-    """
-    data = {}
-    inputs = form.xpath(".//input")
-    for input in inputs:
-        name = input.xpath("./@name").extract_first()
-        value = input.xpath("./@value").extract_first()
-        if name:
-            data[name] = value
-
-    return data
-
-
-def get_domain(url):
-    return urllib.parse.urlparse(url).netloc
-
-
-def get_index_url(url):
-    return "/".join(url.split("/")[:3])
-
-
-def get_ip(domain):
-    ip = socket.getaddrinfo(domain, "http")[0][4][0]
-    return ip
-
-
-def get_localhost_ip():
-    """
-    利用 UDP 协议来实现的,生成一个UDP包,把自己的 IP 放如到 UDP 协议头中,然后从UDP包中获取本机的IP。
-    这个方法并不会真实的向外部发包,所以用抓包工具是看不到的
-    :return:
-    """
-    s = None
-    try:
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        s.connect(("8.8.8.8", 80))
-        ip = s.getsockname()[0]
-    finally:
-        if s:
-            s.close()
-
-    return ip
-
-
-def ip_to_num(ip):
-    import struct
-
-    ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
-    return ip_num
-
-
-def is_valid_proxy(proxy, check_url=None):
-    """
-    检验代理是否有效
-    @param proxy: xxx.xxx.xxx:xxx
-    @param check_url: 利用目标网站检查,目标网站url。默认为None, 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host
-    @return: True / False
-    """
-    is_valid = False
-
-    if check_url:
-        proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"}
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
-        }
-        response = None
-        try:
-            response = requests.get(
-                check_url, headers=headers, proxies=proxies, stream=True, timeout=20
-            )
-            is_valid = True
-
-        except Exception as e:
-            log.error("check proxy failed: {} {}".format(e, proxy))
-
-        finally:
-            if response:
-                response.close()
-
-    else:
-        ip, port = proxy.split(":")
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
-            sk.settimeout(7)
-            try:
-                sk.connect((ip, int(port)))  # 检查代理服务器是否开着
-                is_valid = True
-
-            except Exception as e:
-                log.error("check proxy failed: {} {}:{}".format(e, ip, port))
-
-    return is_valid
-
-
-def is_valid_url(url):
-    """
-    验证url是否合法
-    :param url:
-    :return:
-    """
-    if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url):
-        return True
-    else:
-        return False
-
-
-def get_text(soup, *args):
-    try:
-        return soup.get_text()
-    except Exception as e:
-        log.error(e)
-        return ""
-
-
-def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""):
-    """
-    删除html标签
-    @param content: html内容
-    @param except_line_break: 保留p标签
-    @param save_img: 保留图片
-    @param white_replaced: 空白符替换
-    @return:
-    """
-    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
-    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
-    content = replace_str(content, "<!--(.|\n)*?-->")
-    content = replace_str(
-        content, "(?!&[a-z]+=)&[a-z]+;?"
-    )  # 干掉&nbsp等无用的字符 但&xxx= 这种表示参数的除外
-    if except_line_break:
-        content = content.replace("</p>", "/p")
-        content = replace_str(content, "<[^p].*?>")
-        content = content.replace("/p", "</p>")
-        content = replace_str(content, "[ \f\r\t\v]")
-
-    elif save_img:
-        content = replace_str(content, "(?!<img.+?>)<.+?>")  # 替换掉除图片外的其他标签
-        content = replace_str(content, "(?! +)\s+", "\n")  # 保留空格
-        content = content.strip()
-
-    else:
-        content = replace_str(content, "<(.|\n)*?>")
-        content = replace_str(content, "\s", white_replaced)
-        content = content.strip()
-
-    return content
-
-
-def del_html_js_css(content):
-    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
-    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
-    content = replace_str(content, "<!--(.|\n)*?-->")
-
-    return content
-
-
-def is_have_chinese(content):
-    regex = "[\u4e00-\u9fa5]+"
-    chinese_word = get_info(content, regex)
-    return chinese_word and True or False
-
-
-def is_have_english(content):
-    regex = "[a-zA-Z]+"
-    english_words = get_info(content, regex)
-    return english_words and True or False
-
-
-def get_chinese_word(content):
-    regex = "[\u4e00-\u9fa5]+"
-    chinese_word = get_info(content, regex)
-    return chinese_word
-
-
-def get_english_words(content):
-    regex = "[a-zA-Z]+"
-    english_words = get_info(content, regex)
-    return english_words or ""
-
-
-##################################################
-def get_json(json_str):
-    """
-    @summary: 取json对象
-    ---------
-    @param json_str: json格式的字符串
-    ---------
-    @result: 返回json对象
-    """
-
-    try:
-        return json.loads(json_str) if json_str else {}
-    except Exception as e1:
-        try:
-            json_str = json_str.strip()
-            json_str = json_str.replace("'", '"')
-            keys = get_info(json_str, "(\w+):")
-            for key in keys:
-                json_str = json_str.replace(key, '"%s"' % key)
-
-            return json.loads(json_str) if json_str else {}
-
-        except Exception as e2:
-            log.error(
-                """
-                e1: %s
-                format json_str: %s
-                e2: %s
-                """
-                % (e1, json_str, e2)
-            )
-
-        return {}
-
-
-def jsonp2json(jsonp):
-    """
-    将jsonp转为json
-    @param jsonp: jQuery172013600082560040794_1553230569815({})
-    @return:
-    """
-    try:
-        return json.loads(re.match(".*?({.*}).*", jsonp, re.S).group(1))
-    except:
-        raise ValueError("Invalid Input")
-
-
-def dumps_json(data, indent=4, sort_keys=False):
-    """
-    @summary: 格式化json 用于打印
-    ---------
-    @param data: json格式的字符串或json对象
-    ---------
-    @result: 格式化后的字符串
-    """
-    try:
-        if isinstance(data, str):
-            data = get_json(data)
-
-        data = json.dumps(
-            data,
-            ensure_ascii=False,
-            indent=indent,
-            skipkeys=True,
-            sort_keys=sort_keys,
-            default=str,
-        )
-
-    except Exception as e:
-        data = pformat(data)
-
-    return data
-
-
-def get_json_value(json_object, key):
-    """
-    @summary:
-    ---------
-    @param json_object: json对象或json格式的字符串
-    @param key: 建值 如果在多个层级目录下 可写 key1.key2  如{'key1':{'key2':3}}
-    ---------
-    @result: 返回对应的值,如果没有,返回''
-    """
-    current_key = ""
-    value = ""
-    try:
-        json_object = (
-            isinstance(json_object, str) and get_json(json_object) or json_object
-        )
-
-        current_key = key.split(".")[0]
-        value = json_object[current_key]
-
-        key = key[key.find(".") + 1 :]
-    except Exception as e:
-        return value
-
-    if key == current_key:
-        return value
-    else:
-        return get_json_value(value, key)
-
-
-def get_all_keys(datas, depth=None, current_depth=0):
-    """
-    @summary: 获取json李所有的key
-    ---------
-    @param datas: dict / list
-    @param depth: 字典key的层级 默认不限制层级 层级从1开始
-    @param current_depth: 字典key的当前层级 不用传参
-    ---------
-    @result: 返回json所有的key
-    """
-
-    keys = []
-    if depth and current_depth >= depth:
-        return keys
-
-    if isinstance(datas, list):
-        for data in datas:
-            keys.extend(get_all_keys(data, depth, current_depth=current_depth + 1))
-    elif isinstance(datas, dict):
-        for key, value in datas.items():
-            keys.append(key)
-            if isinstance(value, dict):
-                keys.extend(get_all_keys(value, depth, current_depth=current_depth + 1))
-
-    return keys
-
-
-def to_chinese(unicode_str):
-    format_str = json.loads('{"chinese":"%s"}' % unicode_str)
-    return format_str["chinese"]
-
-
-##################################################
-def replace_str(source_str, regex, replace_str=""):
-    """
-    @summary: 替换字符串
-    ---------
-    @param source_str: 原字符串
-    @param regex: 正则
-    @param replace_str: 用什么来替换 默认为''
-    ---------
-    @result: 返回替换后的字符串
-    """
-    str_info = re.compile(regex)
-    return str_info.sub(replace_str, source_str)
-
-
-def del_redundant_blank_character(text):
-    """
-    删除冗余的空白符, 只保留一个
-    :param text:
-    :return:
-    """
-    return re.sub("\s+", " ", text)
-
-
-##################################################
-def get_conf_value(config_file, section, key):
-    cp = configparser.ConfigParser(allow_no_value=True)
-    with codecs.open(config_file, "r", encoding="utf-8") as f:
-        cp.read_file(f)
-    return cp.get(section, key)
-
-
-def mkdir(path):
-    try:
-        if not os.path.exists(path):
-            os.makedirs(path)
-    except OSError as exc:  # Python >2.5
-        pass
-
-
-def write_file(filename, content, mode="w", encoding="utf-8"):
-    """
-    @summary: 写文件
-    ---------
-    @param filename: 文件名(有路径)
-    @param content: 内容
-    @param mode: 模式 w/w+ (覆盖/追加)
-    ---------
-    @result:
-    """
-
-    directory = os.path.dirname(filename)
-    mkdir(directory)
-    with open(filename, mode, encoding=encoding) as file:
-        file.writelines(content)
-
-
-def read_file(filename, readlines=False, encoding="utf-8"):
-    """
-    @summary: 读文件
-    ---------
-    @param filename: 文件名(有路径)
-    @param readlines: 按行读取 (默认False)
-    ---------
-    @result: 按行读取返回List,否则返回字符串
-    """
-
-    content = None
-    try:
-        with open(filename, "r", encoding=encoding) as file:
-            content = file.readlines() if readlines else file.read()
-    except Exception as e:
-        log.error(e)
-
-    return content
-
-
-def get_oss_file_list(oss_handler, prefix, date_range_min, date_range_max=None):
-    """
-    获取文件列表
-    @param prefix: 路径前缀 如 data/car_service_line/yiche/yiche_serial_zongshu_info
-    @param date_range_min: 时间范围 最小值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
-    @param date_range_max: 时间范围 最大值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
-    @return: 每个文件路径 如 html/e_commerce_service_line/alibaba/alibaba_shop_info/2019/03/22/15/53/15/8ca8b9e4-4c77-11e9-9dee-acde48001122.json.snappy
-    """
-
-    # 计算时间范围
-    date_range_max = date_range_max or date_range_min
-    date_format = "/".join(
-        ["%Y", "%m", "%d", "%H", "%M", "%S"][: date_range_min.count("/") + 1]
-    )
-    time_interval = [
-        {"days": 365},
-        {"days": 31},
-        {"days": 1},
-        {"hours": 1},
-        {"minutes": 1},
-        {"seconds": 1},
-    ][date_range_min.count("/")]
-    date_range = get_between_date(
-        date_range_min, date_range_max, date_format=date_format, **time_interval
-    )
-
-    for date in date_range:
-        file_folder_path = os.path.join(prefix, date)
-        objs = oss_handler.list(prefix=file_folder_path)
-        for obj in objs:
-            filename = obj.key
-            yield filename
-
-
-def is_html(url):
-    if not url:
-        return False
-
-    try:
-        content_type = request.urlopen(url).info().get("Content-Type", "")
-
-        if "text/html" in content_type:
-            return True
-        else:
-            return False
-    except Exception as e:
-        log.error(e)
-        return False
-
-
-def is_exist(file_path):
-    """
-    @summary: 文件是否存在
-    ---------
-    @param file_path:
-    ---------
-    @result:
-    """
-
-    return os.path.exists(file_path)
-
-
-def download_file(url, file_path, *, call_func=None, proxies=None, data=None):
-    """
-    下载文件,会自动创建文件存储目录
-    Args:
-        url: 地址
-        file_path: 文件存储地址
-        call_func: 下载成功的回调
-        proxies: 代理
-        data: 请求体
-
-    Returns:
-
-    """
-    directory = os.path.dirname(file_path)
-    mkdir(directory)
-
-    # 进度条
-    def progress_callfunc(blocknum, blocksize, totalsize):
-        """回调函数
-        @blocknum : 已经下载的数据块
-        @blocksize : 数据块的大小
-        @totalsize: 远程文件的大小
-        """
-        percent = 100.0 * blocknum * blocksize / totalsize
-        if percent > 100:
-            percent = 100
-        # print ('进度条 %.2f%%' % percent, end = '\r')
-        sys.stdout.write("进度条 %.2f%%" % percent + "\r")
-        sys.stdout.flush()
-
-    if url:
-        try:
-            if proxies:
-                # create the object, assign it to a variable
-                proxy = request.ProxyHandler(proxies)
-                # construct a new opener using your proxy settings
-                opener = request.build_opener(proxy)
-                # install the openen on the module-level
-                request.install_opener(opener)
-
-            request.urlretrieve(url, file_path, progress_callfunc, data)
-
-            if callable(call_func):
-                call_func()
-            return 1
-        except Exception as e:
-            log.error(e)
-            return 0
-    else:
-        return 0
-
-
-def get_file_list(path, ignore=[]):
-    templist = path.split("*")
-    path = templist[0]
-    file_type = templist[1] if len(templist) >= 2 else ""
-
-    # 递归遍历文件
-    def get_file_list_(path, file_type, ignore, all_file=[]):
-        file_list = os.listdir(path)
-
-        for file_name in file_list:
-            if file_name in ignore:
-                continue
-
-            file_path = os.path.join(path, file_name)
-            if os.path.isdir(file_path):
-                get_file_list_(file_path, file_type, ignore, all_file)
-            else:
-                if not file_type or file_name.endswith(file_type):
-                    all_file.append(file_path)
-
-        return all_file
-
-    return get_file_list_(path, file_type, ignore) if os.path.isdir(path) else [path]
-
-
-def rename_file(old_name, new_name):
-    os.rename(old_name, new_name)
-
-
-def del_file(path, ignore=()):
-    files = get_file_list(path, ignore)
-    for file in files:
-        try:
-            os.remove(file)
-        except Exception as e:
-            log.error(
-                """
-                删除出错: %s
-                Exception : %s
-                """
-                % (file, str(e))
-            )
-        finally:
-            pass
-
-
-def get_file_type(file_name):
-    """
-    @summary: 取文件后缀名
-    ---------
-    @param file_name:
-    ---------
-    @result:
-    """
-    try:
-        return os.path.splitext(file_name)[1]
-    except Exception as e:
-        log.exception(e)
-
-
-def get_file_path(file_path):
-    """
-    @summary: 取文件路径
-    ---------
-    @param file_path: /root/a.py
-    ---------
-    @result: /root
-    """
-    try:
-        return os.path.split(file_path)[0]
-    except Exception as e:
-        log.exception(e)
-
-
-#############################################
-
-
-def exec_js(js_code):
-    """
-    @summary: 执行js代码
-    ---------
-    @param js_code: js代码
-    ---------
-    @result: 返回执行结果
-    """
-
-    return execjs.eval(js_code)
-
-
-def compile_js(js_func):
-    """
-    @summary: 编译js函数
-    ---------
-    @param js_func:js函数
-    ---------
-    @result: 返回函数对象 调用 fun('js_funName', param1,param2)
-    """
-
-    ctx = execjs.compile(js_func)
-    return ctx.call
-
-
-###############################################
-
-#############################################
-
-
-def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
-    """
-    @summary:
-    ---------
-    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
-    @param format:时间格式
-    ---------
-    @result: 返回时间戳
-    """
-
-    timestamp = time.mktime(time.strptime(date, time_format))
-    return int(timestamp)
-
-
-def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
-    """
-    @summary:
-    ---------
-    @param timestamp: 将时间戳转化为日期
-    @param format: 日期格式
-    ---------
-    @result: 返回日期
-    """
-    if timestamp is None:
-        raise ValueError("timestamp is null")
-
-    date = time.localtime(timestamp)
-    return time.strftime(time_format, date)
-
-
-def get_current_timestamp():
-    return int(time.time())
-
-
-def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
-    return datetime.datetime.now().strftime(date_format)
-    # return time.strftime(date_format, time.localtime(time.time()))
-
-
-def get_date_number(year=None, month=None, day=None):
-    """
-    @summary: 获取指定日期对应的日期数
-    默认当前周
-    ---------
-    @param year: 2010
-    @param month: 6
-    @param day: 16
-    ---------
-    @result: (年号,第几周,第几天) 如 (2010, 24, 3)
-    """
-    if year and month and day:
-        return datetime.date(year, month, day).isocalendar()
-    elif not any([year, month, day]):
-        return datetime.datetime.now().isocalendar()
-    else:
-        assert year, "year 不能为空"
-        assert month, "month 不能为空"
-        assert day, "day 不能为空"
-
-
-def get_between_date(
-    begin_date, end_date=None, date_format="%Y-%m-%d", **time_interval
-):
-    """
-    @summary: 获取一段时间间隔内的日期,默认为每一天
-    ---------
-    @param begin_date: 开始日期 str 如 2018-10-01
-    @param end_date: 默认为今日
-    @param date_format: 日期格式,应与begin_date的日期格式相对应
-    @param time_interval: 时间间隔 默认一天 支持 days、seconds、microseconds、milliseconds、minutes、hours、weeks
-    ---------
-    @result: list 值为字符串
-    """
-
-    date_list = []
-
-    begin_date = datetime.datetime.strptime(begin_date, date_format)
-    end_date = (
-        datetime.datetime.strptime(end_date, date_format)
-        if end_date
-        else datetime.datetime.strptime(
-            time.strftime(date_format, time.localtime(time.time())), date_format
-        )
-    )
-    time_interval = time_interval or dict(days=1)
-
-    while begin_date <= end_date:
-        date_str = begin_date.strftime(date_format)
-        date_list.append(date_str)
-
-        begin_date += datetime.timedelta(**time_interval)
-
-    if end_date.strftime(date_format) not in date_list:
-        date_list.append(end_date.strftime(date_format))
-
-    return date_list
-
-
-def get_between_months(begin_date, end_date=None):
-    """
-    @summary: 获取一段时间间隔内的月份
-    需要满一整月
-    ---------
-    @param begin_date: 开始时间 如 2018-01-01
-    @param end_date: 默认当前时间
-    ---------
-    @result: 列表 如 ['2018-01', '2018-02']
-    """
-
-    def add_months(dt, months):
-        month = dt.month - 1 + months
-        year = dt.year + month // 12
-        month = month % 12 + 1
-        day = min(dt.day, calendar.monthrange(year, month)[1])
-        return dt.replace(year=year, month=month, day=day)
-
-    date_list = []
-    begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
-    end_date = (
-        datetime.datetime.strptime(end_date, "%Y-%m-%d")
-        if end_date
-        else datetime.datetime.strptime(
-            time.strftime("%Y-%m-%d", time.localtime(time.time())), "%Y-%m-%d"
-        )
-    )
-    while begin_date <= end_date:
-        date_str = begin_date.strftime("%Y-%m")
-        date_list.append(date_str)
-        begin_date = add_months(begin_date, 1)
-    return date_list
-
-
-def get_today_of_day(day_offset=0):
-    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
-
-
-def get_days_of_month(year, month):
-    """
-    返回天数
-    """
-
-    return calendar.monthrange(year, month)[1]
-
-
-def get_firstday_of_month(date):
-    """''
-    date format = "YYYY-MM-DD"
-    """
-
-    year, month, day = date.split("-")
-    year, month, day = int(year), int(month), int(day)
-
-    days = "01"
-    if int(month) < 10:
-        month = "0" + str(int(month))
-    arr = (year, month, days)
-    return "-".join("%s" % i for i in arr)
-
-
-def get_lastday_of_month(date):
-    """''
-    get the last day of month
-    date format = "YYYY-MM-DD"
-    """
-    year, month, day = date.split("-")
-    year, month, day = int(year), int(month), int(day)
-
-    days = calendar.monthrange(year, month)[1]
-    month = add_zero(month)
-    arr = (year, month, days)
-    return "-".join("%s" % i for i in arr)
-
-
-def get_firstday_month(month_offset=0):
-    """''
-    get the first day of month from today
-    month_offset is how many months
-    """
-    (y, m, d) = get_year_month_and_days(month_offset)
-    d = "01"
-    arr = (y, m, d)
-    return "-".join("%s" % i for i in arr)
-
-
-def get_lastday_month(month_offset=0):
-    """''
-    get the last day of month from today
-    month_offset is how many months
-    """
-    return "-".join("%s" % i for i in get_year_month_and_days(month_offset))
-
-
-def get_last_month(month_offset=0):
-    """''
-    get the last day of month from today
-    month_offset is how many months
-    """
-    return "-".join("%s" % i for i in get_year_month_and_days(month_offset)[:2])
-
-
-def get_year_month_and_days(month_offset=0):
-    """
-    @summary:
-    ---------
-    @param month_offset: 月份偏移量
-    ---------
-    @result: ('2019', '04', '30')
-    """
-
-    today = datetime.datetime.now()
-    year, month = today.year, today.month
-
-    this_year = int(year)
-    this_month = int(month)
-    total_month = this_month + month_offset
-    if month_offset >= 0:
-        if total_month <= 12:
-            days = str(get_days_of_month(this_year, total_month))
-            total_month = add_zero(total_month)
-            return (year, total_month, days)
-        else:
-            i = total_month // 12
-            j = total_month % 12
-            if j == 0:
-                i -= 1
-                j = 12
-            this_year += i
-            days = str(get_days_of_month(this_year, j))
-            j = add_zero(j)
-            return (str(this_year), str(j), days)
-    else:
-        if (total_month > 0) and (total_month < 12):
-            days = str(get_days_of_month(this_year, total_month))
-            total_month = add_zero(total_month)
-            return (year, total_month, days)
-        else:
-            i = total_month // 12
-            j = total_month % 12
-            if j == 0:
-                i -= 1
-                j = 12
-            this_year += i
-            days = str(get_days_of_month(this_year, j))
-            j = add_zero(j)
-            return (str(this_year), str(j), days)
-
-
-def add_zero(n):
-    return "%02d" % n
-
-
-def get_month(month_offset=0):
-    """''
-    获取当前日期前后N月的日期
-    if month_offset>0, 获取当前日期前N月的日期
-    if month_offset<0, 获取当前日期后N月的日期
-    date format = "YYYY-MM-DD"
-    """
-    today = datetime.datetime.now()
-    day = add_zero(today.day)
-
-    (y, m, d) = get_year_month_and_days(month_offset)
-    arr = (y, m, d)
-    if int(day) < int(d):
-        arr = (y, m, day)
-    return "-".join("%s" % i for i in arr)
-
-
-@run_safe_model("format_date")
-def format_date(date, old_format="", new_format="%Y-%m-%d %H:%M:%S"):
-    """
-    @summary: 格式化日期格式
-    ---------
-    @param date: 日期 eg:2017年4月17日 3时27分12秒
-    @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒'
-        %y 两位数的年份表示(00-99)
-        %Y 四位数的年份表示(000-9999)
-        %m 月份(01-12)
-        %d 月内中的一天(0-31)
-        %H 24小时制小时数(0-23)
-        %I 12小时制小时数(01-12)
-        %M 分钟数(00-59)
-        %S 秒(00-59)
-    @param new_format: 输出的日期格式
-    ---------
-    @result: 格式化后的日期,类型为字符串 如2017-4-17 03:27:12
-    """
-    if not date:
-        return ""
-
-    if not old_format:
-        regex = "(\d+)"
-        numbers = get_info(date, regex, allow_repeat=True)
-        formats = ["%Y", "%m", "%d", "%H", "%M", "%S"]
-        old_format = date
-        for i, number in enumerate(numbers[:6]):
-            if i == 0 and len(number) == 2:  # 年份可能是两位 用小%y
-                old_format = old_format.replace(
-                    number, formats[i].lower(), 1
-                )  # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时
-            else:
-                old_format = old_format.replace(number, formats[i], 1)  # 替换一次
-
-    try:
-        date_obj = datetime.datetime.strptime(date, old_format)
-        if "T" in date and "Z" in date:
-            date_obj += datetime.timedelta(hours=8)
-            date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
-        else:
-            date_str = datetime.datetime.strftime(date_obj, new_format)
-
-    except Exception as e:
-        log.error("日期格式化出错,old_format = %s 不符合 %s 格式" % (old_format, date))
-        date_str = date
-
-    return date_str
-
-
-def transform_lower_num(data_str: str):
-    num_map = {
-        "一": "1",
-        "二": "2",
-        "三": "3",
-        "四": "4",
-        "五": "5",
-        "六": "6",
-        "七": "7",
-        "八": "8",
-        "九": "9",
-        "十": "0",
-    }
-    pattern = f'[{"|".join(num_map.keys())}|零]'
-    res = re.search(pattern, data_str)
-    if not res:
-        #  如果字符串中没有包含中文数字 不做处理 直接返回
-        return data_str
-
-    data_str = data_str.replace("0", "零")
-    for n in num_map:
-        data_str = data_str.replace(n, num_map[n])
-
-    re_data_str = re.findall("\d+", data_str)
-    for i in re_data_str:
-        if len(i) == 3:
-            new_i = i.replace("0", "")
-            data_str = data_str.replace(i, new_i, 1)
-        elif len(i) == 4:
-            new_i = i.replace("10", "")
-            data_str = data_str.replace(i, new_i, 1)
-        elif len(i) == 2 and int(i) < 10:
-            new_i = int(i) + 10
-            data_str = data_str.replace(i, str(new_i), 1)
-        elif len(i) == 1 and int(i) == 0:
-            new_i = int(i) + 10
-            data_str = data_str.replace(i, str(new_i), 1)
-
-    return data_str.replace("零", "0")
-
-
-@run_safe_model("format_time")
-def format_time(release_time, date_format="%Y-%m-%d %H:%M:%S"):
-    """
-    >>> format_time("2个月前")
-    '2021-08-15 16:24:21'
-    >>> format_time("2月前")
-    '2021-08-15 16:24:36'
-    """
-    release_time = transform_lower_num(release_time)
-    release_time = release_time.replace("日", "天").replace("/", "-")
-
-    if "年前" in release_time:
-        years = re.compile("(\d+)\s*年前").findall(release_time)
-        years_ago = datetime.datetime.now() - datetime.timedelta(
-            days=int(years[0]) * 365
-        )
-        release_time = years_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "月前" in release_time:
-        months = re.compile("(\d+)[\s个]*月前").findall(release_time)
-        months_ago = datetime.datetime.now() - datetime.timedelta(
-            days=int(months[0]) * 30
-        )
-        release_time = months_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "周前" in release_time:
-        weeks = re.compile("(\d+)\s*周前").findall(release_time)
-        weeks_ago = datetime.datetime.now() - datetime.timedelta(days=int(weeks[0]) * 7)
-        release_time = weeks_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "天前" in release_time:
-        ndays = re.compile("(\d+)\s*天前").findall(release_time)
-        days_ago = datetime.datetime.now() - datetime.timedelta(days=int(ndays[0]))
-        release_time = days_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "小时前" in release_time:
-        nhours = re.compile("(\d+)\s*小时前").findall(release_time)
-        hours_ago = datetime.datetime.now() - datetime.timedelta(hours=int(nhours[0]))
-        release_time = hours_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "分钟前" in release_time:
-        nminutes = re.compile("(\d+)\s*分钟前").findall(release_time)
-        minutes_ago = datetime.datetime.now() - datetime.timedelta(
-            minutes=int(nminutes[0])
-        )
-        release_time = minutes_ago.strftime("%Y-%m-%d %H:%M:%S")
-
-    elif "前天" in release_time:
-        today = datetime.date.today()
-        yesterday = today - datetime.timedelta(days=2)
-        release_time = release_time.replace("前天", str(yesterday))
-
-    elif "昨天" in release_time:
-        today = datetime.date.today()
-        yesterday = today - datetime.timedelta(days=1)
-        release_time = release_time.replace("昨天", str(yesterday))
-
-    elif "今天" in release_time:
-        release_time = release_time.replace("今天", get_current_date("%Y-%m-%d"))
-
-    elif "刚刚" in release_time:
-        release_time = get_current_date()
-
-    elif re.search("^\d\d:\d\d", release_time):
-        release_time = get_current_date("%Y-%m-%d") + " " + release_time
-
-    elif not re.compile("\d{4}").findall(release_time):
-        month = re.compile("\d{1,2}").findall(release_time)
-        if month and int(month[0]) <= int(get_current_date("%m")):
-            release_time = get_current_date("%Y") + "-" + release_time
-        else:
-            release_time = str(int(get_current_date("%Y")) - 1) + "-" + release_time
-
-    # 把日和小时粘在一起的拆开
-    template = re.compile("(\d{4}-\d{1,2}-\d{2})(\d{1,2})")
-    release_time = re.sub(template, r"\1 \2", release_time)
-    release_time = format_date(release_time, new_format=date_format)
-
-    return release_time
-
-
-def to_date(date_str, date_format="%Y-%m-%d %H:%M:%S"):
-    return datetime.datetime.strptime(date_str, date_format)
-
-
-def get_before_date(
-    current_date,
-    days,
-    current_date_format="%Y-%m-%d %H:%M:%S",
-    return_date_format="%Y-%m-%d %H:%M:%S",
-):
-    """
-    @summary: 获取之前时间
-    ---------
-    @param current_date: 当前时间 str类型
-    @param days: 时间间隔 -1 表示前一天 1 表示后一天
-    @param days: 返回的时间格式
-    ---------
-    @result: 字符串
-    """
-
-    current_date = to_date(current_date, current_date_format)
-    date_obj = current_date + datetime.timedelta(days=days)
-    return datetime.datetime.strftime(date_obj, return_date_format)
-
-
-def get_utcnow():
-    """utc时间"""
-    return datetime.datetime.utcnow()
-
-
-def delay_time(sleep_time=60):
-    """
-    @summary: 睡眠  默认1分钟
-    ---------
-    @param sleep_time: 以秒为单位
-    ---------
-    @result:
-    """
-
-    time.sleep(sleep_time)
-
-
-def format_seconds(seconds):
-    """
-    @summary: 将秒转为时分秒
-    ---------
-    @param seconds:
-    ---------
-    @result: 2天3小时2分49秒
-    """
-
-    seconds = int(seconds + 0.5)  # 向上取整
-
-    m, s = divmod(seconds, 60)
-    h, m = divmod(m, 60)
-    d, h = divmod(h, 24)
-
-    times = ""
-    if d:
-        times += "{}天".format(d)
-    if h:
-        times += "{}小时".format(h)
-    if m:
-        times += "{}分".format(m)
-    if s:
-        times += "{}秒".format(s)
-
-    return times
-
-
-################################################
-def get_md5(*args):
-    """
-    @summary: 获取唯一的32位md5
-    ---------
-    @param *args: 参与联合去重的值
-    ---------
-    @result: 7c8684bcbdfcea6697650aa53d7b1405
-    """
-
-    m = hashlib.md5()
-    for arg in args:
-        m.update(str(arg).encode())
-
-    return m.hexdigest()
-
-
-def get_sha1(*args):
-    """
-    @summary: 获取唯一的40位值, 用于获取唯一的id
-    ---------
-    @param *args: 参与联合去重的值
-    ---------
-    @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
-    """
-
-    sha1 = hashlib.sha1()
-    for arg in args:
-        sha1.update(str(arg).encode())
-    return sha1.hexdigest()  # 40位
-
-
-def get_sha256(*args):
-    """
-    @summary: 获取唯一的64位值, 用于获取唯一的id
-    ---------
-    @param *args: 参与联合去重的值
-    ---------
-    @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
-    """
-
-    sha256 = hashlib.sha256()
-    for arg in args:
-        sha256.update(str(arg).encode())
-    return sha256.hexdigest()  # 64位
-
-
-def get_base64(secret, message):
-    """
-    @summary: 数字证书签名算法是:"HMAC-SHA256"
-              参考:https://www.jokecamp.com/blog/examples-of-creating-base64-hashes-using-hmac-sha256-in-different-languages/
-    ---------
-    @param secret: 秘钥
-    @param message: 消息
-    ---------
-    @result: 签名输出类型是:"base64"
-    """
-
-    import hashlib
-    import hmac
-    import base64
-
-    message = bytes(message, "utf-8")
-    secret = bytes(secret, "utf-8")
-
-    signature = base64.b64encode(
-        hmac.new(secret, message, digestmod=hashlib.sha256).digest()
-    ).decode("utf8")
-    return signature
-
-
-def get_uuid(key1="", key2=""):
-    """
-    @summary: 计算uuid值
-    可用于将两个字符串组成唯一的值。如可将域名和新闻标题组成uuid,形成联合索引
-    ---------
-    @param key1:str
-    @param key2:str
-    ---------
-    @result:
-    """
-
-    if not key1 and not key2:
-        uuid_object = uuid.uuid1()
-    else:
-        hash_ = md5(bytes(key1, "utf-8") + bytes(key2, "utf-8")).digest()
-        uuid_object = uuid.UUID(bytes=hash_[:16], version=3)
-
-    return str(uuid_object)
-
-
-def get_hash(text):
-    return hash(text)
-
-
-def decrypt(input_str: str) -> str:
-    """
-    改写:新增
-    定义base64解密函数
-
-    :param input_str:
-    :return:
-    """
-    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
-    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
-    output_str = ''
-    # 对前面不是“=”的字节取索引,然后转换为2进制
-    # 补齐“=”的个数
-    equal_num = input_str.count('=')
-    while ascii_list:
-        temp_list = ascii_list[:4]
-        # 转换成2进制字符串
-        temp_str = ''.join(temp_list)
-        # 对没有8位2进制的字符串补够8位2进制
-        if len(temp_str) % 8 != 0:
-            temp_str = temp_str[0:-1 * equal_num * 2]
-        # 4个6字节的二进制  转换  为三个8字节的二进制
-        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
-        # 二进制转为10进制
-        temp_str_list = [int(x, 2) for x in temp_str_list if x]
-        # 连接成字符串
-        output_str += ''.join([chr(x) for x in temp_str_list])
-        ascii_list = ascii_list[4:]
-    return output_str
-
-##################################################
-
-
-def cut_string(text, length):
-    """
-    @summary: 将文本按指定长度拆分
-    ---------
-    @param text: 文本
-    @param length: 拆分长度
-    ---------
-    @result: 返回按指定长度拆分后形成的list
-    """
-
-    text_list = re.findall(".{%d}" % length, text, re.S)
-    leave_text = text[len(text_list) * length :]
-    if leave_text:
-        text_list.append(leave_text)
-
-    return text_list
-
-
-def get_random_string(length=1):
-    random_string = "".join(random.sample(string.ascii_letters + string.digits, length))
-    return random_string
-
-
-def get_random_password(length=8, special_characters=""):
-    """
-    @summary: 创建随机密码 默认长度为8,包含大写字母、小写字母、数字
-    ---------
-    @param length: 密码长度 默认8
-    @param special_characters: 特殊字符
-    ---------
-    @result: 指定长度的密码
-    """
-
-    while True:
-        random_password = "".join(
-            random.sample(
-                string.ascii_letters + string.digits + special_characters, length
-            )
-        )
-        if (
-            re.search("[0-9]", random_password)
-            and re.search("[A-Z]", random_password)
-            and re.search("[a-z]", random_password)
-        ):
-            if not special_characters:
-                break
-            elif set(random_password).intersection(special_characters):
-                break
-
-    return random_password
-
-
-def get_random_email(length=None, email_types: list = None, special_characters=""):
-    """
-    随机生成邮箱
-    :param length: 邮箱长度
-    :param email_types: 邮箱类型
-    :param special_characters: 特殊字符
-    :return:
-    """
-    if not length:
-        length = random.randint(4, 12)
-    if not email_types:
-        email_types = [
-            "qq.com",
-            "163.com",
-            "gmail.com",
-            "yahoo.com",
-            "hotmail.com",
-            "yeah.net",
-            "126.com",
-            "139.com",
-            "sohu.com",
-        ]
-
-    email_body = get_random_password(length, special_characters)
-    email_type = random.choice(email_types)
-
-    email = email_body + "@" + email_type
-    return email
-
-
-#################################
-
-
-def dumps_obj(obj):
-    return pickle.dumps(obj)
-
-
-def loads_obj(obj_str):
-    return pickle.loads(obj_str)
-
-
-def get_method(obj, name):
-    name = str(name)
-    try:
-        return getattr(obj, name)
-    except AttributeError:
-        log.error("Method %r not found in: %s" % (name, obj))
-        return None
-
-
-def resolve_method(context, target):
-    """
-    解析目标字符串并返回可调用的方法。
-
-    :param context: 上下文,可以是单个对象或上下文字典。
-                   - 如果是类、函数或模块,直接使用该对象。
-                   - 如果是类实例,直接使用该对象解析方法。
-                   - 如果是字典,则按键值解析对象。例如 {'self': instance, 'other': other_instance}
-    :param target: 目标字符串,例如 'self.detail_get' 或 'other.some_method'
-    :return: 可调用的方法对象
-    """
-    target = str(target)
-    if "." not in target or target.count(".") != 1:
-        raise ValueError(
-            f"Invalid target format: {target}. "
-            f"Expected format: 'object.method'."
-        )
-
-    obj_name, method_name = target.split(".", 1)
-
-    # 解析上下文
-    if isinstance(context, dict):
-        # 如果是字典,按对象名获取对象
-        obj = context.get(obj_name)
-        if obj is None:
-            raise ValueError(f"Object '{obj_name}' not found in context.")
-    elif inspect.isclass(context) or inspect.isroutine(context) or inspect.ismodule(context):
-        # 如果是类、函数或模块,直接使用该对象
-        obj = context
-        if obj_name != getattr(obj, "__name__", None) and obj_name != "self":
-            raise ValueError(
-                f"Unsupported object name: {obj_name}. "
-                f"Expected '{getattr(obj, '__name__', None)}' or 'self'."
-            )
-    elif isinstance(context, object):
-        # 如果是类实例,直接使用该对象
-        obj = context
-        if obj_name != getattr(obj.__class__, "__name__", None) and obj_name != "self":
-            raise ValueError(
-                f"Unsupported object name: {obj_name}. "
-                f"Expected '{getattr(obj.__class__, '__name__', None)}' or 'self'."
-            )
-    else:
-        raise TypeError("Context must be either class, function, module or instance.")
-
-    method = getattr(obj, method_name, None)
-    if method is None or not callable(method):
-        raise AttributeError(
-            f"Method '{method_name}' not found or not callable on object '{obj_name}'."
-        )
-
-    return method
-
-
-def witch_workspace(project_path):
-    """
-    @summary:
-    ---------
-    @param project_path:
-    ---------
-    @result:
-    """
-
-    os.chdir(project_path)  # 切换工作路经
-
-
-############### 数据库相关 #######################
-def format_sql_value(value):
-    if isinstance(value, str):
-        value = value.strip()
-
-    elif isinstance(value, (list, dict)):
-        value = dumps_json(value, indent=None)
-
-    elif isinstance(value, (datetime.date, datetime.time)):
-        value = str(value)
-
-    elif isinstance(value, bool):
-        value = int(value)
-
-    return value
-
-
-def list2str(datas):
-    """
-    列表转字符串
-    :param datas: [1, 2]
-    :return: (1, 2)
-    """
-    data_str = str(tuple(datas))
-    data_str = re.sub(",\)$", ")", data_str)
-    return data_str
-
-
-def make_insert_sql(
-    table, data, auto_update=False, update_columns=(), insert_ignore=False
-):
-    """
-    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
-    ---------
-    @param table:
-    @param data: 表数据 json格式
-    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
-    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
-    @param insert_ignore: 数据存在忽略
-    ---------
-    @result:
-    """
-
-    keys = ["`{}`".format(key) for key in data.keys()]
-    keys = list2str(keys).replace("'", "")
-
-    values = [format_sql_value(value) for value in data.values()]
-    values = list2str(values)
-
-    if update_columns:
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-        update_columns_ = ", ".join(
-            ["{key}=values({key})".format(key=key) for key in update_columns]
-        )
-        sql = (
-            "insert%s into `{table}` {keys} values {values} on duplicate key update %s"
-            % (" ignore" if insert_ignore else "", update_columns_)
-        )
-
-    elif auto_update:
-        sql = "replace into `{table}` {keys} values {values}"
-    else:
-        sql = "insert%s into `{table}` {keys} values {values}" % (
-            " ignore" if insert_ignore else ""
-        )
-
-    sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
-    return sql
-
-
-def make_update_sql(table, data, condition):
-    """
-    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
-    ---------
-    @param table:
-    @param data: 表数据 json格式
-    @param condition: where 条件
-    ---------
-    @result:
-    """
-    key_values = []
-
-    for key, value in data.items():
-        value = format_sql_value(value)
-        if isinstance(value, str):
-            key_values.append("`{}`={}".format(key, repr(value)))
-        elif value is None:
-            key_values.append("`{}`={}".format(key, "null"))
-        else:
-            key_values.append("`{}`={}".format(key, value))
-
-    key_values = ", ".join(key_values)
-
-    sql = "update `{table}` set {key_values} where {condition}"
-    sql = sql.format(table=table, key_values=key_values, condition=condition)
-    return sql
-
-
-def make_batch_sql(
-    table, datas, auto_update=False, update_columns=(), update_columns_value=()
-):
-    """
-    @summary: 生产批量的sql
-    ---------
-    @param table:
-    @param datas: 表数据 [{...}]
-    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
-    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
-    @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号, 如 update_columns_value=("'test'",)
-    ---------
-    @result:
-    """
-    if not datas:
-        return
-
-    keys = list(datas[0].keys())
-    values_placeholder = ["%s"] * len(keys)
-
-    values = []
-    for data in datas:
-        value = []
-        for key in keys:
-            current_data = data.get(key)
-            current_data = format_sql_value(current_data)
-
-            value.append(current_data)
-
-        values.append(value)
-
-    keys = ["`{}`".format(key) for key in keys]
-    keys = list2str(keys).replace("'", "")
-
-    values_placeholder = list2str(values_placeholder).replace("'", "")
-
-    if update_columns:
-        if not isinstance(update_columns, (tuple, list)):
-            update_columns = [update_columns]
-        if update_columns_value:
-            update_columns_ = ", ".join(
-                [
-                    "`{key}`={value}".format(key=key, value=value)
-                    for key, value in zip(update_columns, update_columns_value)
-                ]
-            )
-        else:
-            update_columns_ = ", ".join(
-                ["`{key}`=values(`{key}`)".format(key=key) for key in update_columns]
-            )
-        sql = "insert into `{table}` {keys} values {values_placeholder} on duplicate key update {update_columns}".format(
-            table=table,
-            keys=keys,
-            values_placeholder=values_placeholder,
-            update_columns=update_columns_,
-        )
-    elif auto_update:
-        sql = "replace into `{table}` {keys} values {values_placeholder}".format(
-            table=table, keys=keys, values_placeholder=values_placeholder
-        )
-    else:
-        sql = "insert ignore into `{table}` {keys} values {values_placeholder}".format(
-            table=table, keys=keys, values_placeholder=values_placeholder
-        )
-
-    return sql, values
-
-
-############### json相关 #######################
-
-
-def key2underline(key: str, strict=True):
-    """
-    >>> key2underline("HelloWord")
-    'hello_word'
-    >>> key2underline("SHData", strict=True)
-    's_h_data'
-    >>> key2underline("SHData", strict=False)
-    'sh_data'
-    >>> key2underline("SHDataHi", strict=False)
-    'sh_data_hi'
-    >>> key2underline("SHDataHi", strict=True)
-    's_h_data_hi'
-    >>> key2underline("dataHi", strict=True)
-    'data_hi'
-    """
-    regex = "[A-Z]*" if not strict else "[A-Z]"
-    capitals = re.findall(regex, key)
-
-    if capitals:
-        for capital in capitals:
-            if not capital:
-                continue
-            if key.startswith(capital):
-                if len(capital) > 1:
-                    key = key.replace(
-                        capital, capital[:-1].lower() + "_" + capital[-1].lower(), 1
-                    )
-                else:
-                    key = key.replace(capital, capital.lower(), 1)
-            else:
-                if len(capital) > 1:
-                    key = key.replace(capital, "_" + capital.lower() + "_", 1)
-                else:
-                    key = key.replace(capital, "_" + capital.lower(), 1)
-
-    return key.strip("_")
-
-
-def key2hump(key):
-    """
-    下划线试变成首字母大写
-    """
-    return key.title().replace("_", "")
-
-
-def format_json_key(json_data):
-    json_data_correct = {}
-    for key, value in json_data.items():
-        key = key2underline(key)
-        json_data_correct[key] = value
-
-    return json_data_correct
-
-
-def quick_to_json(text):
-    """
-    @summary: 可快速将浏览器上的header转为json格式
-    ---------
-    @param text:
-    ---------
-    @result:
-    """
-
-    contents = text.split("\n")
-    json = {}
-    for content in contents:
-        if content == "\n":
-            continue
-
-        content = content.strip()
-        regex = ["(:?.*?):(.*)", "(.*?):? +(.*)", "([^:]*)"]
-
-        result = get_info(content, regex)
-        result = result[0] if isinstance(result[0], tuple) else result
-        try:
-            json[result[0]] = eval(result[1].strip())
-        except:
-            json[result[0]] = result[1].strip()
-
-    return json
-
-
-##############################
-
-
-def print_pretty(object):
-    pprint(object)
-
-
-def print_params2json(url):
-    params_json = {}
-    params = url.split("?")[-1].split("&")
-    for param in params:
-        key_value = param.split("=", 1)
-        params_json[key_value[0]] = key_value[1]
-
-    print(dumps_json(params_json))
-
-
-def print_cookie2json(cookie_str_or_list):
-    if isinstance(cookie_str_or_list, str):
-        cookie_json = {}
-        cookies = cookie_str_or_list.split("; ")
-        for cookie in cookies:
-            name, value = cookie.split("=")
-            cookie_json[name] = value
-    else:
-        cookie_json = get_cookies_from_selenium_cookie(cookie_str_or_list)
-
-    print(dumps_json(cookie_json))
-
-
-###############################
-
-
-def flatten(x):
-    """flatten(sequence) -> list
-    Returns a single, flat list which contains all elements retrieved
-    from the sequence and all recursively contained sub-sequences
-    (iterables).
-    Examples:
-    >>> [1, 2, [3,4], (5,6)]
-    [1, 2, [3, 4], (5, 6)]
-    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
-    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
-    >>> flatten(["foo", "bar"])
-    ['foo', 'bar']
-    >>> flatten(["foo", ["baz", 42], "bar"])
-    ['foo', 'baz', 42, 'bar']
-    """
-    return list(iflatten(x))
-
-
-def iflatten(x):
-    """iflatten(sequence) -> iterator
-    Similar to ``.flatten()``, but returns iterator instead"""
-    for el in x:
-        if _is_listlike(el):
-            for el_ in flatten(el):
-                yield el_
-        else:
-            yield el
-
-
-def _is_listlike(x):
-    """
-    >>> _is_listlike("foo")
-    False
-    >>> _is_listlike(5)
-    False
-    >>> _is_listlike(b"foo")
-    False
-    >>> _is_listlike([b"foo"])
-    True
-    >>> _is_listlike((b"foo",))
-    True
-    >>> _is_listlike({})
-    True
-    >>> _is_listlike(set())
-    True
-    >>> _is_listlike((x for x in range(3)))
-    True
-    >>> _is_listlike(six.moves.xrange(5))
-    True
-    """
-    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
-
-
-###################
-
-
-def re_def_supper_class(obj, supper_class):
-    """
-    重新定义父类
-    @param obj: 类 如 class A: 则obj为A 或者 A的实例 a.__class__
-    @param supper_class: 父类
-    @return:
-    """
-    obj.__bases__ = (supper_class,)
-
-
-###################
-freq_limit_record = {}
-
-
-def reach_freq_limit(rate_limit, *key):
-    """
-    频率限制
-    :param rate_limit: 限制时间 单位秒
-    :param key: 频率限制的key
-    :return: True / False
-    """
-    if rate_limit == 0:
-        return False
-
-    msg_md5 = get_md5(*key)
-    key = "rate_limit:{}".format(msg_md5)
-    try:
-        if get_redisdb().get(key):
-            return True
-
-        get_redisdb().set(key, time.time(), ex=rate_limit)
-    except redis.exceptions.ConnectionError as e:
-        # 使用内存做频率限制
-        global freq_limit_record
-
-        if key not in freq_limit_record:
-            freq_limit_record[key] = time.time()
-            return False
-
-        if time.time() - freq_limit_record.get(key) < rate_limit:
-            return True
-        else:
-            freq_limit_record[key] = time.time()
-
-    return False
-
-
-def dingding_warning(
-    message, message_prefix=None, rate_limit=None, url=None, user_phone=None
-):
-    # 为了加载最新的配置
-    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
-    url = url or setting.DINGDING_WARNING_URL
-    user_phone = user_phone or setting.DINGDING_WARNING_PHONE
-
-    if not all([url, message]):
-        return
-
-    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
-        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
-        return
-
-    if isinstance(user_phone, str):
-        user_phone = [user_phone] if user_phone else []
-
-    data = {
-        "msgtype": "text",
-        "text": {"content": message},
-        "at": {"atMobiles": user_phone, "isAtAll": setting.DINGDING_WARNING_ALL},
-    }
-
-    headers = {"Content-Type": "application/json"}
-
-    try:
-        response = requests.post(
-            url, headers=headers, data=json.dumps(data).encode("utf8")
-        )
-        result = response.json()
-        response.close()
-        if result.get("errcode") == 0:
-            return True
-        else:
-            raise Exception(result.get("errmsg"))
-    except Exception as e:
-        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
-        return False
-
-
-def email_warning(
-    message,
-    title,
-    message_prefix=None,
-    email_sender=None,
-    email_password=None,
-    email_receiver=None,
-    email_smtpserver=None,
-    rate_limit=None,
-):
-    # 为了加载最新的配置
-    email_sender = email_sender or setting.EMAIL_SENDER
-    email_password = email_password or setting.EMAIL_PASSWORD
-    email_receiver = email_receiver or setting.EMAIL_RECEIVER
-    email_smtpserver = email_smtpserver or setting.EMAIL_SMTPSERVER
-    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
-
-    if not all([message, email_sender, email_password, email_receiver]):
-        return
-
-    if reach_freq_limit(
-        rate_limit, email_receiver, email_sender, message_prefix or message
-    ):
-        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
-        return
-
-    if isinstance(email_receiver, str):
-        email_receiver = [email_receiver]
-
-    with EmailSender(
-        username=email_sender, password=email_password, smtpserver=email_smtpserver
-    ) as email:
-        return email.send(receivers=email_receiver, title=title, content=message)
-
-
-def linkedsee_warning(message, rate_limit=3600, message_prefix=None, token=None):
-    """
-    灵犀电话报警
-    Args:
-        message:
-        rate_limit:
-        message_prefix:
-        token:
-
-    Returns:
-
-    """
-    if not token:
-        log.info("未设置灵犀token,不支持报警")
-        return
-
-    if reach_freq_limit(rate_limit, token, message_prefix or message):
-        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
-        return
-
-    headers = {"servicetoken": token, "Content-Type": "application/json"}
-
-    url = "http://www.linkedsee.com/alarm/zabbix"
-
-    data = {"content": message}
-    response = requests.post(url, data=json.dumps(data), headers=headers)
-    return response
-
-
-def wechat_warning(
-    message,
-    message_prefix=None,
-    rate_limit=None,
-    url=None,
-    user_phone=None,
-    all_users: bool = None,
-):
-    """企业微信报警"""
-
-    # 为了加载最新的配置
-    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
-    url = url or setting.WECHAT_WARNING_URL
-    user_phone = user_phone or setting.WECHAT_WARNING_PHONE
-    all_users = all_users if all_users is not None else setting.WECHAT_WARNING_ALL
-
-    if isinstance(user_phone, str):
-        user_phone = [user_phone] if user_phone else []
-
-    if all_users is True or not user_phone:
-        user_phone = ["@all"]
-
-    if not all([url, message]):
-        return
-
-    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
-        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
-        return
-
-    data = {
-        "msgtype": "text",
-        "text": {"content": message, "mentioned_mobile_list": user_phone},
-    }
-
-    headers = {"Content-Type": "application/json"}
-
-    try:
-        response = requests.post(
-            url, headers=headers, data=json.dumps(data).encode("utf8")
-        )
-        result = response.json()
-        response.close()
-        if result.get("errcode") == 0:
-            return True
-        else:
-            raise Exception(result.get("errmsg"))
-    except Exception as e:
-        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
-        return False
-
-
-def send_msg(msg, level="debug", message_prefix=""):
-    if setting.WARNING_LEVEL == "ERROR":
-        if level != "error":
-            return
-
-    if setting.DINGDING_WARNING_URL:
-        keyword = "feapder报警系统\n"
-        dingding_warning(keyword + msg, message_prefix=message_prefix)
-
-    if setting.EMAIL_RECEIVER:
-        title = message_prefix or msg
-        if len(title) > 50:
-            title = title[:50] + "..."
-        email_warning(msg, message_prefix=message_prefix, title=title)
-
-    if setting.WECHAT_WARNING_URL:
-        keyword = "feapder报警系统\n"
-        wechat_warning(keyword + msg, message_prefix=message_prefix)
-
-
-###################
-
-
-def make_item(cls, data: dict):
-    """提供Item类与原数据,快速构建Item实例
-    :param cls: Item类
-    :param data: 字典格式的数据
-    """
-    item = cls()
-    for key, val in data.items():
-        setattr(item, key, val)
-    return item
-
-
-###################
-
-
-def aio_wrap(loop=None, executor=None):
-    """
-    wrap a normal sync version of a function to an async version
-    """
-    outer_loop = loop
-    outer_executor = executor
-
-    def wrap(fn):
-        @wraps(fn)
-        async def run(*args, loop=None, executor=None, **kwargs):
-            if loop is None:
-                if outer_loop is None:
-                    loop = asyncio.get_event_loop()
-                else:
-                    loop = outer_loop
-            if executor is None:
-                executor = outer_executor
-            pfunc = partial(fn, *args, **kwargs)
-            return await loop.run_in_executor(executor, pfunc)
-
-        return run
-
-    return wrap
-
-
-######### number ##########
-
-
-def ensure_int(n):
-    """
-    >>> ensure_int(None)
-    0
-    >>> ensure_int(False)
-    0
-    >>> ensure_int(12)
-    12
-    >>> ensure_int("72")
-    72
-    >>> ensure_int('')
-    0
-    >>> ensure_int('1')
-    1
-    """
-    if not n:
-        return 0
-    return int(n)
-
-
-def ensure_float(n):
-    """
-    >>> ensure_float(None)
-    0.0
-    >>> ensure_float(False)
-    0.0
-    >>> ensure_float(12)
-    12.0
-    >>> ensure_float("72")
-    72.0
-    """
-    if not n:
-        return 0.0
-    return float(n)
-
-
-def ensure_int64(n):
-    """
-    >>> ensure_int64(None)
-    0
-    >>> ensure_float(False)
-    0
-    >>> ensure_float(12)
-    12
-    >>> ensure_float("72")
-    72
-    """
-    if not n:
-        return bson.int64.Int64(0)
-    return bson.int64.Int64(n)
-
-
-def import_cls(cls_info):
-    module, class_name = cls_info.rsplit(".", 1)
-    cls = importlib.import_module(module).__getattribute__(class_name)
-    return cls
-
-
-def load_globals(*module_name):
-    global_dict = globals()
-    module_vars = dir(module_name)
-    for var_name in module_vars:
-        if not var_name.startswith('__'):
-            var_value = getattr(module_name, var_name)
-            global_dict[var_name] = var_value
-
-    return global_dict

+ 0 - 442
spider_frame/FworkSpider/feapder/utils/webdriver.py

@@ -1,442 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Created on 2023-03-01
----------
-@summary: 远程selenium服务
----------
-@author: dzr
-@email: dongzhaorui@topnet.net.cn
-"""
-
-import os
-import queue
-import threading
-
-from selenium import webdriver
-from selenium.webdriver.chrome.remote_connection import ChromeRemoteConnection
-from selenium.webdriver.firefox.remote_connection import FirefoxRemoteConnection
-from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
-
-from feapder.setting import WEBDRIVER
-from feapder.utils.log import log
-from feapder.utils.tools import Singleton
-
-DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
-
-
-class WebDriver(RemoteWebDriver):
-    """浏览器采集 - selenium"""
-    CHROME = "CHROME"
-    FIREFOX = "FIREFOX"
-
-    def __init__(
-        self,
-        load_images=True,
-        user_agent=None,
-        proxy=None,
-        driver_type=CHROME,
-        timeout=20,
-        headless=False,
-        usages_local_driver=False,
-        window_size=(1024, 800),
-        server_addr=None,
-        version=None,
-        custom_argument=None,
-        executable_path=None,
-        service_log_path=None,
-        **kwargs
-    ):
-        """
-        webdirver 封装,支持 chrome 和 firefox
-        Args:
-            load_images: 是否加载图片
-            user_agent: 字符串 或 无参函数,返回值为user_agent
-            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
-            headless: 是否启用无头模式
-            driver_type: CHROME 或 FIREFOX...
-            timeout: 请求超时时间
-            window_size: # 窗口大小
-            executable_path: 浏览器路径,默认为默认路径
-            server_addr: 远程服务地址
-            usages_local_driver: 是否使用本地驱动
-            service_log_path: selenium service 日志路径
-            version: 浏览器版本
-            **kwargs:
-        """
-        self._load_images = load_images
-        self._user_agent = user_agent or DEFAULT_USERAGENT
-        self._proxy = proxy
-        self._headless = headless
-        self._usages_local_driver = usages_local_driver
-        self._timeout = timeout
-        self._window_size = window_size
-        self._executable_path = executable_path
-        self._custom_argument = custom_argument
-        self._server_addr = server_addr or WEBDRIVER["server_addr"]
-        self._version = version or WEBDRIVER["version"]
-        self._service_log_path = service_log_path or WEBDRIVER["service_log_path"]
-
-        if driver_type == WebDriver.CHROME:
-            self.driver = self.chrome_driver()
-
-        elif driver_type == WebDriver.FIREFOX:
-            self.driver = self.firefox_driver()
-
-        else:
-            raise TypeError(
-                "dirver_type must be one of CHROME or FIREFOX, but received {}".format(
-                    type(driver_type)
-                )
-            )
-
-        # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
-        self.driver.set_page_load_timeout(self._timeout)
-        # 设置10秒脚本超时时间
-        self.driver.set_script_timeout(self._timeout)
-
-        self._is_remote = not self._usages_local_driver
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_val:
-            log.error(exc_val)
-
-        self.quit()
-        return False
-
-    def __getattr__(self, name):
-        if self.driver:
-            return getattr(self.driver, name)
-        else:
-            raise AttributeError
-
-    def get_driver(self):
-        return self.driver
-
-    def local_firefox_driver(self):
-        firefox_profile = webdriver.FirefoxProfile()
-        firefox_options = webdriver.FirefoxOptions()
-        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
-        firefox_profile.set_preference("dom.webdriver.enabled", False)
-
-        if self._proxy:
-            proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = proxy.replace("socks5://", "")
-            # 使用socks5 代理
-            ip, port = proxy.split(":")
-            firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
-            firefox_profile.set_preference('network.proxy.socks', ip)
-            firefox_profile.set_preference('network.proxy.socks_port', int(port))
-
-        if self._user_agent:
-            firefox_profile.set_preference(
-                "general.useragent.override",
-                self._user_agent() if callable(
-                    self._user_agent) else self._user_agent,
-            )
-
-        if not self._load_images:
-            firefox_profile.set_preference("permissions.default.image", 2)
-
-        if self._headless:
-            firefox_options.add_argument("--headless")
-            firefox_options.add_argument("--disable-gpu")
-
-        # 添加自定义的配置参数
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                firefox_options.add_argument(arg)
-
-        if self._executable_path:
-            driver = webdriver.Firefox(
-                capabilities=firefox_capabilities,
-                options=firefox_options,
-                firefox_profile=firefox_profile,
-                executable_path=self._executable_path,
-                service_log_path=self._service_log_path
-            )
-        else:
-            driver = webdriver.Firefox(
-                capabilities=firefox_capabilities,
-                options=firefox_options,
-                firefox_profile=firefox_profile,
-                service_log_path=self._service_log_path
-            )
-
-        if self._window_size:
-            driver.set_window_size(*self._window_size)
-
-        return driver
-
-    def remote_firefox_driver(self):
-        firefox_options = webdriver.FirefoxOptions()
-        desired_capabilities = firefox_options.to_capabilities()
-        firefox_options.set_preference("dom.webdriver.enabled", False)
-
-        if self._version:
-            desired_capabilities['version'] = self._version
-
-        if self._proxy:
-            proxy = self._proxy() if callable(self._proxy) else self._proxy
-            proxy = proxy.replace("socks5://", "")
-            # 使用socks5 代理
-            ip, port = proxy.split(":")
-            firefox_options.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
-            firefox_options.set_preference('network.proxy.socks', ip)
-            firefox_options.set_preference('network.proxy.socks_port', int(port))
-
-        if self._user_agent:
-            firefox_options.set_preference(
-                "general.useragent.override",
-                self._user_agent() if callable(self._user_agent) else self._user_agent,
-            )
-
-        if not self._load_images:
-            firefox_options.set_preference("permissions.default.image", 2)
-
-        if self._headless:
-            firefox_options.add_argument("--headless")
-            firefox_options.add_argument("--disable-gpu")
-
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                firefox_options.add_argument(arg)
-
-        executor = FirefoxRemoteConnection(remote_server_addr=self._server_addr)
-        browser = webdriver.Remote(
-            command_executor=executor,
-            desired_capabilities=desired_capabilities,
-            options=firefox_options
-        )
-
-        if self._window_size:
-            browser.set_window_size(*self._window_size)
-
-        return browser
-
-    def firefox_driver(self):
-        if self._usages_local_driver:
-            return self.local_firefox_driver()
-        return self.remote_firefox_driver()
-
-    def remote_chrome_driver(self):
-        chrome_options = webdriver.ChromeOptions()
-        desired_capabilities = chrome_options.to_capabilities()
-        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
-        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-        chrome_options.add_experimental_option("useAutomationExtension", False)
-        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
-        # docker 里运行需要
-        chrome_options.add_argument('--no-sandbox')
-        chrome_options.add_argument('--disable-extensions')
-        chrome_options.add_argument('--disable-dev-shm-usage')
-
-        if self._version:
-            desired_capabilities['version'] = self._version
-
-        if self._proxy:
-            chrome_options.add_argument(
-                "--proxy-server={}".format(
-                    self._proxy() if callable(self._proxy) else self._proxy
-                )
-            )
-
-        if self._user_agent:
-            chrome_options.add_argument(
-                "user-agent={}".format(
-                    self._user_agent()
-                    if callable(self._user_agent)
-                    else self._user_agent
-                )
-            )
-
-        if not self._load_images:
-            chrome_options.add_experimental_option(
-                "prefs", {"profile.managed_default_content_settings.images": 2}
-            )
-
-        if self._headless:
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-
-        if self._window_size:
-            chrome_options.add_argument(
-                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
-            )
-
-        # 添加自定义的配置参数
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                chrome_options.add_argument(arg)
-
-        browser = webdriver.Remote(
-            command_executor=ChromeRemoteConnection(
-                remote_server_addr=self._server_addr,
-                keep_alive=True),
-            desired_capabilities=desired_capabilities,
-            options=chrome_options
-        )
-
-        # 隐藏浏览器特征
-        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
-            js = f.read()
-            params = {
-                'cmd': 'Page.addScriptToEvaluateOnNewDocument',
-                'params': {'source': js}
-            }
-            response = browser.execute("executeCdpCommand", params)['value']
-        return browser
-
-    def local_chrome_driver(self):
-        chrome_options = webdriver.ChromeOptions()
-        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
-        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-        chrome_options.add_experimental_option("useAutomationExtension", False)
-        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
-        # docker 里运行需要
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument('--disable-extensions')
-        chrome_options.add_argument('--disable-dev-shm-usage')
-
-        if self._proxy:
-            chrome_options.add_argument(
-                "--proxy-server={}".format(
-                    self._proxy() if callable(self._proxy) else self._proxy
-                )
-            )
-
-        if self._user_agent:
-            chrome_options.add_argument(
-                "user-agent={}".format(
-                    self._user_agent()
-                    if callable(self._user_agent)
-                    else self._user_agent
-                )
-            )
-
-        if not self._load_images:
-            chrome_options.add_experimental_option(
-                "prefs", {"profile.managed_default_content_settings.images": 2}
-            )
-
-        if self._headless:
-            chrome_options.add_argument("--headless")
-            chrome_options.add_argument("--disable-gpu")
-
-        if self._window_size:
-            chrome_options.add_argument(
-                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
-            )
-
-        # 添加自定义的配置参数
-        if self._custom_argument:
-            for arg in self._custom_argument:
-                chrome_options.add_argument(arg)
-
-        if self._executable_path:
-            driver = webdriver.Chrome(
-                chrome_options=chrome_options,
-                executable_path=self._executable_path,
-                service_log_path=self._service_log_path
-            )
-        else:
-            driver = webdriver.Chrome(
-                chrome_options=chrome_options,
-                service_log_path=self._service_log_path
-            )
-
-        # 隐藏浏览器特征
-        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
-            js = f.read()
-            driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
-
-        return driver
-
-    def chrome_driver(self):
-        if self._usages_local_driver:
-            return self.local_chrome_driver()
-        return self.remote_chrome_driver()
-
-    @property
-    def cookies(self):
-        cookies_json = {}
-        for cookie in self.driver.get_cookies():
-            cookies_json[cookie["name"]] = cookie["value"]
-        return cookies_json
-
-    @cookies.setter
-    def cookies(self, val: dict):
-        """
-        设置cookie
-        Args:
-            val: {"key":"value", "key2":"value2"}
-
-        Returns:
-
-        """
-        for key, value in val.items():
-            self.driver.add_cookie({"name": key, "value": value})
-
-    def quit(self):
-        try:
-            self.get_driver().quit()
-        except Exception:
-            # We don't care about the message because something probably has gone wrong
-            pass
-
-    # def __del__(self):
-    #     if self.driver:
-    #         self.driver.quit()
-
-
-@Singleton
-class WebDriverPool:
-    def __init__(self, pool_size=5, **kwargs):
-        self.queue = queue.Queue(maxsize=pool_size)
-        self.kwargs = kwargs
-        self.lock = threading.RLock()
-        self.driver_count = 0
-
-    @property
-    def is_full(self):
-        return self.driver_count >= self.queue.maxsize
-
-    def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
-        """
-        获取webdriver
-        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
-        Args:
-            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
-            proxy: xxx.xxx.xxx.xxx
-        Returns:
-
-        """
-        if not self.is_full:
-            with self.lock:
-                if not self.is_full:
-                    kwargs = self.kwargs.copy()
-                    if user_agent:
-                        kwargs["user_agent"] = user_agent
-                    if proxy:
-                        kwargs["proxy"] = proxy
-                    driver = WebDriver(**kwargs)
-                    self.queue.put(driver)
-                    self.driver_count += 1
-
-        driver = self.queue.get()
-        return driver
-
-    def put(self, driver):
-        self.queue.put(driver)
-
-    def remove(self, driver):
-        driver.quit()
-        self.driver_count -= 1
-
-    def close(self):
-        while not self.queue.empty():
-            driver = self.queue.get()
-            driver.quit()
-            self.driver_count -= 1

+ 0 - 0
spider_frame/FworkSpider/items/__init__.py


+ 0 - 196
spider_frame/FworkSpider/items/njpc_item.py

@@ -1,196 +0,0 @@
-# -*- coding: utf-8 -*-
-import feapder.utils.tools as tools
-from feapder import BaseListItem, BaseDetailItem
-from feapder.utils.log import log
-from untils.check_data import CheckData
-from untils.tools import substitute, text_search
-
-
-class DataNjpcItem(BaseDetailItem):
-    """拟建类"""
-
-    __attr__ = {
-        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area',
-        'city', 'district', 'href', 'title', 'contenthtml', 'detail',
-        'sendflag', 'projectinfo', 'phone', 'startdate',
-        'constructionunit', 'projecttype', 'ownertel', 'designunittel',
-        'scale', 'designunit', 'area', 'owneraddr', 'structure',
-        'house', 'building', 'investment', 'approvestatus', 'person',
-        'floors', 'materials', 'designunitaddr', 'approvecode', 'other',
-        'completedate', 'ownerperson', 'approvenumber',
-        'constructionunittel', 'heating', 'constructionunitaddr',
-        'approvecontent', 'construction', 'parking', 'floor', 'wall',
-        'designunitperson', 'constructionunitperson', 'steel', 'info',
-        'total', 'freshair', 'air', 'projectperiod', 'elevator',
-        'funds', 'pace', 'owner', 'projectaddr', 'system', 'exterior',
-        'method', 'passive', 'conditioner', 'approvedept', 'project',
-        'prefabricated', 'approvetime', 'total_investment', 'project_startdate',
-        'project_completedate', 'project_person', 'project_phone', 'project_scale_info',
-        'project_scale', 'construction_area', 'floor_area', 'building_floors', 'steel_structure',
-        'exterior_wall_materials', 'parking_pace', 'air_conditioner', 'freshair_system', 'heating_method',
-        'prefabricated_building', 'passive_house', 'other_project_scale', 'owner_info', 'designunit_info',
-        'constructionunit_info',
-    }
-
-    # 以下字段为 二类字段,没有则不做存储,不在存储结构中
-    # 附件,默认为Null 正确的格式为 projectinfo.attachments = [{
-    #                       "fid":"附件id"
-    #                       "filename":"附件名称"
-    #                       "ftype":"文件类型"
-    #                       "org_url":"附件原始地址"
-    #                       "size":"附件大小"
-    #                       "url":"附件地址"}]
-    # 事项名称(审批事项)	approvecontent
-    # 项目代码(审批代码)	approvecode
-    # 批准文号	approvenumber
-    # 总投资	total_investment
-    # 资金来源	funds
-    # 业主单位	owner
-    # 申报方式(项目类型)	projecttype
-    # 建设地点	projectaddr
-    # 建设年限	projectperiod
-    # 开工时间	project_startdate
-    # 竣工时间	project_completedate
-    # 审批部门	approvedept
-    # 审批结果	approvestatus
-    # 项目联系人  project_person
-    # 项目联系电话  project_phone
-
-    # 建设规模及主要内容	project_scale_info
-    # 	project_scale
-    # 建筑面积	construction_area
-    # 占地面积	floor_area
-    # 建筑层数	building_floors
-    # 钢结构	steel_structure
-    # 外墙材料	exterior_wall_materials
-    # 车库停车位	parking_pace
-    # 电梯	elevator
-    # 空调	air_conditioner
-    # 新风系统	freshair_system
-    # 供暖方式	heating_method
-    # 装配式建筑	prefabricated_building
-    # 被动房	passive_house
-    # 其它建设内容描述	other_project_scale
-
-    # 三类字段,难以处理时可以不处理
-    # 业主及其联系方式	owner_info
-    # 业主单位/建设单位	owner
-    # 业主单位联系人	ownerperson
-    # 业主单位联系方式	ownertel
-    # 业主单位地址	owneraddr
-    # 设计院及其联系方式	designunit_info
-    # 设计单位	designunit
-    # 设计单位联系人	designunitperson
-    # 设计单位联系方式	designunittel
-    # 设计单位地址	designunitaddr
-    # 施工单位及其联系方式	constructionunit_info
-    # 施工单位	constructionunit
-    # 施工单位联系人	constructionunitperson
-    # 施工单位联系方式	constructionunittel
-    # 施工单位地址	constructionunitaddr
-
-    def __init__(self, projectname='', publishtime='', **kwargs):
-        """
-
-        @param projectname: 项目名称
-        @param publishtime: 文章发布时间(时间戳),单位:秒
-        """
-        kwargs = {k: v for k, v in kwargs.items() if k in self.__attr__}
-        super(DataNjpcItem, self).__init__(**kwargs)
-
-        self.table_name = "data_bak"  # 拟建数据存储表名
-
-        self.projectname = projectname
-        self.publishtime = publishtime
-
-        # 默认设置
-        self.T = "bidding"
-        self.infoformat = 2
-        self.is_check_text = True
-
-    def handle_publish_time(self):
-        # 时间格式处理
-        publishtime = str(self.publishtime)
-        time_str = tools.get_current_date().split(' ')[-1]
-        if "-" in publishtime and ":" not in publishtime:
-            publishtime = publishtime + " " + time_str
-        elif "-" not in publishtime:
-            publishtime = tools.timestamp_to_date(int(publishtime[:10]))
-            if "00:00:00" in publishtime:
-                publishtime = publishtime.split(' ')[0] + " " + time_str
-        else:
-            if "-" in publishtime and ":" in publishtime:
-                pass
-            else:
-                raise ValueError("发布时间格式不正确 -> %r " % self.publishtime)
-
-        # 时间字符串转时间戳
-        self.publishtime = tools.ensure_int64(tools.date_to_timestamp(publishtime))
-
-    def handle_publish_time_overdue(self):
-        """超期发布时间处理"""
-        if not isinstance(self.publishtime, int):
-            raise TypeError("发布时间类型不正确 -> %s " % type(self.publishtime))
-
-        if self.publishtime > tools.get_current_timestamp():
-            log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
-            self.publishtime = tools.ensure_int64(tools.get_current_timestamp())
-
-    def handle_page_html(self):
-        if not self.contenthtml:
-            log.warning(f"页面源码不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
-            self.dont_save = True
-        else:
-            if not self.detail:
-                self.detail = substitute(self.contenthtml)
-
-            if self.is_check_text and text_search(self.detail).total == 0:
-                self.sendflag = "true"
-
-    def check_data_validity(self):
-        if not self.dont_save:
-            if not self.projectname or not self.publishtime or not self.href:
-                log.warning(f"基础数据不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.projectname}")
-                self.dont_save = True
-
-    def cleanup(self):
-        # 删除检测正文是否包含中文字段
-        del self.is_check_text
-
-        if not self.projectinfo:
-            del self.projectinfo
-
-    def pre_to_db(self):
-        if not self.title:
-            self.title = self.projectname
-            log.debug("请检测 < title > 是否正确!")
-
-        self.handle_publish_time()
-        self.handle_publish_time_overdue()
-        self.handle_page_html()
-        self.check_data_validity()
-        self.cleanup()
-
-
-class NjpcListItem(BaseListItem):
-
-    def __init__(self):
-        super(NjpcListItem, self).__init__()
-        self.table_name = 'pyspider_listdata'  # 表名
-
-        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
-        self.parser_url = ""  # 详情页数据地址
-
-        self.projectname = ""  # 项目名称
-        self.publishtime = ""  # 文章发布时间
-        self.is_check_spider = True
-
-    def pre_to_db(self):
-        if self.is_check_spider and CheckData.channel(self.channel, self.site, group="njpc"):
-            code, reason = CheckData.title(self.projectname, group="njpc")
-            if code == 10106:
-                log.warning(f"{self.projectname}--不可入库,原因:{reason}")
-                self.dont_save = True
-
-        # 删除 是否检测 该爬虫的标题符合规范
-        del self.is_check_spider

+ 0 - 160
spider_frame/FworkSpider/items/spider_item.py

@@ -1,160 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import feapder.utils.tools as tools
-from feapder import BaseListItem, BaseDetailItem
-from feapder.utils.log import log
-from untils.check_data import CheckData
-from untils.tools import (
-    substitute,
-    text_search,
-)
-
-
-class DataBakItem(BaseDetailItem):
-    """招投标(标讯)类"""
-
-    __attr__ = {
-        'pyuuid', 'save', 'site', 'channel', 'spidercode', 'area',
-        'city', 'district', 'href', 'title', 'contenthtml', 'detail',
-        'sendflag', 'projectinfo', 'infoformat'
-    }
-
-    def __init__(self, s_title='', publishtime='', **kwargs):
-        """
-
-        @param s_title: 详情页标题(有必填),默认提供列表页标题
-        @param publishtime: 文章发布时间(列表页或者详情页发布时间)
-        @param kwargs:
-        """
-        kwargs = {k: v for k, v in kwargs.items() if k in self.__attr__}
-        super(DataBakItem, self).__init__(**kwargs)
-        self.s_title = s_title
-        self.publishtime = publishtime
-        self.l_np_publishtime = 0  # 发布时间的时间戳(秒级), 需定义为long型
-
-        self.competehref = None  # 竞品详情页地址
-
-        self.T = "bidding"
-        self.infoformat = kwargs.get('infoformat', 1)
-
-        '''招投标默认属性'''
-        self.iscompete = True  # 新爬虫标识
-        self._d = "comeintime"
-        self.publishdept = ""
-        self.type = ""
-        self.is_check_text = True
-
-    def cleanup(self):
-        # 删除检测正文是否包含中文字段
-        del self.is_check_text
-
-        # 竞品网站-详情页地址标识字段
-        if not self.competehref:
-            del self.competehref
-
-        # 详情无附件,不需要 projectinfo 字段
-        if not self.projectinfo:
-            del self.projectinfo
-
-    def handle_publish_time(self):
-        time_str = tools.get_current_date().split(' ')[-1]
-        if ':' not in self.publishtime:
-            self.publishtime = self.publishtime + ' ' + time_str
-        else:
-            if '00:00:00' in self.publishtime:
-                self.publishtime = self.publishtime.split(' ')[0] + ' ' + time_str
-
-        self.l_np_publishtime = tools.ensure_int64(tools.date_to_timestamp(self.publishtime))
-
-    def handle_publish_time_overdue(self):
-        """处理超期发布时间"""
-        if self.l_np_publishtime and self.l_np_publishtime > tools.get_current_timestamp():
-            log.warning("发布时间大于当前时间,已设置当前时间为发布时间!")
-            self.publishtime = tools.get_current_date()
-            self.l_np_publishtime = tools.ensure_int64(tools.date_to_timestamp(self.publishtime))
-
-    def handle_page_html(self):
-        if not self.contenthtml:
-            log.warning(f"页面源码不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
-            self.dont_save = True
-        else:
-            if not self.detail:
-                self.detail = substitute(self.contenthtml)
-
-            if self.is_check_text and text_search(self.detail).total == 0:
-                self.sendflag = "true"  # 无内容数据,数据不入保存服务
-
-    def check_data_validity(self):
-        if not self.dont_save:
-            if not self.title or not self.publishtime or not self.href:
-                log.warning(f"基础数据不能为空!\n 发布地址:{self.href}\n 发布时间:{self.publishtime}\n 标题:{self.title}")
-                self.dont_save = True
-
-    def pre_to_db(self):
-        if not self.s_title:
-            self.s_title = self.title
-            log.debug("请检测 < s_title > 是否正确!")
-
-        self.handle_publish_time()
-        self.handle_publish_time_overdue()
-        self.handle_page_html()
-        self.check_data_validity()
-        self.cleanup()
-
-
-class ExamineAndApproveItem(DataBakItem):
-    """审批类"""
-    def __init__(self, **kwargs):
-        super(ExamineAndApproveItem, self).__init__(**kwargs)
-
-        self.table_name = "data_bak"
-
-        self.T = "bidding"
-        self.infoformat = 2
-
-
-class PropertyRightItem(DataBakItem):
-    """产权类"""
-    def __init__(self, **kwargs):
-        super(PropertyRightItem, self).__init__(**kwargs)
-
-        self.table_name = "data_bak"
-
-        self.T = "bidding"
-        self.infoformat = 3
-
-
-class BidingListItem(BaseListItem):
-
-    def __init__(self):
-        super(BidingListItem, self).__init__()
-        self.table_name = 'pyspider_listdata'
-
-        self.title = ""  # 标题
-        self.publishtime = ""  # 列表页文章发布时间
-
-        self.parse_url = ""  # 详情爬虫访问地址
-        self.parser_name = ""  # 详情爬虫从MongoDB拉取任务的唯一标识(建议使用 spidercode 命名)
-        self.parse = ""  # 详情爬虫解析回调方法名
-
-        self.proxies = False  # 代理
-
-        self.deal_detail = []  # 定义解析详情页主页内容的xpath列表
-        self.ex_js = ""  # 定义需要执行的js代码,包括但不限于script、文件路径等
-        self.ex_python = None  # 定义需要执行的python代码,生成params/date,如headers和cookies特殊,最好使用特殊定义法
-
-        self.files = False  # 采集附件配置
-        self.is_check_spider = True
-
-    def pre_to_db(self):
-        if self.is_check_spider and CheckData.channel(self.channel, self.site):
-            code, reason = CheckData.title(self.title)
-            if code == 10106:
-                log.warning(f"{self.title}--不可入库,原因:{reason}")
-                self.dont_save = True
-
-        # 删除 是否检测 该爬虫的标题符合规范
-        del self.is_check_spider
-
-
-MgpListItem = BidingListItem

+ 0 - 25
spider_frame/FworkSpider/requirements.txt

@@ -1,25 +0,0 @@
-better-exceptions==0.3.3
-AMQPStorm==2.10.6
-beautifulsoup4==4.9.3
-bs4==0.0.1
-DBUtils==3.0.0
-fire==0.4.0
-influxdb==5.3.1
-ipython==7.30.0
-loguru==0.5.3
-lxml==4.6.2
-oss2==2.15.0
-parsel==1.6.0
-pymongo==3.10.1
-PyMySQL==0.9.3
-python-logstash==0.4.8
-redis==3.3.6
-requests==2.24.0
-PySocks==1.7.1
-selenium==3.141.0
-six==1.15.0
-tqdm==4.64.0
-urllib3==1.25.11
-w3lib==1.22.0
-PyExecJS>=1.5.1
-func-timeout==4.3.5

+ 0 - 129
spider_frame/FworkSpider/setting.py

@@ -1,129 +0,0 @@
-# -*- coding: utf-8 -*-
-"""爬虫配置文件"""
-import datetime
-import os
-import sys
-
-# 数据保存失败表
-TAB_FAILED_ITEMS = "pyspider:s_failed_items"
-# 任务失败表
-TAB_FAILED_REQUESTS = "pyspider:z_failed_requests"
-# 采集任务生产表
-TASK_REQUEST_PRODUCE = "pyspider_listdata"
-# 失败任务记录表
-TASK_REQUEST_FAILED = "pyspider_listdata_err"
-# 爬虫心跳与数据采集汇总统计指标表
-SPIDER_HEARTBEAT_RECORD = "pyspider_heartbeat"
-
-# MONGODB
-MONGO_IP = "172.17.4.87"
-MONGO_PORT = 27080
-MONGO_DB = "py_spider"
-
-# REDIS
-REDISDB_IP_PORTS = "172.17.162.28:7361"
-REDISDB_USER_PASS = "k5ZJR5KV4q7DRZ92DQ"
-REDISDB_DB = 10
-
-# rabbitMq
-RABBITMQ_IP_PORT = "172.17.162.28:5672"
-RABBITMQ_USER = "root"
-RABBITMQ_USER_PASS = "V0O0049qBI2rV1554jLZPiBZ8H3Bo4"
-RABBITMQ_EXCHANGE = "pyspider.data.spider"
-RABBITMQ_EXCHANGE_TYPE = "direct"
-RABBITMQ_VIRTUAL_HOST = "/"
-RABBITMQ_SOCKET_TIMEOUT = 60
-RABBITMQ_HEARTBEAT = 600
-
-# 数据入库的pipeline
-ITEM_PIPELINES = [
-    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
-    "feapder.pipelines.rabbitmq_pipeline.RabbitMqPipeline",
-]
-# 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
-EXPORT_DATA_MAX_FAILED_TIMES = 5
-# 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
-EXPORT_DATA_MAX_RETRY_TIMES = 5
-
-COLLECTOR_TASK_COUNT = 100  # 每次获取任务数量
-
-# 爬虫
-SPIDER_THREAD_COUNT = 1  # 爬虫并发数,追求速度推荐32
-SPIDER_MAX_RETRY_TIMES = 3  # 每个请求最大重试次数
-
-# 浏览器渲染
-WEBDRIVER = dict(
-    pool_size=1,  # 浏览器的数量
-    load_images=False,  # 是否加载图片
-    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
-    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
-    headless=False,  # 是否为无头浏览器
-    driver_type="CHROME",  # CHROME、FIREFOX
-    timeout=30,  # 请求超时时间
-    window_size=(1280, 800),  # 窗口大小
-    executable_path=None,  # 浏览器路径,默认为默认路径
-    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
-    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
-    usages_local_driver=False,  # 是否加载本地驱动
-    server_addr="http://172.17.162.28:6666/wd/hub",  # selenium 远程服务地址
-    version="",  # 远程浏览器版本
-    service_log_path=os.devnull  # 日志路径
-)
-
-# splash渲染
-SPLASH_API = "http://splash.spdata.jianyu360.com/render.json"
-
-# request网络请求超时时间
-REQUEST_TIMEOUT = 60
-
-# 设置代理,代理提取API ,返回的代理分割符为\r\n
-PROXY_ENABLE = True
-JY_PROXY_AUTHOR = "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
-PROXY_EXTRACT_API = "http://proxy.spdata.jianyu360.com/proxy/getallip"
-JY_PROXY_URL = "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
-
-# 任务中心
-JY_TASK_URL = "http://pytask.spdata.jianyu360.com"
-
-# item去重
-ITEM_FILTER_ENABLE = True
-ITEM_FILTER_SETTING = dict(
-    filter_type=6,
-    redisdb_conf=[
-        dict(
-            fingerprint_pref="pylist_",
-            ip_port="172.17.162.34:8361",
-            user_pass="k5ZJR5KV4q7DRZ92DQ",
-            db=0
-        ),
-        dict(
-            fingerprint_pref="list_",
-            ip_port="172.17.4.84:4679",
-            user_pass="jytopnet123",
-            db=0
-        )
-    ],
-    expire_time=1 * 365 * 24 * 3600,  # 过期时间
-)
-
-# 日志设置
-DTIME = datetime.datetime.now().strftime("%Y-%m-%d")
-LOG_NAME = os.path.split(sys.argv[0])[-1].split(".")[0]
-LOG_PATH = "log/%s/%s.log" % (DTIME, LOG_NAME)  # log存储路径
-LOG_LEVEL = "ERROR"
-LOG_COLOR = True  # 是否带有颜色
-LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
-LOG_IS_WRITE_TO_FILE = True  # 是否写文件
-LOG_MODE = "w"  # 写文件的模式
-LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
-LOG_BACKUP_COUNT = 20  # 日志文件保留数量
-LOG_ENCODING = "utf8"  # 日志文件编码
-OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级 一般用不到
-
-# 远程bucket配置
-ALI_BUCKET_CONFIG = {
-    "key_id": "LTAI4G5x9aoZx8dDamQ7vfZi",
-    "key_secret": "Bk98FsbPYXcJe72n1bG3Ssf73acuNh",
-    "endpoint": "oss-cn-beijing-internal.aliyuncs.com",
-    "bucket_name": "jy-datafile"
-}

+ 0 - 2
spider_frame/FworkSpider/setup.cfg

@@ -1,2 +0,0 @@
-[easy_install]
-index_url = https://mirrors.aliyun.com/pypi/simple

Nem az összes módosított fájl került megjelenítésre, mert túl sok fájl változott