3
0
maguopeng 3 жил өмнө
parent
commit
8ba6e05988
100 өөрчлөгдсөн 18732 нэмэгдсэн , 0 устгасан
  1. 47 0
      Crawlb/docker-compose.yml
  2. 54 0
      Crawlb/docker-compose_work.yml
  3. 15 0
      FworkSpider/details/__init__.py
  4. 134 0
      FworkSpider/details/detail_ztlbw.py
  5. 170 0
      FworkSpider/details/details.py
  6. 165 0
      FworkSpider/details/details_cookie.py
  7. 115 0
      FworkSpider/details/details_firefox.py
  8. 150 0
      FworkSpider/details/details_login.py
  9. 88 0
      FworkSpider/details/dtcookie_pool.py
  10. 1 0
      FworkSpider/details/file/sj.js
  11. 1 0
      FworkSpider/feapder/VERSION
  12. 33 0
      FworkSpider/feapder/__init__.py
  13. 9 0
      FworkSpider/feapder/buffer/__init__.py
  14. 426 0
      FworkSpider/feapder/buffer/item_buffer.py
  15. 151 0
      FworkSpider/feapder/buffer/request_buffer.py
  16. 0 0
      FworkSpider/feapder/commands/__init__.py
  17. 45 0
      FworkSpider/feapder/commands/cmdline.py
  18. 21 0
      FworkSpider/feapder/commands/create/__init__.py
  19. 48 0
      FworkSpider/feapder/commands/create/create_cookies.py
  20. 30 0
      FworkSpider/feapder/commands/create/create_init.py
  21. 165 0
      FworkSpider/feapder/commands/create/create_item.py
  22. 52 0
      FworkSpider/feapder/commands/create/create_json.py
  23. 51 0
      FworkSpider/feapder/commands/create/create_params.py
  24. 52 0
      FworkSpider/feapder/commands/create/create_project.py
  25. 27 0
      FworkSpider/feapder/commands/create/create_setting.py
  26. 102 0
      FworkSpider/feapder/commands/create/create_spider.py
  27. 135 0
      FworkSpider/feapder/commands/create/create_table.py
  28. 118 0
      FworkSpider/feapder/commands/create_builder.py
  29. 93 0
      FworkSpider/feapder/commands/shell.py
  30. 9 0
      FworkSpider/feapder/core/__init__.py
  31. 252 0
      FworkSpider/feapder/core/base_parser.py
  32. 176 0
      FworkSpider/feapder/core/collector.py
  33. 56 0
      FworkSpider/feapder/core/handle_failed_requests.py
  34. 724 0
      FworkSpider/feapder/core/parser_control.py
  35. 579 0
      FworkSpider/feapder/core/scheduler.py
  36. 15 0
      FworkSpider/feapder/core/spiders/__init__.py
  37. 125 0
      FworkSpider/feapder/core/spiders/air_spider.py
  38. 1273 0
      FworkSpider/feapder/core/spiders/batch_spider.py
  39. 437 0
      FworkSpider/feapder/core/spiders/spider.py
  40. 9 0
      FworkSpider/feapder/db/__init__.py
  41. 37 0
      FworkSpider/feapder/db/memory_db.py
  42. 426 0
      FworkSpider/feapder/db/mongodb.py
  43. 381 0
      FworkSpider/feapder/db/mysqldb.py
  44. 848 0
      FworkSpider/feapder/db/redisdb.py
  45. 178 0
      FworkSpider/feapder/dedup/__init__.py
  46. 143 0
      FworkSpider/feapder/dedup/bitarray.py
  47. 385 0
      FworkSpider/feapder/dedup/bloomfilter.py
  48. 70 0
      FworkSpider/feapder/dedup/expirefilter.py
  49. 0 0
      FworkSpider/feapder/network/__init__.py
  50. 821 0
      FworkSpider/feapder/network/cookie_pool.py
  51. 145 0
      FworkSpider/feapder/network/item.py
  52. 20 0
      FworkSpider/feapder/network/proxy_file/1c718b9e5cc682d4ca7154958d0919c0.txt
  53. 20 0
      FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt
  54. 763 0
      FworkSpider/feapder/network/proxy_pool.py
  55. 506 0
      FworkSpider/feapder/network/request.py
  56. 356 0
      FworkSpider/feapder/network/response.py
  57. 155 0
      FworkSpider/feapder/network/selector.py
  58. 389 0
      FworkSpider/feapder/network/user_agent.py
  59. 56 0
      FworkSpider/feapder/pipelines/__init__.py
  60. 47 0
      FworkSpider/feapder/pipelines/console_pipeline.py
  61. 84 0
      FworkSpider/feapder/pipelines/mongo_pipeline.py
  62. 74 0
      FworkSpider/feapder/pipelines/mysql_pipeline.py
  63. 17 0
      FworkSpider/feapder/requirements.txt
  64. 172 0
      FworkSpider/feapder/setting.py
  65. 22 0
      FworkSpider/feapder/templates/air_spider_template.tmpl
  66. 45 0
      FworkSpider/feapder/templates/batch_spider_template.tmpl
  67. 105 0
      FworkSpider/feapder/templates/detail_template.tmpl
  68. 22 0
      FworkSpider/feapder/templates/item_template.tmpl
  69. 49 0
      FworkSpider/feapder/templates/project_template/CHECK_DATA.md
  70. 8 0
      FworkSpider/feapder/templates/project_template/README.md
  71. 0 0
      FworkSpider/feapder/templates/project_template/items/__init__.py
  72. 79 0
      FworkSpider/feapder/templates/project_template/main.py
  73. 137 0
      FworkSpider/feapder/templates/project_template/setting.py
  74. 0 0
      FworkSpider/feapder/templates/project_template/spiders/__init__.py
  75. 88 0
      FworkSpider/feapder/templates/spider_list_template.tmpl
  76. 67 0
      FworkSpider/feapder/templates/spider_template.tmpl
  77. 9 0
      FworkSpider/feapder/utils/__init__.py
  78. 168 0
      FworkSpider/feapder/utils/aliyun.py
  79. 63 0
      FworkSpider/feapder/utils/custom_argparse.py
  80. 93 0
      FworkSpider/feapder/utils/email_sender.py
  81. 6 0
      FworkSpider/feapder/utils/js/stealth.min.js
  82. 265 0
      FworkSpider/feapder/utils/log.py
  83. 539 0
      FworkSpider/feapder/utils/metrics.py
  84. 94 0
      FworkSpider/feapder/utils/perfect_dict.py
  85. 115 0
      FworkSpider/feapder/utils/redis_lock.py
  86. 2554 0
      FworkSpider/feapder/utils/tools.py
  87. 334 0
      FworkSpider/feapder/utils/webdriver.py
  88. 0 0
      FworkSpider/items/__init__.py
  89. 125 0
      FworkSpider/items/spider_item.py
  90. 0 0
      FworkSpider/login_pool/__init__.py
  91. 95 0
      FworkSpider/login_pool/zglbw.py
  92. 96 0
      FworkSpider/mongo_pipeline.py
  93. 163 0
      FworkSpider/setting.py
  94. 22 0
      FworkSpider/untils/__init__.py
  95. 24 0
      FworkSpider/untils/aliyun.py
  96. 198 0
      FworkSpider/untils/attachment.py
  97. 61 0
      FworkSpider/untils/chaojiying.py
  98. 788 0
      FworkSpider/untils/cookie_pool.py
  99. 33 0
      FworkSpider/untils/create_menus.py
  100. 19 0
      FworkSpider/untils/execptions.py

+ 47 - 0
Crawlb/docker-compose.yml

@@ -0,0 +1,47 @@
+version: '3.3'
+services:
+  master:
+    image: swordfish:v1
+    container_name: master_new
+    environment:
+       CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
+       CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
+       CRAWLAB_SERVER_MASTER: "Y"
+       CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
+       CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
+       CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
+       CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
+       CRAWLAB_REDIS_ADDRESS: "redis"  #
+#       CRAWLAB_REDIS_ADDRESS: "172.19.0.2"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
+       CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
+       CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
+       CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
+       CRAWLAB_SERVER_REGISTER_TYPE: "mac"
+    volumes: # 目录挂载,宿主机在前,容器在后
+      - /mnt/magp:/magp
+    ports:
+        - "8998:8080"
+
+
+#    depends_on:
+#          - redis
+
+#    deploy:
+#      resources:
+#        limits:
+#          memory: 15G
+#        reservations:
+#          memory: 1G
+
+#  mongo:
+#    image: mongo:latest
+#    restart: always
+#    ports:
+#      - "27027:27017"
+#  redis:
+#    image: redis:latest
+#    container_name: master_redis
+#    restart: always
+#    ports:
+#      - "6379:6379"
+#  wget http://download.firefox.com.cn/releases/firefox/78.14/zh-CN/Firefox-latest-x86_64.tar.bz2

+ 54 - 0
Crawlb/docker-compose_work.yml

@@ -0,0 +1,54 @@
+version: '3.3'
+services:
+  worker01:
+    image: swordfish:v1
+    container_name: crawlab_worker01
+    environment:
+      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
+      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
+      CRAWLAB_SERVER_MASTER: "N"
+      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
+      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
+      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
+      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
+      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
+      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
+      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
+      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
+      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
+      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
+
+  worker02:
+    image: swordfish:v1
+    container_name: crawlab_worker02
+    environment:
+      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
+      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
+      CRAWLAB_SERVER_MASTER: "N"
+      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
+      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
+      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
+      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
+      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
+      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
+      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
+      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
+      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
+      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
+  worker03:
+    image: swordfish:v1
+    container_name: crawlab_worker03
+    environment:
+      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址,在 docker compose 网络中,直接引用服务名称
+      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
+      CRAWLAB_SERVER_MASTER: "N"
+      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
+      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
+      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
+      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
+      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址,在 docker compose 网络中,直接引用服务名称
+      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
+      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
+      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
+      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
+      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"

+ 15 - 0
FworkSpider/details/__init__.py

@@ -0,0 +1,15 @@
+import requests
+
+
+headers = {
+
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
+}
+cookies = {
+    "__jsluid_h": "018c23a4fee58c26aa118512640f8022"
+}
+url = "http://www.snszgh.gov.cn/gsgg/index.html"
+response = requests.get(url, headers=headers,verify=False)
+
+print(response.text)
+print(response)

+ 134 - 0
FworkSpider/details/detail_ztlbw.py

@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-12-13 13:25:15
+---------
+@summary:
+---------
+@author: 马国鹏
+"""
+
+import feapder
+from feapder.utils.log import Log
+from feapder.utils.tools import wechat_warning
+from items.spider_item import DataBakItem, MgpListItem
+from feapder.db.mongodb import MongoDB
+from login_pool.zglbw import ZglbwPool
+from untils.attachment import AttachmentDownloader
+
+Log().info("")
+
+
+class FirefoxDetails(feapder.Spider):
+    _to_db = None
+    db_name = 'mgp_list'
+    send_list = []
+
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    def start_requests(self):
+        while True:
+            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
+                                        sort={"date": -1}, limit=1)
+            print(data_lsit)
+            for item in data_lsit:
+                url = item.get("parse_url")
+                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
+                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
+                cookie = cookie.get_cookie().cookie
+                yield feapder.Request(url=url, item=item.get("item"),
+                                      callback=self.detail_get, base_info=item, render=True,
+                                      render_time=3, proxies=False, cookies=cookie)
+                self.to_db.delete(self.db_name, item)
+            break
+
+    def detail_get(self, request, response):
+        items = request.item
+        # print(items)
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key, items[key])
+        html = ''
+        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
+                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
+                      '//div[@class="login ng-star-inserted"]']
+        for xpath in xpath_list:
+            # import pdb
+            # pdb.set_trace()
+            html_one = response.xpath(xpath).extract_first()
+            if html_one is not None:
+                html += '\n'  # 标书详细内容
+                html += html_one  # 拼接html
+        print(html)
+        list_item.contenthtml = html
+        files_list = response.xpath("//iframe/@src").extract_first()
+        file_url = files_list.split("file=")[-1]
+        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
+        attachments = {}
+        file_name = list_item.title
+
+        attachment = AttachmentDownloader().fetch_attachment(
+            file_name=file_name, file_type='pdf', download_url=file_url,
+            enable_proxy=False)
+        attachments["0"] = attachment
+        list_item.projectinfo = {"attachments": attachments}
+        yield list_item
+
+    def failed_request(self, request, response):
+        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        if response is None:
+            code = 0
+        code = response.status_code
+        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
+        if 200 <= code < 300:
+            err = 'analysis'
+        elif 300 <= code < 400:
+            err = 'download'
+        elif 400 <= code < 500:
+            err = 'download'
+        elif 500 <= code:
+            err = "servers"
+        else:
+            err = "timeout"
+        mgp = MgpListItem()
+        mgp.code = code
+        mgp.error = err
+        items = request.base_info
+        for key in items:
+            mgp.__setitem__(key, items[key])
+        mgp.failed += 1
+        if mgp.pri is None:
+            mgp.pri = 0
+
+        if mgp.pri > 5:
+            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
+                    '''
+                    根据爬虫优先级报警'''
+                    info = f'''`
+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
+        > **爬虫名称:** {mgp.item.get("site")}
+        > **栏目名称:** {mgp.item.get("channel")}
+        > **爬虫代码:** {mgp.item.get("spidercode")}
+        > **爬虫等级:** {mgp.pri}
+        > **所属管理人员:** {mgp.author}
+        请登录剑鱼爬虫管理平台查看详情。
+        `'''
+                    wechat_warning(info)
+                    self.send_list.append(mgp.item.get("site"))
+        yield mgp
+
+    def end_callback(self):
+        print("爬虫结束")
+        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
+    # def download_midware(self, request):
+    #     request.proxies = self.prox_pool.get()
+    #     return request
+
+
+if __name__ == "__main__":
+    FirefoxDetails(redis_key="magp:details:ztlbw").start()

+ 170 - 0
FworkSpider/details/details.py

@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-12-13 13:25:15
+---------
+@summary:
+---------
+@author: 马国鹏
+"""
+import json
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+import time
+from urllib.parse import urljoin
+
+import feapder
+from feapder.utils.tools import wechat_warning
+import execjs
+from items.spider_item import DataBakItem, MgpListItem
+from feapder.db.mongodb import MongoDB
+from untils.attachment import AttachmentDownloader
+
+
+class Details(feapder.Spider):
+    _to_db = None
+    db_name = 'mgp_list'
+    send_list = []
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    def start_requests(self):
+        while True:
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details"},sort={"item.publishtime":-1},limit=50)
+            for item in data_lsit:
+                print(11111)
+                request_params = item.get("request_params")
+                if item.get("js"):
+                    eval(item.get("js"))
+                if item.get("ex_python"):
+                    exec(item.get("ex_python"))
+                if item.get("proxies"):
+
+                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),
+                                          callback=eval(item.get("parse")),base_info=item,**request_params)
+                else:
+                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
+                                          deal_detail=item.get("deal_detail"),
+                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
+
+                self.to_db.delete(self.db_name,item)
+            break
+
+    def detail_get(self,request,response):
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        for xpath in request.deal_detail:
+            html = response.xpath(xpath).extract_first()  # 标书详细内容
+            if html is not None:
+                break
+
+        list_item.contenthtml = html
+        if request.files:
+            files_info = request.files
+            files =  response.xpath(files_info.get("list_xpath"))
+            if request.files_info:
+                files_info = request.files_info
+                files = response.xpath(files_info.get("list_xpath"))
+                if request.files_info:
+                    files_info = request.files_info
+                    files = response.xpath(files_info.get("list_xpath"))
+                    if len(files) > 0:
+                        attachments = {}
+                        for index, info in enumerate(files):
+                            file_url = info.xpath(files_info.get("url_xpath")).extract_first()
+                            file_name = info.xpath(files_info.get("name_xpath")).extract_first()
+                            if files_info.get("host"):
+                                file_url = urljoin(files_info.get("host"), file_url)
+                            if not files_info.get("file_type"):
+                                file_type = file_url.split("?")[0].split(".")[-1].lower()
+                            else:
+                                file_type = files_info.get("file_type")
+                            if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
+                                attachment = AttachmentDownloader().fetch_attachment(
+                                    file_name=file_name, file_type=file_type, download_url=file_url,
+                                    enable_proxy=False)
+                                attachments[len(attachments) + 1] = attachment
+                        if len(attachments) == 0:
+                            pass
+                        else:
+                            list_item.projectinfo = {"attachment": attachments}
+
+
+        yield list_item
+
+    def detail_json(self,request,response):
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        exec(request.deal_detail)
+
+        yield list_item
+    def detail_post(self,request,response):
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        exec(request.deal_detail)
+
+        yield list_item
+
+    def failed_request(self, request, response):
+        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        if response is None:
+            code = 0
+        else:
+            code = response.status_code
+        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
+        if 200<=code<300:
+            err = 'analysis'
+        elif 300<=code<400:
+            err = 'download'
+        elif 400<=code<500:
+            err = 'download'
+        elif 500<=code:
+            err = "servers"
+        else:
+            err = "timeout"
+        mgp = MgpListItem()
+        mgp.code=code
+        mgp.error=err
+        items = request.base_info
+        for key in items:
+            mgp.__setitem__(key,items[key])
+        mgp.failed +=1
+        if mgp.pri is None:
+            mgp.pri = 0
+
+        if mgp.pri > 5:
+            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
+                    '''
+                    根据爬虫优先级报警'''
+                    info= f'''`
+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
+        > **爬虫名称:** {mgp.item.get("site")}
+        > **栏目名称:** {mgp.item.get("channel")}
+        > **爬虫代码:** {mgp.item.get("spidercode")}
+        > **爬虫等级:** {mgp.pri}
+        > **所属管理人员:** {mgp.author}
+        请登录剑鱼爬虫管理平台查看详情。
+        `'''
+                    wechat_warning(info)
+                    self.send_list.append(mgp.item.get("site"))
+        yield mgp
+
+    def end_callback(self):
+        print("爬虫结束")
+
+
+
+if __name__ == "__main__":
+    Details(redis_key="magp:details1").start()

+ 165 - 0
FworkSpider/details/details_cookie.py

@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-12-13 13:25:15
+---------
+@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
+---------
+@author: 马国鹏
+"""
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+import feapder
+from feapder.utils.tools import wechat_warning
+import execjs
+from items.spider_item import DataBakItem, MgpListItem
+from feapder.db.mongodb import MongoDB
+
+from untils.cookie_pool import PageCookiePool
+import copy
+
+class Details(feapder.Spider):
+    _to_db = None
+    db_name = 'mgp_list'
+    send_list = []
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    def start_requests(self):
+        while True:
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
+            for item in data_lsit:
+                request_params = item.get("request_params")
+
+                if item.get("ex_python"):
+                    exec(item.get("ex_python"))
+
+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
+                                      deal_detail=item.get("deal_detail"),**request_params,
+                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
+                self.to_db.delete(self.db_name,item)
+            break
+
+
+
+    def detail_get(self,request,response):
+        '''处理html格式的返回结果'''
+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
+            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.down_mid)
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        if response.code in (request.down_mid.get("code")):
+            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.down_mid)
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        for xpath in request.deal_detail:
+            html = response.xpath(xpath).extract_first()  # 标书详细内容
+            if html is not None:
+                break
+
+        list_item.contenthtml = html
+        yield list_item
+
+    def detail_json(self,request,response):
+        '''处理json串及其他格式的返回结果'''
+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
+            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.down_mid)
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        if response.code in (request.down_mid.get("code")):
+            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.down_mid)
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        exec(request.deal_detail)
+
+        list_item.contenthtml = html
+        yield list_item
+
+    def failed_request(self, request, response):
+        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        if response is None:
+            code = 0
+        else:
+            code = response.status_code
+        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
+        if 200 <= code < 300:
+            err = 'analysis'
+        elif 300 <= code < 400:
+            err = 'download'
+        elif 400 <= code < 500:
+            err = 'download'
+        elif 500 <= code:
+            err = "servers"
+        else:
+            err = "timeout"
+        mgp = MgpListItem()
+        mgp.code = code
+        mgp.error = err
+        items = request.base_info
+        for key in items:
+            mgp.__setitem__(key, items[key])
+        mgp.failed += 1
+        if mgp.pri is None:
+            mgp.pri = 0
+
+        if mgp.pri > 5:
+            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
+                    '''
+                    根据爬虫优先级报警'''
+                    info = f'''`
+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
+        > **爬虫名称:** {mgp.item.get("site")}
+        > **栏目名称:** {mgp.item.get("channel")}
+        > **爬虫代码:** {mgp.item.get("spidercode")}
+        > **爬虫等级:** {mgp.pri}
+        > **所属管理人员:** {mgp.author}
+        请登录剑鱼爬虫管理平台查看详情。
+        `'''
+                    wechat_warning(info)
+                    self.send_list.append(mgp.item.get("site"))
+        yield mgp
+
+
+    def end_callback(self):
+        print("爬虫结束")
+    def download_midware(self, request):
+        down_mid = request.down_mid
+        key = down_mid.get("key")
+        page_url = down_mid.get("page_url")
+        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+        request.cookies = cookie_pool.get_cookie()
+        return request
+
+
+if __name__ == "__main__":
+    Details(redis_key="magp:details1").start()

+ 115 - 0
FworkSpider/details/details_firefox.py

@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-12-13 13:25:15
+---------
+@summary:
+---------
+@author: 马国鹏
+"""
+
+import feapder
+from feapder.utils.tools import wechat_warning
+import execjs
+from items.spider_item import DataBakItem, MgpListItem
+from feapder.db.mongodb import MongoDB
+
+
+
+class FirefoxDetails(feapder.Spider):
+    _to_db = None
+    db_name = 'mgp_list'
+    send_list = []
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    def start_requests(self):
+        while True:
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
+            print(data_lsit)
+            for item in data_lsit:
+                print(item)
+                request_params = item.get("request_params")
+                if item.get("ex_python"):
+                    exec(item.get("ex_python"))
+
+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
+                                      deal_detail=item.get("deal_detail"),**request_params,
+                                      callback=eval(item.get("parse")),base_info=item,render=True,
+                                      render_time=item.get("render_time"))
+                self.to_db.delete(self.db_name,item)
+            break
+
+    def detail_get(self,request,response):
+        print(response.text)
+        items = request.item
+        # print(items)
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        for xpath in request.deal_detail:
+            html = response.xpath(xpath).extract_first()  # 标书详细内容
+            if html is not None:
+                break
+        list_item.contenthtml = html
+        yield list_item
+
+    def failed_request(self, request, response):
+        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        if response is None:
+            code = 0
+        code = response.status_code
+        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
+        if 200 <= code < 300:
+            err = 'analysis'
+        elif 300 <= code < 400:
+            err = 'download'
+        elif 400 <= code < 500:
+            err = 'download'
+        elif 500 <= code:
+            err = "servers"
+        else:
+            err = "timeout"
+        mgp = MgpListItem()
+        mgp.code = code
+        mgp.error = err
+        items = request.base_info
+        for key in items:
+            mgp.__setitem__(key, items[key])
+        mgp.failed += 1
+        if mgp.pri is None:
+            mgp.pri = 0
+
+        if mgp.pri > 5:
+            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
+                    '''
+                    根据爬虫优先级报警'''
+                    info = f'''`
+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
+        > **爬虫名称:** {mgp.item.get("site")}
+        > **栏目名称:** {mgp.item.get("channel")}
+        > **爬虫代码:** {mgp.item.get("spidercode")}
+        > **爬虫等级:** {mgp.pri}
+        > **所属管理人员:** {mgp.author}
+        请登录剑鱼爬虫管理平台查看详情。
+        `'''
+                    wechat_warning(info)
+                    self.send_list.append(mgp.item.get("site"))
+        yield mgp
+
+
+    def end_callback(self):
+        print("爬虫结束")
+        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
+    # def download_midware(self, request):
+    #     request.proxies = self.prox_pool.get()
+    #     return request
+
+
+if __name__ == "__main__":
+    FirefoxDetails(redis_key="magp:details:firefox").start()

+ 150 - 0
FworkSpider/details/details_login.py

@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-12-13 13:25:15
+---------
+@summary:  生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
+---------
+@author: 马国鹏
+"""
+
+import feapder
+from feapder.utils.tools import wechat_warning
+import execjs
+from items.spider_item import DataBakItem, MgpListItem
+from feapder.db.mongodb import MongoDB
+
+from untils.cookie_pool import LoginCookiePool
+import copy
+
+class Details(feapder.Spider):
+    _to_db = None
+    db_name = 'mgp_list'
+    send_list = []
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    def start_requests(self):
+        while True:
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
+            for item in data_lsit:
+                request_params = item.get("request_params")
+                down_mid = copy.copy(item.get("down_mid"))
+                key = down_mid.get("key")
+                page_url = down_mid.get("page_url")
+                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+                down_mid["cookie_pool"] = cookie_pool
+                print(down_mid)
+
+                if item.get("ex_python"):
+                    exec(item.get("ex_python"))
+
+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
+                                      deal_detail=item.get("deal_detail"),**request_params,
+                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
+                self.to_db.delete(self.db_name,item)
+            break
+
+
+
+    def detail_get(self,request,response):
+        '''处理html格式的返回结果'''
+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
+            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.get("down_mid"))
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        if response.code in (request.down_mid.get("code")):
+            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.get("down_mid"))
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        for xpath in request.deal_detail:
+            html = response.xpath(xpath).extract_first()  # 标书详细内容
+            if html is not None:
+                break
+
+        list_item.contenthtml = html
+        yield list_item
+
+    def detail_json(self,request,response):
+        '''处理json串及其他格式的返回结果'''
+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
+            '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.get("down_mid"))
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        if response.code in (request.down_mid.get("code")):
+            '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
+            down_mid = copy.copy(request.get("down_mid"))
+            key = down_mid.get("key")
+            page_url = down_mid.get("page_url")
+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+            cookie_pool.del_cookie(request.cookies)
+            yield request
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        exec(request.deal_detail)
+
+        list_item.contenthtml = html
+        yield list_item
+
+    def failed_request(self, request, response):
+        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        mgp = MgpListItem()
+        items = request.base_info
+        for key in items:
+            mgp.__setitem__(key,items[key])
+        mgp.failed +=1
+        print(f'......{mgp.failed}')
+        if mgp.pri > 5:
+            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
+                    '''
+                    根据爬虫优先级报警'''
+                    info= f'''`
+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
+        > **爬虫名称:** {mgp.item.get("site")}
+        > **栏目名称:** {mgp.item.get("channel")}
+        > **爬虫代码:** {mgp.item.get("spidercode")}
+        > **所属管理人员:** {mgp.author}
+        请登录剑鱼爬虫管理平台查看详情。
+        `'''
+                    wechat_warning(info)
+                    self.send_list.append(mgp.item.get("site"))
+        yield mgp
+
+
+    def end_callback(self):
+        print("爬虫结束")
+    def download_midware(self, request):
+        down_mid = request.down_mid
+        key = down_mid.get("key")
+        page_url = down_mid.get("page_url")
+        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
+        request.cookies = cookie_pool.get_cookie()
+        return request
+
+
+if __name__ == "__main__":
+    Details(redis_key="magp:details1").start()

+ 88 - 0
FworkSpider/details/dtcookie_pool.py

@@ -0,0 +1,88 @@
+import json
+import re
+import sys
+
+import execjs
+
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+from untils.cookie_pool import PageCookiePool
+import requests
+
+
+class DTCookiePool(PageCookiePool):
+    def __init__(self,redis_key,header,page_url=None,
+        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
+        super(DTCookiePool, self).__init__(redis_key,page_url=None,
+        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
+        self.headers=header
+        self.page_url = page_url
+
+    def create_cookie(self,):
+        session = requests.Session()
+        start_url = self.page_url
+        print(self.headers)
+        res = session.get(start_url, headers=self.headers,verify=False)
+        js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
+        js_func = 'function sd() { return ' + js_func + "}"
+        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
+        ss = ctx.call("sd")
+        cookies = {}
+
+        for item in ss.split(";"):
+            if '=' in item:
+                cookies[item.split("=")[0]] = item.split("=")[-1]
+        res = session.get(start_url, cookies=cookies, headers=self.headers)
+        js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
+        js_func = re.sub("<(/*?)script>", "", res.text)
+        location = re.compile('location(.*?)}else')
+        setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
+        setTimeout2 = re.compile('setTimeout(.*?)document')
+        gox = re.compile('};go(.*?)\)')
+        js_func = re.sub(location, "}else", js_func)
+        js_func = re.sub(setTimeout, "       document", js_func)
+        js_func = re.sub(setTimeout2, "       document", js_func)
+        js_func = re.sub(gox, "   return document['cookie']\n};", js_func)
+        js_func = '''const jsdom = require("jsdom");
+        const {JSDOM} = jsdom;
+        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
+        window = dom.window;
+        document = window.document;''' + js_func
+        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
+        with open('ex_js.js', 'w+', encoding='utf-8') as f:
+            f.write(js_func)
+        print(js_do_data)
+        ss = ctx.call("go", json.loads(js_do_data))
+
+        for item in ss.split(";"):
+            if '=' in item:
+                cookies[item.split("=")[0]] = item.split("=")[-1]
+                session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
+        res = session.get(start_url, headers=self.headers, cookies=cookies)
+        cookies = requests.utils.dict_from_cookiejar(session.cookies)
+        return cookies
+
+if __name__ == '__main__':
+    headers = {
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Accept-Language": "zh-CN,zh;q=0.9",
+    "Cache-Control": "max-age=0",
+    "Connection": "keep-alive",
+    "Host": "www.hefei.gov.cn",
+    "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": "\"Windows\"",
+    "Sec-Fetch-Dest": "document",
+    "Sec-Fetch-Mode": "navigate",
+    "Sec-Fetch-Site": "none",
+    "Sec-Fetch-User": "?1",
+    "Upgrade-Insecure-Requests": "1",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
+}
+
+    cookie_pool = DTCookiePool(
+        page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
+        header=headers, redis_key="dongtaices")
+    cookie = cookie_pool.get_cookie()
+    print(cookie)
+    # cookie_pool.del_cookie(cookie)

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 1 - 0
FworkSpider/details/file/sj.js


+ 1 - 0
FworkSpider/feapder/VERSION

@@ -0,0 +1 @@
+1.6.9

+ 33 - 0
FworkSpider/feapder/__init__.py

@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/21 10:41 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import os, sys
+import re
+
+sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
+
+__all__ = [
+    "AirSpider",
+    "Spider",
+    "BatchSpider",
+    "BaseParser",
+    "BatchParser",
+    "Request",
+    "Response",
+    "Item",
+    "UpdateItem",
+    "ArgumentParser",
+]
+
+from feapder.core.spiders import Spider, BatchSpider, AirSpider
+from feapder.core.base_parser import BaseParser, BatchParser
+from feapder.network.request import Request
+from feapder.network.response import Response
+from feapder.network.item import Item, UpdateItem
+from feapder.utils.custom_argparse import ArgumentParser

+ 9 - 0
FworkSpider/feapder/buffer/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+'''
+Created on 2020/4/23 12:09 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+'''

+ 426 - 0
FworkSpider/feapder/buffer/item_buffer.py

@@ -0,0 +1,426 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-06-19 17:17
+---------
+@summary: item 管理器, 负责缓冲添加到数据库中的item, 由该manager统一添加。防止多线程同时访问数据库
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import importlib
+import threading
+from queue import Queue
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.db.redisdb import RedisDB
+from feapder.dedup import Dedup
+from feapder.network.item import Item, UpdateItem
+from feapder.pipelines import BasePipeline
+from feapder.pipelines.mysql_pipeline import MysqlPipeline
+from feapder.utils import metrics
+from feapder.utils.log import log
+
+MAX_ITEM_COUNT = 5000  # 缓存中最大item数
+UPLOAD_BATCH_MAX_SIZE = 1000
+
+MYSQL_PIPELINE_PATH = "feapder.pipelines.mysql_pipeline.MysqlPipeline"
+
+
+class ItemBuffer(threading.Thread):
+    dedup = None
+    __redis_db = None
+
+    def __init__(self, redis_key, task_table=None):
+        if not hasattr(self, "_table_item"):
+            super(ItemBuffer, self).__init__()
+
+            self._thread_stop = False
+            self._is_adding_to_db = False
+            self._redis_key = redis_key
+            self._task_table = task_table
+
+            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
+
+            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
+            self._table_failed_items = setting.TAB_FAILED_ITEMS.format(
+                redis_key=redis_key
+            )
+
+            self._item_tables = {
+                # 'item_name': 'table_name' # 缓存item名与表名对应关系
+            }
+
+            self._item_update_keys = {
+                # 'table_name': ['id', 'name'...] # 缓存table_name与__update_key__的关系
+            }
+
+            self._pipelines = self.load_pipelines()
+
+            self._have_mysql_pipeline = MYSQL_PIPELINE_PATH in setting.ITEM_PIPELINES
+            self._mysql_pipeline = None
+
+            if setting.ITEM_FILTER_ENABLE and not self.__class__.dedup:
+                self.__class__.dedup = Dedup(
+                    to_md5=False, **setting.ITEM_FILTER_SETTING
+                )
+
+            # 导出重试的次数
+            self.export_retry_times = 0
+            # 导出失败的次数 TODO 非air爬虫使用redis统计
+            self.export_falied_times = 0
+
+    @property
+    def redis_db(self):
+        if self.__class__.__redis_db is None:
+            self.__class__.__redis_db = RedisDB()
+
+        return self.__class__.__redis_db
+
+    def load_pipelines(self):
+        pipelines = []
+        for pipeline_path in setting.ITEM_PIPELINES:
+            module, class_name = pipeline_path.rsplit(".", 1)
+            pipeline_cls = importlib.import_module(module).__getattribute__(class_name)
+            pipeline = pipeline_cls()
+            if not isinstance(pipeline, BasePipeline):
+                raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline")
+            pipelines.append(pipeline)
+
+        return pipelines
+
+    @property
+    def mysql_pipeline(self):
+        if not self._mysql_pipeline:
+            module, class_name = MYSQL_PIPELINE_PATH.rsplit(".", 1)
+            pipeline_cls = importlib.import_module(module).__getattribute__(class_name)
+            self._mysql_pipeline = pipeline_cls()
+
+        return self._mysql_pipeline
+
+    def run(self):
+        self._thread_stop = False
+        while not self._thread_stop:
+            self.flush()
+            tools.delay_time(1)
+
+        self.close()
+
+    def stop(self):
+        self._thread_stop = True
+        self._started.clear()
+
+    def put_item(self, item):
+        if isinstance(item, Item):
+            # 入库前的回调
+            item.pre_to_db()
+
+        self._items_queue.put(item)
+
+    def flush(self):
+        try:
+            items = []
+            update_items = []
+            requests = []
+            callbacks = []
+            items_fingerprints = []
+            data_count = 0
+
+            while not self._items_queue.empty():
+                data = self._items_queue.get_nowait()
+                data_count += 1
+
+                # data 分类
+                if callable(data):
+                    callbacks.append(data)
+
+                elif isinstance(data, UpdateItem):
+                    update_items.append(data)
+
+                elif isinstance(data, Item):
+                    items.append(data)
+                    if setting.ITEM_FILTER_ENABLE:
+                        items_fingerprints.append(data.fingerprint)
+
+                else:  # request-redis
+                    requests.append(data)
+
+                if data_count >= UPLOAD_BATCH_MAX_SIZE:
+                    self.__add_item_to_db(
+                        items, update_items, requests, callbacks, items_fingerprints
+                    )
+
+                    items = []
+                    update_items = []
+                    requests = []
+                    callbacks = []
+                    items_fingerprints = []
+                    data_count = 0
+
+            if data_count:
+                self.__add_item_to_db(
+                    items, update_items, requests, callbacks, items_fingerprints
+                )
+
+        except Exception as e:
+            log.exception(e)
+
+    def get_items_count(self):
+        return self._items_queue.qsize()
+
+    def is_adding_to_db(self):
+        return self._is_adding_to_db
+
+    def __dedup_items(self, items, items_fingerprints):
+        """
+        去重
+        @param items:
+        @param items_fingerprints:
+        @return: 返回去重后的items, items_fingerprints
+        """
+        if not items:
+            return items, items_fingerprints
+
+        is_exists = self.__class__.dedup.get(items_fingerprints)
+        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]
+
+        dedup_items = []
+        dedup_items_fingerprints = []
+        items_count = dedup_items_count = dup_items_count = 0
+
+        while is_exists:
+            item = items.pop(0)
+            items_fingerprint = items_fingerprints.pop(0)
+            is_exist = is_exists.pop(0)
+
+            items_count += 1
+
+            if not is_exist:
+                dedup_items.append(item)
+                dedup_items_fingerprints.append(items_fingerprint)
+                dedup_items_count += 1
+            else:
+                dup_items_count += 1
+
+        log.info(
+            "待入库数据 {} 条, 重复 {} 条,实际待入库数据 {} 条".format(
+                items_count, dup_items_count, dedup_items_count
+            )
+        )
+
+        return dedup_items, dedup_items_fingerprints
+
+    def __pick_items(self, items, is_update_item=False):
+        """
+        将每个表之间的数据分开 拆分后 原items为空
+        @param items:
+        @param is_update_item:
+        @return:
+        """
+        datas_dict = {
+            # 'table_name': [{}, {}]
+        }
+
+        while items:
+            item = items.pop(0)
+            # 取item下划线格式的名
+            # 下划线类的名先从dict中取,没有则现取,然后存入dict。加快下次取的速度
+            item_name = item.item_name
+            table_name = self._item_tables.get(item_name)
+            if not table_name:
+                table_name = item.table_name
+                self._item_tables[item_name] = table_name
+
+            if table_name not in datas_dict:
+                datas_dict[table_name] = []
+
+            datas_dict[table_name].append(item.to_dict)
+
+            if is_update_item and table_name not in self._item_update_keys:
+                self._item_update_keys[table_name] = item.update_key
+
+        return datas_dict
+
+    def __export_to_db(self, table, datas, is_update=False, update_keys=()):
+        # 打点 校验
+        self.check_datas(table=table, datas=datas)
+
+        for pipeline in self._pipelines:
+            if is_update:
+                if table == self._task_table and not isinstance(
+                    pipeline, MysqlPipeline
+                ):
+                    continue
+
+                if not pipeline.update_items(table, datas, update_keys=update_keys):
+                    log.error(
+                        f"{pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
+                    )
+                    return False
+
+            else:
+                if not pipeline.save_items(table, datas):
+                    log.error(
+                        f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
+                    )
+                    return False
+
+        # 若是任务表, 且上面的pipeline里没mysql,则需调用mysql更新任务
+        if not self._have_mysql_pipeline and is_update and table == self._task_table:
+            if not self.mysql_pipeline.update_items(
+                table, datas, update_keys=update_keys
+            ):
+                log.error(
+                    f"{self.mysql_pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
+                )
+                return False
+
+        return True
+
+    def __add_item_to_db(
+        self, items, update_items, requests, callbacks, items_fingerprints
+    ):
+        export_success = True
+        self._is_adding_to_db = True
+
+        # 去重
+        if setting.ITEM_FILTER_ENABLE:
+            items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
+
+        # 分捡
+        items_dict = self.__pick_items(items)
+        update_items_dict = self.__pick_items(update_items, is_update_item=True)
+
+        # item批量入库
+        failed_items = {"add": [], "update": [], "requests": []}
+        while items_dict:
+            table, datas = items_dict.popitem()
+
+            log.debug(
+                """
+                -------------- item 批量入库 --------------
+                表名: %s
+                datas: %s
+                    """
+                % (table, tools.dumps_json(datas, indent=16))
+            )
+
+            if not self.__export_to_db(table, datas):
+                export_success = False
+                failed_items["add"].append({"table": table, "datas": datas})
+
+        # 执行批量update
+        while update_items_dict:
+            table, datas = update_items_dict.popitem()
+
+            log.debug(
+                """
+                -------------- item 批量更新 --------------
+                表名: %s
+                datas: %s
+                    """
+                % (table, tools.dumps_json(datas, indent=16))
+            )
+
+            update_keys = self._item_update_keys.get(table)
+            if not self.__export_to_db(
+                table, datas, is_update=True, update_keys=update_keys
+            ):
+                export_success = False
+                failed_items["update"].append({"table": table, "datas": datas})
+
+        if export_success:
+            # 执行回调
+            while callbacks:
+                try:
+                    callback = callbacks.pop(0)
+                    callback()
+                except Exception as e:
+                    log.exception(e)
+
+            # 删除做过的request
+            if requests:
+                self.redis_db.zrem(self._table_request, requests)
+
+            # 去重入库
+            if setting.ITEM_FILTER_ENABLE:
+                if items_fingerprints:
+                    self.__class__.dedup.add(items_fingerprints, skip_check=True)
+        else:
+            failed_items["requests"] = requests
+
+            if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
+                if self._redis_key != "air_spider":
+                    # 失败的item记录到redis
+                    self.redis_db.sadd(self._table_failed_items, failed_items)
+
+                    # 删除做过的request
+                    if requests:
+                        self.redis_db.zrem(self._table_request, requests)
+
+                    log.error(
+                        "入库超过最大重试次数,不再重试,数据记录到redis,items:\n {}".format(
+                            tools.dumps_json(failed_items)
+                        )
+                    )
+                self.export_retry_times = 0
+
+            else:
+                tip = ["入库不成功"]
+                if callbacks:
+                    tip.append("不执行回调")
+                if requests:
+                    tip.append("不删除任务")
+                    exists = self.redis_db.zexists(self._table_request, requests)
+                    for exist, request in zip(exists, requests):
+                        if exist:
+                            self.redis_db.zadd(self._table_request, requests, 300)
+
+                if setting.ITEM_FILTER_ENABLE:
+                    tip.append("数据不入去重库")
+
+                if self._redis_key != "air_spider":
+                    tip.append("将自动重试")
+
+                tip.append("失败items:\n {}".format(tools.dumps_json(failed_items)))
+                log.error(",".join(tip))
+
+                self.export_falied_times += 1
+
+                if self._redis_key != "air_spider":
+                    self.export_retry_times += 1
+
+            if self.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
+                # 报警
+                msg = "《{}》爬虫导出数据失败,失败次数:{},请检查爬虫是否正常".format(
+                    self._redis_key, self.export_falied_times
+                )
+                log.error(msg)
+                tools.send_msg(
+                    msg=msg,
+                    level="error",
+                    message_prefix="《%s》爬虫导出数据失败" % (self._redis_key),
+                )
+
+        self._is_adding_to_db = False
+
+    def check_datas(self, table, datas):
+        """
+        打点 记录总条数及每个key情况
+        @param table: 表名
+        @param datas: 数据 列表
+        @return:
+        """
+        metrics.emit_counter("total count", len(datas), classify=table)
+        for data in datas:
+            for k, v in data.items():
+                metrics.emit_counter(k, int(bool(v)), classify=table)
+
+    def close(self):
+        # 调用pipeline的close方法
+        for pipeline in self._pipelines:
+            try:
+                pipeline.close()
+            except:
+                pass

+ 151 - 0
FworkSpider/feapder/buffer/request_buffer.py

@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-06-19 17:17
+---------
+@summary: request 管理器, 负责缓冲添加到数据库中的request
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import collections
+import threading
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.db.redisdb import RedisDB
+from feapder.dedup import Dedup
+from feapder.utils.log import log
+
+MAX_URL_COUNT = 1000  # 缓存中最大request数
+
+
+class RequestBuffer(threading.Thread):
+    dedup = None
+
+    def __init__(self, redis_key):
+        if not hasattr(self, "_requests_deque"):
+            super(RequestBuffer, self).__init__()
+
+            self._thread_stop = False
+            self._is_adding_to_db = False
+
+            self._requests_deque = collections.deque()
+            self._del_requests_deque = collections.deque()
+            self._db = RedisDB()
+
+            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
+            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
+                redis_key=redis_key
+            )
+
+            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
+                self.__class__.dedup = Dedup(
+                    name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
+                )  # 默认过期时间为一个月
+
+    def run(self):
+        self._thread_stop = False
+        while not self._thread_stop:
+            try:
+                self.__add_request_to_db()
+            except Exception as e:
+                log.exception(e)
+
+            tools.delay_time(1)
+
+    def stop(self):
+        self._thread_stop = True
+        self._started.clear()
+
+    def put_request(self, request):
+        self._requests_deque.append(request)
+
+        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存,主动调用
+            self.flush()
+
+    def put_del_request(self, request):
+        self._del_requests_deque.append(request)
+
+    def put_failed_request(self, request, table=None):
+        try:
+            request_dict = request.to_dict
+            self._db.zadd(
+                table or self._table_failed_request, request_dict, request.priority
+            )
+        except Exception as e:
+            log.exception(e)
+
+    def flush(self):
+        try:
+            self.__add_request_to_db()
+        except Exception as e:
+            log.exception(e)
+
+    def get_requests_count(self):
+        return len(self._requests_deque)
+
+    def is_adding_to_db(self):
+        return self._is_adding_to_db
+
+    def __add_request_to_db(self):
+        request_list = []
+        prioritys = []
+        callbacks = []
+
+        while self._requests_deque:
+            request = self._requests_deque.popleft()
+            self._is_adding_to_db = True
+
+            if callable(request):
+                # 函数
+                # 注意:应该考虑闭包情况。闭包情况可写成
+                # def test(xxx = xxx):
+                #     # TODO 业务逻辑 使用 xxx
+                # 这么写不会导致xxx为循环结束后的最后一个值
+                callbacks.append(request)
+                continue
+
+            priority = request.priority
+
+            # 如果需要去重并且库中已重复 则continue
+            if (
+                request.filter_repeat
+                and setting.REQUEST_FILTER_ENABLE
+                and not self.__class__.dedup.add(request.fingerprint)
+            ):
+                log.debug("request已存在  url = %s" % request.url)
+                continue
+            else:
+                request_list.append(str(request.to_dict))
+                prioritys.append(priority)
+
+            if len(request_list) > MAX_URL_COUNT:
+                self._db.zadd(self._table_request, request_list, prioritys)
+                request_list = []
+                prioritys = []
+
+        # 入库
+        if request_list:
+            self._db.zadd(self._table_request, request_list, prioritys)
+
+        # 执行回调
+        for callback in callbacks:
+            try:
+                callback()
+            except Exception as e:
+                log.exception(e)
+
+        # 删除已做任务
+        if self._del_requests_deque:
+            request_done_list = []
+            while self._del_requests_deque:
+                request_done_list.append(self._del_requests_deque.popleft())
+
+            # 去掉request_list中的requests, 否则可能会将刚添加的request删除
+            request_done_list = list(set(request_done_list) - set(request_list))
+
+            if request_done_list:
+                self._db.zrem(self._table_request, request_done_list)
+
+        self._is_adding_to_db = False

+ 0 - 0
FworkSpider/feapder/commands/__init__.py


+ 45 - 0
FworkSpider/feapder/commands/cmdline.py

@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/5/8 2:24 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import sys
+from os.path import dirname, join
+
+from feapder.commands import create_builder
+from feapder.commands import shell
+
+
+def _print_commands():
+    with open(join(dirname(dirname(__file__)), "VERSION"), "rb") as f:
+        version = f.read().decode("ascii").strip()
+
+    print("feapder {}".format(version))
+    print("\nUsage:")
+    print("  feapder <command> [options] [args]\n")
+    print("Available commands:")
+    cmds = {"create": "create project、spider、item and so on", "shell": "debug response"}
+    for cmdname, cmdclass in sorted(cmds.items()):
+        print("  %-13s %s" % (cmdname, cmdclass))
+
+    print('\nUse "feapder <command> -h" to see more info about a command')
+
+
+def execute():
+    args = sys.argv
+    if len(args) < 2:
+        _print_commands()
+        return
+
+    command = args.pop(1)
+    if command == "create":
+        create_builder.main()
+    elif command == "shell":
+        shell.main()
+    else:
+        _print_commands()

+ 21 - 0
FworkSpider/feapder/commands/create/__init__.py

@@ -0,0 +1,21 @@
+__all__ = [
+    "CreateProject",
+    "CreateSpider",
+    "CreateItem",
+    "CreateInit",
+    "CreateJson",
+    "CreateTable",
+    "CreateCookies",
+    "CreateSetting",
+    "CreateParams",
+]
+
+from .create_table import CreateTable
+from .create_json import CreateJson
+from .create_spider import CreateSpider
+from .create_init import CreateInit
+from .create_item import CreateItem
+from .create_project import CreateProject
+from .create_cookies import CreateCookies
+from .create_setting import CreateSetting
+from .create_params import CreateParams

+ 48 - 0
FworkSpider/feapder/commands/create/create_cookies.py

@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/4/25 10:22 上午
+---------
+@summary: 将浏览器的cookie转为request的cookie
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import json
+import sys
+
+from feapder.utils.tools import get_cookies_from_str, print_pretty
+
+
+class CreateCookies:
+    def get_data(self):
+        """
+        @summary: 从控制台读取多行
+        ---------
+        ---------
+        @result:
+        """
+        print("请输入浏览器cookie (列表或字符串格式)")
+        data = []
+        while True:
+            line = sys.stdin.readline().strip()
+            if not line:
+                break
+
+            data.append(line)
+
+        return "".join(data)
+
+    def create(self):
+        data = self.get_data()
+        cookies = {}
+        try:
+            data_json = json.loads(data)
+
+            for data in data_json:
+                cookies[data.get("name")] = data.get("value")
+
+        except:
+            cookies = get_cookies_from_str(data)
+
+        print_pretty(cookies)

+ 30 - 0
FworkSpider/feapder/commands/create/create_init.py

@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-28 17:38:43
+---------
+@summary: 创建__init__.py
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+from feapder.utils.tools import dumps_json
+
+
+class CreateInit:
+    def create(self):
+        __all__ = []
+
+        import os
+
+        path = os.getcwd()
+        for file in os.listdir(path):
+            if file.endswith(".py") and not file.startswith("__init__"):
+                model = file.split(".")[0]
+                __all__.append(model)
+
+        del os
+
+        with open("__init__.py", "w", encoding="utf-8") as file:
+            text = "__all__ = %s" % dumps_json(__all__)
+            file.write(text)

+ 165 - 0
FworkSpider/feapder/commands/create/create_item.py

@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-28 17:38:43
+---------
+@summary: 创建item
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import getpass
+import os
+
+import feapder.utils.tools as tools
+from feapder import setting
+from feapder.db.mysqldb import MysqlDB
+from .create_init import CreateInit
+
+
+def deal_file_info(file):
+    file = file.replace("{DATE}", tools.get_current_date())
+    file = file.replace("{USER}", getpass.getuser())
+
+    return file
+
+
+class CreateItem:
+    def __init__(self):
+        self._db = MysqlDB()
+        self._create_init = CreateInit()
+
+    def select_columns(self, table_name):
+        # sql = 'SHOW COLUMNS FROM ' + table_name
+        sql = f"SELECT COLUMN_NAME, COLUMN_TYPE, IS_NULLABLE, COLUMN_DEFAULT, EXTRA, COLUMN_KEY, COLUMN_COMMENT FROM INFORMATION_SCHEMA.Columns WHERE table_name = '{table_name}' and table_schema = '{setting.MYSQL_DB}'"
+        columns = self._db.find(sql)
+
+        return columns
+
+    def select_tables_name(self, tables_name):
+        """
+        @summary:
+        ---------
+        @param tables_name: 一类tables 如 qidian*
+        ---------
+        @result:
+        """
+        sql = f"select table_name from information_schema.tables where table_name like '{tables_name}' and table_schema = '{setting.MYSQL_DB}'"
+        tables_name = self._db.find(sql)
+
+        return tables_name
+
+    def convert_table_name_to_hump(self, table_name):
+        """
+        @summary: 格式化表明为驼峰格式
+        ---------
+        @param table:
+        ---------
+        @result:
+        """
+        table_hump_format = ""
+
+        words = table_name.split("_")
+        for word in words:
+            table_hump_format += word.capitalize()  # 首字母大写
+
+        return table_hump_format
+
+    def get_item_template(self):
+        template_path = os.path.abspath(
+            os.path.join(__file__, "../../../templates/item_template.tmpl")
+        )
+        with open(template_path, "r", encoding="utf-8") as file:
+            item_template = file.read()
+
+        return item_template
+
+    def create_item(self, item_template, columns, table_name, support_dict):
+        table_name_hump_format = self.convert_table_name_to_hump(table_name)
+        # 组装 类名
+        item_template = item_template.replace("${item_name}", table_name_hump_format)
+        if support_dict:
+            item_template = item_template.replace("${table_name}", table_name + " 1")
+        else:
+            item_template = item_template.replace("${table_name}", table_name)
+
+        # 组装 属性
+        propertys = ""
+        for column in columns:
+            column_name = column[0]
+            column_type = column[1]
+            is_nullable = column[2]
+            column_default = column[3]
+            column_extra = column[4]
+            column_key = column[5]
+            column_comment = column[6]
+
+            try:
+                value = (
+                    "kwargs.get('{column_name}')".format(column_name=column_name)
+                    if support_dict
+                    else (
+                        column_default != "CURRENT_TIMESTAMP" and column_default or None
+                    )
+                    and eval(column_default)
+                )
+            except:
+                value = (
+                    "kwargs.get('{column_name}')".format(column_name=column_name)
+                    if support_dict
+                    else (
+                        column_default != "CURRENT_TIMESTAMP" and column_default or None
+                    )
+                    and column_default
+                )
+
+            if column_extra == "auto_increment" or column_default is not None:
+                propertys += f"# self.{column_name} = {value}"
+
+            else:
+                if value is None or isinstance(value, (float, int)) or support_dict:
+                    propertys += f"self.{column_name} = {value}"
+                else:
+                    propertys += f"self.{column_name} = '{value}'"
+
+            if column_comment:
+                propertys += f"  # {column_comment}"
+            propertys += "\n" + " " * 8
+
+        item_template = item_template.replace("${propertys}", propertys.strip())
+        item_template = deal_file_info(item_template)
+
+        return item_template
+
+    def save_template_to_file(self, item_template, table_name):
+        item_file = table_name + "_item.py"
+        if os.path.exists(item_file):
+            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % item_file)
+            if confirm != "y":
+                print("取消覆盖  退出")
+                return
+
+        with open(item_file, "w", encoding="utf-8") as file:
+            file.write(item_template)
+            print("\n%s 生成成功" % item_file)
+
+        self._create_init.create()
+
+    def create(self, tables_name, support_dict):
+        input_tables_name = tables_name
+
+        tables_name = self.select_tables_name(tables_name)
+        if not tables_name:
+            print(tables_name)
+            tip = "mysql数据库中无 %s 表 " % input_tables_name
+            raise KeyError(tip)
+
+        for table_name in tables_name:
+            table_name = table_name[0]
+
+            columns = self.select_columns(table_name)
+            item_template = self.get_item_template()
+            item_template = self.create_item(
+                item_template, columns, table_name, support_dict
+            )
+            self.save_template_to_file(item_template, table_name)

+ 52 - 0
FworkSpider/feapder/commands/create/create_json.py

@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-28 17:38:43
+---------
+@summary: 字符串转json
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import sys
+
+import feapder.utils.tools as tools
+
+
+class CreateJson:
+    def get_data(self):
+        """
+        @summary: 从控制台读取多行
+        ---------
+        ---------
+        @result:
+        """
+        print("请输入需要转换的内容: (xxx:xxx格式,支持多行)")
+        data = []
+        while True:
+            line = sys.stdin.readline().strip().replace("\t", " " * 4)
+            if not line:
+                break
+
+            data.append(line)
+
+        return data
+
+    def create(self, sort_keys=False):
+        contents = self.get_data()
+
+        json = {}
+        for content in contents:
+            content = content.strip()
+            if not content or content.startswith(":"):
+                continue
+
+            regex = "([^:\s]*)[:|\s]*(.*)"
+
+            result = tools.get_info(content, regex, fetch_one=True)
+            if result[0] in json:
+                json[result[0]] = json[result[0]] + "&" + result[1]
+            else:
+                json[result[0]] = result[1].strip()
+
+        print(tools.dumps_json(json, sort_keys=sort_keys))

+ 51 - 0
FworkSpider/feapder/commands/create/create_params.py

@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/4/25 10:22 上午
+---------
+@summary: 将浏览器的cookie转为request的cookie
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import sys
+
+from feapder.utils.tools import dumps_json
+
+
+class CreateParams:
+    def get_data(self):
+        """
+        @summary: 从控制台读取多行
+        ---------
+        ---------
+        @result:
+        """
+        print("请输入请求地址")
+        data = []
+        while True:
+            line = sys.stdin.readline().strip()
+            if not line:
+                break
+
+            data.append(line)
+
+        return "".join(data)
+
+    def get_params(self, url):
+        params_json = {}
+        params = url.split("?")[-1].split("&")
+        for param in params:
+            key_value = param.split("=", 1)
+            params_json[key_value[0]] = key_value[1]
+
+        return params_json
+
+    def create(self):
+        data = self.get_data()
+
+        params = self.get_params(data)
+        url = data.split("?")[0]
+
+        print(f'url = "{url}"')
+        print(f"params = {dumps_json(params)}")

+ 52 - 0
FworkSpider/feapder/commands/create/create_project.py

@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-28 17:38:43
+---------
+@summary: 创建项目
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import getpass
+import os
+import shutil
+
+import feapder.utils.tools as tools
+
+
+def deal_file_info(file):
+    file = file.replace("{DATE}", tools.get_current_date())
+    file = file.replace("{USER}", getpass.getuser())
+
+    return file
+
+
+class CreateProject:
+    def copy_callback(self, src, dst, *, follow_symlinks=True):
+        if src.endswith(".py"):
+            with open(src, "r", encoding="utf-8") as src_file, open(
+                dst, "w", encoding="utf8"
+            ) as dst_file:
+                content = src_file.read()
+                content = deal_file_info(content)
+                dst_file.write(content)
+
+        else:
+            shutil.copy2(src, dst, follow_symlinks=follow_symlinks)
+
+    def create(self, project_name):
+        if os.path.exists(project_name):
+            print("%s 项目已经存在" % project_name)
+        else:
+            template_path = os.path.abspath(
+                os.path.join(__file__, "../../../templates/project_template")
+            )
+            shutil.copytree(
+                template_path, project_name, copy_function=self.copy_callback
+            )
+
+            print("\n%s 项目生成成功" % project_name)
+
+
+

+ 27 - 0
FworkSpider/feapder/commands/create/create_setting.py

@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/4/23 13:20
+---------
+@summary: 生成配置文件
+---------
+@author: mkdir700
+@email:  mkdir700@gmail.com
+"""
+
+import os
+import shutil
+
+
+class CreateSetting:
+    def create(self):
+        if os.path.exists("setting.py"):
+            confirm = input("配置文件已存在 是否覆盖 (y/n).  ")
+            if confirm != "y":
+                print("取消覆盖  退出")
+                return
+
+        template_file_path = os.path.abspath(
+            os.path.join(__file__, "../../../templates/project_template/setting.py")
+        )
+        shutil.copy(template_file_path, "./", follow_symlinks=False)
+        print("配置文件生成成功")

+ 102 - 0
FworkSpider/feapder/commands/create/create_spider.py

@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-28 17:38:43
+---------
+@summary: 创建spider
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import getpass
+import os
+import re
+
+import feapder.utils.tools as tools
+from .create_init import CreateInit
+
+
+def deal_file_info(file):
+    file = file.replace("{DATE}", tools.get_current_date())
+    # file = file.replace("{USER}", getpass.getuser())
+    file = file.replace("{USER}", os.path.basename(os.getcwd()))
+
+    return file
+
+
+class CreateSpider:
+    def __init__(self):
+        self._create_init = CreateInit()
+
+    def cover_to_underline(self, key):
+        regex = "[A-Z]*"
+        capitals = re.findall(regex, key)
+
+        if capitals:
+            for pos, capital in enumerate(capitals):
+                if not capital:
+                    continue
+                if pos == 0:
+                    if len(capital) > 1:
+                        key = key.replace(capital, capital.lower() + "_", 1)
+                    else:
+                        key = key.replace(capital, capital.lower(), 1)
+                else:
+                    if len(capital) > 1:
+                        key = key.replace(capital, "_" + capital.lower() + "_", 1)
+                    else:
+                        key = key.replace(capital, "_" + capital.lower(), 1)
+
+        return key
+
+    def get_spider_template(self, spider_type):
+        if spider_type == 1:
+            template_path = "air_spider_template.tmpl"
+        elif spider_type == 2:
+            template_path = "spider_template.tmpl"
+        elif spider_type == 3:
+            template_path = "batch_spider_template.tmpl"
+        elif spider_type == 4:
+            template_path = "spider_list_template.tmpl"
+        else:
+            raise ValueError("spider type error, support 1 2 3")
+
+        template_path = os.path.abspath(
+            os.path.join(__file__, "../../../templates", template_path)
+        )
+        with open(template_path, "r", encoding="utf-8") as file:
+            spider_template = file.read()
+
+        return spider_template
+
+    def create_spider(self, spider_template, spider_name):
+        spider_template = spider_template.replace("${spider_name}", spider_name)
+        spider_template = deal_file_info(spider_template)
+        return spider_template
+
+    def save_spider_to_file(self, spider, spider_name):
+        spider_underline = self.cover_to_underline(spider_name)
+        spider_file = spider_underline + ".py"
+
+        if os.path.exists(spider_file):
+            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % spider_file)
+            if confirm != "y":
+                print("取消覆盖  退出")
+                return
+
+        with open(spider_file, "w", encoding="utf-8") as file:
+            file.write(spider)
+            print("\n%s 生成成功" % spider_name)
+
+        self._create_init.create()
+
+    def create(self, spider_name, spider_type):
+        # 检查spider_name
+        if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
+            raise Exception("爬虫名不符合命名规范,请用下划线命名或驼峰命名方式")
+
+        if spider_name.islower():
+            spider_name = tools.key2hump(spider_name)
+        spider_template = self.get_spider_template(spider_type)
+        spider = self.create_spider(spider_template, spider_name)
+        self.save_spider_to_file(spider, spider_name)

+ 135 - 0
FworkSpider/feapder/commands/create/create_table.py

@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-28 17:38:43
+---------
+@summary: 根据json生成表
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import sys
+import time
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.db.mysqldb import MysqlDB
+from feapder.utils.tools import key2underline
+
+
+class CreateTable:
+    def __init__(self):
+        self._db = MysqlDB()
+
+    def is_vaild_date(self, date):
+        try:
+            if ":" in date:
+                time.strptime(date, "%Y-%m-%d %H:%M:%S")
+            else:
+                time.strptime(date, "%Y-%m-%d")
+            return True
+        except:
+            return False
+
+    def get_key_type(self, value):
+        try:
+            value = eval(value)
+        except:
+            value = value
+
+        key_type = "varchar(255)"
+        if isinstance(value, int):
+            key_type = "int"
+        elif isinstance(value, float):
+            key_type = "double"
+        elif isinstance(value, str):
+            if self.is_vaild_date(value):
+                if ":" in value:
+                    key_type = "datetime"
+                else:
+                    key_type = "date"
+            elif len(value) > 255:
+                key_type = "text"
+            else:
+                key_type = "varchar(255)"
+
+        return key_type
+
+    def get_data(self):
+        """
+        @summary: 从控制台读取多行
+        ---------
+        ---------
+        @result:
+        """
+        data = ""
+        while True:
+            line = sys.stdin.readline().strip()
+            if not line:
+                break
+            data += line
+
+        return tools.get_json(data)
+
+    def create(self, table_name):
+        # 输入表字段
+        print('请输入表数据 json格式 如 {"name":"张三"}\n等待输入:\n')
+        data = self.get_data()
+
+        if not isinstance(data, dict):
+            raise Exception("表数据格式不正确")
+
+        # 拼接表结构
+        sql = """
+            CREATE TABLE `{db}`.`{table_name}` (
+                `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id 自动递增',
+                {other_key}
+                `gtime` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间',
+                PRIMARY KEY (`id`),
+                {unique}
+            ) COMMENT='';
+        """
+
+        print("请设置注释 回车跳过")
+        other_key = ""
+        for key, value in data.items():
+            key = key2underline(key)
+            key_type = self.get_key_type(value)
+
+            comment = input("%s : %s  -> comment:" % (key, key_type))
+
+            other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
+                key=key, key_type=key_type, comment=comment
+            )
+
+        print("\n")
+
+        while True:
+            is_need_batch_date = input("是否添加batch_date 字段 (y/n):")
+            if is_need_batch_date == "y":
+                other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
+                    key="batch_date", key_type="date", comment="批次时间"
+                )
+                break
+            elif is_need_batch_date == "n":
+                break
+
+        print("\n")
+
+        while True:
+            unique = input("请设置唯一索引, 多个逗号间隔\n等待输入:\n").replace(",", ",")
+            if unique:
+                break
+        unique = "UNIQUE `idx` USING BTREE (`%s`) comment ''" % "`,`".join(
+            unique.split(",")
+        )
+
+        sql = sql.format(
+            db=setting.MYSQL_DB,
+            table_name=table_name,
+            other_key=other_key,
+            unique=unique,
+        )
+        print(sql)
+        self._db.execute(sql)
+        print("\n%s 创建成功" % table_name)

+ 118 - 0
FworkSpider/feapder/commands/create_builder.py

@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/2/8 11:21 上午
+---------
+@summary: 生成器
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import argparse
+
+import feapder.setting as setting
+from feapder.commands.create import *
+
+
+def main():
+    spider = argparse.ArgumentParser(description="生成器")
+
+    spider.add_argument(
+        "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
+    )
+    spider.add_argument(
+        "-s",
+        "--spider",
+        nargs="+",
+        help="创建爬虫\n"
+        "如 feapder create -s <spider_name> <spider_type> "
+        "spider_type=1  AirSpider; "
+        "spider_type=2  Spider; "
+        "spider_type=3  BatchSpider;",
+        metavar="",
+    )
+    spider.add_argument(
+        "-i",
+        "--item",
+        nargs="+",
+        help="创建item 如 feapder create -i test 则生成test表对应的item。 "
+        "支持like语法模糊匹配所要生产的表。 "
+        "若想生成支持字典方式赋值的item,则create -item test 1",
+        metavar="",
+    )
+    spider.add_argument(
+        "-t", "--table", help="根据json创建表 如 feapder create -t <table_name>", metavar=""
+    )
+    spider.add_argument(
+        "-init", help="创建__init__.py 如 feapder create -init", action="store_true"
+    )
+    spider.add_argument("-j", "--json", help="创建json", action="store_true")
+    spider.add_argument("-sj", "--sort_json", help="创建有序json", action="store_true")
+    spider.add_argument("-c", "--cookies", help="创建cookie", action="store_true")
+    spider.add_argument("--params", help="解析地址中的参数", action="store_true")
+    spider.add_argument(
+        "--setting", help="创建全局配置文件" "feapder create --setting", action="store_true"
+    )
+
+    # 指定数据库
+    spider.add_argument("--host", type=str, help="mysql 连接地址", metavar="")
+    spider.add_argument("--port", type=str, help="mysql 端口", metavar="")
+    spider.add_argument("--username", type=str, help="mysql 用户名", metavar="")
+    spider.add_argument("--password", type=str, help="mysql 密码", metavar="")
+    spider.add_argument("--db", type=str, help="mysql 数据库名", metavar="")
+    args = spider.parse_args()
+
+    if args.host:
+        setting.MYSQL_IP = args.host
+    if args.port:
+        setting.MYSQL_PORT = int(args.port)
+    if args.username:
+        setting.MYSQL_USER_NAME = args.username
+    if args.password:
+        setting.MYSQL_USER_PASS = args.password
+    if args.db:
+        setting.MYSQL_DB = args.db
+
+    if args.item:
+        item_name, *support_dict = args.item
+        support_dict = bool(support_dict)
+        CreateItem().create(item_name, support_dict)
+
+    elif args.spider:
+        spider_name, *spider_type = args.spider
+        if not spider_type:
+            spider_type = 1
+        else:
+            spider_type = spider_type[0]
+        try:
+            spider_type = int(spider_type)
+        except:
+            raise ValueError("spider_type error, support 1, 2, 3")
+        CreateSpider().create(spider_name, spider_type)
+
+    elif args.project:
+        CreateProject().create(args.project)
+
+    elif args.table:
+        CreateTable().create(args.table)
+
+    elif args.init:
+        CreateInit().create()
+
+    elif args.json:
+        CreateJson().create()
+
+    elif args.sort_json:
+        CreateJson().create(sort_keys=True)
+
+    elif args.cookies:
+        CreateCookies().create()
+
+    elif args.setting:
+        CreateSetting().create()
+
+    elif args.params:
+        CreateParams().create()
+
+
+if __name__ == "__main__":
+    main()

+ 93 - 0
FworkSpider/feapder/commands/shell.py

@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/5/9 12:37 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import json
+import re
+import sys
+
+import IPython
+
+from feapder import Request
+
+
+def request(**kwargs):
+    kwargs.setdefault("proxies", None)
+    response = Request(**kwargs).get_response()
+    print(response)
+
+    IPython.embed(header="now you can use response")
+
+
+def fetch_url(url):
+    request(url=url)
+
+
+def fetch_curl(curl_args):
+    """
+    解析及抓取curl请求
+    :param curl_args:
+    [url, '-H', 'xxx', '-H', 'xxx', '--data-binary', '{"xxx":"xxx"}', '--compressed']
+    :return:
+    """
+    url = curl_args[0]
+    curl_args.pop(0)
+
+    headers = {}
+    data = {}
+    for i in range(0, len(curl_args), 2):
+        if curl_args[i] == "-H":
+            regex = "([^:\s]*)[:|\s]*(.*)"
+            result = re.search(regex, curl_args[i + 1], re.S).groups()
+            if result[0] in headers:
+                headers[result[0]] = headers[result[0]] + "&" + result[1]
+            else:
+                headers[result[0]] = result[1].strip()
+
+        elif curl_args[i] == "--data-binary":
+            data = json.loads(curl_args[i + 1])
+
+    request(url=url, data=data, headers=headers)
+
+
+def usage():
+    """
+下载调试器
+
+usage: feapder shell [options] [args]
+
+optional arguments:
+  -u, --url     抓取指定url
+  -c, --curl    抓取curl格式的请求
+
+    """
+    print(usage.__doc__)
+    sys.exit()
+
+
+def main():
+    args = sys.argv
+    if len(args) < 3:
+        usage()
+
+    elif args[1] in ("-h", "--help"):
+        usage()
+
+    elif args[1] in ("-u", "--url"):
+        fetch_url(args[2])
+
+    elif args[1] in ("-c", "--curl"):
+        fetch_curl(args[2:])
+
+    else:
+        usage()
+
+
+if __name__ == "__main__":
+    main()

+ 9 - 0
FworkSpider/feapder/core/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+'''
+Created on 2020/4/23 12:09 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+'''

+ 252 - 0
FworkSpider/feapder/core/base_parser.py

@@ -0,0 +1,252 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-25 11:41:57
+---------
+@summary: parser 的基类
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+import os
+import traceback
+import feapder.utils.tools as tools
+from feapder.db.mysqldb import MysqlDB
+from feapder.network.item import UpdateItem
+from feapder.utils.log import log
+from feapder.utils.aliyun import UploadOSS
+from feapder.db.redisdb import RedisDB
+
+
+class BaseParser(object):
+    def start_requests(self):
+        """
+        @summary: 添加初始url
+        ---------
+        ---------
+        @result: yield Request()
+        """
+
+        pass
+
+    def download_midware(self, request):
+        """
+        @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载,然后返回 request, response
+        ---------
+        @param request:
+        ---------
+        @result: return request / request, response
+        """
+
+        pass
+
+    def validate(self, request, response):
+        """
+        @summary: 校验函数, 可用于校验response是否正确
+        若函数内抛出异常,则重试请求
+        若返回True 或 None,则进入解析函数
+        若返回False,则抛弃当前请求
+        可通过request.callback_name 区分不同的回调函数,编写不同的校验逻辑
+        ---------
+        @param request:
+        @param response:
+        ---------
+        @result: True / None / False
+        """
+        pass
+
+    def parse(self, request, response):
+        """
+        @summary: 默认的解析函数
+        ---------
+        @param request:
+        @param response:
+        ---------
+        @result:
+        """
+
+        pass
+
+    def exception_request(self, request, response):
+        """
+        @summary: 请求或者parser里解析出异常的request
+        ---------
+        @param request:
+        @param response:
+        ---------
+        @result: request / callback / None (返回值必须可迭代)
+        """
+
+        pass
+
+    def failed_request(self, request, response):
+        """
+        @summary: 超过最大重试次数的request
+        可返回修改后的request  若不返回request,则将传进来的request直接人redis的failed表。否则将修改后的request入failed表
+        ---------
+        @param request:
+        ---------
+        @result: request / item / callback / None (返回值必须可迭代)
+        """
+
+        pass
+    def push_files(self, request, response):
+        """
+        @summary: 下载 并上传附件文件,传进来的request的auto_request必须为False,否则可能会因为响应失败而无法下载文件
+        ---------
+        @param request:  request.url 为文件下载地址, 该方法需要自行调用
+        request.INFO  为上传文件时所需要提供的部分参数  必传
+         info = {
+            "org_url": "http://www...",  # 文件下载连接
+            "filename": f"{list_item.title}.docx",  # 文件名
+            "channel": list_item.channel,
+            "ftype": 'docx,zip,ftp', # 文件类型
+        }
+        request.headers 则存放请求的必要参数,如:parmas,headers  必传
+        ---------
+        @result: request / item / callback / None (返回值必须可迭代),正常处理为 None 即可
+        """
+        list_item = request.item
+        res = None
+        for i in range(5):
+            try:
+                parameter = request.parameter
+                res = UploadOSS().get_state(request.info,**parameter)
+            except:
+                log.error(traceback.format_exc())
+            if res is not None:
+                list_item.projectinfo = res
+                yield list_item
+                log.info(f"{res.get('filename')}附件下载完成,大小为:{res.get('size')},fid为:{res.get('fid')}")
+                return
+            else:
+                log.error(f"{res.get('filename')}附件下载失败,失败连接为:{res.get('org_url')}")
+        if res is None:
+            _db = RedisDB()
+            request_dict = request.to_dict
+            _db.zadd("forwork:files_failed", request_dict)
+
+    def start_callback(self):
+        """
+        @summary: 程序开始的回调
+        ---------
+        ---------
+        @result: None
+        """
+
+        pass
+
+    def end_callback(self):
+        """
+        @summary: 程序结束的回调
+        ---------
+        ---------
+        @result: None
+        """
+
+        pass
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    def close(self):
+        pass
+
+
+class BatchParser(BaseParser):
+    """
+    @summary: 批次爬虫模版
+    ---------
+    """
+
+    def __init__(
+        self, task_table, batch_record_table, task_state, date_format, mysqldb=None
+    ):
+        self._mysqldb = mysqldb or MysqlDB()  # mysqldb
+
+        self._task_table = task_table  # mysql中的任务表
+        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
+        self._task_state = task_state  # mysql中任务表的state字段名
+        self._date_format = date_format  # 批次日期格式
+
+    def add_task(self):
+        """
+        @summary: 添加任务, 每次启动start_monitor 都会调用,且在init_task之前调用
+        ---------
+        ---------
+        @result:
+        """
+
+    def start_requests(self, task):
+        """
+        @summary:
+        ---------
+        @param task: 任务信息 list
+        ---------
+        @result:
+        """
+
+    def update_task_state(self, task_id, state=1, **kwargs):
+        """
+        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
+        调用方法为 yield lambda : self.update_task_state(task_id, state)
+        ---------
+        @param task_id:
+        @param state:
+        ---------
+        @result:
+        """
+
+        kwargs["id"] = task_id
+        kwargs[self._task_state] = state
+
+        sql = tools.make_update_sql(
+            self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id)
+        )
+
+        if self._mysqldb.update(sql):
+            log.debug("置任务%s状态成功" % task_id)
+        else:
+            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
+
+    def update_task_batch(self, task_id, state=1, **kwargs):
+        """
+        批量更新任务 多处调用,更新的字段必须一致
+        注意:需要 写成 yield update_task_batch(...) 否则不会更新
+        @param task_id:
+        @param state:
+        @param kwargs:
+        @return:
+        """
+        kwargs["id"] = task_id
+        kwargs[self._task_state] = state
+
+        update_item = UpdateItem(**kwargs)
+        update_item.table_name = self._task_table
+        update_item.name_underline = self._task_table + "_item"
+
+        return update_item
+
+    @property
+    def batch_date(self):
+        """
+        @summary: 获取批次时间
+        ---------
+        ---------
+        @result:
+        """
+
+        batch_date = os.environ.get("batch_date")
+        if not batch_date:
+            sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format(
+                date_format=self._date_format.replace(":%M", ":%i"),
+                batch_record_table=self._batch_record_table,
+            )
+            batch_info = MysqlDB().find(sql)  # (('2018-08-19'),)
+            if batch_info:
+                os.environ["batch_date"] = batch_date = batch_info[0][0]
+            else:
+                log.error("需先运行 start_monitor_task()")
+                os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
+
+        return batch_date

+ 176 - 0
FworkSpider/feapder/core/collector.py

@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2016-12-23 11:24
+---------
+@summary: request 管理
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import collections
+import threading
+import time
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.db.redisdb import RedisDB
+from feapder.network.request import Request
+from feapder.utils.log import log
+
+
+class Collector(threading.Thread):
+    def __init__(self, redis_key):
+        """
+        @summary:
+        ---------
+        @param redis_key:
+        ---------
+        @result:
+        """
+
+        super(Collector, self).__init__()
+        self._db = RedisDB()
+
+        self._thread_stop = False
+
+        self._todo_requests = collections.deque()
+
+        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
+        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
+
+        self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"
+
+        self._interval = setting.COLLECTOR_SLEEP_TIME
+        self._request_count = setting.COLLECTOR_TASK_COUNT
+        self._is_collector_task = False
+        self._first_get_task = True
+
+        self.__delete_dead_node()
+
+    def run(self):
+        self._thread_stop = False
+        while not self._thread_stop:
+            try:
+                self.__report_node_heartbeat()
+                self.__input_data()
+            except Exception as e:
+                log.exception(e)
+
+            self._is_collector_task = False
+
+            time.sleep(self._interval)
+
+    def stop(self):
+        self._thread_stop = True
+        self._started.clear()
+
+    def __input_data(self):
+        current_timestamp = tools.get_current_timestamp()
+        if len(self._todo_requests) >= self._request_count:
+            return
+
+        request_count = self._request_count  # 先赋值
+        # 查询最近有心跳的节点数量
+        spider_count = self._db.zget_count(
+            self._tab_spider_status,
+            priority_min=current_timestamp - (self._interval + 10),
+            priority_max=current_timestamp,
+        )
+        # 根据等待节点数量,动态分配request
+        if spider_count:
+            # 任务数量
+            task_count = self._db.zget_count(self._tab_requests)
+            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
+            request_count = task_count // spider_count + 1
+
+        request_count = (
+            request_count
+            if request_count <= self._request_count
+            else self._request_count
+        )
+
+        if not request_count:
+            return
+
+        # 当前无其他节点,并且是首次取任务,则重置丢失的任务
+        if self._first_get_task and spider_count <= 1:
+            datas = self._db.zrangebyscore_set_score(
+                self._tab_requests,
+                priority_min=current_timestamp,
+                priority_max=current_timestamp + setting.REQUEST_LOST_TIMEOUT,
+                score=300,
+                count=None,
+            )
+            self._first_get_task = False
+            lose_count = len(datas)
+            if lose_count:
+                log.info("重置丢失任务完毕,共{}条".format(len(datas)))
+
+        # 取任务,只取当前时间搓以内的任务,同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
+        requests_list = self._db.zrangebyscore_set_score(
+            self._tab_requests,
+            priority_min="-inf",
+            priority_max=current_timestamp,
+            score=current_timestamp + setting.REQUEST_LOST_TIMEOUT,
+            count=request_count,
+        )
+
+        if requests_list:
+            self._is_collector_task = True
+            # 存request
+            self.__put_requests(requests_list)
+
+    def __report_node_heartbeat(self):
+        """
+        汇报节点心跳,以便任务平均分配
+        """
+        self._db.zadd(
+            self._tab_spider_status, self._spider_mark, tools.get_current_timestamp()
+        )
+
+    def __delete_dead_node(self):
+        """
+        删除没有心跳的节点信息
+        """
+        self._db.zremrangebyscore(
+            self._tab_spider_status,
+            "-inf",
+            tools.get_current_timestamp() - (self._interval + 10),
+        )
+
+    def __put_requests(self, requests_list):
+        for request in requests_list:
+            try:
+                request_dict = {
+                    "request_obj": Request.from_dict(eval(request)),
+                    "request_redis": request,
+                }
+            except Exception as e:
+                log.exception(
+                    """
+                error %s
+                request %s
+                """
+                    % (e, request)
+                )
+
+                request_dict = None
+
+            if request_dict:
+                self._todo_requests.append(request_dict)
+
+    def get_requests(self, count):
+        requests = []
+        count = count if count <= len(self._todo_requests) else len(self._todo_requests)
+        while count:
+            requests.append(self._todo_requests.popleft())
+            count -= 1
+
+        return requests
+
+    def get_requests_count(self):
+        return len(self._todo_requests) or self._db.zget_count(self._tab_requests) or 0
+
+    def is_collector_task(self):
+        return self._is_collector_task

+ 56 - 0
FworkSpider/feapder/core/handle_failed_requests.py

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-08-13 11:43:01
+---------
+@summary:
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+import feapder.setting as setting
+from feapder.buffer.request_buffer import RequestBuffer
+from feapder.db.redisdb import RedisDB
+from feapder.network.request import Request
+from feapder.utils.log import log
+
+
+class HandleFailedRequests(object):
+    """docstring for HandleFailedRequests"""
+
+    def __init__(self, redis_key):
+        super(HandleFailedRequests, self).__init__()
+        self._redis_key = redis_key
+
+        self._redisdb = RedisDB()
+        self._request_buffer = RequestBuffer(self._redis_key)
+
+        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
+            redis_key=redis_key
+        )
+
+    def get_failed_requests(self, count=10000):
+        failed_requests = self._redisdb.zget(self._table_failed_request, count=count)
+        failed_requests = [eval(failed_request) for failed_request in failed_requests]
+        return failed_requests
+
+    def reput_failed_requests_to_requests(self):
+        log.debug("正在重置失败的requests...")
+        total_count = 0
+        while True:
+            try:
+                failed_requests = self.get_failed_requests()
+                if not failed_requests:
+                    break
+
+                for request in failed_requests:
+                    request["retry_times"] = 0
+                    request_obj = Request.from_dict(request)
+                    self._request_buffer.put_request(request_obj)
+
+                    total_count += 1
+            except Exception as e:
+                log.exception(e)
+
+        self._request_buffer.flush()
+
+        log.debug("重置%s条失败requests为待抓取requests" % total_count)

+ 724 - 0
FworkSpider/feapder/core/parser_control.py

@@ -0,0 +1,724 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-01-03 16:06
+---------
+@summary: parser 控制类
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import random
+import threading
+import time
+from collections import Iterable
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.buffer.item_buffer import ItemBuffer
+from feapder.db.memory_db import MemoryDB
+from feapder.network.item import Item
+from feapder.network.request import Request
+from feapder.utils import metrics
+from feapder.utils.log import log
+
+
+class PaserControl(threading.Thread):
+    DOWNLOAD_EXCEPTION = "download_exception"
+    DOWNLOAD_SUCCESS = "download_success"
+    DOWNLOAD_TOTAL = "download_total"
+    PAESERS_EXCEPTION = "parser_exception"
+
+    is_show_tip = False
+
+    # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警
+    _success_task_count = 0
+    _failed_task_count = 0
+
+    def __init__(self, collector, redis_key, request_buffer, item_buffer):
+        super(PaserControl, self).__init__()
+        self._parsers = []
+        self._collector = collector
+        self._redis_key = redis_key
+        self._request_buffer = request_buffer
+        self._item_buffer = item_buffer
+
+        self._thread_stop = False
+
+        self._wait_task_time = 0
+
+    def run(self):
+        self._thread_stop = False
+        while not self._thread_stop:
+            try:
+                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT)
+                if not requests:
+                    if not self.is_show_tip:
+                        log.debug("parser 等待任务...")
+                        self.is_show_tip = True
+
+                    # log.debug('parser 等待任务{}...'.format(tools.format_seconds(self._wait_task_time)))
+
+                    time.sleep(1)
+                    self._wait_task_time += 1
+                    continue
+
+                self.is_show_tip = False
+                self.deal_requests(requests)
+
+            except Exception as e:
+                log.exception(e)
+                time.sleep(3)
+
+    def is_not_task(self):
+        return self.is_show_tip
+
+    @classmethod
+    def get_task_status_count(cls):
+        return cls._failed_task_count, cls._success_task_count
+
+    def deal_requests(self, requests):
+        for request in requests:
+
+            response = None
+            request_redis = request["request_redis"]
+            request = request["request_obj"]
+
+            del_request_redis_after_item_to_db = False
+            del_request_redis_after_request_to_db = False
+
+            for parser in self._parsers:
+                if parser.name == request.parser_name:
+                    used_download_midware_enable = False
+                    try:
+                        # 记录需下载的文档
+                        self.record_download_status(
+                            PaserControl.DOWNLOAD_TOTAL, parser.name
+                        )
+
+                        # 解析request
+                        if request.auto_request:
+                            request_temp = None
+                            response = None
+
+                            # 下载中间件
+                            if request.download_midware:
+                                if isinstance(request.download_midware, (list, tuple)):
+                                    request_temp = request
+                                    for download_midware in request.download_midware:
+                                        download_midware = (
+                                            download_midware
+                                            if callable(download_midware)
+                                            else tools.get_method(
+                                                parser, download_midware
+                                            )
+                                        )
+                                        request_temp = download_midware(request_temp)
+                                else:
+                                    download_midware = (
+                                        request.download_midware
+                                        if callable(request.download_midware)
+                                        else tools.get_method(
+                                            parser, request.download_midware
+                                        )
+                                    )
+                                    request_temp = download_midware(request)
+                            elif request.download_midware != False:
+                                request_temp = parser.download_midware(request)
+
+                            # 请求
+                            if request_temp:
+                                if (
+                                    isinstance(request_temp, (tuple, list))
+                                    and len(request_temp) == 2
+                                ):
+                                    request_temp, response = request_temp
+
+                                if not isinstance(request_temp, Request):
+                                    raise Exception(
+                                        "download_midware need return a request, but received type: {}".format(
+                                            type(request_temp)
+                                        )
+                                    )
+                                used_download_midware_enable = True
+                                if not response:
+                                    response = (
+                                        request_temp.get_response()
+                                        if not setting.RESPONSE_CACHED_USED
+                                        else request_temp.get_response_from_cached(
+                                            save_cached=False
+                                        )
+                                    )
+                            else:
+                                response = (
+                                    request.get_response()
+                                    if not setting.RESPONSE_CACHED_USED
+                                    else request.get_response_from_cached(
+                                        save_cached=False
+                                    )
+                                )
+
+                            if response == None:
+                                raise Exception(
+                                    "连接超时 url: %s" % (request.url or request_temp.url)
+                                )
+
+                        else:
+                            response = None
+
+                        # 校验
+                        if parser.validate(request, response) == False:
+                            continue
+
+                        if request.callback:  # 如果有parser的回调函数,则用回调处理
+                            callback_parser = (
+                                request.callback
+                                if callable(request.callback)
+                                else tools.get_method(parser, request.callback)
+                            )
+                            results = callback_parser(request, response)
+                        else:  # 否则默认用parser处理
+                            results = parser.parse(request, response)
+
+                        if results and not isinstance(results, Iterable):
+                            raise Exception(
+                                "%s.%s返回值必须可迭代"
+                                % (parser.name, request.callback or "parse")
+                            )
+
+                        # 标识上一个result是什么
+                        result_type = 0  # 0\1\2 (初始值\request\item)
+                        # 此处判断是request 还是 item
+                        for result in results or []:
+                            if isinstance(result, Request):
+                                result_type = 1
+                                # 给request的 parser_name 赋值
+                                result.parser_name = result.parser_name or parser.name
+
+                                # 判断是同步的callback还是异步的
+                                if result.request_sync:  # 同步
+                                    request_dict = {
+                                        "request_obj": result,
+                                        "request_redis": None,
+                                    }
+                                    requests.append(request_dict)
+                                else:  # 异步
+                                    # 将next_request 入库
+                                    self._request_buffer.put_request(result)
+                                    del_request_redis_after_request_to_db = True
+
+                            elif isinstance(result, Item):
+                                result_type = 2
+                                # 将item入库
+                                self._item_buffer.put_item(result)
+                                # 需删除正在做的request
+                                del_request_redis_after_item_to_db = True
+
+                            elif callable(result):  # result为可执行的无参函数
+                                if (
+                                    result_type == 2
+                                ):  # item 的 callback,buffer里的item均入库后再执行
+                                    self._item_buffer.put_item(result)
+                                    del_request_redis_after_item_to_db = True
+
+                                else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
+                                    self._request_buffer.put_request(result)
+                                    del_request_redis_after_request_to_db = True
+
+                            # else:
+                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))
+
+                    except Exception as e:
+                        exception_type = (
+                            str(type(e)).replace("<class '", "").replace("'>", "")
+                        )
+                        if exception_type.startswith("requests"):
+                            # 记录下载失败的文档
+                            self.record_download_status(
+                                PaserControl.DOWNLOAD_EXCEPTION, parser.name
+                            )
+
+                        else:
+                            # 记录解析程序异常
+                            self.record_download_status(
+                                PaserControl.PAESERS_EXCEPTION, parser.name
+                            )
+
+                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
+                            log.exception(e)
+
+                        log.error(
+                            """
+                            -------------- %s.%s error -------------
+                            error          %s
+                            response       %s
+                            deal request   %s
+                            """
+                            % (
+                                parser.name,
+                                (
+                                    request.callback
+                                    and callable(request.callback)
+                                    and getattr(request.callback, "__name__")
+                                    or request.callback
+                                )
+                                or "parse",
+                                str(e),
+                                response,
+                                tools.dumps_json(request.to_dict, indent=28)
+                                if setting.LOG_LEVEL == "DEBUG"
+                                else request,
+                            )
+                        )
+
+                        request.error_msg = "%s: %s" % (exception_type, e)
+                        request.response = str(response)
+
+                        if "Invalid URL" in str(e):
+                            request.is_abandoned = True
+
+                        requests = parser.exception_request(request, response) or [
+                            request
+                        ]
+                        if not isinstance(requests, Iterable):
+                            raise Exception(
+                                "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
+                            )
+                        for request in requests:
+                            if callable(request):
+                                self._request_buffer.put_request(request)
+                                continue
+
+                            if not isinstance(request, Request):
+                                raise Exception("exception_request 需 yield request")
+
+                            if (
+                                request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
+                                or request.is_abandoned
+                            ):
+                                self.__class__._failed_task_count += 1  # 记录失败任务数
+
+                                # 处理failed_request的返回值 request 或 func
+                                results = parser.failed_request(request, response) or [
+                                    request
+                                ]
+                                if not isinstance(results, Iterable):
+                                    raise Exception(
+                                        "%s.%s返回值必须可迭代"
+                                        % (parser.name, "failed_request")
+                                    )
+
+                                for result in results:
+                                    if isinstance(result, Request):
+                                        if setting.SAVE_FAILED_REQUEST:
+                                            if used_download_midware_enable:
+                                                # 去掉download_midware 添加的属性
+                                                original_request = (
+                                                    Request.from_dict(
+                                                        eval(request_redis)
+                                                    )
+                                                    if request_redis
+                                                    else result
+                                                )
+                                                original_request.error_msg = (
+                                                    request.error_msg
+                                                )
+                                                original_request.response = (
+                                                    request.response
+                                                )
+
+                                                self._request_buffer.put_failed_request(
+                                                    original_request
+                                                )
+                                            else:
+                                                self._request_buffer.put_failed_request(
+                                                    result
+                                                )
+
+                                    elif callable(result):
+                                        self._request_buffer.put_request(result)
+
+                                    elif isinstance(result, Item):
+                                        self._item_buffer.put_item(result)
+
+                                del_request_redis_after_request_to_db = True
+
+                            else:
+                                # 将 requests 重新入库 爬取
+                                request.retry_times += 1
+                                request.filter_repeat = False
+                                log.info(
+                                    """
+                                    入库 等待重试
+                                    url     %s
+                                    重试次数 %s
+                                    最大允许重试次数 %s"""
+                                    % (
+                                        request.url,
+                                        request.retry_times,
+                                        setting.SPIDER_MAX_RETRY_TIMES,
+                                    )
+                                )
+                                if used_download_midware_enable:
+                                    # 去掉download_midware 添加的属性 使用原来的requests
+                                    original_request = (
+                                        Request.from_dict(eval(request_redis))
+                                        if request_redis
+                                        else request
+                                    )
+                                    if hasattr(request, "error_msg"):
+                                        original_request.error_msg = request.error_msg
+                                    if hasattr(request, "response"):
+                                        original_request.response = request.response
+                                    original_request.retry_times = request.retry_times
+                                    original_request.filter_repeat = (
+                                        request.filter_repeat
+                                    )
+
+                                    self._request_buffer.put_request(original_request)
+                                else:
+                                    self._request_buffer.put_request(request)
+                                del_request_redis_after_request_to_db = True
+
+                    else:
+                        # 记录下载成功的文档
+                        self.record_download_status(
+                            PaserControl.DOWNLOAD_SUCCESS, parser.name
+                        )
+                        # 记录成功任务数
+                        self.__class__._success_task_count += 1
+
+                        # 缓存下载成功的文档
+                        if setting.RESPONSE_CACHED_ENABLE:
+                            request.save_cached(
+                                response=response,
+                                expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
+                            )
+
+                    finally:
+                        # 释放浏览器
+                        if response and hasattr(response, "browser"):
+                            request._webdriver_pool.put(response.browser)
+
+                    break
+
+            # 删除正在做的request 跟随item优先
+            if request_redis:
+                if del_request_redis_after_item_to_db:
+                    self._item_buffer.put_item(request_redis)
+
+                elif del_request_redis_after_request_to_db:
+                    self._request_buffer.put_del_request(request_redis)
+
+                else:
+                    self._request_buffer.put_del_request(request_redis)
+
+        if setting.SPIDER_SLEEP_TIME:
+            if (
+                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
+                and len(setting.SPIDER_SLEEP_TIME) == 2
+            ):
+                sleep_time = random.randint(
+                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
+                )
+                time.sleep(sleep_time)
+            else:
+                time.sleep(setting.SPIDER_SLEEP_TIME)
+
+    def record_download_status(self, status, spider):
+        """
+        记录html等文档下载状态
+        @return:
+        """
+
+        metrics.emit_counter(f"{spider}:{status}", 1, classify="document")
+
+    def stop(self):
+        self._thread_stop = True
+        self._started.clear()
+
+    def add_parser(self, parser):
+        self._parsers.append(parser)
+
+
+class AirSpiderParserControl(PaserControl):
+    is_show_tip = False
+
+    # 实时统计已做任务数及失败任务数,若失败任务数/已做任务数>0.5 则报警
+    _success_task_count = 0
+    _failed_task_count = 0
+
+    def __init__(self, memory_db: MemoryDB, item_buffer: ItemBuffer):
+        super(PaserControl, self).__init__()
+        self._parsers = []
+        self._memory_db = memory_db
+        self._thread_stop = False
+        self._wait_task_time = 0
+        self._item_buffer = item_buffer
+
+    def run(self):
+        while not self._thread_stop:
+            try:
+                requests = self._memory_db.get()
+                if not requests:
+                    if not self.is_show_tip:
+                        log.debug("parser 等待任务...")
+                        self.is_show_tip = True
+
+                    time.sleep(1)
+                    self._wait_task_time += 1
+                    continue
+
+                self.is_show_tip = False
+                self.deal_requests([requests])
+
+            except Exception as e:
+                log.exception(e)
+                time.sleep(3)
+
+    def deal_requests(self, requests):
+        for request in requests:
+
+            response = None
+
+            for parser in self._parsers:
+                if parser.name == request.parser_name:
+                    try:
+                        # 记录需下载的文档
+                        self.record_download_status(
+                            PaserControl.DOWNLOAD_TOTAL, parser.name
+                        )
+
+                        # 解析request
+                        if request.auto_request:
+                            request_temp = None
+                            response = None
+
+                            # 下载中间件
+                            if request.download_midware:
+                                if isinstance(request.download_midware, (list, tuple)):
+                                    request_temp = request
+                                    for download_midware in request.download_midware:
+                                        download_midware = (
+                                            download_midware
+                                            if callable(download_midware)
+                                            else tools.get_method(
+                                                parser, download_midware
+                                            )
+                                        )
+                                        request_temp = download_midware(request_temp)
+                                else:
+                                    download_midware = (
+                                        request.download_midware
+                                        if callable(request.download_midware)
+                                        else tools.get_method(
+                                            parser, request.download_midware
+                                        )
+                                    )
+                                    request_temp = download_midware(request)
+                            elif request.download_midware != False:
+                                request_temp = parser.download_midware(request)
+
+                            # 请求
+                            if request_temp:
+                                if (
+                                    isinstance(request_temp, (tuple, list))
+                                    and len(request_temp) == 2
+                                ):
+                                    request_temp, response = request_temp
+
+                                if not isinstance(request_temp, Request):
+                                    raise Exception(
+                                        "download_midware need return a request, but received type: {}".format(
+                                            type(request_temp)
+                                        )
+                                    )
+                                request = request_temp
+
+                            if not response:
+                                response = (
+                                    request.get_response()
+                                    if not setting.RESPONSE_CACHED_USED
+                                    else request.get_response_from_cached(
+                                        save_cached=False
+                                    )
+                                )
+
+                        else:
+                            response = None
+
+                        # 校验
+                        if parser.validate(request, response) == False:
+                            continue
+
+                        if request.callback:  # 如果有parser的回调函数,则用回调处理
+                            callback_parser = (
+                                request.callback
+                                if callable(request.callback)
+                                else tools.get_method(parser, request.callback)
+                            )
+                            results = callback_parser(request, response)
+                        else:  # 否则默认用parser处理
+                            results = parser.parse(request, response)
+
+                        if results and not isinstance(results, Iterable):
+                            raise Exception(
+                                "%s.%s返回值必须可迭代"
+                                % (parser.name, request.callback or "parse")
+                            )
+
+                        # 此处判断是request 还是 item
+                        for result in results or []:
+                            if isinstance(result, Request):
+                                # 给request的 parser_name 赋值
+                                result.parser_name = result.parser_name or parser.name
+
+                                # 判断是同步的callback还是异步的
+                                if result.request_sync:  # 同步
+                                    requests.append(result)
+                                else:  # 异步
+                                    # 将next_request 入库
+                                    self._memory_db.add(result)
+
+                            elif isinstance(result, Item):
+                                self._item_buffer.put_item(result)
+
+                    except Exception as e:
+                        exception_type = (
+                            str(type(e)).replace("<class '", "").replace("'>", "")
+                        )
+                        if exception_type.startswith("requests"):
+                            # 记录下载失败的文档
+                            self.record_download_status(
+                                PaserControl.DOWNLOAD_EXCEPTION, parser.name
+                            )
+
+                        else:
+                            # 记录解析程序异常
+                            self.record_download_status(
+                                PaserControl.PAESERS_EXCEPTION, parser.name
+                            )
+
+                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
+                            log.exception(e)
+
+                        log.error(
+                            """
+                                -------------- %s.%s error -------------
+                                error          %s
+                                response       %s
+                                deal request   %s
+                                """
+                            % (
+                                parser.name,
+                                (
+                                    request.callback
+                                    and callable(request.callback)
+                                    and getattr(request.callback, "__name__")
+                                    or request.callback
+                                )
+                                or "parse",
+                                str(e),
+                                response,
+                                tools.dumps_json(request.to_dict, indent=28)
+                                if setting.LOG_LEVEL == "DEBUG"
+                                else request,
+                            )
+                        )
+
+                        request.error_msg = "%s: %s" % (exception_type, e)
+                        request.response = str(response)
+
+                        if "Invalid URL" in str(e):
+                            request.is_abandoned = True
+
+                        requests = parser.exception_request(request, response) or [
+                            request
+                        ]
+                        if not isinstance(requests, Iterable):
+                            raise Exception(
+                                "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
+                            )
+                        for request in requests:
+                            if not isinstance(request, Request):
+                                raise Exception("exception_request 需 yield request")
+
+                            if (
+                                request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
+                                or request.is_abandoned
+                            ):
+                                self.__class__._failed_task_count += 1  # 记录失败任务数
+
+                                # 处理failed_request的返回值 request 或 func
+                                results = parser.failed_request(request, response) or [
+                                    request
+                                ]
+                                if not isinstance(results, Iterable):
+                                    raise Exception(
+                                        "%s.%s返回值必须可迭代"
+                                        % (parser.name, "failed_request")
+                                    )
+
+                                log.info(
+                                    """
+                                    任务超过最大重试次数,丢弃
+                                    url     %s
+                                    重试次数 %s
+                                    最大允许重试次数 %s"""
+                                    % (
+                                        request.url,
+                                        request.retry_times,
+                                        setting.SPIDER_MAX_RETRY_TIMES,
+                                    )
+                                )
+
+                            else:
+                                # 将 requests 重新入库 爬取
+                                request.retry_times += 1
+                                request.filter_repeat = False
+                                log.info(
+                                    """
+                                        入库 等待重试
+                                        url     %s
+                                        重试次数 %s
+                                        最大允许重试次数 %s"""
+                                    % (
+                                        request.url,
+                                        request.retry_times,
+                                        setting.SPIDER_MAX_RETRY_TIMES,
+                                    )
+                                )
+                                self._memory_db.add(request)
+
+                    else:
+                        # 记录下载成功的文档
+                        self.record_download_status(
+                            PaserControl.DOWNLOAD_SUCCESS, parser.name
+                        )
+                        # 记录成功任务数
+                        self.__class__._success_task_count += 1
+
+                        # 缓存下载成功的文档
+                        if setting.RESPONSE_CACHED_ENABLE:
+                            request.save_cached(
+                                response=response,
+                                expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
+                            )
+
+                    finally:
+                        # 释放浏览器
+                        if response and hasattr(response, "browser"):
+                            request._webdriver_pool.put(response.browser)
+
+                    break
+
+        if setting.SPIDER_SLEEP_TIME:
+            if (
+                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
+                and len(setting.SPIDER_SLEEP_TIME) == 2
+            ):
+                sleep_time = random.randint(
+                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
+                )
+                time.sleep(sleep_time)
+            else:
+                time.sleep(setting.SPIDER_SLEEP_TIME)

+ 579 - 0
FworkSpider/feapder/core/scheduler.py

@@ -0,0 +1,579 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2017-01-09 10:38
+---------
+@summary: 组装parser、 parser_control 和 collector
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import threading
+import time
+from collections import Iterable
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.buffer.item_buffer import ItemBuffer
+from feapder.buffer.request_buffer import RequestBuffer
+from feapder.core.base_parser import BaseParser
+from feapder.core.collector import Collector
+from feapder.core.handle_failed_requests import HandleFailedRequests
+from feapder.core.parser_control import PaserControl
+from feapder.db.redisdb import RedisDB
+from feapder.network.item import Item
+from feapder.network.request import Request
+from feapder.utils.log import log
+from feapder.utils.redis_lock import RedisLock
+from feapder.utils import metrics
+
+SPIDER_START_TIME_KEY = "spider_start_time"
+SPIDER_END_TIME_KEY = "spider_end_time"
+SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
+
+
+class Scheduler(threading.Thread):
+    __custom_setting__ = {}
+
+    def __init__(
+        self,
+        redis_key=None,
+        thread_count=None,
+        begin_callback=None,
+        end_callback=None,
+        delete_keys=(),
+        keep_alive=None,
+        auto_start_requests=None,
+        batch_interval=0,
+        wait_lock=True,
+        task_table=None,
+        **kwargs
+    ):
+        """
+        @summary: 调度器
+        ---------
+        @param redis_key: 爬虫request及item存放redis中的文件夹
+        @param thread_count: 线程数,默认为配置文件中的线程数
+        @param begin_callback: 爬虫开始回调函数
+        @param end_callback: 爬虫结束回调函数
+        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则
+        @param keep_alive: 爬虫是否常驻,默认否
+        @param auto_start_requests: 爬虫是否自动添加任务
+        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
+        @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
+        @param task_table: 任务表, 批次爬虫传递
+        ---------
+        @result:
+        """
+
+        super(Scheduler, self).__init__()
+
+        for key, value in self.__class__.__custom_setting__.items():
+            if key == "AUTO_STOP_WHEN_SPIDER_DONE":  # 兼容老版本的配置
+                setattr(setting, "KEEP_ALIVE", not value)
+            else:
+                setattr(setting, key, value)
+        
+
+        self._redis_key = redis_key or setting.REDIS_KEY
+        if not self._redis_key:
+            raise Exception(
+                """
+                redis_key 为redis中存放request与item的目录。不能为空,
+                可在setting中配置,如 REDIS_KEY = 'test'
+                或spider初始化时传参, 如 TestSpider(redis_key='test')
+                """
+            )
+
+        self._request_buffer = RequestBuffer(redis_key)
+        self._item_buffer = ItemBuffer(redis_key, task_table)
+
+        self._collector = Collector(redis_key)
+        self._parsers = []
+        self._parser_controls = []
+        self._parser_control_obj = PaserControl
+
+        # 兼容老版本的参数
+        if "auto_stop_when_spider_done" in kwargs:
+            self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
+        else:
+            self._keep_alive = (
+                keep_alive if keep_alive is not None else setting.KEEP_ALIVE
+            )
+        self._auto_start_requests = (
+            auto_start_requests
+            if auto_start_requests is not None
+            else setting.SPIDER_AUTO_START_REQUESTS
+        )
+        self._batch_interval = batch_interval
+
+        self._begin_callback = (
+            begin_callback
+            if begin_callback
+            else lambda: log.info("\n********** feapder begin **********")
+        )
+        self._end_callback = (
+            end_callback
+            if end_callback
+            else lambda: log.info("\n********** feapder end **********")
+        )
+
+        self._thread_count = (
+            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
+        )
+
+        self._spider_name = redis_key
+        self._project_name = redis_key.split(":")[0]
+
+        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
+        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
+        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
+        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
+            redis_key=redis_key
+        )
+
+        self._is_notify_end = False  # 是否已经通知结束
+        self._last_task_count = 0  # 最近一次任务数量
+        self._redisdb = RedisDB()
+
+        self._project_total_state_table = "{}_total_state".format(self._project_name)
+        self._is_exist_project_total_state_table = False
+
+        # Request 缓存设置
+        Request.cached_redis_key = redis_key
+        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
+
+        delete_keys = delete_keys or setting.DELETE_KEYS
+        if delete_keys:
+            self.delete_tables(delete_keys)
+
+        self._last_check_task_status_time = 0
+        self.wait_lock = wait_lock
+
+        self.init_metrics()
+
+    def init_metrics(self):
+        """
+        初始化打点系统
+        """
+        metrics.init(**setting.METRICS_OTHER_ARGS)
+
+    def add_parser(self, parser):
+        parser = parser()  # parser 实例化
+        if isinstance(parser, BaseParser):
+            self._parsers.append(parser)
+        else:
+            raise ValueError("类型错误,爬虫需继承feapder.BaseParser或feapder.BatchParser")
+
+    def run(self):
+        if not self.is_reach_next_spider_time():
+            return
+
+        self._start()
+
+        while True:
+            try:
+                if self.all_thread_is_done():
+                    if not self._is_notify_end:
+                        self.spider_end()  # 跑完一轮
+                        self.record_spider_state(
+                            spider_type=1,
+                            state=1,
+                            spider_end_time=tools.get_current_date(),
+                            batch_interval=self._batch_interval,
+                        )
+
+                        self._is_notify_end = True
+
+                    if not self._keep_alive:
+                        self._stop_all_thread()
+                        break
+
+                else:
+                    self._is_notify_end = False
+
+                self.check_task_status()
+
+            except Exception as e:
+                log.exception(e)
+
+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
+
+    def __add_task(self):
+        # 启动parser 的 start_requests
+        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
+        self.record_spider_state(
+            spider_type=1,
+            state=0,
+            batch_date=tools.get_current_date(),
+            spider_start_time=tools.get_current_date(),
+            batch_interval=self._batch_interval,
+        )
+
+        # 判断任务池中属否还有任务,若有接着抓取
+        todo_task_count = self._collector.get_requests_count()
+        if todo_task_count:
+            log.info("检查到有待做任务 %s 条,不重下发新任务,将接着上回异常终止处继续抓取" % todo_task_count)
+        else:
+            for parser in self._parsers:
+                results = parser.start_requests()
+                # 添加request到请求队列,由请求队列统一入库
+                if results and not isinstance(results, Iterable):
+                    raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
+
+                result_type = 1
+                for result in results or []:
+                    if isinstance(result, Request):
+                        result.parser_name = result.parser_name or parser.name
+                        self._request_buffer.put_request(result)
+                        result_type = 1
+
+                    elif isinstance(result, Item):
+                        self._item_buffer.put_item(result)
+                        result_type = 2
+
+                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
+                        if result_type == 1:
+                            self._request_buffer.put_request(result)
+                        else:
+                            self._item_buffer.put_item(result)
+                    else:
+                        raise TypeError(
+                            "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
+                                type(result)
+                            )
+                        )
+
+                self._request_buffer.flush()
+                self._item_buffer.flush()
+
+    def _start(self):
+        # 启动request_buffer
+        self._request_buffer.start()
+        # 启动item_buffer
+        self._item_buffer.start()
+        # 启动collector
+        self._collector.start()
+
+        # 启动parser control
+        for i in range(self._thread_count):
+            parser_control = self._parser_control_obj(
+                self._collector,
+                self._redis_key,
+                self._request_buffer,
+                self._item_buffer,
+            )
+
+            for parser in self._parsers:
+                parser_control.add_parser(parser)
+
+            parser_control.start()
+            self._parser_controls.append(parser_control)
+
+        # 下发任务 因为时间可能比较长,放到最后面
+        if setting.RETRY_FAILED_REQUESTS:
+            # 重设失败的任务, 不用加锁,原子性操作
+            handle_failed_requests = HandleFailedRequests(self._redis_key)
+            handle_failed_requests.reput_failed_requests_to_requests()
+
+        # 下发新任务
+        if self._auto_start_requests:  # 自动下发
+            if self.wait_lock:
+                # 将添加任务处加锁,防止多进程之间添加重复的任务
+                with RedisLock(key=self._spider_name) as lock:
+                    if lock.locked:
+                        self.__add_task()
+            else:
+                self.__add_task()
+
+    def all_thread_is_done(self):
+        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+            # 检测 collector 状态
+            if (
+                self._collector.is_collector_task()
+                or self._collector.get_requests_count() > 0
+            ):
+                return False
+
+            # 检测 parser_control 状态
+            for parser_control in self._parser_controls:
+                if not parser_control.is_not_task():
+                    return False
+
+            # 检测 item_buffer 状态
+            if (
+                self._item_buffer.get_items_count() > 0
+                or self._item_buffer.is_adding_to_db()
+            ):
+                return False
+
+            # 检测 request_buffer 状态
+            if (
+                self._request_buffer.get_requests_count() > 0
+                or self._request_buffer.is_adding_to_db()
+            ):
+                return False
+
+            tools.delay_time(1)
+
+        return True
+
+    @tools.run_safe_model("check_task_status")
+    def check_task_status(self):
+        """
+        检查任务状态 预警
+        """
+        # 每分钟检查一次
+        now_time = time.time()
+        if now_time - self._last_check_task_status_time > 60:
+            self._last_check_task_status_time = now_time
+        else:
+            return
+
+        # 检查redis中任务状态,若连续20分钟内任务数量未发生变化(parser可能卡死),则发出报警信息
+        task_count = self._redisdb.zget_count(self._tab_requests)
+
+        if task_count:
+            if task_count != self._last_task_count:
+                self._last_task_count = task_count
+                self._redisdb.hset(
+                    self._tab_spider_time,
+                    SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
+                    tools.get_current_timestamp(),
+                )  # 多进程会重复发消息, 使用reids记录上次统计时间
+            else:
+                # 判断时间间隔是否超过20分钟
+                lua = """
+                    -- local key = KEYS[1]
+                    local field = ARGV[1]
+                    local current_timestamp = ARGV[2]
+
+                    -- 取值
+                    local last_timestamp = redis.call('hget', KEYS[1], field)
+                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
+                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
+                    end
+
+                    if not last_timestamp then
+                        redis.call('hset', KEYS[1], field, current_timestamp)
+                    end
+
+                    return 0
+
+                """
+                redis_obj = self._redisdb.get_redis_obj()
+                cmd = redis_obj.register_script(lua)
+                overtime = cmd(
+                    keys=[self._tab_spider_time],
+                    args=[
+                        SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
+                        tools.get_current_timestamp(),
+                    ],
+                )
+
+                if overtime:
+                    # 发送报警
+                    msg = "{}  爬虫任务停滞 {},请检查爬虫是否正常".format(
+                        self._spider_name, tools.format_seconds(overtime)
+                    )
+                    log.error(msg)
+                    self.send_msg(
+                        msg,
+                        level="error",
+                        message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
+                    )
+
+        else:
+            self._last_task_count = 0
+
+        # 检查失败任务数量 超过1000 报警,
+        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
+        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<失败次数:',failed_count)
+        if failed_count > setting.WARNING_FAILED_COUNT:
+            # 发送报警
+            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
+            log.error(msg)
+            self.send_msg(
+                msg,
+                level="error",
+                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
+            )
+
+        # parser_control实时统计已做任务数及失败任务数,若成功率<0.5 则报警
+        failed_task_count, success_task_count = PaserControl.get_task_status_count()
+        total_count = success_task_count + failed_task_count
+        if total_count > 0:
+            task_success_rate = success_task_count / total_count
+            if task_success_rate < 0.5:
+                # 发送报警
+                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
+                    self._spider_name,
+                    success_task_count,
+                    failed_task_count,
+                    task_success_rate,
+                )
+                log.error(msg)
+                self.send_msg(
+                    msg,
+                    level="error",
+                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
+                )
+
+        # 检查入库失败次数
+        if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
+            msg = "《{}》爬虫导出数据失败,失败次数:{}, 请检查爬虫是否正常".format(
+                self._spider_name, self._item_buffer.export_falied_times
+            )
+            log.error(msg)
+            self.send_msg(
+                msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
+            )
+
+    def delete_tables(self, delete_tables_list):
+        if isinstance(delete_tables_list, bool):
+            delete_tables_list = [self._redis_key + "*"]
+        elif not isinstance(delete_tables_list, (list, tuple)):
+            delete_tables_list = [delete_tables_list]
+
+        redis = RedisDB()
+        for delete_tab in delete_tables_list:
+            if not delete_tab.startswith(self._redis_key):
+                delete_tab = self._redis_key + delete_tab
+            tables = redis.getkeys(delete_tab)
+            for table in tables:
+                if table != self._tab_spider_time:
+                    log.info("正在删除key %s" % table)
+                    redis.clear(table)
+
+    def _stop_all_thread(self):
+        self._request_buffer.stop()
+        self._item_buffer.stop()
+        # 停止 collector
+        self._collector.stop()
+        # 停止 parser_controls
+        for parser_control in self._parser_controls:
+            parser_control.stop()
+
+        self._started.clear()
+
+    def send_msg(self, msg, level="debug", message_prefix=""):
+        # log.debug("发送报警 level:{} msg{}".format(level, msg))
+        tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
+
+    def spider_begin(self):
+        """
+        @summary: start_monitor_task 方式启动,此函数与spider_end不在同一进程内,变量不可共享
+        ---------
+        ---------
+        @result:
+        """
+
+        if self._begin_callback:
+            self._begin_callback()
+
+        for parser in self._parsers:
+            parser.start_callback()
+
+        # 记录开始时间
+        if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
+            current_timestamp = tools.get_current_timestamp()
+            self._redisdb.hset(
+                self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
+            )
+
+            # 发送消息
+            # self.send_msg("《%s》爬虫开始" % self._spider_name)
+
+    def spider_end(self):
+        self.record_end_time()
+
+        if self._end_callback:
+            self._end_callback()
+
+        for parser in self._parsers:
+            if not self._keep_alive:
+                parser.close()
+            parser.end_callback()
+
+        if not self._keep_alive:
+            # 关闭webdirver
+            if Request.webdriver_pool:
+                Request.webdriver_pool.close()
+
+            # 关闭打点
+            metrics.close()
+        else:
+            metrics.flush()
+
+        # 计算抓取时长
+        data = self._redisdb.hget(
+            self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
+        )
+        if data:
+            begin_timestamp = int(data)
+
+            spand_time = tools.get_current_timestamp() - begin_timestamp
+
+            msg = "《%s》爬虫结束,耗时 %s" % (
+                self._spider_name,
+                tools.format_seconds(spand_time),
+            )
+            log.info(msg)
+
+            # self.send_msg(msg)
+
+        if self._keep_alive:
+            log.info("爬虫不自动结束, 等待下一轮任务...")
+        else:
+            self.delete_tables(self._tab_spider_status)
+
+    def record_end_time(self):
+        # 记录结束时间
+        if self._batch_interval:
+            current_timestamp = tools.get_current_timestamp()
+            self._redisdb.hset(
+                self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
+            )
+
+    def is_reach_next_spider_time(self):
+        if not self._batch_interval:
+            return True
+
+        last_spider_end_time = self._redisdb.hget(
+            self._tab_spider_time, SPIDER_END_TIME_KEY
+        )
+        if last_spider_end_time:
+            last_spider_end_time = int(last_spider_end_time)
+            current_timestamp = tools.get_current_timestamp()
+            time_interval = current_timestamp - last_spider_end_time
+
+            if time_interval < self._batch_interval * 86400:
+                log.info(
+                    "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行,退出~".format(
+                        tools.timestamp_to_date(last_spider_end_time),
+                        tools.format_seconds(time_interval),
+                        tools.format_seconds(self._batch_interval * 86400),
+                    )
+                )
+                return False
+
+        return True
+
+    def record_spider_state(
+        self,
+        spider_type,
+        state,
+        batch_date=None,
+        spider_start_time=None,
+        spider_end_time=None,
+        batch_interval=None,
+    ):
+        pass
+
+    def join(self, timeout=None):
+        """
+        重写线程的join
+        """
+        if not self._started.is_set():
+            return
+
+        super().join()

+ 15 - 0
FworkSpider/feapder/core/spiders/__init__.py

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/22 12:08 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+__all__ = ["AirSpider", "Spider", "BatchSpider"]
+
+from feapder.core.spiders.air_spider import AirSpider
+from feapder.core.spiders.spider import Spider
+from feapder.core.spiders.batch_spider import BatchSpider

+ 125 - 0
FworkSpider/feapder/core/spiders/air_spider.py

@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/22 12:05 AM
+---------
+@summary: 基于内存队列的爬虫,不支持分布式
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+from threading import Thread
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.buffer.item_buffer import ItemBuffer
+from feapder.core.base_parser import BaseParser
+from feapder.core.parser_control import AirSpiderParserControl
+from feapder.db.memory_db import MemoryDB
+from feapder.network.request import Request
+from feapder.utils.log import log
+from feapder.utils import metrics
+
+
+class AirSpider(BaseParser, Thread):
+    __custom_setting__ = {}
+
+    def __init__(self, thread_count=None):
+        """
+        基于内存队列的爬虫,不支持分布式
+        :param thread_count: 线程数
+        """
+        super(AirSpider, self).__init__()
+
+        for key, value in self.__class__.__custom_setting__.items():
+            setattr(setting, key, value)
+
+        self._thread_count = (
+            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
+        )
+
+        self._memory_db = MemoryDB()
+        self._parser_controls = []
+        self._item_buffer = ItemBuffer(redis_key="air_spider")
+
+        metrics.init(**setting.METRICS_OTHER_ARGS)
+
+    def distribute_task(self):
+        for request in self.start_requests():
+            if not isinstance(request, Request):
+                raise ValueError("仅支持 yield Request")
+
+            request.parser_name = request.parser_name or self.name
+            self._memory_db.add(request)
+
+    def all_thread_is_done(self):
+        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
+            # 检测 parser_control 状态
+            for parser_control in self._parser_controls:
+                if not parser_control.is_not_task():
+                    return False
+
+            # 检测 任务队列 状态
+            if not self._memory_db.empty():
+                return False
+
+            # 检测 item_buffer 状态
+            if (
+                self._item_buffer.get_items_count() > 0
+                or self._item_buffer.is_adding_to_db()
+            ):
+                return False
+
+            tools.delay_time(1)
+
+        return True
+
+    def run(self):
+        self.start_callback()
+
+        for i in range(self._thread_count):
+            parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer)
+            parser_control.add_parser(self)
+            parser_control.start()
+            self._parser_controls.append(parser_control)
+
+        self._item_buffer.start()
+
+        self.distribute_task()
+
+        while True:
+            try:
+                if self.all_thread_is_done():
+                    # 停止 parser_controls
+                    for parser_control in self._parser_controls:
+                        parser_control.stop()
+
+                    # 关闭item_buffer
+                    self._item_buffer.stop()
+
+                    # 关闭webdirver
+                    if Request.webdriver_pool:
+                        Request.webdriver_pool.close()
+
+                    log.info("无任务,爬虫结束")
+                    break
+
+            except Exception as e:
+                log.exception(e)
+
+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
+
+        self.end_callback()
+        # 为了线程可重复start
+        self._started.clear()
+        # 关闭打点
+        metrics.close()
+
+    def join(self, timeout=None):
+        """
+        重写线程的join
+        """
+        if not self._started.is_set():
+            return
+
+        super().join()

+ 1273 - 0
FworkSpider/feapder/core/spiders/batch_spider.py

@@ -0,0 +1,1273 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/22 12:06 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import datetime
+import os
+import time
+import warnings
+from collections import Iterable
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.buffer.item_buffer import MAX_ITEM_COUNT
+from feapder.core.base_parser import BatchParser
+from feapder.core.scheduler import Scheduler
+from feapder.db.mysqldb import MysqlDB
+from feapder.db.redisdb import RedisDB
+from feapder.network.item import Item
+from feapder.network.item import UpdateItem
+from feapder.network.request import Request
+from feapder.utils.log import log
+from feapder.utils.perfect_dict import PerfectDict
+from feapder.utils.redis_lock import RedisLock
+
+CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
+MYSQL_PIPELINE_PATH = "feapder.pipelines.mysql_pipeline.MysqlPipeline"
+
+
+class BatchSpider(BatchParser, Scheduler):
+    def __init__(
+        self,
+        task_table,
+        batch_record_table,
+        batch_name,
+        batch_interval,
+        task_keys,
+        task_state="state",
+        min_task_count=10000,
+        check_task_interval=5,
+        task_limit=10000,
+        related_redis_key=None,
+        related_batch_record=None,
+        task_condition="",
+        task_order_by="",
+        redis_key=None,
+        thread_count=None,
+        begin_callback=None,
+        end_callback=None,
+        delete_keys=(),
+        keep_alive=None,
+        **kwargs,
+    ):
+        """
+        @summary: 批次爬虫
+        必要条件
+        1、需有任务表
+            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段,则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充
+
+            参考建表语句如下:
+            CREATE TABLE `table_name` (
+              `id` int(11) NOT NULL AUTO_INCREMENT,
+              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
+              `state` int(11) DEFAULT NULL COMMENT '任务状态',
+              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
+              PRIMARY KEY (`id`),
+              UNIQUE KEY `nui` (`param`) USING BTREE
+            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
+
+        2、需有批次记录表 不存在自动创建
+        ---------
+        @param task_table: mysql中的任务表
+        @param batch_record_table: mysql 中的批次记录表
+        @param batch_name: 批次采集程序名称
+        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次,可写成1/24
+        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser,则需将parser_name字段取出来。
+        @param task_state: mysql中任务表的任务状态字段
+        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
+        @param check_task_interval: 检查是否还有任务的时间间隔;
+        @param task_limit: 从数据库中取任务的数量
+        @param redis_key: 任务等数据存放在redis中的key前缀
+        @param thread_count: 线程数,默认为配置文件中的线程数
+        @param begin_callback: 爬虫开始回调函数
+        @param end_callback: 爬虫结束回调函数
+        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
+        @param keep_alive: 爬虫是否常驻,默认否
+        @param related_redis_key: 有关联的其他爬虫任务表(redis)注意:要避免环路 如 A -> B & B -> A 。
+        @param related_batch_record: 有关联的其他爬虫批次表(mysql)注意:要避免环路 如 A -> B & B -> A 。
+            related_redis_key 与 related_batch_record 选其一配置即可;用于相关联的爬虫没结束时,本爬虫也不结束
+            若相关连的爬虫为批次爬虫,推荐以related_batch_record配置,
+            若相关连的爬虫为普通爬虫,无批次表,可以以related_redis_key配置
+        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务,即where后的条件语句
+        @param task_order_by: 取任务时的排序条件 如 id desc
+        ---------
+        @result:
+        """
+        Scheduler.__init__(
+            self,
+            redis_key=redis_key,
+            thread_count=thread_count,
+            begin_callback=begin_callback,
+            end_callback=end_callback,
+            delete_keys=delete_keys,
+            keep_alive=keep_alive,
+            auto_start_requests=False,
+            batch_interval=batch_interval,
+            task_table=task_table,
+            **kwargs,
+        )
+
+        self._redisdb = RedisDB()
+        self._mysqldb = MysqlDB()
+
+        self._task_table = task_table  # mysql中的任务表
+        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
+        self._batch_name = batch_name  # 批次采集程序名称
+        self._task_keys = task_keys  # 需要获取的任务字段
+
+        self._task_state = task_state  # mysql中任务表的state字段名
+        self._min_task_count = min_task_count  # redis 中最少任务数
+        self._check_task_interval = check_task_interval
+        self._task_limit = task_limit  # mysql中一次取的任务数量
+        self._related_task_tables = [
+            setting.TAB_REQUSETS.format(redis_key=redis_key)
+        ]  # 自己的task表也需要检查是否有任务
+        if related_redis_key:
+            self._related_task_tables.append(
+                setting.TAB_REQUSETS.format(redis_key=related_redis_key)
+            )
+
+        self._related_batch_record = related_batch_record
+        self._task_condition = task_condition
+        self._task_condition_prefix_and = task_condition and " and {}".format(
+            task_condition
+        )
+        self._task_condition_prefix_where = task_condition and " where {}".format(
+            task_condition
+        )
+        self._task_order_by = task_order_by and " order by {}".format(task_order_by)
+
+        self._batch_date_cache = None
+        if self._batch_interval >= 1:
+            self._date_format = "%Y-%m-%d"
+        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
+            self._date_format = "%Y-%m-%d %H"
+        else:
+            self._date_format = "%Y-%m-%d %H:%M"
+
+        # 报警相关
+        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
+        self._last_send_msg_time = None
+
+        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
+        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
+        self._spider_deal_speed_cached = None
+
+        self._is_more_parsers = True  # 多模版类爬虫
+
+    def init_property(self):
+        """
+        每个批次开始时需要重置的属性
+        @return:
+        """
+        self._last_send_msg_time = None
+
+        self._spider_last_done_time = None
+        self._spider_last_done_count = 0  # 爬虫刚开始启动时已做任务数量
+
+    def add_parser(self, parser):
+        parser = parser(
+            self._task_table,
+            self._batch_record_table,
+            self._task_state,
+            self._date_format,
+            self._mysqldb,
+        )  # parser 实例化
+        self._parsers.append(parser)
+
+    def start_monitor_task(self):
+        """
+        @summary: 监控任务状态
+        ---------
+        ---------
+        @result:
+        """
+        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
+            self._is_more_parsers = False
+            self._parsers.append(self)
+
+        elif len(self._parsers) <= 1:
+            self._is_more_parsers = False
+
+        self.create_batch_record_table()
+
+        # 添加任务
+        for parser in self._parsers:
+            parser.add_task()
+
+        is_first_check = True
+        while True:
+            try:
+                if self.check_batch(is_first_check):  # 该批次已经做完
+                    if self._keep_alive:
+                        is_first_check = True
+                        log.info("爬虫所有任务已做完,不自动结束,等待新任务...")
+                        time.sleep(self._check_task_interval)
+                        continue
+                    else:
+                        break
+
+                is_first_check = False
+
+                # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
+                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
+                todo_task_count = self._redisdb.zget_count(tab_requests)
+
+                tasks = []
+                if todo_task_count < self._min_task_count:  # 从mysql中取任务
+                    # 更新batch表的任务状态数量
+                    self.update_task_done_count()
+
+                    log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" % todo_task_count)
+                    tasks = self.get_todo_task_from_mysql()
+                    if not tasks:  # 状态为0的任务已经做完,需要检查状态为2的任务是否丢失
+
+                        if (
+                            todo_task_count == 0
+                        ):  # redis 中无待做任务,此时mysql中状态为2的任务为丢失任务。需重新做
+                            lose_task_count = self.get_lose_task_count()
+
+                            if not lose_task_count:
+                                time.sleep(self._check_task_interval)
+                                continue
+
+                            elif (
+                                lose_task_count > self._task_limit * 5
+                            ):  # 丢失任务太多,直接重置,否则每次等redis任务消耗完再取下一批丢失任务,速度过慢
+                                log.info("正在重置丢失任务为待做 共 {} 条".format(lose_task_count))
+                                # 重置正在做的任务为待做
+                                if self.reset_lose_task_from_mysql():
+                                    log.info("重置丢失任务成功")
+                                else:
+                                    log.info("重置丢失任务失败")
+
+                                continue
+
+                            else:  # 丢失任务少,直接取
+                                log.info(
+                                    "正在取丢失任务 共 {} 条, 取 {} 条".format(
+                                        lose_task_count,
+                                        self._task_limit
+                                        if self._task_limit <= lose_task_count
+                                        else lose_task_count,
+                                    )
+                                )
+                                tasks = self.get_doing_task_from_mysql()
+
+                    else:
+                        log.info("mysql 中取到待做任务 %s 条" % len(tasks))
+
+                else:
+                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)
+
+                if not tasks:
+                    if todo_task_count >= self._min_task_count:
+                        # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count)
+                        pass
+                    else:
+                        log.info("mysql 中无待做任务 redis中剩余任务 %s" % todo_task_count)
+                else:
+                    # make start requests
+                    self.distribute_task(tasks)
+                    log.info("添加任务到redis成功")
+
+            except Exception as e:
+                log.exception(e)
+
+            time.sleep(self._check_task_interval)
+
+    def create_batch_record_table(self):
+        sql = (
+            "select table_name from information_schema.tables where table_name like '%s'"
+            % self._batch_record_table
+        )
+        tables_name = self._mysqldb.find(sql)
+        if not tables_name:
+            sql = """
+                CREATE TABLE `{table_name}` (
+                      `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT,
+                      `batch_date` {batch_date} DEFAULT NULL COMMENT '批次时间',
+                      `total_count` int(11) DEFAULT NULL COMMENT '任务总数',
+                      `done_count` int(11) DEFAULT NULL COMMENT '完成数 (1,-1)',
+                      `fail_count` int(11) DEFAULT NULL COMMENT '失败任务数 (-1)',
+                      `interval` float(11) DEFAULT NULL COMMENT '批次间隔',
+                      `interval_unit` varchar(20) DEFAULT NULL COMMENT '批次间隔单位 day, hour',
+                      `create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '批次开始时间',
+                      `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '本条记录更新时间',
+                      `is_done` int(11) DEFAULT '0' COMMENT '批次是否完成 0 未完成  1 完成',
+                      PRIMARY KEY (`id`)
+                    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
+            """.format(
+                table_name=self._batch_record_table,
+                batch_date="date" if self._date_format == "%Y-%m-%d" else "datetime",
+            )
+
+            self._mysqldb.execute(sql)
+
+    def distribute_task(self, tasks):
+        """
+        @summary: 分发任务
+        ---------
+        @param tasks:
+        ---------
+        @result:
+        """
+        if self._is_more_parsers:  # 为多模版类爬虫,需要下发指定的parser
+            for task in tasks:
+                for parser in self._parsers:  # 寻找task对应的parser
+                    if parser.name in task:
+                        task = PerfectDict(
+                            _dict=dict(zip(self._task_keys, task)), _values=list(task)
+                        )
+                        requests = parser.start_requests(task)
+                        if requests and not isinstance(requests, Iterable):
+                            raise Exception(
+                                "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
+                            )
+
+                        result_type = 1
+                        for request in requests or []:
+                            if isinstance(request, Request):
+                                request.parser_name = request.parser_name or parser.name
+                                self._request_buffer.put_request(request)
+                                result_type = 1
+
+                            elif isinstance(request, Item):
+                                self._item_buffer.put_item(request)
+                                result_type = 2
+
+                                if (
+                                    self._item_buffer.get_items_count()
+                                    >= MAX_ITEM_COUNT
+                                ):
+                                    self._item_buffer.flush()
+
+                            elif callable(request):  # callbale的request可能是更新数据库操作的函数
+                                if result_type == 1:
+                                    self._request_buffer.put_request(request)
+                                else:
+                                    self._item_buffer.put_item(request)
+
+                                    if (
+                                        self._item_buffer.get_items_count()
+                                        >= MAX_ITEM_COUNT
+                                    ):
+                                        self._item_buffer.flush()
+
+                            else:
+                                raise TypeError(
+                                    "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
+                                        type(requests)
+                                    )
+                                )
+
+                        break
+
+        else:  # task没对应的parser 则将task下发到所有的parser
+            for task in tasks:
+                for parser in self._parsers:
+                    task = PerfectDict(
+                        _dict=dict(zip(self._task_keys, task)), _values=list(task)
+                    )
+                    requests = parser.start_requests(task)
+                    if requests and not isinstance(requests, Iterable):
+                        raise Exception(
+                            "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
+                        )
+
+                    result_type = 1
+                    for request in requests or []:
+                        if isinstance(request, Request):
+                            request.parser_name = request.parser_name or parser.name
+                            self._request_buffer.put_request(request)
+                            result_type = 1
+
+                        elif isinstance(request, Item):
+                            self._item_buffer.put_item(request)
+                            result_type = 2
+
+                            if self._item_buffer.get_items_count() >= MAX_ITEM_COUNT:
+                                self._item_buffer.flush()
+
+                        elif callable(request):  # callbale的request可能是更新数据库操作的函数
+                            if result_type == 1:
+                                self._request_buffer.put_request(request)
+                            else:
+                                self._item_buffer.put_item(request)
+
+                                if (
+                                    self._item_buffer.get_items_count()
+                                    >= MAX_ITEM_COUNT
+                                ):
+                                    self._item_buffer.flush()
+
+        self._request_buffer.flush()
+        self._item_buffer.flush()
+
+    def __get_task_state_count(self):
+        sql = "select {state}, count(1) from {task_table}{task_condition} group by {state}".format(
+            state=self._task_state,
+            task_table=self._task_table,
+            task_condition=self._task_condition_prefix_where,
+        )
+        task_state_count = self._mysqldb.find(sql)
+
+        task_state = {
+            "total_count": sum(count for state, count in task_state_count),
+            "done_count": sum(
+                count for state, count in task_state_count if state in (1, -1)
+            ),
+            "failed_count": sum(
+                count for state, count in task_state_count if state == -1
+            ),
+        }
+
+        return task_state
+
+    def update_task_done_count(self):
+        """
+        @summary: 更新批次表中的任务状态
+        ---------
+        ---------
+        @result:
+        """
+        task_count = self.__get_task_state_count()
+
+        # log.info('《%s》 批次进度 %s/%s' % (self._batch_name, done_task_count, total_task_count))
+
+        # 更新批次表
+        sql = "update {} set done_count = {}, total_count = {}, fail_count = {}, update_time = CURRENT_TIME, is_done=0, `interval` = {}, interval_unit = '{}' where batch_date = '{}'".format(
+            self._batch_record_table,
+            task_count.get("done_count"),
+            task_count.get("total_count"),
+            task_count.get("failed_count"),
+            self._batch_interval
+            if self._batch_interval >= 1
+            else self._batch_interval * 24,
+            "day" if self._batch_interval >= 1 else "hour",
+            self.batch_date,
+        )
+        self._mysqldb.update(sql)
+
+    def update_is_done(self):
+        sql = "update {} set is_done = 1, update_time = CURRENT_TIME where batch_date = '{}' and is_done = 0".format(
+            self._batch_record_table, self.batch_date
+        )
+        self._mysqldb.update(sql)
+
+    def get_todo_task_from_mysql(self):
+        """
+        @summary: 取待做的任务
+        ---------
+        ---------
+        @result:
+        """
+        # TODO 分批取数据 每批最大取 1000000个,防止内存占用过大
+        # 查询任务
+        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
+        sql = "select %s from %s where %s = 0%s%s limit %s" % (
+            task_keys,
+            self._task_table,
+            self._task_state,
+            self._task_condition_prefix_and,
+            self._task_order_by,
+            self._task_limit,
+        )
+        tasks = self._mysqldb.find(sql)
+
+        if tasks:
+            # 更新任务状态
+            for i in range(0, len(tasks), 10000):  # 10000 一批量更新
+                task_ids = str(
+                    tuple([task[0] for task in tasks[i : i + 10000]])
+                ).replace(",)", ")")
+                sql = "update %s set %s = 2 where id in %s" % (
+                    self._task_table,
+                    self._task_state,
+                    task_ids,
+                )
+                self._mysqldb.update(sql)
+
+        return tasks
+
+    def get_doing_task_from_mysql(self):
+        """
+        @summary: 取正在做的任务
+        ---------
+        ---------
+        @result:
+        """
+
+        # 查询任务
+        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
+        sql = "select %s from %s where %s = 2%s%s limit %s" % (
+            task_keys,
+            self._task_table,
+            self._task_state,
+            self._task_condition_prefix_and,
+            self._task_order_by,
+            self._task_limit,
+        )
+        tasks = self._mysqldb.find(sql)
+
+        return tasks
+
+    def get_lose_task_count(self):
+        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
+            date_format=self._date_format.replace(":%M", ":%i"),
+            batch_record_table=self._batch_record_table,
+        )
+        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
+        batch_date, total_count, done_count = batch_info[0]
+        return total_count - done_count
+
+    def reset_lose_task_from_mysql(self):
+        """
+        @summary: 重置丢失任务为待做
+        ---------
+        ---------
+        @result:
+        """
+
+        sql = "update {table} set {state} = 0 where {state} = 2{task_condition}".format(
+            table=self._task_table,
+            state=self._task_state,
+            task_condition=self._task_condition_prefix_and,
+        )
+        return self._mysqldb.update(sql)
+
+    def get_deal_speed(self, total_count, done_count, last_batch_date):
+        """
+        获取处理速度
+        @param total_count: 总数量
+        @param done_count: 做完数量
+        @param last_batch_date: 批次时间 datetime
+        @return:
+            deal_speed (条/小时), need_time (秒), overflow_time(秒) ( overflow_time < 0 时表示提前多少秒完成 )
+            或
+            None
+        """
+        if not self._spider_last_done_count:
+            now_date = datetime.datetime.now()
+            self._spider_last_done_count = done_count
+            self._spider_last_done_time = now_date
+
+        if done_count > self._spider_last_done_count:
+            now_date = datetime.datetime.now()
+
+            time_interval = (now_date - self._spider_last_done_time).total_seconds()
+            deal_speed = (
+                done_count - self._spider_last_done_count
+            ) / time_interval  # 条/秒
+            need_time = (total_count - done_count) / deal_speed  # 单位秒
+            overflow_time = (
+                (now_date - last_batch_date).total_seconds()
+                + need_time
+                - datetime.timedelta(days=self._batch_interval).total_seconds()
+            )  # 溢出时间 秒
+            calculate_speed_time = now_date.strftime("%Y-%m-%d %H:%M:%S")  # 统计速度时间
+
+            deal_speed = int(deal_speed * 3600)  # 条/小时
+
+            # 更新最近已做任务数及时间
+            self._spider_last_done_count = done_count
+            self._spider_last_done_time = now_date
+
+            self._spider_deal_speed_cached = (
+                deal_speed,
+                need_time,
+                overflow_time,
+                calculate_speed_time,
+            )
+
+        return self._spider_deal_speed_cached
+
+    def init_task(self):
+        """
+        @summary: 初始化任务表中的任务, 新一个批次开始时调用。 可能会重写
+        ---------
+        ---------
+        @result:
+        """
+
+        sql = "update {task_table} set {state} = 0 where {state} != -1{task_condition}".format(
+            task_table=self._task_table,
+            state=self._task_state,
+            task_condition=self._task_condition_prefix_and,
+        )
+        return self._mysqldb.update(sql)
+
+    def check_batch(self, is_first_check=False):
+        """
+        @summary: 检查批次是否完成
+        ---------
+        @param: is_first_check 是否为首次检查,若首次检查,且检查结果为批次已完成,则不发送批次完成消息。因为之前发送过了
+        ---------
+        @result: 完成返回True 否则False
+        """
+
+        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
+            date_format=self._date_format.replace(":%M", ":%i"),
+            batch_record_table=self._batch_record_table,
+        )
+        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
+
+        if batch_info:
+            batch_date, total_count, done_count = batch_info[0]
+
+            now_date = datetime.datetime.now()
+            last_batch_date = datetime.datetime.strptime(batch_date, self._date_format)
+            time_difference = now_date - last_batch_date
+
+            if total_count == done_count and time_difference < datetime.timedelta(
+                days=self._batch_interval
+            ):  # 若在本批次内,再次检查任务表是否有新增任务
+                # # 改成查询任务表 看是否真的没任务了,因为batch_record表里边的数量可能没来得及更新
+                task_count = self.__get_task_state_count()
+
+                total_count = task_count.get("total_count")
+                done_count = task_count.get("done_count")
+
+            if total_count == done_count:
+                # 检查相关联的爬虫是否完成
+                releated_spider_is_done = self.related_spider_is_done()
+                if releated_spider_is_done == False:
+                    msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format(
+                        self._batch_name,
+                        self._related_batch_record or self._related_task_tables,
+                        batch_date,
+                        done_count,
+                        total_count,
+                    )
+                    log.info(msg)
+                    # 检查是否超时 超时发出报警
+                    if time_difference >= datetime.timedelta(
+                        days=self._batch_interval
+                    ):  # 已经超时
+                        if (
+                            not self._last_send_msg_time
+                            or now_date - self._last_send_msg_time
+                            >= self._send_msg_interval
+                        ):
+                            self._last_send_msg_time = now_date
+                            self.send_msg(
+                                msg,
+                                level="error",
+                                message_prefix="《{}》本批次未完成, 正在等待依赖爬虫 {} 结束".format(
+                                    self._batch_name,
+                                    self._related_batch_record
+                                    or self._related_task_tables,
+                                ),
+                            )
+
+                    return False
+
+                elif releated_spider_is_done == True:
+                    # 更新is_done 状态
+                    self.update_is_done()
+
+                else:
+                    self.update_is_done()
+
+                msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format(
+                    self._batch_name, batch_date, done_count
+                )
+                log.info(msg)
+                if not is_first_check:
+                    self.send_msg(msg)
+
+                # 判断下一批次是否到
+                if time_difference >= datetime.timedelta(days=self._batch_interval):
+                    msg = "《{}》下一批次开始".format(self._batch_name)
+                    log.info(msg)
+                    self.send_msg(msg)
+
+                    # 初始化任务表状态
+                    if self.init_task() != False:  # 更新失败返回False 其他返回True/None
+                        # 初始化属性
+                        self.init_property()
+
+                        is_success = (
+                            self.record_batch()
+                        )  # 有可能插入不成功,但是任务表已经重置了,不过由于当前时间为下一批次的时间,检查批次是否结束时不会检查任务表,所以下次执行时仍然会重置
+                        if is_success:
+                            # 看是否有等待任务的worker,若有则需要等会再下发任务,防止work批次时间没来得及更新
+                            current_timestamp = tools.get_current_timestamp()
+                            spider_count = self._redisdb.zget_count(
+                                self._tab_spider_status,
+                                priority_min=current_timestamp
+                                - (setting.COLLECTOR_SLEEP_TIME + 10),
+                                priority_max=current_timestamp,
+                            )
+                            if spider_count:
+                                log.info(
+                                    f"插入新批次记录成功,检测到有{spider_count}个爬虫进程在等待任务,本批任务1分钟后开始下发, 防止爬虫端缓存的批次时间没来得及更新"
+                                )
+                                tools.delay_time(60)
+                            else:
+                                log.info("插入新批次记录成功")
+
+                            return False  # 下一批次开始
+
+                        else:
+                            return True  # 下一批次不开始。先不派发任务,因为批次表新批次插入失败了,需要插入成功后再派发任务
+
+                else:
+                    log.info("《{}》下次批次时间未到".format(self._batch_name))
+                    if not is_first_check:
+                        self.send_msg("《{}》下次批次时间未到".format(self._batch_name))
+                    return True
+
+            else:
+                if time_difference >= datetime.timedelta(
+                    days=self._batch_interval
+                ):  # 已经超时
+                    time_out = time_difference - datetime.timedelta(
+                        days=self._batch_interval
+                    )
+                    time_out_pretty = tools.format_seconds(time_out.total_seconds())
+
+                    msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format(
+                        self._batch_name,
+                        time_out_pretty,
+                        batch_date,
+                        done_count,
+                        total_count,
+                    )
+                    if self._batch_interval >= 1:
+                        msg += ", 期望时间{}天".format(self._batch_interval)
+                    else:
+                        msg += ", 期望时间{}小时".format(self._batch_interval * 24)
+
+                    result = self.get_deal_speed(
+                        total_count=total_count,
+                        done_count=done_count,
+                        last_batch_date=last_batch_date,
+                    )
+                    if result:
+                        deal_speed, need_time, overflow_time, calculate_speed_time = (
+                            result
+                        )
+                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
+                            calculate_speed_time,
+                            deal_speed,
+                            tools.format_seconds(need_time),
+                        )
+
+                        if overflow_time > 0:
+                            msg += ", 该批次预计总超时 {}, 请及时处理".format(
+                                tools.format_seconds(overflow_time)
+                            )
+
+                    log.info(msg)
+
+                    if (
+                        not self._last_send_msg_time
+                        or now_date - self._last_send_msg_time
+                        >= self._send_msg_interval
+                    ):
+                        self._last_send_msg_time = now_date
+                        self.send_msg(
+                            msg,
+                            level="error",
+                            message_prefix="《{}》批次超时".format(self._batch_name),
+                        )
+
+                else:  # 未超时
+                    remaining_time = (
+                        datetime.timedelta(days=self._batch_interval) - time_difference
+                    )
+                    remaining_time_pretty = tools.format_seconds(
+                        remaining_time.total_seconds()
+                    )
+
+                    if self._batch_interval >= 1:
+                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format(
+                            self._batch_name,
+                            batch_date,
+                            done_count,
+                            total_count,
+                            self._batch_interval,
+                            remaining_time_pretty,
+                        )
+                    else:
+                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format(
+                            self._batch_name,
+                            batch_date,
+                            done_count,
+                            total_count,
+                            self._batch_interval * 24,
+                            remaining_time_pretty,
+                        )
+
+                    result = self.get_deal_speed(
+                        total_count=total_count,
+                        done_count=done_count,
+                        last_batch_date=last_batch_date,
+                    )
+                    if result:
+                        deal_speed, need_time, overflow_time, calculate_speed_time = (
+                            result
+                        )
+                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
+                            calculate_speed_time,
+                            deal_speed,
+                            tools.format_seconds(need_time),
+                        )
+
+                        if overflow_time > 0:
+                            msg += ", 该批次可能会超时 {}, 请及时处理".format(
+                                tools.format_seconds(overflow_time)
+                            )
+                            # 发送警报
+                            if (
+                                not self._last_send_msg_time
+                                or now_date - self._last_send_msg_time
+                                >= self._send_msg_interval
+                            ):
+                                self._last_send_msg_time = now_date
+                                self.send_msg(
+                                    msg,
+                                    level="error",
+                                    message_prefix="《{}》批次可能超时".format(
+                                        self._batch_name
+                                    ),
+                                )
+
+                        elif overflow_time < 0:
+                            msg += ", 该批次预计提前 {} 完成".format(
+                                tools.format_seconds(-overflow_time)
+                            )
+
+                    log.info(msg)
+
+        else:
+            # 插入batch_date
+            self.record_batch()
+
+            # 初始化任务表状态 可能有产生任务的代码
+            self.init_task()
+
+            return False
+
+    def related_spider_is_done(self):
+        """
+        相关连的爬虫是否跑完
+        @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断
+        """
+
+        for related_redis_task_table in self._related_task_tables:
+            if self._redisdb.exists_key(related_redis_task_table):
+                return False
+
+        if self._related_batch_record:
+            sql = "select is_done from {} order by id desc limit 1".format(
+                self._related_batch_record
+            )
+            is_done = self._mysqldb.find(sql)
+            is_done = is_done[0][0] if is_done else None
+
+            if is_done is None:
+                log.warning("相关联的批次表不存在或无批次信息")
+                return None
+
+            if not is_done:
+                return False
+
+        return True
+
+    def record_batch(self):
+        """
+        @summary: 记录批次信息(初始化)
+        ---------
+        ---------
+        @result:
+        """
+
+        # 查询总任务数
+        sql = "select count(1) from %s%s" % (
+            self._task_table,
+            self._task_condition_prefix_where,
+        )
+        total_task_count = self._mysqldb.find(sql)[0][0]
+
+        batch_date = tools.get_current_date(self._date_format)
+
+        sql = (
+            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
+            % (
+                self._batch_record_table,
+                batch_date,
+                0,
+                total_task_count,
+                self._batch_interval
+                if self._batch_interval >= 1
+                else self._batch_interval * 24,
+                "day" if self._batch_interval >= 1 else "hour",
+            )
+        )
+
+        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
+        if affect_count:
+            # 重置批次日期
+            self._batch_date_cache = batch_date
+            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
+            os.environ["batch_date"] = self._batch_date_cache
+
+            # 爬虫开始
+            self.spider_begin()
+            self.record_spider_state(
+                spider_type=2,
+                state=0,
+                batch_date=batch_date,
+                spider_start_time=tools.get_current_date(),
+                batch_interval=self._batch_interval,
+            )
+        else:
+            log.error("插入新批次失败")
+
+        return affect_count
+
+    # -------- 批次结束逻辑 ------------
+
+    def task_is_done(self):
+        """
+        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
+        ---------
+        ---------
+        @result: True / False (做完 / 未做完)
+        """
+
+        is_done = False
+
+        # 查看批次记录表任务状态
+        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
+            date_format=self._date_format.replace(":%M", ":%i"),
+            batch_record_table=self._batch_record_table,
+        )
+
+        batch_info = self._mysqldb.find(sql)
+        if batch_info is None:
+            raise Exception("查询批次信息失败")
+
+        if batch_info:
+            self._batch_date_cache, total_count, done_count, is_done = batch_info[
+                0
+            ]  # 更新self._batch_date_cache, 防止新批次已经开始了,但self._batch_date_cache还是原来的批次时间
+
+            log.info(
+                "《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d"
+                % (
+                    self._batch_name,
+                    self._batch_date_cache,
+                    done_count,
+                    total_count,
+                    is_done,
+                )
+            )
+            os.environ["batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间
+
+        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
+            # 比较耗时 加锁防止多进程同时查询
+            with RedisLock(key=self._spider_name) as lock:
+                if lock.locked:
+                    log.info("批次表标记已完成,正在检查任务表是否有未完成的任务")
+
+                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
+                        self._task_table,
+                        self._task_state,
+                        self._task_state,
+                        self._task_condition_prefix_and,
+                    )
+                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
+                    if tasks:
+                        log.info("检测到任务表中有未完成任务,等待任务下发")
+                        is_done = False
+
+                        # 更新batch_record 表的is_done 状态,减少查询任务表的次数
+                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
+                            batch_record_table=self._batch_record_table,
+                            batch_date=self._batch_date_cache,
+                        )
+                        self._mysqldb.update(sql)
+
+                    else:
+                        log.info("任务表中任务均已完成,爬虫结束")
+                else:
+                    log.info("批次表标记已完成,其他爬虫进程正在检查任务表是否有未完成的任务,本进程跳过检查,继续等待")
+
+                    is_done = False
+
+        return is_done
+
+    def run(self):
+        """
+        @summary: 重写run方法 检查mysql中的任务是否做完, 做完停止
+        ---------
+        ---------
+        @result:
+        """
+        try:
+            self.create_batch_record_table()
+
+            if not self._parsers:  # 不是add_parser 模式
+                self._parsers.append(self)
+
+            self._start()
+
+            while True:
+                try:
+                    if (
+                        self.task_is_done() and self.all_thread_is_done()
+                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完(检查各个线程all_thread_is_done,防止任务没做完,就更新任务状态,导致程序结束的情况)
+                        if not self._is_notify_end:
+                            self.spider_end()
+                            self.record_spider_state(
+                                spider_type=2,
+                                state=1,
+                                batch_date=self._batch_date_cache,
+                                spider_end_time=tools.get_current_date(),
+                                batch_interval=self._batch_interval,
+                            )
+
+                            self._is_notify_end = True
+
+                        if not self._keep_alive:
+                            self._stop_all_thread()
+                            break
+                    else:
+                        self._is_notify_end = False
+
+                    self.check_task_status()
+
+                except Exception as e:
+                    log.exception(e)
+
+                tools.delay_time(10)  # 10秒钟检查一次爬虫状态
+
+        except Exception as e:
+            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
+            log.error(msg)
+            self.send_msg(
+                msg, level="error", message_prefix="《%s》爬虫异常结束".format(self._batch_name)
+            )
+
+            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
+
+    @classmethod
+    def to_DebugBatchSpider(cls, *args, **kwargs):
+        # DebugBatchSpider 继承 cls
+        DebugBatchSpider.__bases__ = (cls,)
+        DebugBatchSpider.__name__ = cls.__name__
+        return DebugBatchSpider(*args, **kwargs)
+
+
+class DebugBatchSpider(BatchSpider):
+    """
+    Debug批次爬虫
+    """
+
+    __debug_custom_setting__ = dict(
+        COLLECTOR_SLEEP_TIME=1,
+        COLLECTOR_TASK_COUNT=1,
+        # SPIDER
+        SPIDER_THREAD_COUNT=1,
+        SPIDER_SLEEP_TIME=0,
+        SPIDER_TASK_COUNT=1,
+        SPIDER_MAX_RETRY_TIMES=10,
+        REQUEST_LOST_TIMEOUT=600,  # 10分钟
+        PROXY_ENABLE=False,
+        RETRY_FAILED_REQUESTS=False,
+        # 保存失败的request
+        SAVE_FAILED_REQUEST=False,
+        # 过滤
+        ITEM_FILTER_ENABLE=False,
+        REQUEST_FILTER_ENABLE=False,
+        OSS_UPLOAD_TABLES=(),
+        DELETE_KEYS=True,
+        ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
+    )
+
+    def __init__(
+        self,
+        task_id=None,
+        task=None,
+        save_to_db=False,
+        update_stask=False,
+        *args,
+        **kwargs,
+    ):
+        """
+        @param task_id:  任务id
+        @param task:  任务  task 与 task_id 二者选一即可
+        @param save_to_db: 数据是否入库 默认否
+        @param update_stask: 是否更新任务 默认否
+        @param args:
+        @param kwargs:
+        """
+        warnings.warn(
+            "您正处于debug模式下,该模式下不会更新任务状态及数据入库,仅用于调试。正式发布前请更改为正常模式", category=Warning
+        )
+
+        if not task and not task_id:
+            raise Exception("task_id 与 task 不能同时为null")
+
+        kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
+        if save_to_db and not self.__class__.__custom_setting__.get("ITEM_PIPELINES"):
+            self.__class__.__debug_custom_setting__.update(
+                ITEM_PIPELINES=[MYSQL_PIPELINE_PATH]
+            )
+        self.__class__.__custom_setting__.update(
+            self.__class__.__debug_custom_setting__
+        )
+
+        super(DebugBatchSpider, self).__init__(*args, **kwargs)
+
+        self._task_id = task_id
+        self._task = task
+        self._update_task = update_stask
+
+    def start_monitor_task(self):
+        """
+        @summary: 监控任务状态
+        ---------
+        ---------
+        @result:
+        """
+        if not self._parsers:  # 不是多模版模式, 将自己注入到parsers,自己为模版
+            self._is_more_parsers = False
+            self._parsers.append(self)
+
+        elif len(self._parsers) <= 1:
+            self._is_more_parsers = False
+
+        if self._task:
+            self.distribute_task([self._task])
+        else:
+            tasks = self.get_todo_task_from_mysql()
+            if not tasks:
+                raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format(self._task_id))
+            self.distribute_task(tasks)
+
+        os.environ.setdefault("batch_date", "1970-00-00")
+        log.debug("下发任务完毕")
+
+    def get_todo_task_from_mysql(self):
+        """
+        @summary: 取待做的任务
+        ---------
+        ---------
+        @result:
+        """
+
+        # 查询任务
+        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
+        sql = "select %s from %s where id=%s" % (
+            task_keys,
+            self._task_table,
+            self._task_id,
+        )
+        tasks = self._mysqldb.find(sql)
+
+        return tasks
+
+    def save_cached(self, request, response, table):
+        pass
+
+    def update_task_state(self, task_id, state=1, *args, **kwargs):
+        """
+        @summary: 更新任务表中任务状态,做完每个任务时代码逻辑中要主动调用。可能会重写
+        调用方法为 yield lambda : self.update_task_state(task_id, state)
+        ---------
+        @param task_id:
+        @param state:
+        ---------
+        @result:
+        """
+        if self._update_task:
+            kwargs["id"] = task_id
+            kwargs[self._task_state] = state
+
+            sql = tools.make_update_sql(
+                self._task_table,
+                kwargs,
+                condition="id = {task_id}".format(task_id=task_id),
+            )
+
+            if self._mysqldb.update(sql):
+                log.debug("置任务%s状态成功" % task_id)
+            else:
+                log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
+
+    def update_task_batch(self, task_id, state=1, *args, **kwargs):
+        """
+        批量更新任务 多处调用,更新的字段必须一致
+        注意:需要 写成 yield update_task_batch(...) 否则不会更新
+        @param task_id:
+        @param state:
+        @param kwargs:
+        @return:
+        """
+        if self._update_task:
+            kwargs["id"] = task_id
+            kwargs[self._task_state] = state
+
+            update_item = UpdateItem(**kwargs)
+            update_item.table_name = self._task_table
+            update_item.name_underline = self._task_table + "_item"
+
+            return update_item
+
+    def delete_tables(self, delete_tables_list):
+        if isinstance(delete_tables_list, bool):
+            delete_tables_list = [self._redis_key + "*"]
+        elif not isinstance(delete_tables_list, (list, tuple)):
+            delete_tables_list = [delete_tables_list]
+
+        redis = RedisDB()
+        for delete_tab in delete_tables_list:
+            if delete_tab == "*":
+                delete_tab = self._redis_key + "*"
+
+            tables = redis.getkeys(delete_tab)
+            for table in tables:
+                log.info("正在删除表 %s" % table)
+                redis.clear(table)
+
+    def run(self):
+        self.start_monitor_task()
+
+        if not self._parsers:  # 不是add_parser 模式
+            self._parsers.append(self)
+
+        self._start()
+
+        while True:
+            try:
+                if self.all_thread_is_done():
+                    self._stop_all_thread()
+                    break
+
+            except Exception as e:
+                log.exception(e)
+
+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
+
+        self.delete_tables([self._redis_key + "*"])
+
+    def record_spider_state(
+        self,
+        spider_type,
+        state,
+        batch_date=None,
+        spider_start_time=None,
+        spider_end_time=None,
+        batch_interval=None,
+    ):
+        pass

+ 437 - 0
FworkSpider/feapder/core/spiders/spider.py

@@ -0,0 +1,437 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/22 12:05 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import time
+import warnings
+from collections import Iterable
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.core.base_parser import BaseParser
+from feapder.core.scheduler import Scheduler
+from feapder.db.redisdb import RedisDB
+from feapder.network.item import Item
+from feapder.network.request import Request
+from feapder.utils.log import log
+
+CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
+
+
+class Spider(
+    BaseParser, Scheduler
+):  # threading 中有name函数, 必须先继承BaseParser 否则其内部的name会被Schedule的基类threading.Thread的name覆盖
+    """
+    @summary: 为了简化搭建爬虫
+    ---------
+    """
+
+    def __init__(
+        self,
+        redis_key=None,
+        min_task_count=1,
+        check_task_interval=5,
+        thread_count=None,
+        begin_callback=None,
+        end_callback=None,
+        delete_keys=(),
+        keep_alive=None,
+        auto_start_requests=None,
+        batch_interval=0,
+        wait_lock=True,
+        **kwargs
+    ):
+        """
+        @summary: 爬虫
+        ---------
+        @param redis_key: 任务等数据存放在redis中的key前缀
+        @param min_task_count: 任务队列中最少任务数, 少于这个数量才会添加任务,默认1。start_monitor_task 模式下生效
+        @param check_task_interval: 检查是否还有任务的时间间隔;默认5秒
+        @param thread_count: 线程数,默认为配置文件中的线程数
+        @param begin_callback: 爬虫开始回调函数
+        @param end_callback: 爬虫结束回调函数
+        @param delete_keys: 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
+        @param keep_alive: 爬虫是否常驻
+        @param auto_start_requests: 爬虫是否自动添加任务
+        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时,只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时,爬虫才启动
+        @param wait_lock: 下发任务时否等待锁,若不等待锁,可能会存在多进程同时在下发一样的任务,因此分布式环境下请将该值设置True
+        ---------
+        @result:
+        """
+        super(Spider, self).__init__(
+            redis_key=redis_key,
+            thread_count=thread_count,
+            begin_callback=begin_callback,
+            end_callback=end_callback,
+            delete_keys=delete_keys,
+            keep_alive=keep_alive,
+            auto_start_requests=auto_start_requests,
+            batch_interval=batch_interval,
+            wait_lock=wait_lock,
+            **kwargs
+        )
+
+        self._min_task_count = min_task_count
+        self._check_task_interval = check_task_interval
+
+        self._is_distributed_task = False
+        self._is_show_not_task = False
+
+    def start_monitor_task(self, *args, **kws):
+        if not self.is_reach_next_spider_time():
+            return
+
+        self._auto_start_requests = False
+        redisdb = RedisDB()
+
+        if not self._parsers:  # 不是add_parser 模式
+            self._parsers.append(self)
+
+        while True:
+            try:
+                # 检查redis中是否有任务
+                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
+                todo_task_count = redisdb.zget_count(tab_requests)
+
+                if todo_task_count < self._min_task_count:  # 添加任务
+                    # make start requests
+                    self.distribute_task(*args, **kws)
+
+                else:
+                    log.info("redis 中尚有%s条积压任务,暂时不派发新任务" % todo_task_count)
+
+            except Exception as e:
+                log.exception(e)
+
+            if not self._keep_alive:
+                break
+
+            time.sleep(self._check_task_interval)
+
+    def distribute_task(self, *args, **kws):
+        """
+        @summary: 分发任务 并将返回的request入库
+        ---------
+        @param tasks:
+        ---------
+        @result:
+        """
+        self._is_distributed_task = False
+
+        for parser in self._parsers:
+            requests = parser.start_requests(*args, **kws)
+            if requests and not isinstance(requests, Iterable):
+                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
+
+            result_type = 1
+            for request in requests or []:
+                if isinstance(request, Request):
+                    request.parser_name = request.parser_name or parser.name
+                    self._request_buffer.put_request(request)
+
+                    self._is_distributed_task = True
+                    result_type = 1
+
+                elif isinstance(request, Item):
+                    self._item_buffer.put_item(request)
+                    result_type = 2
+
+                elif callable(request):  # callbale的request可能是更新数据库操作的函数
+                    if result_type == 1:
+                        self._request_buffer.put_request(request)
+                    else:
+                        self._item_buffer.put_item(request)
+                else:
+                    raise TypeError(
+                        "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
+                            type(request)
+                        )
+                    )
+
+            self._request_buffer.flush()
+            self._item_buffer.flush()
+
+        if self._is_distributed_task:  # 有任务时才提示启动爬虫
+            # begin
+            self.spider_begin()
+            self.record_spider_state(
+                spider_type=1,
+                state=0,
+                batch_date=tools.get_current_date(),
+                spider_start_time=tools.get_current_date(),
+                batch_interval=self._batch_interval,
+            )
+
+            # 重置已经提示无任务状态为False
+            self._is_show_not_task = False
+
+        elif not self._is_show_not_task:  # 无任务,且没推送过无任务信息
+            # 发送无任务消息
+            msg = "《%s》start_requests无任务添加" % (self._spider_name)
+            log.info(msg)
+
+            # self.send_msg(msg)
+
+            self._is_show_not_task = True
+
+    def run(self):
+        if not self.is_reach_next_spider_time():
+            return
+
+        if not self._parsers:  # 不是add_parser 模式
+            self._parsers.append(self)
+
+        self._start()
+
+        while True:
+            try:
+                if self.all_thread_is_done():
+                    if not self._is_notify_end:
+                        self.spider_end()  # 跑完一轮
+                        self.record_spider_state(
+                            spider_type=1,
+                            state=1,
+                            spider_end_time=tools.get_current_date(),
+                            batch_interval=self._batch_interval,
+                        )
+
+                        self._is_notify_end = True
+
+                    if not self._keep_alive:
+                        self._stop_all_thread()
+                        break
+
+                else:
+                    self._is_notify_end = False
+
+                self.check_task_status()
+            except Exception as e:
+                log.exception(e)
+
+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
+
+    @classmethod
+    def to_DebugSpider(cls, *args, **kwargs):
+        # DebugSpider 继承 cls
+        DebugSpider.__bases__ = (cls,)
+        DebugSpider.__name__ = cls.__name__
+        return DebugSpider(*args, **kwargs)
+
+
+class DebugSpider(Spider):
+    """
+    Debug爬虫
+    """
+
+    __debug_custom_setting__ = dict(
+        COLLECTOR_SLEEP_TIME=1,
+        COLLECTOR_TASK_COUNT=1,
+        # SPIDER
+        SPIDER_THREAD_COUNT=1,
+        SPIDER_SLEEP_TIME=0,
+        SPIDER_TASK_COUNT=1,
+        SPIDER_MAX_RETRY_TIMES=10,
+        REQUEST_LOST_TIMEOUT=600,  # 10分钟
+        PROXY_ENABLE=False,
+        RETRY_FAILED_REQUESTS=False,
+        # 保存失败的request
+        SAVE_FAILED_REQUEST=False,
+        # 过滤
+        ITEM_FILTER_ENABLE=False,
+        REQUEST_FILTER_ENABLE=False,
+        OSS_UPLOAD_TABLES=(),
+        DELETE_KEYS=True,
+        ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
+    )
+
+    def __init__(self, request=None, request_dict=None, *args, **kwargs):
+        """
+        @param request: request 类对象
+        @param request_dict: request 字典。 request 与 request_dict 二者选一即可
+        @param kwargs:
+        """
+        warnings.warn(
+            "您正处于debug模式下,该模式下不会更新任务状态及数据入库,仅用于调试。正式发布前请更改为正常模式", category=Warning
+        )
+
+        if not request and not request_dict:
+            raise Exception("request 与 request_dict 不能同时为null")
+
+        kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
+        self.__class__.__custom_setting__.update(
+            self.__class__.__debug_custom_setting__
+        )
+
+        super(DebugSpider, self).__init__(*args, **kwargs)
+
+        self._request = request or Request.from_dict(request_dict)
+
+    def save_cached(self, request, response, table):
+        pass
+
+    def delete_tables(self, delete_tables_list):
+        if isinstance(delete_tables_list, bool):
+            delete_tables_list = [self._redis_key + "*"]
+        elif not isinstance(delete_tables_list, (list, tuple)):
+            delete_tables_list = [delete_tables_list]
+
+        redis = RedisDB()
+        for delete_tab in delete_tables_list:
+            if delete_tab == "*":
+                delete_tab = self._redis_key + "*"
+
+            tables = redis.getkeys(delete_tab)
+            for table in tables:
+                log.info("正在删除表 %s" % table)
+                redis.clear(table)
+
+    def __start_requests(self):
+        yield self._request
+
+    def distribute_task(self):
+        """
+        @summary: 分发任务 并将返回的request入库
+        ---------
+        ---------
+        @result:
+        """
+        self._is_distributed_task = False
+
+        for parser in self._parsers:
+            requests = parser.__start_requests()
+            if requests and not isinstance(requests, Iterable):
+                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
+
+            result_type = 1
+            for request in requests or []:
+                if isinstance(request, Request):
+                    request.parser_name = request.parser_name or parser.name
+                    self._request_buffer.put_request(request)
+
+                    self._is_distributed_task = True
+                    result_type = 1
+
+                elif isinstance(request, Item):
+                    self._item_buffer.put_item(request)
+                    result_type = 2
+
+                elif callable(request):  # callbale的request可能是更新数据库操作的函数
+                    if result_type == 1:
+                        self._request_buffer.put_request(request)
+                    else:
+                        self._item_buffer.put_item(request)
+
+            self._request_buffer.flush()
+            self._item_buffer.flush()
+
+        if self._is_distributed_task:  # 有任务时才提示启动爬虫
+            # begin
+            self.spider_begin()
+            self.record_spider_state(
+                spider_type=1,
+                state=0,
+                batch_date=tools.get_current_date(),
+                spider_start_time=tools.get_current_date(),
+                batch_interval=self._batch_interval,
+            )
+
+            # 重置已经提示无任务状态为False
+            self._is_show_not_task = False
+
+        elif not self._is_show_not_task:  # 无任务,且没推送过无任务信息
+            # 发送无任务消息
+            msg = "《%s》start_requests无任务添加" % (self._spider_name)
+            log.info(msg)
+
+            # self.send_msg(msg)
+
+            self._is_show_not_task = True
+
+    def record_spider_state(
+        self,
+        spider_type,
+        state,
+        batch_date=None,
+        spider_start_time=None,
+        spider_end_time=None,
+        batch_interval=None,
+    ):
+        pass
+
+    def _start(self):
+        # 启动parser 的 start_requests
+        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
+
+        for parser in self._parsers:
+            results = parser.__start_requests()
+            # 添加request到请求队列,由请求队列统一入库
+            if results and not isinstance(results, Iterable):
+                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
+
+            result_type = 1
+            for result in results or []:
+                if isinstance(result, Request):
+                    result.parser_name = result.parser_name or parser.name
+                    self._request_buffer.put_request(result)
+                    result_type = 1
+
+                elif isinstance(result, Item):
+                    self._item_buffer.put_item(result)
+                    result_type = 2
+
+                elif callable(result):  # callbale的request可能是更新数据库操作的函数
+                    if result_type == 1:
+                        self._request_buffer.put_request(result)
+                    else:
+                        self._item_buffer.put_item(result)
+
+            self._request_buffer.flush()
+            self._item_buffer.flush()
+
+        # 启动collector
+        self._collector.start()
+
+        # 启动parser control
+        for i in range(self._thread_count):
+            parser_control = self._parser_control_obj(
+                self._collector,
+                self._redis_key,
+                self._request_buffer,
+                self._item_buffer,
+            )
+
+            for parser in self._parsers:
+                parser_control.add_parser(parser)
+
+            parser_control.start()
+            self._parser_controls.append(parser_control)
+
+        # 启动request_buffer
+        self._request_buffer.start()
+
+        # 启动item_buffer
+        self._item_buffer.start()
+
+    def run(self):
+        if not self._parsers:  # 不是add_parser 模式
+            self._parsers.append(self)
+
+        self._start()
+
+        while True:
+            try:
+                if self.all_thread_is_done():
+                    self._stop_all_thread()
+                    break
+            except Exception as e:
+                log.exception(e)
+
+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
+
+        self.delete_tables([self._redis_key + "*"])

+ 9 - 0
FworkSpider/feapder/db/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/23 12:09 AM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""

+ 37 - 0
FworkSpider/feapder/db/memory_db.py

@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/4/21 11:42 PM
+---------
+@summary: 基于内存的队列,代替redis
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+from queue import PriorityQueue
+
+
+class MemoryDB:
+    def __init__(self):
+        self.priority_queue = PriorityQueue()
+
+    def add(self, item):
+        """
+        添加任务
+        :param item: 数据: 支持小于号比较的类 或者 (priority, item)
+        :return:
+        """
+        self.priority_queue.put(item)
+
+    def get(self):
+        """
+        获取任务
+        :return:
+        """
+        try:
+            item = self.priority_queue.get_nowait()
+            return item
+        except:
+            return
+
+    def empty(self):
+        return self.priority_queue.empty()

+ 426 - 0
FworkSpider/feapder/db/mongodb.py

@@ -0,0 +1,426 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-04-18 14:12:21
+---------
+@summary: 操作mongo数据库
+---------
+@author: Mkdir700
+@email:  mkdir700@gmail.com
+"""
+import re
+from typing import List, Dict, Optional
+from urllib import parse
+
+import pymongo
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from pymongo.database import Database
+from pymongo.errors import DuplicateKeyError, BulkWriteError
+
+import feapder.setting as setting
+from feapder.utils.log import log
+
+
+class MongoDB:
+    def __init__(
+        self,
+        ip=None,
+        port=None,
+        db=None,
+        user_name=None,
+        user_pass=None,
+        url=None,
+        **kwargs,
+    ):
+        if url:
+            self.client = MongoClient(url, **kwargs)
+        else:
+            if not ip:
+                ip = setting.MONGO_IP
+            if not port:
+                port = setting.MONGO_PORT
+            if not db:
+                db = setting.MONGO_DB
+            if not user_name:
+                user_name = setting.MONGO_USER_NAME
+            if not user_pass:
+                user_pass = setting.MONGO_USER_PASS
+            self.client = MongoClient(
+                host=ip, port=port, username=user_name, password=user_pass
+            )
+
+        self.db = self.get_database(db)
+
+        # 缓存索引信息
+        self.__index__cached = {}
+
+    @classmethod
+    def from_url(cls, url, **kwargs):
+        """
+        Args:
+            url: mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
+                 参考:http://mongodb.github.io/mongo-java-driver/3.4/javadoc/com/mongodb/MongoClientURI.html
+            **kwargs:
+
+        Returns:
+
+        """
+        url_parsed = parse.urlparse(url)
+
+        db_type = url_parsed.scheme.strip()
+        if db_type != "mongodb":
+            raise Exception(
+                "url error, expect mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]], but get {}".format(
+                    url
+                )
+            )
+
+        return cls(url=url, **kwargs)
+
+    def get_database(self, database, **kwargs) -> Database:
+        """
+        获取数据库对象
+        @param database: 数据库名
+        @return:
+        """
+        return self.client.get_database(database, **kwargs)
+
+    def get_collection(self, coll_name, **kwargs) -> Collection:
+        """
+        根据集合名获取集合对象
+        @param coll_name: 集合名
+        @return:
+        """
+        return self.db.get_collection(coll_name, **kwargs)
+
+    def find(
+        self, coll_name: str, condition: Optional[Dict] = None, limit: int = 0, **kwargs
+    ) -> List[Dict]:
+        """
+        @summary:
+        无数据: 返回[]
+        有数据: [{'_id': 'xx', ...}, ...]
+        ---------
+        @param coll_name: 集合名(表名)
+        @param condition: 查询条件
+        @param limit: 结果数量
+        @param kwargs:
+            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
+
+        ---------
+        @result:
+        """
+        condition = {} if condition is None else condition
+        command = {"find": coll_name, "filter": condition, "limit": limit}
+        command.update(kwargs)
+        result = self.run_command(command)
+        cursor = result["cursor"]
+        cursor_id = cursor["id"]
+        dataset = cursor["firstBatch"]
+        while True:
+            if cursor_id == 0:
+                break
+            result = self.run_command(
+                {
+                    "getMore": cursor_id,
+                    "collection": coll_name,
+                    "batchSize": kwargs.get("batchSize", 100),
+                }
+            )
+            cursor = result["cursor"]
+            cursor_id = cursor["id"]
+            dataset.extend(cursor["nextBatch"])
+        return dataset
+
+    def add(
+        self,
+        coll_name,
+        data: Dict,
+        replace=False,
+        update_columns=(),
+        update_columns_value=(),
+        insert_ignore=False,
+    ):
+        """
+        添加单条数据
+        Args:
+            coll_name: 集合名
+            data: 单条数据
+            replace: 唯一索引冲突时直接覆盖旧数据,默认为False
+            update_columns: 更新指定的列(如果数据唯一索引冲突,则更新指定字段,如 update_columns = ["name", "title"]
+            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
+            insert_ignore: 索引冲突是否忽略 默认False
+
+        Returns: 插入成功的行数
+
+        """
+        affect_count = 1
+        collection = self.get_collection(coll_name)
+        try:
+            collection.insert_one(data)
+        except DuplicateKeyError as e:
+            data.pop("_id", "")
+            # 存在则更新
+            if update_columns:
+                if not isinstance(update_columns, (tuple, list)):
+                    update_columns = [update_columns]
+
+                condition = self.__get_update_condition(
+                    coll_name, data, e.details.get("errmsg")
+                )
+
+                # 更新指定的列
+                if update_columns_value:
+                    # 使用指定的值更新
+                    doc = {
+                        key: value
+                        for key, value in zip(update_columns, update_columns_value)
+                    }
+                else:
+                    # 使用数据本身的值更新
+                    doc = {key: data[key] for key in update_columns}
+
+                collection.update_one(condition, {"$set": doc})
+
+            # 覆盖更新
+            elif replace:
+                condition = self.__get_update_condition(
+                    coll_name, data, e.details.get("errmsg")
+                )
+                # 替换已存在的数据
+                collection.replace_one(condition, data)
+
+            elif not insert_ignore:
+                raise e
+
+        return affect_count
+
+    def add_batch(
+        self,
+        coll_name: str,
+        datas: List[Dict],
+        replace=False,
+        update_columns=(),
+        update_columns_value=(),
+        condition_fields: dict = None,
+    ):
+        """
+        批量添加数据
+        Args:
+            coll_name: 集合名
+            datas: 数据 [{'_id': 'xx'}, ... ]
+            replace:  唯一索引冲突时直接覆盖旧数据,默认为False
+            update_columns: 更新指定的列(如果数据的唯一索引存在,则更新指定字段,如 update_columns = ["name", "title"]
+            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
+            condition_fields: 用于条件查找的字段,不指定则用索引冲突中的字段查找
+
+        Returns: 添加行数,不包含更新
+
+        """
+        add_count = 0
+
+        if not datas:
+            return add_count
+
+        collection = self.get_collection(coll_name)
+        if not isinstance(update_columns, (tuple, list)):
+            update_columns = [update_columns]
+
+        try:
+            add_count = len(datas)
+            collection.insert_many(datas, ordered=False)
+        except BulkWriteError as e:
+            write_errors = e.details.get("writeErrors")
+            for error in write_errors:
+                if error.get("code") == 11000:
+                    # 数据重复
+                    # 获取重复的数据
+                    data = error.get("op")
+                    data.pop("_id", "")
+
+                    def get_condition():
+                        # 获取更新条件
+                        if condition_fields:
+                            condition = {
+                                condition_field: data[condition_field]
+                                for condition_field in condition_fields
+                            }
+                        else:
+                            # 根据重复的值获取更新条件
+                            condition = self.__get_update_condition(
+                                coll_name, data, error.get("errmsg")
+                            )
+
+                        return condition
+
+                    if update_columns:
+                        # 更新指定的列
+                        if update_columns_value:
+                            # 使用指定的值更新
+                            doc = {
+                                key: value
+                                for key, value in zip(
+                                    update_columns, update_columns_value
+                                )
+                            }
+                        else:
+                            # 使用数据本身的值更新
+                            doc = {}
+                            for key in update_columns:
+                                doc = {key: data.get(key)}
+
+                        collection.update_one(get_condition(), {"$set": doc})
+                        add_count -= 1
+
+                    elif replace:
+                        # 覆盖更新
+                        collection.replace_one(get_condition(), data)
+                        add_count -= 1
+
+                    else:
+                        # log.error(error)
+                        add_count -= 1
+
+        return add_count
+
+    def count(self, coll_name, condition: Optional[Dict], limit=0, **kwargs):
+        """
+        计数
+        @param coll_name: 集合名
+        @param condition: 查询条件
+        @param limit: 限制数量
+        @param kwargs:
+        ----
+        command = {
+          count: <collection or view>,
+          query: <document>,
+          limit: <integer>,
+          skip: <integer>,
+          hint: <hint>,
+          readConcern: <document>,
+          collation: <document>,
+          comment: <any>
+        }
+        https://docs.mongodb.com/manual/reference/command/count/#mongodb-dbcommand-dbcmd.count
+        @return: 数据数量
+        """
+        command = {"count": coll_name, "query": condition, "limit": limit, **kwargs}
+        result = self.run_command(command)
+        return result["n"]
+
+    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
+        """
+        更新
+        Args:
+            coll_name: 集合名
+            data: 单条数据 {"xxx":"xxx"}
+            condition: 更新条件 {"_id": "xxxx"}
+            upsert: 数据不存在则插入,默认为 False
+
+        Returns: True / False
+        """
+        try:
+            collection = self.get_collection(coll_name)
+            collection.update_one(condition, {"$set": data}, upsert=upsert)
+        except Exception as e:
+            log.error(
+                """
+                error:{}
+                condition: {}
+            """.format(
+                    e, condition
+                )
+            )
+            return False
+        else:
+            return True
+
+    def delete(self, coll_name, condition: Dict) -> bool:
+        """
+        删除
+        Args:
+            coll_name: 集合名
+            condition: 查找条件
+        Returns: True / False
+
+        """
+        try:
+            collection = self.get_collection(coll_name)
+            collection.delete_one(condition)
+        except Exception as e:
+            log.error(
+                """
+                error:{}
+                condition: {}
+            """.format(
+                    e, condition
+                )
+            )
+            return False
+        else:
+            return True
+
+    def run_command(self, command: Dict):
+        """
+        运行指令
+        参考文档 https://www.geek-book.com/src/docs/mongodb/mongodb/docs.mongodb.com/manual/reference/command/index.html
+        @param command:
+        @return:
+        """
+        return self.db.command(command)
+
+    def create_index(self, coll_name, keys, unique=True):
+        collection = self.get_collection(coll_name)
+        _keys = [(key, pymongo.ASCENDING) for key in keys]
+        collection.create_index(_keys, unique=unique)
+
+    def get_index(self, coll_name):
+        return self.get_collection(coll_name).index_information()
+
+    def drop_collection(self, coll_name):
+        return self.db.drop_collection(coll_name)
+
+    def get_index_key(self, coll_name, index_name):
+        """
+        获取参与索引的key
+        Args:
+            index_name: 索引名
+
+        Returns:
+
+        """
+        cache_key = f"{coll_name}:{index_name}"
+
+        if cache_key in self.__index__cached:
+            return self.__index__cached.get(cache_key)
+
+        index = self.get_index(coll_name)
+        index_detail = index.get(index_name)
+        if not index_detail:
+            errmsg = f"not found index {index_name} in collection {coll_name}"
+            raise Exception(errmsg)
+
+        index_keys = [val[0] for val in index_detail.get("key")]
+        self.__index__cached[cache_key] = index_keys
+        return index_keys
+
+    def __get_update_condition(
+        self, coll_name: str, data: dict, duplicate_errmsg: str
+    ) -> dict:
+        """
+        根据索引冲突的报错信息 获取更新条件
+        Args:
+            duplicate_errmsg: E11000 duplicate key error collection: feapder.test index: a_1_b_1 dup key: { : 1, : "你好" }
+            data: {"a": 1, "b": "你好", "c": "嘻嘻"}
+
+        Returns: {"a": 1, "b": "你好"}
+
+        """
+        index_name = re.search(r"index: (\w+)", duplicate_errmsg).group(1)
+        index_keys = self.get_index_key(coll_name, index_name)
+
+        condition = {key: data.get(key) for key in index_keys}
+        return condition
+
+    def __getattr__(self, name):
+        return getattr(self.db, name)

+ 381 - 0
FworkSpider/feapder/db/mysqldb.py

@@ -0,0 +1,381 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2016-11-16 16:25
+---------
+@summary: 操作oracle数据库
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import datetime
+import json
+from urllib import parse
+from typing import List, Dict
+
+import pymysql
+from dbutils.pooled_db import PooledDB
+from pymysql import cursors
+from pymysql import err
+
+import feapder.setting as setting
+from feapder.utils.log import log
+from feapder.utils.tools import make_insert_sql, make_batch_sql, make_update_sql
+
+
+def auto_retry(func):
+    def wapper(*args, **kwargs):
+        for i in range(3):
+            try:
+                return func(*args, **kwargs)
+            except (err.InterfaceError, err.OperationalError) as e:
+                log.error(
+                    """
+                    error:%s
+                    sql:  %s
+                    """
+                    % (e, kwargs.get("sql") or args[1])
+                )
+
+    return wapper
+
+
+class MysqlDB:
+    def __init__(
+        self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs
+    ):
+        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
+        if not ip:
+            ip = setting.MYSQL_IP
+        if not port:
+            port = setting.MYSQL_PORT
+        if not db:
+            db = setting.MYSQL_DB
+        if not user_name:
+            user_name = setting.MYSQL_USER_NAME
+        if not user_pass:
+            user_pass = setting.MYSQL_USER_PASS
+
+        try:
+
+            self.connect_pool = PooledDB(
+                creator=pymysql,
+                mincached=1,
+                maxcached=100,
+                maxconnections=100,
+                blocking=True,
+                ping=7,
+                host=ip,
+                port=port,
+                user=user_name,
+                passwd=user_pass,
+                db=db,
+                charset="utf8mb4",
+                cursorclass=cursors.SSCursor,
+            )  # cursorclass 使用服务的游标,默认的在多线程下大批量插入数据会使内存递增
+
+        except Exception as e:
+            log.error(
+                """
+            连接数据失败:
+            ip: {}
+            port: {}
+            db: {}
+            user_name: {}
+            user_pass: {}
+            exception: {}
+            """.format(
+                    ip, port, db, user_name, user_pass, e
+                )
+            )
+        else:
+            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
+
+    @classmethod
+    def from_url(cls, url, **kwargs):
+        # mysql://username:password@ip:port/db?charset=utf8mb4
+        url_parsed = parse.urlparse(url)
+
+        db_type = url_parsed.scheme.strip()
+        if db_type != "mysql":
+            raise Exception(
+                "url error, expect mysql://username:ip:port/db?charset=utf8mb4, but get {}".format(
+                    url
+                )
+            )
+
+        connect_params = {}
+        connect_params["ip"] = url_parsed.hostname.strip()
+        connect_params["port"] = url_parsed.port
+        connect_params["user_name"] = url_parsed.username.strip()
+        connect_params["user_pass"] = url_parsed.password.strip()
+        connect_params["db"] = url_parsed.path.strip("/").strip()
+
+        connect_params.update(kwargs)
+
+        return cls(**connect_params)
+
+    @staticmethod
+    def unescape_string(value):
+        if not isinstance(value, str):
+            return value
+
+        value = value.replace("\\0", "\0")
+        value = value.replace("\\\\", "\\")
+        value = value.replace("\\n", "\n")
+        value = value.replace("\\r", "\r")
+        value = value.replace("\\Z", "\032")
+        value = value.replace('\\"', '"')
+        value = value.replace("\\'", "'")
+
+        return value
+
+    def get_connection(self):
+        conn = self.connect_pool.connection(shareable=False)
+        # cursor = conn.cursor(cursors.SSCursor)
+        cursor = conn.cursor()
+
+        return conn, cursor
+
+    def close_connection(self, conn, cursor):
+        cursor.close()
+        conn.close()
+
+    def size_of_connections(self):
+        """
+        当前活跃的连接数
+        @return:
+        """
+        return self.connect_pool._connections
+
+    def size_of_connect_pool(self):
+        """
+        池子里一共有多少连接
+        @return:
+        """
+        return len(self.connect_pool._idle_cache)
+
+    @auto_retry
+    def find(self, sql, limit=0, to_json=False):
+        """
+        @summary:
+        无数据: 返回()
+        有数据: 若limit == 1 则返回 (data1, data2)
+                否则返回 ((data1, data2),)
+        ---------
+        @param sql:
+        @param limit:
+        @param to_json 是否将查询结果转为json
+        ---------
+        @result:
+        """
+        conn, cursor = self.get_connection()
+
+        cursor.execute(sql)
+
+        if limit == 1:
+            result = cursor.fetchone()  # 全部查出来,截取 不推荐使用
+        elif limit > 1:
+            result = cursor.fetchmany(limit)  # 全部查出来,截取 不推荐使用
+        else:
+            result = cursor.fetchall()
+
+        if to_json:
+            columns = [i[0] for i in cursor.description]
+
+            # 处理数据
+            def convert(col):
+                if isinstance(col, (datetime.date, datetime.time)):
+                    return str(col)
+                elif isinstance(col, str) and (
+                    col.startswith("{") or col.startswith("[")
+                ):
+                    try:
+                        # col = self.unescape_string(col)
+                        return json.loads(col)
+                    except:
+                        return col
+                else:
+                    # col = self.unescape_string(col)
+                    return col
+
+            if limit == 1:
+                result = [convert(col) for col in result]
+                result = dict(zip(columns, result))
+            else:
+                result = [[convert(col) for col in row] for row in result]
+                result = [dict(zip(columns, r)) for r in result]
+
+        self.close_connection(conn, cursor)
+
+        return result
+
+    def add(self, sql, exception_callfunc=None):
+        """
+
+        Args:
+            sql:
+            exception_callfunc: 异常回调
+
+        Returns: 添加行数
+
+        """
+        affect_count = None
+
+        try:
+            conn, cursor = self.get_connection()
+            affect_count = cursor.execute(sql)
+            conn.commit()
+
+        except Exception as e:
+            log.error(
+                """
+                error:%s
+                sql:  %s
+            """
+                % (e, sql)
+            )
+            if exception_callfunc:
+                exception_callfunc(e)
+        finally:
+            self.close_connection(conn, cursor)
+
+        return affect_count
+
+    def add_smart(self, table, data: Dict, **kwargs):
+        """
+        添加数据, 直接传递json格式的数据,不用拼sql
+        Args:
+            table: 表名
+            data: 字典 {"xxx":"xxx"}
+            **kwargs:
+
+        Returns: 添加行数
+
+        """
+        sql = make_insert_sql(table, data, **kwargs)
+        return self.add(sql)
+
+    def add_batch(self, sql, datas: List[Dict]):
+        """
+        @summary: 批量添加数据
+        ---------
+        @ param sql: insert ignore into (xxx,xxx) values (%s, %s, %s)
+        # param datas: 列表 [{}, {}, {}]
+        ---------
+        @result: 添加行数
+        """
+        affect_count = None
+
+        try:
+            conn, cursor = self.get_connection()
+            affect_count = cursor.executemany(sql, datas)
+            conn.commit()
+
+        except Exception as e:
+            log.error(
+                """
+                error:%s
+                sql:  %s
+                """
+                % (e, sql)
+            )
+        finally:
+            self.close_connection(conn, cursor)
+
+        return affect_count
+
+    def add_batch_smart(self, table, datas: List[Dict], **kwargs):
+        """
+        批量添加数据, 直接传递list格式的数据,不用拼sql
+        Args:
+            table: 表名
+            datas: 列表 [{}, {}, {}]
+            **kwargs:
+
+        Returns: 添加行数
+
+        """
+        sql, datas = make_batch_sql(table, datas, **kwargs)
+        return self.add_batch(sql, datas)
+
+    def update(self, sql):
+        try:
+            conn, cursor = self.get_connection()
+            cursor.execute(sql)
+            conn.commit()
+
+        except Exception as e:
+            log.error(
+                """
+                error:%s
+                sql:  %s
+            """
+                % (e, sql)
+            )
+            return False
+        else:
+            return True
+        finally:
+            self.close_connection(conn, cursor)
+
+    def update_smart(self, table, data: Dict, condition):
+        """
+        更新, 不用拼sql
+        Args:
+            table: 表名
+            data: 数据 {"xxx":"xxx"}
+            condition: 更新条件 where后面的条件,如 condition='status=1'
+
+        Returns: True / False
+
+        """
+        sql = make_update_sql(table, data, condition)
+        return self.update(sql)
+
+    def delete(self, sql):
+        """
+        删除
+        Args:
+            sql:
+
+        Returns: True / False
+
+        """
+        try:
+            conn, cursor = self.get_connection()
+            cursor.execute(sql)
+            conn.commit()
+
+        except Exception as e:
+            log.error(
+                """
+                error:%s
+                sql:  %s
+            """
+                % (e, sql)
+            )
+            return False
+        else:
+            return True
+        finally:
+            self.close_connection(conn, cursor)
+
+    def execute(self, sql):
+        try:
+            conn, cursor = self.get_connection()
+            cursor.execute(sql)
+            conn.commit()
+
+        except Exception as e:
+            log.error(
+                """
+                error:%s
+                sql:  %s
+            """
+                % (e, sql)
+            )
+            return False
+        else:
+            return True
+        finally:
+            self.close_connection(conn, cursor)

+ 848 - 0
FworkSpider/feapder/db/redisdb.py

@@ -0,0 +1,848 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2016-11-16 16:25
+---------
+@summary: 操作redis数据库
+---------
+@author: Boris
+"""
+
+import time
+
+import redis
+from redis._compat import unicode, long, basestring
+from redis.connection import Encoder as _Encoder
+from redis.exceptions import ConnectionError, TimeoutError
+from redis.exceptions import DataError
+from redis.sentinel import Sentinel
+from rediscluster import RedisCluster
+
+import feapder.setting as setting
+from feapder.utils.log import log
+
+
+class Encoder(_Encoder):
+    def encode(self, value):
+        "Return a bytestring or bytes-like representation of the value"
+        if isinstance(value, (bytes, memoryview)):
+            return value
+        # elif isinstance(value, bool):
+        #     # special case bool since it is a subclass of int
+        #     raise DataError(
+        #         "Invalid input of type: 'bool'. Convert to a "
+        #         "bytes, string, int or float first."
+        #     )
+        elif isinstance(value, float):
+            value = repr(value).encode()
+        elif isinstance(value, (int, long)):
+            # python 2 repr() on longs is '123L', so use str() instead
+            value = str(value).encode()
+        elif isinstance(value, (list, dict, tuple)):
+            value = unicode(value)
+        elif not isinstance(value, basestring):
+            # a value we don't know how to deal with. throw an error
+            typename = type(value).__name__
+            raise DataError(
+                "Invalid input of type: '%s'. Convert to a "
+                "bytes, string, int or float first." % typename
+            )
+        if isinstance(value, unicode):
+            value = value.encode(self.encoding, self.encoding_errors)
+        return value
+
+
+redis.connection.Encoder = Encoder
+
+
+class RedisDB:
+    def __init__(
+        self,
+        ip_ports=None,
+        db=None,
+        user_pass=None,
+        url=None,
+        decode_responses=True,
+        service_name=None,
+        max_connections=32,
+        **kwargs,
+    ):
+        """
+        redis的封装
+        Args:
+            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+            db:
+            user_pass:
+            url:
+            decode_responses:
+            service_name: 适用于redis哨兵模式
+        """
+
+        # 可能会改setting中的值,所以此处不能直接赋值为默认值,需要后加载赋值
+        if ip_ports is None:
+            ip_ports = setting.REDISDB_IP_PORTS
+        if db is None:
+            db = setting.REDISDB_DB
+        if user_pass is None:
+            user_pass = setting.REDISDB_USER_PASS
+        if service_name is None:
+            service_name = setting.REDISDB_SERVICE_NAME
+
+        self._is_redis_cluster = False
+
+        self.__redis = None
+        self._url = url
+        self._ip_ports = ip_ports
+        self._db = db
+        self._user_pass = user_pass
+        self._decode_responses = decode_responses
+        self._service_name = service_name
+        self._max_connections = max_connections
+        self._kwargs = kwargs
+        self.get_connect()
+
+    def __repr__(self):
+        if self._url:
+            return "<Redisdb url:{}>".format(self._url)
+
+        return "<Redisdb ip_ports: {} db:{} user_pass:{}>".format(
+            self._ip_ports, self._db, self._user_pass
+        )
+
+    @property
+    def _redis(self):
+        try:
+            if not self.__redis.ping():
+                raise ConnectionError("unable to connect to redis")
+        except:
+            self._reconnect()
+
+        return self.__redis
+
+    @_redis.setter
+    def _redis(self, val):
+        self.__redis = val
+
+    def get_connect(self):
+        # 获取数据库连接
+        try:
+            if not self._url:
+                if not self._ip_ports:
+                    raise Exception("未设置 redis 连接信息")
+
+                ip_ports = (
+                    self._ip_ports
+                    if isinstance(self._ip_ports, list)
+                    else self._ip_ports.split(",")
+                )
+                if len(ip_ports) > 1:
+                    startup_nodes = []
+                    for ip_port in ip_ports:
+                        ip, port = ip_port.split(":")
+                        startup_nodes.append({"host": ip, "port": port})
+
+                    if self._service_name:
+                        # log.debug("使用redis哨兵模式")
+                        hosts = [(node["host"], node["port"]) for node in startup_nodes]
+                        sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs)
+                        self._redis = sentinel.master_for(
+                            self._service_name,
+                            password=self._user_pass,
+                            db=self._db,
+                            redis_class=redis.StrictRedis,
+                            decode_responses=self._decode_responses,
+                            max_connections=self._max_connections,
+                            **self._kwargs,
+                        )
+
+                    else:
+                        # log.debug("使用redis集群模式")
+                        self._redis = RedisCluster(
+                            startup_nodes=startup_nodes,
+                            decode_responses=self._decode_responses,
+                            password=self._user_pass,
+                            max_connections=self._max_connections,
+                            **self._kwargs,
+                        )
+
+                    self._is_redis_cluster = True
+                else:
+                    ip, port = ip_ports[0].split(":")
+                    self._redis = redis.StrictRedis(
+                        host=ip,
+                        port=port,
+                        db=self._db,
+                        password=self._user_pass,
+                        decode_responses=self._decode_responses,
+                        max_connections=self._max_connections,
+                        **self._kwargs,
+                    )
+                    self._is_redis_cluster = False
+            else:
+                self._redis = redis.StrictRedis.from_url(
+                    self._url, decode_responses=self._decode_responses
+                )
+                self._is_redis_cluster = False
+
+        except Exception as e:
+            raise
+
+        # 不要写成self._redis.ping() 否则循环调用了
+        return self.__redis.ping()
+
+    @classmethod
+    def from_url(cls, url):
+        """
+
+        Args:
+            url: redis://[[username]:[password]]@[host]:[port]/[db]
+
+        Returns:
+
+        """
+        return cls(url=url)
+
+    def sadd(self, table, values):
+        """
+        @summary: 使用无序set集合存储数据, 去重
+        ---------
+        @param table:
+        @param values: 值; 支持list 或 单个值
+        ---------
+        @result: 若库中存在 返回0,否则入库,返回1。 批量添加返回None
+        """
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value in values:
+                pipe.sadd(table, value)
+            pipe.execute()
+
+        else:
+            return self._redis.sadd(table, values)
+
+    def sget(self, table, count=1, is_pop=True):
+        """
+        返回 list 如 ['1'] 或 []
+        @param table:
+        @param count:
+        @param is_pop:
+        @return:
+        """
+
+        datas = []
+        if is_pop:
+            count = count if count <= self.sget_count(table) else self.sget_count(table)
+            if count:
+                if count > 1:
+                    pipe = self._redis.pipeline()
+
+                    if not self._is_redis_cluster:
+                        pipe.multi()
+                    while count:
+                        pipe.spop(table)
+                        count -= 1
+                    datas = pipe.execute()
+
+                else:
+                    datas.append(self._redis.spop(table))
+
+        else:
+            datas = self._redis.srandmember(table, count)
+
+        return datas
+
+    def srem(self, table, values):
+        """
+        @summary: 移除集合中的指定元素
+        ---------
+        @param table:
+        @param values: 一个或者列表
+        ---------
+        @result:
+        """
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value in values:
+                pipe.srem(table, value)
+            pipe.execute()
+        else:
+            self._redis.srem(table, values)
+
+    def sget_count(self, table):
+        return self._redis.scard(table)
+
+    def sdelete(self, table):
+        """
+        @summary: 删除set集合的大键(数据量大的表)
+        删除大set键,使用sscan命令,每次扫描集合中500个元素,再用srem命令每次删除一个键
+        若直接用delete命令,会导致Redis阻塞,出现故障切换和应用程序崩溃的故障。
+        ---------
+        @param table:
+        ---------
+        @result:
+        """
+
+        # 当 SCAN 命令的游标参数被设置为 0 时, 服务器将开始一次新的迭代, 而当服务器向用户返回值为 0 的游标时, 表示迭代已结束
+        cursor = "0"
+        while cursor != 0:
+            cursor, data = self._redis.sscan(table, cursor=cursor, count=500)
+            for item in data:
+                # pipe.srem(table, item)
+                self._redis.srem(table, item)
+
+            # pipe.execute()
+
+    def sismember(self, table, key):
+        "Return a boolean indicating if ``value`` is a member of set ``name``"
+        return self._redis.sismember(table, key)
+
+    def zadd(self, table, values, prioritys=0):
+        """
+        @summary: 使用有序set集合存储数据, 去重(值存在更新)
+        ---------
+        @param table:
+        @param values: 值; 支持list 或 单个值
+        @param prioritys: 优先级; double类型,支持list 或 单个值。 根据此字段的值来排序, 值越小越优先。 可不传值,默认value的优先级为0
+        ---------
+        @result:若库中存在 返回0,否则入库,返回1。 批量添加返回 [0, 1 ...]
+        """
+        if isinstance(values, list):
+            if not isinstance(prioritys, list):
+                prioritys = [prioritys] * len(values)
+            else:
+                assert len(values) == len(prioritys), "values值要与prioritys值一一对应"
+
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value, priority in zip(values, prioritys):
+                pipe.execute_command(
+                    "ZADD", table, priority, value
+                )  # 为了兼容2.x与3.x版本的redis
+            return pipe.execute()
+
+        else:
+            return self._redis.execute_command(
+                "ZADD", table, prioritys, values
+            )  # 为了兼容2.x与3.x版本的redis
+
+    def zget(self, table, count=1, is_pop=True):
+        """
+        @summary: 从有序set集合中获取数据 优先返回分数小的(优先级高的)
+        ---------
+        @param table:
+        @param count: 数量 -1 返回全部数据
+        @param is_pop:获取数据后,是否在原set集合中删除,默认是
+        ---------
+        @result: 列表
+        """
+
+        start_pos = 0  # 包含
+        end_pos = count - 1 if count > 0 else count
+
+        pipe = self._redis.pipeline()
+
+        if not self._is_redis_cluster:
+            pipe.multi()  # 标记事务的开始 参考 http://www.runoob.com/redis/redis-transactions.html
+        pipe.zrange(table, start_pos, end_pos)  # 取值
+        if is_pop:
+            pipe.zremrangebyrank(table, start_pos, end_pos)  # 删除
+        results, *count = pipe.execute()
+        return results
+
+    def zremrangebyscore(self, table, priority_min, priority_max):
+        """
+        根据分数移除成员 闭区间
+        @param table:
+        @param priority_min:
+        @param priority_max:
+        @return: 被移除的成员个数
+        """
+        return self._redis.zremrangebyscore(table, priority_min, priority_max)
+
+    def zrangebyscore(self, table, priority_min, priority_max, count=None, is_pop=True):
+        """
+        @summary: 返回指定分数区间的数据 闭区间
+        ---------
+        @param table:
+        @param priority_min: 优先级越小越优先
+        @param priority_max:
+        @param count: 获取的数量,为空则表示分数区间内的全部数据
+        @param is_pop: 是否删除
+        ---------
+        @result:
+        """
+
+        # 使用lua脚本, 保证操作的原子性
+        lua = """
+            -- local key = KEYS[1]
+            local min_score = ARGV[2]
+            local max_score = ARGV[3]
+            local is_pop = ARGV[4]
+            local count = ARGV[5]
+
+            -- 取值
+            local datas = nil
+            if count then
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
+            else
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
+            end
+
+            -- 删除redis中刚取到的值
+            if (is_pop=='True' or is_pop=='1') then
+                for i=1, #datas do
+                    redis.call('zrem', KEYS[1], datas[i])
+                end
+            end
+
+
+            return datas
+
+        """
+        cmd = self._redis.register_script(lua)
+        if count:
+            res = cmd(
+                keys=[table], args=[table, priority_min, priority_max, is_pop, count]
+            )
+        else:
+            res = cmd(keys=[table], args=[table, priority_min, priority_max, is_pop])
+
+        return res
+
+    def zrangebyscore_increase_score(
+        self, table, priority_min, priority_max, increase_score, count=None
+    ):
+        """
+        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
+        ---------
+        @param table:
+        @param priority_min: 最小分数
+        @param priority_max: 最大分数
+        @param increase_score: 分数值增量 正数则在原有的分数上叠加,负数则相减
+        @param count: 获取的数量,为空则表示分数区间内的全部数据
+        ---------
+        @result:
+        """
+
+        # 使用lua脚本, 保证操作的原子性
+        lua = """
+            -- local key = KEYS[1]
+            local min_score = ARGV[1]
+            local max_score = ARGV[2]
+            local increase_score = ARGV[3]
+            local count = ARGV[4]
+
+            -- 取值
+            local datas = nil
+            if count then
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
+            else
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
+            end
+
+            --修改优先级
+            for i=1, #datas do
+                redis.call('zincrby', KEYS[1], increase_score, datas[i])
+            end
+
+            return datas
+
+        """
+        cmd = self._redis.register_script(lua)
+        if count:
+            res = cmd(
+                keys=[table], args=[priority_min, priority_max, increase_score, count]
+            )
+        else:
+            res = cmd(keys=[table], args=[priority_min, priority_max, increase_score])
+
+        return res
+
+    def zrangebyscore_set_score(
+        self, table, priority_min, priority_max, score, count=None
+    ):
+        """
+        @summary: 返回指定分数区间的数据 闭区间, 同时修改分数
+        ---------
+        @param table:
+        @param priority_min: 最小分数
+        @param priority_max: 最大分数
+        @param score: 分数值
+        @param count: 获取的数量,为空则表示分数区间内的全部数据
+        ---------
+        @result:
+        """
+
+        # 使用lua脚本, 保证操作的原子性
+        lua = """
+            -- local key = KEYS[1]
+            local min_score = ARGV[1]
+            local max_score = ARGV[2]
+            local set_score = ARGV[3]
+            local count = ARGV[4]
+
+            -- 取值
+            local datas = nil
+            if count then
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores','limit', 0, count)
+            else
+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores')
+            end
+
+            local real_datas = {} -- 数据
+            --修改优先级
+            for i=1, #datas, 2 do
+               local data = datas[i]
+               local score = datas[i+1]
+
+               table.insert(real_datas, data) -- 添加数据
+
+               redis.call('zincrby', KEYS[1], set_score - score, datas[i])
+            end
+
+            return real_datas
+
+        """
+        cmd = self._redis.register_script(lua)
+        if count:
+            res = cmd(keys=[table], args=[priority_min, priority_max, score, count])
+        else:
+            res = cmd(keys=[table], args=[priority_min, priority_max, score])
+
+        return res
+
+    def zincrby(self, table, amount, value):
+        return self._redis.zincrby(table, amount, value)
+
+    def zget_count(self, table, priority_min=None, priority_max=None):
+        """
+        @summary: 获取表数据的数量
+        ---------
+        @param table:
+        @param priority_min:优先级范围 最小值(包含)
+        @param priority_max:优先级范围 最大值(包含)
+        ---------
+        @result:
+        """
+
+        if priority_min != None and priority_max != None:
+            return self._redis.zcount(table, priority_min, priority_max)
+        else:
+            return self._redis.zcard(table)
+
+    def zrem(self, table, values):
+        """
+        @summary: 移除集合中的指定元素
+        ---------
+        @param table:
+        @param values: 一个或者列表
+        ---------
+        @result:
+        """
+
+        if isinstance(values, list):
+            self._redis.zrem(table, *values)
+        else:
+            self._redis.zrem(table, values)
+
+    def zexists(self, table, values):
+        """
+        利用zscore判断某元素是否存在
+        @param values:
+        @return:
+        """
+
+        is_exists = []
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+            pipe.multi()
+            for value in values:
+                pipe.zscore(table, value)
+            is_exists_temp = pipe.execute()
+            for is_exist in is_exists_temp:
+                if is_exist != None:
+                    is_exists.append(1)
+                else:
+                    is_exists.append(0)
+
+        else:
+            is_exists = self._redis.zscore(table, values)
+            is_exists = 1 if is_exists != None else 0
+
+        return is_exists
+
+    def lpush(self, table, values):
+
+        if isinstance(values, list):
+            pipe = self._redis.pipeline()
+
+            if not self._is_redis_cluster:
+                pipe.multi()
+            for value in values:
+                pipe.rpush(table, value)
+            pipe.execute()
+
+        else:
+            return self._redis.rpush(table, values)
+
+    def lpop(self, table, count=1):
+        """
+        @summary:
+        ---------
+        @param table:
+        @param count:
+        ---------
+        @result: count>1时返回列表
+        """
+
+        datas = None
+
+        count = count if count <= self.lget_count(table) else self.lget_count(table)
+
+        if count:
+            if count > 1:
+                pipe = self._redis.pipeline()
+
+                if not self._is_redis_cluster:
+                    pipe.multi()
+                while count:
+                    pipe.lpop(table)
+                    count -= 1
+                datas = pipe.execute()
+
+            else:
+                datas = self._redis.lpop(table)
+
+        return datas
+
+    def rpoplpush(self, from_table, to_table=None):
+        """
+        将列表 from_table 中的最后一个元素(尾元素)弹出,并返回给客户端。
+        将 from_table 弹出的元素插入到列表 to_table ,作为 to_table 列表的的头元素。
+        如果 from_table 和 to_table 相同,则列表中的表尾元素被移动到表头,并返回该元素,可以把这种特殊情况视作列表的旋转(rotation)操作
+        @param from_table:
+        @param to_table:
+        @return:
+        """
+
+        if not to_table:
+            to_table = from_table
+
+        return self._redis.rpoplpush(from_table, to_table)
+
+    def lget_count(self, table):
+        return self._redis.llen(table)
+
+    def lrem(self, table, value, num=0):
+        """
+        @summary:
+        删除value
+        ---------
+        @param table:
+        @param value:
+        @param num:
+        ---------
+        @result: 删除的条数
+        """
+        return self._redis.lrem(table, num, value)
+
+    def lrange(self, table, start=0, end=-1):
+        return self._redis.lrange(table, start, end)
+
+    def hset(self, table, key, value):
+        """
+        @summary:
+        如果 key 不存在,一个新的哈希表被创建并进行 HSET 操作。
+        如果域 field 已经存在于哈希表中,旧值将被覆盖
+        ---------
+        @param table:
+        @param key:
+        @param value:
+        ---------
+        @result: 1 新插入; 0 覆盖
+        """
+        return self._redis.hset(table, key, value)
+
+    def hset_batch(self, table, datas):
+        """
+        批量插入
+        Args:
+            datas:
+                [[key, value]]
+        Returns:
+
+        """
+        pipe = self._redis.pipeline()
+
+        if not self._is_redis_cluster:
+            pipe.multi()
+        for key, value in datas:
+            pipe.hset(table, key, value)
+        return pipe.execute()
+
+    def hincrby(self, table, key, increment):
+        return self._redis.hincrby(table, key, increment)
+
+    def hget(self, table, key, is_pop=False):
+        if not is_pop:
+            return self._redis.hget(table, key)
+        else:
+            lua = """
+                -- local key = KEYS[1]
+                local field = ARGV[1]
+
+                -- 取值
+                local datas = redis.call('hget', KEYS[1], field)
+                -- 删除值
+                redis.call('hdel', KEYS[1], field)
+
+                return datas
+
+                    """
+            cmd = self._redis.register_script(lua)
+            res = cmd(keys=[table], args=[key])
+
+            return res
+
+    def hgetall(self, table):
+        return self._redis.hgetall(table)
+
+    def hexists(self, table, key):
+        return self._redis.hexists(table, key)
+
+    def hdel(self, table, *keys):
+        """
+        @summary: 删除对应的key 可传多个
+        ---------
+        @param table:
+        @param *keys:
+        ---------
+        @result:
+        """
+        self._redis.hdel(table, *keys)
+
+    def hget_count(self, table):
+        return self._redis.hlen(table)
+
+    def setbit(self, table, offsets, values):
+        """
+        设置字符串数组某一位的值, 返回之前的值
+        @param table:
+        @param offsets: 支持列表或单个值
+        @param values: 支持列表或单个值
+        @return: list / 单个值
+        """
+        if isinstance(offsets, list):
+            if not isinstance(values, list):
+                values = [values] * len(offsets)
+            else:
+                assert len(offsets) == len(values), "offsets值要与values值一一对应"
+
+            pipe = self._redis.pipeline()
+            pipe.multi()
+
+            for offset, value in zip(offsets, values):
+                pipe.setbit(table, offset, value)
+
+            return pipe.execute()
+
+        else:
+            return self._redis.setbit(table, offsets, values)
+
+    def getbit(self, table, offsets):
+        """
+        取字符串数组某一位的值
+        @param table:
+        @param offsets: 支持列表
+        @return: list / 单个值
+        """
+        if isinstance(offsets, list):
+            pipe = self._redis.pipeline()
+            pipe.multi()
+            for offset in offsets:
+                pipe.getbit(table, offset)
+
+            return pipe.execute()
+
+        else:
+            return self._redis.getbit(table, offsets)
+
+    def bitcount(self, table):
+        return self._redis.bitcount(table)
+
+    def strset(self, table, value, **kwargs):
+        return self._redis.set(table, value, **kwargs)
+
+    def str_incrby(self, table, value):
+        return self._redis.incrby(table, value)
+
+    def strget(self, table):
+        return self._redis.get(table)
+
+    def strlen(self, table):
+        return self._redis.strlen(table)
+
+    def getkeys(self, regex):
+        return self._redis.keys(regex)
+
+    def exists_key(self, key):
+        return self._redis.exists(key)
+
+    def set_expire(self, key, seconds):
+        """
+        @summary: 设置过期时间
+        ---------
+        @param key:
+        @param seconds: 秒
+        ---------
+        @result:
+        """
+        self._redis.expire(key, seconds)
+
+    def get_expire(self, key):
+        """
+        @summary: 查询过期时间
+        ---------
+        @param key:
+        @param seconds: 秒
+        ---------
+        @result:
+        """
+        return self._redis.ttl(key)
+
+    def clear(self, table):
+        try:
+            self._redis.delete(table)
+        except Exception as e:
+            log.error(e)
+
+    def get_redis_obj(self):
+        return self._redis
+
+    def _reconnect(self):
+        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
+        retry_count = 0
+        while True:
+            try:
+                retry_count += 1
+                log.error(f"redis 连接断开, 重新连接 {retry_count}")
+                if self.get_connect():
+                    log.info(f"redis 连接成功")
+                    return True
+            except (ConnectionError, TimeoutError) as e:
+                log.error(f"连接失败 e: {e}")
+
+            time.sleep(2)
+
+    def __getattr__(self, name):
+        return getattr(self._redis, name)

+ 178 - 0
FworkSpider/feapder/dedup/__init__.py

@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-12-13 21:08
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import copy
+from typing import Any, List, Union, Optional, Tuple, Callable
+
+from feapder.utils.tools import get_md5
+from .bloomfilter import BloomFilter, ScalableBloomFilter
+from .expirefilter import ExpireFilter
+
+
+class Dedup:
+    BloomFilter = 1
+    MemoryFilter = 2
+    ExpireFilter = 3
+
+    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
+        """
+        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
+        Args:
+            filter_type: 过滤器类型 BloomFilter
+            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
+            absolute_name: 过滤器绝对名称 不会加dedup前缀,当此值不为空时name参数无效
+            expire_time: ExpireFilter的过期时间 单位为秒,其他两种过滤器不用指定
+            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
+            to_md5: 去重前是否将数据转为MD5,默认是
+            redis_url: redis://[[username]:[password]]@localhost:6379/0
+                       BloomFilter 与 ExpireFilter 使用
+                       默认会读取setting中的redis配置,若无setting,则需要专递redis_url
+            initial_capacity: 单个布隆过滤器去重容量 默认100000000,当布隆过滤器容量满时会扩展下一个布隆过滤器
+            error_rate:布隆过滤器的误判率 默认0.00001
+            **kwargs:
+        """
+
+        if filter_type == Dedup.ExpireFilter:
+            try:
+                expire_time = kwargs["expire_time"]
+            except:
+                raise ValueError("需传参数 expire_time")
+
+            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
+                "name", expire_time
+            )
+            expire_time_record_key = "dedup:expire_set:expire_time"
+
+            self.dedup = ExpireFilter(
+                name=name,
+                expire_time=expire_time,
+                expire_time_record_key=expire_time_record_key,
+                redis_url=kwargs.get("redis_url"),
+            )
+
+        else:
+            initial_capacity = kwargs.get("initial_capacity", 100000000)
+            error_rate = kwargs.get("error_rate", 0.00001)
+            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
+                "name", "bloomfilter"
+            )
+            if filter_type == Dedup.BloomFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
+                    redis_url=kwargs.get("redis_url"),
+                )
+            elif filter_type == Dedup.MemoryFilter:
+                self.dedup = ScalableBloomFilter(
+                    name=name,
+                    initial_capacity=initial_capacity,
+                    error_rate=error_rate,
+                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
+                )
+            else:
+                raise ValueError(
+                    "filter_type 类型错误,仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
+                )
+
+        self._to_md5 = to_md5
+
+    def __repr__(self):
+        return str(self.dedup)
+
+    def _deal_datas(self, datas):
+        if self._to_md5:
+            if isinstance(datas, list):
+                keys = [get_md5(data) for data in datas]
+            else:
+                keys = get_md5(datas)
+        else:
+            keys = copy.deepcopy(datas)
+
+        return keys
+
+    def add(
+        self, datas: Union[List[Any], Any], skip_check: bool = False
+    ) -> Union[List[Any], Any]:
+        """
+        添加数据
+        @param datas: list / 单个值
+        @param skip_check: 是否直接添加,不检查是否存在 适用于bloomfilter,加快add速度
+        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
+        """
+
+        keys = self._deal_datas(datas)
+        is_added = self.dedup.add(keys, skip_check)
+
+        return is_added
+
+    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
+        """
+        检查数据是否存在
+        @param datas: list / 单个值
+        @return: list / 单个值 (存在返回1 不存在返回0)
+        """
+        keys = self._deal_datas(datas)
+        is_exists = self.dedup.get(keys)
+
+        return is_exists
+
+    def filter_exist_data(
+        self,
+        datas: List[Any],
+        *,
+        datas_fingerprints: Optional[List] = None,
+        callback: Callable[[Any], None] = None
+    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
+        """
+        过滤掉已存在的数据
+        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
+        @param datas_fingerprints: 数据的唯一指纹 列表
+        @param datas: 数据 列表
+        @param callback: 数据已存在时的回调 callback(data)
+        @return: None
+        """
+
+        is_exists = self.get(datas_fingerprints or datas)
+
+        dedup_datas = []
+
+        if datas_fingerprints:
+            dedup_datas_fingerprints = []
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+                data_fingerprint = datas_fingerprints.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                    dedup_datas_fingerprints.append(data_fingerprint)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas_fingerprints.extend(dedup_datas_fingerprints)
+            datas.extend(dedup_datas)
+            return datas, datas_fingerprints
+
+        else:
+            while is_exists:
+                data = datas.pop(0)
+                is_exist = is_exists.pop(0)
+
+                if not is_exist:
+                    dedup_datas.append(data)
+                else:
+                    if callback:
+                        callback(data)
+
+            datas.extend(dedup_datas)
+            return datas

+ 143 - 0
FworkSpider/feapder/dedup/bitarray.py

@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018/12/14 1:05 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+from __future__ import absolute_import
+
+
+from feapder.db.redisdb import RedisDB
+
+
+class BitArray:
+    def setall(self, value):
+        pass
+
+    def __repr__(self):
+        raise ImportError("this method mush be implement")
+
+    def set(self, offsets, values):
+        """
+        设置字符串数字某一位的值, 返回之前的值
+        @param offsets: 支持列表或单个值
+        @param values: 支持列表或单个值
+        @return: list / 单个值
+        """
+        raise ImportError("this method mush be implement")
+
+    def get(self, offsets):
+        """
+        取字符串数字某一位的值
+        @param offsets: 支持列表或单个值
+        @return: list / 单个值
+        """
+        raise ImportError("this method mush be implement")
+
+    def count(self, value=True):
+        raise ImportError("this method mush be implement")
+
+
+class MemoryBitArray(BitArray):
+    def __init__(self, num_bits):
+        try:
+            import bitarray
+        except Exception as e:
+            raise Exception(
+                "需要安装feapder完整版\ncommand: pip install feapder[all]\n若安装出错,参考:https://boris.org.cn/feapder/#/question/%E5%AE%89%E8%A3%85%E9%97%AE%E9%A2%98"
+            )
+
+        self.num_bits = num_bits
+        self.bitarray = bitarray.bitarray(num_bits, endian="little")
+
+        self.setall(0)
+
+    def __repr__(self):
+        return "MemoryBitArray: {}".format(self.num_bits)
+
+    def setall(self, value):
+        self.bitarray.setall(value)
+
+    def set(self, offsets, values):
+        """
+        设置字符串数字某一位的值, 返回之前的值
+        @param offsets: 支持列表或单个值
+        @param values: 支持列表或单个值
+        @return: list / 单个值
+        """
+
+        old_values = []
+
+        if isinstance(offsets, list):
+            if not isinstance(values, list):
+                values = [values] * len(offsets)
+            else:
+                assert len(offsets) == len(values), "offsets值要与values值一一对应"
+
+            for offset, value in zip(offsets, values):
+                old_values.append(int(self.bitarray[offset]))
+                self.bitarray[offset] = value
+
+        else:
+            old_values = int(self.bitarray[offsets])
+            self.bitarray[offsets] = values
+
+        return old_values
+
+    def get(self, offsets):
+        """
+        取字符串数字某一位的值
+        @param offsets: 支持列表或单个值
+        @return: list / 单个值
+        """
+        if isinstance(offsets, list):
+            return [self.bitarray[offset] for offset in offsets]
+        else:
+            return self.bitarray[offsets]
+
+    def count(self, value=True):
+        return self.bitarray.count(value)
+
+
+class RedisBitArray(BitArray):
+    """
+    仿bitarray 基于redis
+    """
+
+    redis_db = None
+
+    def __init__(self, name, redis_url=None):
+        self.name = name
+        self.count_cached_name = name + "_count_cached"
+
+        if not self.__class__.redis_db:
+            self.__class__.redis_db = RedisDB(url=redis_url)
+
+    def __repr__(self):
+        return "RedisBitArray: {}".format(self.name)
+
+    def set(self, offsets, values):
+        """
+        设置字符串数字某一位的值, 返回之前的值
+        @param offsets: 支持列表或单个值
+        @param values: 支持列表或单个值
+        @return: list / 单个值
+        """
+        return self.redis_db.setbit(self.name, offsets, values)
+
+    def get(self, offsets):
+        return self.redis_db.getbit(self.name, offsets)
+
+    def count(self, value=True):
+        # 先查redis的缓存,若没有 在统计数量
+        count = self.redis_db.strget(self.count_cached_name)
+        if count:
+            return int(count)
+        else:
+            count = self.redis_db.bitcount(self.name)
+            self.redis_db.strset(self.count_cached_name, count, ex=1800)  # 半小时过期
+            return count

+ 385 - 0
FworkSpider/feapder/dedup/bloomfilter.py

@@ -0,0 +1,385 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018/12/13 4:11 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import hashlib
+import math
+import threading
+import time
+from struct import unpack, pack
+
+from feapder.db.redisdb import RedisDB
+from feapder.utils.redis_lock import RedisLock
+from . import bitarray
+
+
+def make_hashfuncs(num_slices, num_bits):
+    if num_bits >= (1 << 31):
+        fmt_code, chunk_size = "Q", 8
+    elif num_bits >= (1 << 15):
+        fmt_code, chunk_size = "I", 4
+    else:
+        fmt_code, chunk_size = "H", 2
+    total_hash_bits = 8 * num_slices * chunk_size
+    if total_hash_bits > 384:
+        hashfn = hashlib.sha512
+    elif total_hash_bits > 256:
+        hashfn = hashlib.sha384
+    elif total_hash_bits > 160:
+        hashfn = hashlib.sha256
+    elif total_hash_bits > 128:
+        hashfn = hashlib.sha1
+    else:
+        hashfn = hashlib.md5
+    fmt = fmt_code * (hashfn().digest_size // chunk_size)
+    num_salts, extra = divmod(num_slices, len(fmt))
+    if extra:
+        num_salts += 1
+    salts = tuple(hashfn(hashfn(pack("I", i)).digest()) for i in range(num_salts))
+
+    def _make_hashfuncs(key):
+        if isinstance(key, str):
+            key = key.encode("utf-8")
+        else:
+            key = str(key).encode("utf-8")
+
+        i = 0
+        for salt in salts:
+            h = salt.copy()
+            h.update(key)
+            for uint in unpack(fmt, h.digest()):
+                yield uint % num_bits
+                i += 1
+                if i >= num_slices:
+                    return
+
+    return _make_hashfuncs
+
+
+class BloomFilter(object):
+    BASE_MEMORY = 1
+    BASE_REDIS = 2
+
+    def __init__(
+        self,
+        capacity: int,
+        error_rate: float = 0.00001,
+        bitarray_type=BASE_REDIS,
+        name=None,
+        redis_url=None,
+    ):
+        if not (0 < error_rate < 1):
+            raise ValueError("Error_Rate must be between 0 and 1.")
+        if not capacity > 0:
+            raise ValueError("Capacity must be > 0")
+
+        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
+        # k = log2(1/P)
+        # solving for m = bits_per_slice
+        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
+        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
+        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
+        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
+        bits_per_slice = int(
+            math.ceil(
+                (capacity * abs(math.log(error_rate)))
+                / (num_slices * (math.log(2) ** 2))
+            )
+        )
+        self._setup(error_rate, num_slices, bits_per_slice, capacity)
+
+        if bitarray_type == BloomFilter.BASE_MEMORY:
+            self.bitarray = bitarray.MemoryBitArray(self.num_bits)
+            self.bitarray.setall(False)
+        elif bitarray_type == BloomFilter.BASE_REDIS:
+            assert name, "name can't be None "
+            self.bitarray = bitarray.RedisBitArray(name, redis_url)
+        else:
+            raise ValueError("not support this bitarray type")
+
+    def _setup(self, error_rate, num_slices, bits_per_slice, capacity):
+        self.error_rate = error_rate
+        self.num_slices = num_slices
+        self.bits_per_slice = bits_per_slice
+        self.capacity = capacity
+        self.num_bits = num_slices * bits_per_slice
+        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
+
+        self._is_at_capacity = False
+        self._check_capacity_time = 0
+
+    def __repr__(self):
+        return "<BloomFilter: {}>".format(self.bitarray)
+
+    def get(self, keys, to_list=False):
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+        is_exists = []
+
+        offsets = []
+        for key in keys:
+            hashes = self.make_hashes(key)
+            offset = 0
+            for k in hashes:
+                offsets.append(offset + k)
+                offset += self.bits_per_slice
+
+        old_values = self.bitarray.get(offsets)
+        for i in range(0, len(old_values), self.num_slices):
+            is_exists.append(int(all(old_values[i : i + self.num_slices])))
+
+        if to_list:
+            return is_exists
+        else:
+            return is_exists if is_list else is_exists[0]
+
+    @property
+    def is_at_capacity(self):
+        """
+        是否容量已满, 1的个数满位数组的一半的时,则看做已满
+        比较耗时 半小时检查一次
+        @return:
+        """
+        # if self._is_at_capacity:
+        #     return self._is_at_capacity
+        #
+        # if not self._check_capacity_time or time.time() - self._check_capacity_time > 1800:
+        #     bit_count = self.bitarray.count()
+        #     if bit_count and bit_count / self.num_bits > 0.5:
+        #         self._is_at_capacity = True
+        #
+        #     self._check_capacity_time = time.time()
+        #
+        # return self._is_at_capacity
+
+        if self._is_at_capacity:
+            return self._is_at_capacity
+
+        bit_count = self.bitarray.count()
+        if bit_count and bit_count / self.num_bits > 0.5:
+            self._is_at_capacity = True
+
+        return self._is_at_capacity
+
+    def add(self, keys):
+        """
+        Adds a key to this bloom filter. If the key already exists in this
+        filter it will return False. Otherwise True. keys support list
+        @param keys: list or one key
+        @return:
+        """
+        if self.is_at_capacity:
+            raise IndexError("BloomFilter is at capacity")
+
+        is_list = isinstance(keys, list)
+
+        keys = keys if is_list else [keys]
+        is_added = []
+
+        offsets = []
+        for key in keys:
+            hashes = self.make_hashes(key)
+            offset = 0
+            for k in hashes:
+                offsets.append(offset + k)
+                offset += self.bits_per_slice
+
+        old_values = self.bitarray.set(offsets, 1)
+        for i in range(0, len(old_values), self.num_slices):
+            is_added.append(1 ^ int(all(old_values[i : i + self.num_slices])))
+
+        return is_added if is_list else is_added[0]
+
+
+class ScalableBloomFilter(object):
+    """
+    自动扩展空间的bloomfilter, 当一个filter满一半的时候,创建下一个
+    """
+
+    BASE_MEMORY = BloomFilter.BASE_MEMORY
+    BASE_REDIS = BloomFilter.BASE_REDIS
+
+    def __init__(
+        self,
+        initial_capacity: int = 100000000,
+        error_rate: float = 0.00001,
+        bitarray_type=BASE_REDIS,
+        name=None,
+        redis_url=None,
+    ):
+
+        if not error_rate or error_rate < 0:
+            raise ValueError("Error_Rate must be a decimal less than 0.")
+
+        self._setup(
+            initial_capacity, error_rate, name, bitarray_type, redis_url=redis_url
+        )
+
+    def _setup(self, initial_capacity, error_rate, name, bitarray_type, redis_url):
+        self.initial_capacity = initial_capacity
+        self.error_rate = error_rate
+        self.name = name
+        self.bitarray_type = bitarray_type
+        self.redis_url = redis_url
+
+        self.filters = []
+
+        self.filters.append(self.create_filter())
+        self._thread_lock = threading.RLock()
+        self._check_capacity_time = 0
+
+    def __repr__(self):
+        return "<ScalableBloomFilter: {}>".format(self.filters[-1].bitarray)
+
+    def create_filter(self):
+        filter = BloomFilter(
+            capacity=self.initial_capacity,
+            error_rate=self.error_rate,
+            bitarray_type=self.bitarray_type,
+            name=self.name + str(len(self.filters)) if self.name else self.name,
+            redis_url=self.redis_url,
+        )
+
+        return filter
+
+    def check_filter_capacity(self):
+        """
+        检测filter状态,如果已满,加载新的filter
+        @return:
+        """
+        if (
+            not self._check_capacity_time
+            or time.time() - self._check_capacity_time > 1800
+        ):
+            if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY:
+                with self._thread_lock:
+                    while True:
+                        if self.filters[-1].is_at_capacity:
+                            self.filters.append(self.create_filter())
+                        else:
+                            break
+
+                    self._check_capacity_time = time.time()
+            else:
+                # 全局锁 同一时间只有一个进程在真正的创建新的filter,等这个进程创建完,其他进程只是把刚创建的filter append进来
+                key = (
+                    f"ScalableBloomFilter:{self.name}"
+                    if self.name
+                    else "ScalableBloomFilter"
+                )
+                with RedisLock(key=key) as lock:
+                    if lock.locked:
+                        while True:
+                            if self.filters[-1].is_at_capacity:
+                                self.filters.append(self.create_filter())
+                            else:
+                                break
+
+                        self._check_capacity_time = time.time()
+
+    def add(self, keys, skip_check=False):
+        """
+        Adds a key to this bloom filter. If the key already exists in this
+        filter it will return False. Otherwise True. keys support list
+        @param keys: list or one key
+        @param skip_check: add directly,not check if is exist in bloomfilters
+        @return:
+        """
+
+        self.check_filter_capacity()
+
+        current_filter = self.filters[-1]
+
+        if skip_check:
+            return current_filter.add(keys)
+
+        else:
+            is_list = isinstance(keys, list)
+
+            keys = keys if is_list else [keys]
+            not_exist_keys = list(set(keys))
+
+            # 检查之前的bloomfilter是否存在
+            # 记录下每级filter存在的key,不存在的key继续向下检查
+            for filter in reversed(self.filters):
+                current_filter_is_exists = filter.get(
+                    not_exist_keys, to_list=True
+                )  # 当前的filter是否存在
+
+                not_exist_keys_temp = []
+
+                for key, is_exist in zip(not_exist_keys, current_filter_is_exists):
+                    if not is_exist:  # 当前filter不存在的key 需要继续向下检查
+                        not_exist_keys_temp.append(key)
+
+                not_exist_keys = not_exist_keys_temp
+
+                if not not_exist_keys:
+                    break
+
+            # 仍有不存在的关键词,记录该关键词
+            if not_exist_keys:
+                current_filter.add(not_exist_keys)
+
+            # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
+            for i, key in enumerate(keys):
+                for j, not_exist_key in enumerate(not_exist_keys):
+                    if key == not_exist_key:
+                        keys[i] = 1
+                        not_exist_keys.pop(j)
+                        break
+                else:
+                    keys[i] = 0
+
+            is_added = keys
+            return is_added if is_list else is_added[0]
+
+    def get(self, keys):
+        self.check_filter_capacity()
+
+        is_list = isinstance(keys, list)
+
+        keys = keys if is_list else [keys]  # 最终会修改为 [0, 1, ...] 0表示不存在 1 已存在
+        not_exist_keys = list(set(keys))
+
+        # 检查之前的bloomfilter是否存在
+        # 记录下每级filter存在的key,不存在的key继续向下检查
+        for filter in reversed(self.filters):
+            current_filter_is_exists = filter.get(
+                not_exist_keys, to_list=True
+            )  # 当前的filter是否存在
+
+            not_exist_keys_temp = []
+
+            for checked_key, is_exist in zip(not_exist_keys, current_filter_is_exists):
+                if not is_exist:  # 当前filter不存在的key 需要继续向下检查
+                    not_exist_keys_temp.append(checked_key)
+
+            not_exist_keys = not_exist_keys_temp
+
+            if not not_exist_keys:
+                break
+
+        # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在,其他看作已存在
+        for i, key in enumerate(keys):
+            for j, not_exist_key in enumerate(not_exist_keys):
+                if key == not_exist_key:
+                    keys[i] = 0
+                    not_exist_keys.pop(j)
+                    break
+            else:
+                keys[i] = 1
+
+        is_exists = keys
+        return is_exists if is_list else is_exists[0]
+
+    @property
+    def capacity(self):
+        """Returns the total capacity for all filters in this SBF"""
+        return sum(f.capacity for f in self.filters)

+ 70 - 0
FworkSpider/feapder/dedup/expirefilter.py

@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018/12/13 9:44 PM
+---------
+@summary: 带有有效期的去重集合
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import time
+
+from feapder.db.redisdb import RedisDB
+
+
+class ExpireFilter:
+    redis_db = None
+
+    def __init__(
+        self, name: str, expire_time: int, expire_time_record_key=None, redis_url=None
+    ):
+        if not name:
+            raise ValueError("name cant't be None")
+        if not expire_time:
+            raise ValueError("please set expire time, units is seconds")
+
+        if not self.__class__.redis_db:
+            self.__class__.redis_db = RedisDB(url=redis_url)
+
+        self.name = name
+        self.expire_time = expire_time
+        self.expire_time_record_key = expire_time_record_key
+        self.del_expire_key_time = None
+
+        self.record_expire_time()
+
+        self.del_expire_key()
+
+    def __repr__(self):
+        return "<ExpireSet: {}>".format(self.name)
+
+    @property
+    def current_timestamp(self):
+        return int(time.time())
+
+    def add(self, keys, *args, **kwargs):
+        """
+        @param keys: 检查关键词在zset中是否存在,支持列表批量
+        @return: list / 单个值
+        """
+        if self.current_timestamp - self.del_expire_key_time > self.expire_time:
+            self.del_expire_key()
+
+        is_added = self.redis_db.zadd(self.name, keys, self.current_timestamp)
+        return is_added
+
+    def get(self, keys):
+        return self.redis_db.zexists(self.name, keys)
+
+    def del_expire_key(self):
+        self.redis_db.zremrangebyscore(
+            self.name, "-inf", self.current_timestamp - self.expire_time
+        )
+        self.del_expire_key_time = self.current_timestamp
+
+    def record_expire_time(self):
+        if self.expire_time_record_key:
+            self.redis_db.hset(
+                self.expire_time_record_key, key=self.name, value=self.expire_time
+            )

+ 0 - 0
FworkSpider/feapder/network/__init__.py


+ 821 - 0
FworkSpider/feapder/network/cookie_pool.py

@@ -0,0 +1,821 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018/12/27 11:32 AM
+---------
+@summary: cookie池
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import abc
+import datetime
+import random
+import time
+import warnings
+from collections import Iterable
+from enum import Enum, unique
+
+import feapder.utils.tools as tools
+from feapder import setting
+from feapder.db.mysqldb import MysqlDB
+from feapder.db.redisdb import RedisDB
+from feapder.utils import metrics
+from feapder.utils.log import log
+from feapder.utils.redis_lock import RedisLock
+from feapder.utils.tools import send_msg
+from feapder.utils.webdriver import WebDriver
+
+
+class CookiePoolInterface(metaclass=abc.ABCMeta):
+    """
+    cookie pool interface
+    """
+
+    @abc.abstractmethod
+    def create_cookie(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_cookie(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def del_cookie(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def run(self):
+        raise NotImplementedError
+
+
+class PageCookiePool(CookiePoolInterface):
+    """
+    由页面产生的cookie 不需要用户登陆
+    """
+
+    def __init__(
+        self,
+        redis_key,
+        page_url=None,
+        min_cookies=10000,
+        must_contained_keys=(),
+        keep_alive=False,
+        **kwargs,
+    ):
+        """
+        @param redis_key: 项目名
+        @param page_url: 生产cookie的url
+        @param min_cookies: 最小cookie数
+        @param must_contained_keys: cookie 必须包含的key
+        @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
+        ---
+        @param kwargs: WebDriver的一些参数
+            load_images: 是否加载图片
+            user_agent_pool: user-agent池 为None时不使用
+            proxies_pool: ;代理池 为None时不使用
+            headless: 是否启用无头模式
+            driver_type: web driver 类型
+            timeout: 请求超时时间 默认16s
+            window_size: 屏幕分辨率 (width, height)
+
+        """
+
+        self._redisdb = RedisDB()
+
+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
+        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
+            redis_key
+        )  # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
+        self._page_url = page_url
+        self._min_cookies = min_cookies
+        self._must_contained_keys = must_contained_keys
+        self._keep_alive = keep_alive
+
+        self._kwargs = kwargs
+        self._kwargs.setdefault("load_images", False)
+        self._kwargs.setdefault("headless", True)
+
+    def create_cookie(self):
+        """
+        可能会重写
+        @return:
+        """
+        with WebDriver(**self._kwargs) as driver:
+            driver.get(self._page_url)
+
+            cookies = driver.get_cookies()
+
+            cookies_json = {}
+            for cookie in cookies:
+                cookies_json[cookie["name"]] = cookie["value"]
+
+            for key in self._must_contained_keys:
+                if key not in cookies_json:
+                    break
+            else:
+                return cookies_json
+
+            log.error("获取cookie失败 cookies = {}".format(cookies_json))
+            return None
+
+    def add_cookies(self, cookies):
+        log.info("添加cookie {}".format(cookies))
+        self._redisdb.lpush(self._tab_cookie_pool, cookies)
+
+    def run(self):
+        while True:
+            try:
+                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
+                need_cookie_count = self._min_cookies - now_cookie_count
+
+                if need_cookie_count > 0:
+                    log.info(
+                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
+                            now_cookie_count, self._min_cookies
+                        )
+                    )
+                    try:
+                        cookies = self.create_cookie()
+                        if cookies:
+                            self.add_cookies(cookies)
+                    except Exception as e:
+                        log.exception(e)
+                else:
+                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
+
+                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
+                    last_count_info = self._redisdb.strget(
+                        self._tab_cookie_pool_last_count
+                    )
+                    if not last_count_info:
+                        self._redisdb.strset(
+                            self._tab_cookie_pool_last_count,
+                            "{}:{}".format(time.time(), now_cookie_count),
+                        )
+                    else:
+                        last_time, last_count = last_count_info.split(":")
+                        last_time = float(last_time)
+                        last_count = int(last_count)
+
+                        if time.time() - last_time > 60:
+                            if now_cookie_count == last_count:
+                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
+                                break
+                            else:
+                                self._redisdb.strset(
+                                    self._tab_cookie_pool_last_count,
+                                    "{}:{}".format(time.time(), now_cookie_count),
+                                )
+
+                    if self._keep_alive:
+                        log.info("sleep 10")
+                        tools.delay_time(10)
+                    else:
+                        break
+
+            except Exception as e:
+                log.exception(e)
+                tools.delay_time(1)
+
+    def get_cookie(self, wait_when_null=True):
+        while True:
+            try:
+                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
+                if not cookie_info and wait_when_null:
+                    log.info("暂无cookie 生产中...")
+                    self._keep_alive = False
+                    self._min_cookies = 1
+                    with RedisLock(
+                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
+                    ) as _lock:
+                        if _lock.locked:
+                            self.run()
+                    continue
+                return eval(cookie_info) if cookie_info else {}
+            except Exception as e:
+                log.exception(e)
+                tools.delay_time(1)
+
+    def del_cookie(self, cookies):
+        self._redisdb.lrem(self._tab_cookie_pool, cookies)
+
+
+class User:
+    def __init__(self, username, cookie):
+        self.username = username
+        self.cookie = cookie
+
+
+class LoginCookiePool(CookiePoolInterface):
+    """
+    需要登陆的cookie池, 用户账号密码等信息用mysql保存
+    """
+
+    def __init__(
+        self,
+        redis_key,
+        *,
+        table_userbase,
+        login_state_key="login_state",
+        lock_state_key="lock_state",
+        username_key="username",
+        password_key="password",
+        login_retry_times=10,
+    ):
+        """
+        @param redis_key: 项目名
+        @param table_userbase: 用户表名
+        @param login_state_key: 登录状态列名
+        @param lock_state_key: 封锁状态列名
+        @param username_key: 登陆名列名
+        @param password_key: 密码列名
+        @param login_retry_times: 登陆失败重试次数
+        """
+
+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
+        self._login_retry_times = login_retry_times
+        self._table_userbase = table_userbase
+        self._login_state_key = login_state_key
+        self._lock_state_key = lock_state_key
+        self._username_key = username_key
+        self._password_key = password_key
+
+        self._redisdb = RedisDB()
+        self._mysqldb = ()
+
+        self.create_userbase()
+
+    def create_userbase(self):
+        sql = f"""
+            CREATE TABLE IF NOT EXISTS `{self._table_userbase}` (
+              `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
+              `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名',
+              `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码',
+              `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态(0未登录 1已登录)',
+              `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封(0 未封 1 被封)',
+              PRIMARY KEY (`id`),
+              UNIQUE KEY `username` (`username`) USING BTREE
+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+        """
+        self._mysqldb.execute(sql)
+
+    def create_cookie(self, username, password):
+        """
+        创建cookie
+        @param username: 用户名
+        @param password: 密码
+        @return: return cookie / None
+        """
+        raise NotImplementedError
+
+    def get_user_info(self):
+        """
+        返回用户信息
+        @return: yield username, password
+        """
+
+        sql = "select {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format(
+            username_key=self._username_key,
+            password_key=self._password_key,
+            table_userbase=self._table_userbase,
+            lock_state_key=self._lock_state_key,
+            login_state_key=self._login_state_key,
+        )
+
+        return self._mysqldb.find(sql)
+
+    def handle_login_failed_user(self, username, password):
+        """
+        处理登录失败的user
+        @param username:
+        @param password:
+        @return:
+        """
+
+        pass
+
+    def handel_exception(self, e):
+        """
+        处理异常
+        @param e:
+        @return:
+        """
+        log.exception(e)
+
+    def save_cookie(self, username, cookie):
+        user_cookie = {"username": username, "cookie": cookie}
+
+        self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
+
+        sql = "update {table_userbase} set {login_state_key} = 1 where {username_key} = '{username}'".format(
+            table_userbase=self._table_userbase,
+            login_state_key=self._login_state_key,
+            username_key=self._username_key,
+            username=username,
+        )
+
+        self._mysqldb.update(sql)
+
+    def get_cookie(self, wait_when_null=True) -> User:
+        while True:
+            try:
+                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
+                if not user_cookie and wait_when_null:
+                    log.info("暂无cookie 生产中...")
+                    self.login()
+                    continue
+
+                if user_cookie:
+                    user_cookie = eval(user_cookie)
+                    return User(**user_cookie)
+
+                return None
+            except Exception as e:
+                log.exception(e)
+                tools.delay_time(1)
+
+    def del_cookie(self, user: User):
+        """
+        删除失效的cookie
+        @param user:
+        @return:
+        """
+        user_info = {"username": user.username, "cookie": user.cookie}
+        self._redisdb.lrem(self._tab_cookie_pool, user_info)
+
+        sql = "update {table_userbase} set {login_state_key} = 0 where {username_key} = '{username}'".format(
+            table_userbase=self._table_userbase,
+            login_state_key=self._login_state_key,
+            username_key=self._username_key,
+            username=user.username,
+        )
+
+        self._mysqldb.update(sql)
+
+    def user_is_locked(self, user: User):
+        sql = "update {table_userbase} set {lock_state_key} = 1 where {username_key} = '{username}'".format(
+            table_userbase=self._table_userbase,
+            lock_state_key=self._lock_state_key,
+            username_key=self._username_key,
+            username=user.username,
+        )
+
+        self._mysqldb.update(sql)
+
+    def run(self):
+        with RedisLock(
+            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
+        ) as _lock:
+            if _lock.locked:
+                user_infos = self.get_user_info()
+                if not isinstance(user_infos, Iterable):
+                    raise ValueError("get_user_info 返回值必须可迭代")
+
+                if not user_infos:
+                    log.info("无可用用户")
+
+                for username, password in user_infos:
+                    for i in range(self._login_retry_times):
+                        try:
+                            cookie = self.create_cookie(username, password)
+                            if cookie:
+                                self.save_cookie(username, cookie)
+                            else:
+                                self.handle_login_failed_user(username, password)
+
+                            break
+                        except Exception as e:
+                            self.handel_exception(e)
+
+                    else:
+                        self.handle_login_failed_user(username, password)
+
+    login = run
+
+
+@unique
+class LimitTimesUserStatus(Enum):
+    # 使用状态
+    USED = "used"
+    SUCCESS = "success"
+    OVERDUE = "overdue"  # cookie 过期
+    SLEEP = "sleep"
+    EXCEPTION = "exception"
+    # 登陆状态
+    LOGIN_SUCCESS = "login_success"
+    LOGIN_FALIED = "login_failed"
+
+
+class LimitTimesUser:
+    """
+    有次数限制的账户
+    基于本地做的缓存,不支持多进程调用
+    """
+
+    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
+    SITE_NAME = ""  # 网站名
+
+    redisdb = None
+
+    def __init__(
+        self,
+        username,
+        password,
+        max_search_times,
+        proxies=None,
+        search_interval=0,
+        **kwargs,
+    ):
+        """
+        @param username:
+        @param password:
+        @param max_search_times:
+        @param proxies:
+        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
+        """
+        self.__dict__.update(kwargs)
+        self.username = username
+        self.password = password
+        self.max_search_times = max_search_times
+        self.proxies = proxies
+        self.search_interval = search_interval
+        self.delay_use = 0  # 延时使用,用于等待解封的用户
+
+        if isinstance(search_interval, (tuple, list)):
+            if len(search_interval) != 2:
+                raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
+
+            self.used_for_time_length = (
+                search_interval[1] * 5
+            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
+        else:
+            self.used_for_time_length = (
+                search_interval * 5
+            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
+
+        self.account_info = {
+            "login_time": 0,
+            "cookies": {},
+            "search_times": 0,
+            "last_search_time": 0,
+            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
+            "init_search_times_time": 0,  # 初始化搜索次数的时间
+        }
+
+        if not self.__class__.redisdb:
+            self.__class__.redisdb = RedisDB()
+
+        self.sync_account_info_from_redis()
+
+        self.__init_metrics()
+
+    def __init_metrics(self):
+        """
+        初始化打点系统
+        @return:
+        """
+        metrics.init(**setting.METRICS_OTHER_ARGS)
+
+    def record_user_status(self, status: LimitTimesUserStatus):
+        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
+
+    def __repr__(self):
+        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
+
+    def __eq__(self, other):
+        return self.username == other.username
+
+    def sync_account_info_from_redis(self):
+        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
+        if account_info:
+            account_info = eval(account_info)
+            self.account_info.update(account_info)
+
+    @property
+    def cookies(self):
+        cookies = self.account_info.get("cookies")
+        return cookies
+
+    def set_cookies(self, cookies):
+        self.account_info["cookies"] = cookies
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    def set_login_time(self, login_time=None):
+        self.account_info["login_time"] = login_time or time.time()
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    def get_login_time(self):
+        return self.account_info.get("login_time")
+
+    def is_time_to_login(self):
+        return time.time() - self.get_login_time() > 40 * 60
+
+    def get_last_search_time(self):
+        return self.account_info.get("last_search_time", 0)
+
+    def is_time_to_search(self):
+        if self.delay_use:
+            is_time = time.time() - self.get_last_search_time() > self.delay_use
+            if is_time:
+                self.delay_use = 0
+
+        else:
+            is_time = time.time() - self.get_last_search_time() > (
+                random.randint(*self.search_interval)
+                if isinstance(self.search_interval, (tuple, list))
+                else self.search_interval
+            )
+
+        return is_time
+
+    @property
+    def used_for_spider_name(self):
+        return self.account_info.get("used_for_spider_name")
+
+    @used_for_spider_name.setter
+    def used_for_spider_name(self, spider_name):
+        self.account_info["used_for_spider_name"] = spider_name
+
+    def update_status(self):
+        """
+        更新search的一些状态
+        @return:
+        """
+        self.account_info["search_times"] += 1
+        self.account_info["last_search_time"] = time.time()
+
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    @property
+    def search_times(self):
+        init_search_times_time = self.account_info.get("init_search_times_time")
+        current_time = time.time()
+        if (
+            current_time - init_search_times_time >= 86400
+        ):  # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
+            self.account_info["search_times"] = 0
+            self.account_info["init_search_times_time"] = current_time
+
+            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
+
+        return self.account_info["search_times"]
+
+    def is_overwork(self):
+        if self.search_times > self.max_search_times:
+            log.warning("账号 {} 请求次数超限制".format(self.username))
+            return True
+
+        return False
+
+    def is_at_work_time(self):
+        if datetime.datetime.now().hour in list(range(7, 23)):
+            return True
+
+        log.warning("账号 {} 不再工作时间内".format(self.username))
+        return False
+
+    def del_cookie(self):
+        self.account_info["cookies"] = {}
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    def create_cookie(self):
+        """
+        生产cookie 有异常需要抛出
+        @return: cookie_dict
+        """
+
+        raise NotImplementedError
+
+    def login(self):
+        """
+        @return: 1 成功 0 失败
+        """
+
+        try:
+            # 预检查
+            if not self.is_time_to_login():
+                log.info("此账号尚未到登陆时间: {}".format(self.username))
+                time.sleep(5)
+                return 0
+
+            cookies = self.create_cookie()
+            if not cookies:
+                raise Exception("登陆失败 未获取到合法cookie")
+
+            if not isinstance(cookies, dict):
+                raise Exception("cookie 必须为字典格式")
+
+            # 保存cookie
+            self.set_login_time()
+            self.set_cookies(cookies)
+            log.info("登录成功 {}".format(self.username))
+            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
+            return 1
+
+        except Exception as e:
+            log.exception(e)
+            send_msg(
+                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
+                level="error",
+                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
+            )
+
+        log.info("登录失败 {}".format(self.username))
+        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
+        return 0
+
+
+class LimitTimesUserPool:
+    """
+    限制查询次数的用户的User pool
+    基于本地做的缓存,不支持多进程调用
+    """
+
+    LOAD_USER_INTERVAL = 60
+
+    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
+        """
+        @param accounts_dic: 账户信息字典
+            {
+                "15011300228": {
+                    "password": "300228",
+                    "proxies": {},
+                    "max_search_times": 500,
+                    "search_interval": 1, # 使用时间间隔
+                    # 其他携带信息
+                }
+            }
+        @param limit_user_class: 用户重写的 limit_user_class
+        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
+        """
+        self.accounts_dict = accounts_dict
+        self.limit_user_class = limit_user_class
+
+        self.limit_times_users = []
+        self.current_user_index = -1
+
+        self.support_more_client = support_more_client
+
+        self.last_load_user_time = 0
+
+    def __load_users(self, username=None):
+        # 装载user
+        log.info("更新可用用户")
+
+        for _username, detail in self.accounts_dict.items():
+            if username and username != _username:
+                continue
+
+            limit_times_users = self.limit_user_class(username=_username, **detail)
+            if limit_times_users in self.limit_times_users:
+                continue
+
+            if limit_times_users.is_overwork():
+                continue
+            else:
+                if (
+                    limit_times_users.cookies or limit_times_users.login()
+                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
+                    self.limit_times_users.append(limit_times_users)
+
+        self.last_load_user_time = time.time()
+
+    def get_user(
+        self,
+        username=None,
+        used_for_spider_name=None,
+        wait_when_null=True,
+        not_limit_frequence=False,
+    ) -> LimitTimesUser:
+        """
+        @params username: 获取指定的用户
+        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
+        @params wait_when_null: 无用户时是否等待
+        @params not_limit_frequence: 不限制使用频率
+        @return: LimitTimesUser
+        """
+        if not self.support_more_client:
+            warnings.warn(
+                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
+                category=Warning,
+            )
+            self._is_show_warning = True
+
+        while True:
+            if (
+                not self.limit_times_users
+                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
+            ):
+                self.__load_users(username)
+                if not self.limit_times_users:
+                    log.warning("无可用的用户")
+                    if wait_when_null:
+                        time.sleep(1)
+                        continue
+                    else:
+                        return None
+
+            self.current_user_index += 1
+            self.current_user_index = self.current_user_index % len(
+                self.limit_times_users
+            )
+
+            limit_times_user = self.limit_times_users[self.current_user_index]
+            if self.support_more_client:  # 需要先同步下最新数据
+                limit_times_user.sync_account_info_from_redis()
+
+            if username and limit_times_user.username != username:
+                log.info(
+                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
+                )
+                time.sleep(1)
+                continue
+
+            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
+            if (
+                limit_times_user.used_for_spider_name
+                and limit_times_user.used_for_spider_name != used_for_spider_name
+            ):
+                wait_time = time.time() - limit_times_user.get_last_search_time()
+                if wait_time < limit_times_user.used_for_time_length:
+                    log.info(
+                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
+                            limit_times_user.username,
+                            limit_times_user.used_for_spider_name,
+                            limit_times_user.used_for_time_length - wait_time,
+                        )
+                    )
+                    time.sleep(1)
+                    continue
+
+            if (
+                not limit_times_user.is_overwork()
+                and limit_times_user.is_at_work_time()
+            ):
+                if not limit_times_user.cookies:
+                    self.limit_times_users.remove(limit_times_user)
+                    continue
+
+                if not_limit_frequence or limit_times_user.is_time_to_search():
+                    limit_times_user.used_for_spider_name = used_for_spider_name
+
+                    limit_times_user.update_status()
+                    log.info("使用用户 {}".format(limit_times_user.username))
+                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
+                    return limit_times_user
+                else:
+                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
+                    time.sleep(1)
+                    continue
+            else:
+                self.limit_times_users.remove(limit_times_user)
+                self.current_user_index -= 1
+
+                if not limit_times_user.is_at_work_time():
+                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
+                    if wait_when_null:
+                        time.sleep(30)
+                        continue
+                    else:
+                        return None
+
+    def del_user(self, username):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.del_cookie()
+                self.limit_times_users.remove(limit_times_user)
+                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
+                self.__load_users(username)
+                break
+
+    def update_cookies(self, username, cookies):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.set_cookies(cookies)
+                break
+
+    def delay_use(self, username, delay_seconds):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.delay_use = delay_seconds
+                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
+                break
+
+    def record_success_user(self, username):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
+
+    def record_exception_user(self, username):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)

+ 145 - 0
FworkSpider/feapder/network/item.py

@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-26 22:28:10
+---------
+@summary: 定义实体
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import feapder.utils.tools as tools
+
+
+class ItemMetaclass(type):
+    def __new__(cls, name, bases, attrs):
+        attrs.setdefault("__name__", None)
+        attrs.setdefault("__table_name__", None)
+        attrs.setdefault("__name_underline__", None)
+        attrs.setdefault("__update_key__", None)
+        attrs.setdefault("__unique_key__", None)
+
+        return type.__new__(cls, name, bases, attrs)
+
+
+class Item(metaclass=ItemMetaclass):
+    __unique_key__ = []
+
+    def __init__(self, **kwargs):
+        self.__dict__ = kwargs
+
+    def __repr__(self):
+        return "<{}: {}>".format(self.item_name, tools.dumps_json(self.to_dict))
+
+    def __getitem__(self, key):
+        return self.__dict__[key]
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+    def pre_to_db(self):
+        """
+        入库前的处理
+        """
+        pass
+
+    @property
+    def to_dict(self):
+        propertys = {}
+        for key, value in self.__dict__.items():
+            if key not in (
+                "__name__",
+                "__table_name__",
+                "__name_underline__",
+                "__update_key__",
+                "__unique_key__",
+            ):
+                if key.startswith(f"_{self.__class__.__name__}"):
+                    key = key.replace(f"_{self.__class__.__name__}", "")
+                propertys[key] = value
+
+        return propertys
+
+    def to_sql(self, auto_update=False, update_columns=()):
+        return tools.make_insert_sql(
+            self.table_name, self.to_dict, auto_update, update_columns
+        )
+
+    @property
+    def item_name(self):
+        return self.__name__ or self.__class__.__name__
+
+    @item_name.setter
+    def item_name(self, name):
+        self.__name__ = name
+        self.__table_name__ = self.name_underline.replace("_item", "")
+
+    @property
+    def table_name(self):
+        if not self.__table_name__:
+            self.__table_name__ = self.name_underline.replace("_item", "")
+        return self.__table_name__
+
+    @table_name.setter
+    def table_name(self, name):
+        self.__table_name__ = name
+        self.__name__ = tools.key2hump(name) + "Item"
+
+    @property
+    def name_underline(self):
+        if not self.__name_underline__:
+            self.__name_underline__ = tools.key2underline(self.item_name)
+
+        return self.__name_underline__
+
+    @name_underline.setter
+    def name_underline(self, name):
+        self.__name_underline__ = name
+
+    @property
+    def unique_key(self):
+        return self.__unique_key__ or self.__class__.__unique_key__
+
+    @unique_key.setter
+    def unique_key(self, keys):
+        if isinstance(keys, (tuple, list)):
+            self.__unique_key__ = keys
+        else:
+            self.__unique_key__ = (keys,)
+
+    @property
+    def fingerprint(self):
+        args = []
+        for key, value in self.to_dict.items():
+            if value:
+                if (self.unique_key and key in self.unique_key) or not self.unique_key:
+                    args.append(str(value))
+
+        if args:
+            args = sorted(args)
+            return tools.get_md5(*args)
+        else:
+            return None
+
+    def to_UpdateItem(self):
+        update_item = UpdateItem(**self.__dict__)
+        update_item.item_name = self.item_name
+        return update_item
+
+
+class UpdateItem(Item):
+    __update_key__ = []
+
+    def __init__(self, **kwargs):
+        super(UpdateItem, self).__init__(**kwargs)
+
+    @property
+    def update_key(self):
+        return self.__update_key__ or self.__class__.__update_key__
+
+    @update_key.setter
+    def update_key(self, keys):
+        if isinstance(keys, (tuple, list)):
+            self.__update_key__ = keys
+        else:
+            self.__update_key__ = (keys,)

+ 20 - 0
FworkSpider/feapder/network/proxy_file/1c718b9e5cc682d4ca7154958d0919c0.txt

@@ -0,0 +1,20 @@
+117.88.5.96:8860

+111.179.93.27:8861

+111.179.93.27:8860

+113.226.100.155:8861

+113.226.100.155:8860

+114.99.103.81:8861

+171.13.51.41:8861

+114.99.103.81:8860

+171.13.51.41:8860

+125.41.17.67:8861

+125.41.17.67:8860

+113.123.0.127:8861

+117.88.5.96:8861

+182.101.196.230:8861

+113.123.0.127:8860

+182.101.196.230:8860

+182.34.102.234:8861

+182.34.102.234:8860

+117.88.4.100:8861

+117.88.4.100:8860

+ 20 - 0
FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt

@@ -0,0 +1,20 @@
+175.162.217.157:8860&&1643361380
+222.86.85.51:8861&&1643361867
+222.86.85.51:8860&&1643361867
+182.101.215.123:8861&&1643361013
+182.34.32.132:8860&&1643361124
+182.101.215.123:8860&&1643361013
+182.34.32.132:8861&&1643361124
+113.123.0.11:8861&&1643361579
+113.123.0.11:8860&&1643361579
+117.66.140.217:8860&&1643361016
+117.66.140.217:8861&&1643361016
+123.10.66.129:8860&&1643361437
+123.10.66.129:8861&&1643361437
+123.169.34.75:8860&&1643360309
+123.169.34.75:8861&&1643360309
+175.162.217.157:8861&&1643361379
+111.179.73.220:8860&&1643360596
+111.179.73.220:8861&&1643360596
+36.62.71.201:8861&&1643360585
+36.62.71.201:8860&&1643360585

+ 763 - 0
FworkSpider/feapder/network/proxy_pool.py

@@ -0,0 +1,763 @@
+# coding:utf8
+"""
+代理池
+"""
+import datetime
+import json
+import os
+import random
+import socket
+import time
+from urllib import parse
+
+import redis
+import requests
+
+from feapder import setting
+from feapder.utils import tools
+from feapder.utils.log import log
+
+
+def decrypt(input_str: str) -> str:
+    """
+    改写:新增
+    定义base64解密函数
+
+    :param input_str:
+    :return:
+    """
+    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
+    output_str = ''
+    # 对前面不是“=”的字节取索引,然后转换为2进制
+    # 补齐“=”的个数
+    equal_num = input_str.count('=')
+    while ascii_list:
+        temp_list = ascii_list[:4]
+        # 转换成2进制字符串
+        temp_str = ''.join(temp_list)
+        # 对没有8位2进制的字符串补够8位2进制
+        if len(temp_str) % 8 != 0:
+            temp_str = temp_str[0:-1 * equal_num * 2]
+        # 4个6字节的二进制  转换  为三个8字节的二进制
+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
+        # 二进制转为10进制
+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
+        # 连接成字符串
+        output_str += ''.join([chr(x) for x in temp_str_list])
+        ascii_list = ascii_list[4:]
+    return output_str
+
+
+# 建立本地缓存代理文件夹
+proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
+if not os.path.exists(proxy_path):
+    os.mkdir(proxy_path)
+
+
+# def get_proxies_by_host(host, port):
+#     proxy_id = "{}:{}".format(host, port)
+#     return get_proxies_by_id(proxy_id)
+
+
+# def get_proxies_by_id(proxy_id):
+#     proxies = {
+#         "http": "http://{}".format(proxy_id),
+#         "https": "https://{}".format(proxy_id),
+#     }
+#     return proxies
+
+
+def get_proxy_from_url(**kwargs):
+    """
+    获取指定url的代理
+    :param kwargs:
+    :return:
+    """
+    proxy_source_url = kwargs.get("proxy_source_url", [])
+    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
+
+    if not isinstance(proxy_source_url, list):
+        proxy_source_url = [proxy_source_url]
+        proxy_source_url = [x for x in proxy_source_url if x]
+    if not proxy_source_url:
+        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
+    kwargs = kwargs.copy()
+    kwargs.pop("proxy_source_url")
+    proxies_list = []
+    for url in proxy_source_url:
+        if url.startswith("http"):
+            proxies_list.extend(get_proxy_from_http(url, **kwargs))
+        elif url.startswith("redis"):
+            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
+
+    if proxies_list:
+        # 顺序打乱
+        random.shuffle(proxies_list)
+    return proxies_list
+
+
+def get_proxy_from_http(proxy_source_url, **kwargs):
+    """
+    从指定 http 地址获取代理
+    :param proxy_source_url:
+    :param kwargs:
+    :return:
+    """
+    filename = tools.get_md5(proxy_source_url) + ".txt"
+    abs_filename = os.path.join(proxy_path, filename)
+    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
+    update_flag = 0
+    if not update_interval:
+        # 强制更新
+        update_flag = 1
+    elif not os.path.exists(abs_filename):
+        # 文件不存在则更新
+        update_flag = 1
+    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
+        # 超过更新间隔
+        update_flag = 1
+    if update_flag:
+        pool = []
+        response = requests.get(proxy_source_url, timeout=20)
+        # 改写:获取scocks代理的response处理
+        for proxy in response.json():
+            host = decrypt(proxy['host'])
+            port = proxy['port']
+            endTime = proxy['EndTime']
+            pool.append(f"{host}:{port}&&{endTime}")
+
+        with open(os.path.join(proxy_path, filename), "w") as f:
+            f.write('\n'.join(pool))
+    return get_proxy_from_file(filename)
+
+
+def get_proxy_from_file(filename, **kwargs):
+    """
+    从指定本地文件获取代理
+        文件格式
+        ip:port:https
+        ip:port:http
+        ip:port
+    :param filename:
+    :param kwargs:
+    :return:
+    """
+    proxies_list = []
+    with open(os.path.join(proxy_path, filename), "r") as f:
+        lines = f.readlines()
+
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        # 解析
+        auth = ""
+        if "@" in line:
+            auth, line = line.split("@")
+        # 改写,解析代理有效期结束时间
+        line, end = line.split("&&")
+
+        items = line.split(":")
+        if len(items) < 2:
+            continue
+
+        ip, port, *protocol = items
+        if not all([port, ip]):
+            continue
+        if auth:
+            ip = "{}@{}".format(auth, ip)
+        if not protocol:
+            # 改写:判断代理是否在有效期内,并将代理格式重http格式改成socks格式
+            if time.time() < int(end):
+                proxies = {
+                    "https": "socks5://%s:%s" % (ip, port),
+                    "http": "socks5://%s:%s" % (ip, port),
+                    # "end":end
+                }
+            else:
+                continue
+        else:
+            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
+        proxies_list.append(proxies)
+
+    return proxies_list
+
+
+def get_proxy_from_redis(proxy_source_url, **kwargs):
+    """
+    从指定 redis 地址获取代理
+    @param proxy_source_url: redis://:passwd@host:ip/db
+        redis 存储结构 zset
+        ip:port ts
+    @param kwargs:
+        {"redis_proxies_key": "xxx"}
+    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
+    """
+
+    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
+    key = kwargs.get("redis_proxies_key")
+    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
+    proxies = redis_conn.zrange(key, 0, -1)
+    proxies_list = []
+    for proxy in proxies:
+        proxy = proxy.decode()
+        proxies_list.append(
+            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
+        )
+    return proxies_list
+
+
+def check_proxy(
+        ip="",
+        port="",
+        proxies=None,
+        type=0,
+        timeout=5,
+        logger=None,
+        show_error_log=True,
+        **kwargs,
+):
+    """
+    代理有效性检查
+    :param ip:
+    :param port:
+    :param type: 0:socket  1:requests
+    :param timeout:
+    :param logger:
+    :return:
+    """
+    if not logger:
+        logger = log
+    ok = 0
+    if type == 0 and ip and port:
+        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
+            sk.settimeout(timeout)
+            try:
+                # 必须检测 否则代理永远不刷新
+                sk.connect((ip, int(port)))
+                ok = 1
+            except Exception as e:
+                if show_error_log:
+                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
+            sk.close()
+    else:
+        if not proxies:
+            proxies = {
+                "http": "socks5://{}:{}".format(ip, port),
+                "https": "socks5//{}:{}".format(ip, port),
+            }
+        try:
+            # 改写:代理检测的url
+            r = requests.get(
+                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
+            )
+            ok = 1
+            r.close()
+        except Exception as e:
+            if show_error_log:
+                logger.debug(
+                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
+                )
+    return ok
+
+
+class ProxyItem(object):
+    """单个代理对象"""
+
+    # 代理标记
+    proxy_tag_list = (-1, 0, 1)
+
+    def __init__(
+            self,
+            proxies=None,
+            valid_timeout=20,
+            check_interval=180,
+            max_proxy_use_num=10000,
+            delay=30,
+            use_interval=None,
+            **kwargs,
+    ):
+        """
+        :param proxies:
+        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
+        :param check_interval:
+        :param max_proxy_use_num:
+        :param delay:
+        :param use_interval: 使用间隔 单位秒 默认不限制
+        :param logger: 日志处理器 默认 log.get_logger()
+        :param kwargs:
+        """
+        # {"http": ..., "https": ...}
+        self.proxies = proxies
+        # 检测超时时间 秒
+        self.valid_timeout = valid_timeout
+        # 检测间隔 秒
+        self.check_interval = check_interval
+
+        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
+        self.flag = 0
+        # 上次状态变化时间
+        self.flag_ts = 0
+        # 上次更新时间 有效时间
+        self.update_ts = 0
+        # 最大被使用次数
+        self.max_proxy_use_num = max_proxy_use_num
+        # 被使用次数记录
+        self.use_num = 0
+        # 延迟使用时间
+        self.delay = delay
+        # 使用间隔 单位秒
+        self.use_interval = use_interval
+        # 使用时间
+        self.use_ts = 0
+
+        self.proxy_args = self.parse_proxies(self.proxies)
+        self.proxy_ip = self.proxy_args["ip"]
+        self.proxy_port = self.proxy_args["port"]
+        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
+        if self.proxy_args["user"]:
+            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
+        else:
+            self.proxy_id = self.proxy_ip_port
+
+        # 日志处理器
+        self.logger = log
+
+    def get_proxies(self):
+        self.use_num += 1
+        return self.proxies
+
+    def is_delay(self):
+        return self.flag == 1
+
+    def is_valid(self, force=0, type=0):
+        """
+        检测代理是否有效
+            1 有效
+            2 延时使用
+            0 无效 直接在代理池删除
+        :param force:
+        :param type:
+        :return:
+        """
+        if self.use_num > self.max_proxy_use_num > 0:
+            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
+            return 0
+        if self.flag == -1:
+            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
+            return 0
+        if self.delay > 0 and self.flag == 1:
+            if time.time() - self.flag_ts < self.delay:
+                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
+                return 2
+            else:
+                self.flag = 0
+                self.logger.debug("延迟代理释放: {}".format(self.proxies))
+        if self.use_interval:
+            if time.time() - self.use_ts < self.use_interval:
+                return 2
+        if not force:
+            if time.time() - self.update_ts < self.check_interval:
+                return 1
+        if self.valid_timeout > 0:
+            ok = check_proxy(
+                proxies=self.proxies,
+                type=type,
+                timeout=self.valid_timeout,
+                logger=self.logger,
+            )
+        else:
+            ok = 1
+        self.update_ts = time.time()
+        return ok
+
+    @classmethod
+    def parse_proxies(self, proxies):
+        """
+        分解代理组成部分
+        :param proxies:
+        :return:
+        """
+        if not proxies:
+            return {}
+        if isinstance(proxies, (str, bytes)):
+            proxies = json.loads(proxies)
+        protocol = list(proxies.keys())
+        if not protocol:
+            return {}
+        _url = proxies.get(protocol[0])
+        # 改写:注释http代理url的拼接,以正常生成代理池
+        # if not _url.startswith("http"):
+        #     _url = "http://" + _url
+        _url_parse = parse.urlparse(_url)
+        netloc = _url_parse.netloc
+        if "@" in netloc:
+            netloc_auth, netloc_host = netloc.split("@")
+        else:
+            netloc_auth, netloc_host = "", netloc
+        ip, *port = netloc_host.split(":")
+        port = port[0] if port else "80"
+        user, *password = netloc_auth.split(":")
+        password = password[0] if password else ""
+        return {
+            "protocol": protocol,
+            "ip": ip,
+            "port": port,
+            "user": user,
+            "password": password,
+            "ip_port": "{}:{}".format(ip, port),
+        }
+
+
+class ProxyPoolBase(object):
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def get(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class ProxyPool(ProxyPoolBase):
+    """代理池"""
+
+    def __init__(self, **kwargs):
+        """
+        :param size: 代理池大小  -1 为不限制
+        :param proxy_source_url: 代理文件地址 支持列表
+        :param proxy_instance:  提供代理的实例
+        :param reset_interval:  代理池重置间隔 最小间隔
+        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
+        :param check_valid: 是否在获取代理时进行检测有效性
+        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
+        :param logger: 日志处理器 默认 log.get_logger()
+        :param kwargs: 其他的参数
+        """
+        kwargs.setdefault("size", -1)
+        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
+
+        super(ProxyPool, self).__init__(**kwargs)
+        # 队列最大长度
+        self.max_queue_size = kwargs.get("size", -1)
+        # 实际代理数量
+        self.real_max_proxy_count = 1000
+        # 代理可用最大次数
+        # 代理获取地址 http://localhost/proxy.txt
+        self.proxy_source_url = kwargs.get("proxy_source_url", [])
+        if not isinstance(self.proxy_source_url, list):
+            self.proxy_source_url = [self.proxy_source_url]
+            self.proxy_source_url = [x for x in self.proxy_source_url if x]
+            self.proxy_source_url = list(set(self.proxy_source_url))
+            kwargs.update({"proxy_source_url": self.proxy_source_url})
+        # 处理日志
+        self.logger = kwargs.get("logger") or log
+        kwargs["logger"] = self.logger
+        if not self.proxy_source_url:
+            self.logger.warn("need set proxy_source_url or proxy_instance")
+
+        # 代理池重置间隔
+        self.reset_interval = kwargs.get("reset_interval", 5)
+        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
+        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
+        # 是否监测代理有效性
+        self.check_valid = kwargs.get("check_valid", True)
+
+        # 代理队列
+        self.proxy_queue = None
+        # {代理id: ProxyItem, ...}
+        self.proxy_dict = {}
+        # 失效代理队列
+        self.invalid_proxy_dict = {}
+
+        self.kwargs = kwargs
+
+        # 重置代理池锁
+        self.reset_lock = None
+        # 重置时间
+        self.last_reset_time = 0
+        # 重置的太快了  计数
+        self.reset_fast_count = 0
+        # 计数 获取代理重试3次仍然失败 次数
+        self.no_valid_proxy_times = 0
+
+        # 上次获取代理时间
+        self.last_get_ts = time.time()
+
+        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
+        self.proxy_item_update_ts_dict = {}
+
+        # 警告
+        self.warn_flag = False
+
+    def warn(self):
+        if not self.warn_flag:
+            for url in self.proxy_source_url:
+                if "zhima" in url:
+                    continue
+            self.warn_flag = True
+        return
+
+    @property
+    def queue_size(self):
+        """
+        当前代理池中代理数量
+        :return:
+        """
+        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
+
+    def clear(self):
+        """
+        清空自己
+        :return:
+        """
+        self.proxy_queue = None
+        # {代理ip: ProxyItem, ...}
+        self.proxy_dict = {}
+        # 清理失效代理集合
+        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
+        self.invalid_proxy_dict = {
+            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
+        }
+        # 清理超时的update_ts记录
+        _limit = time.time() - 600
+        self.proxy_item_update_ts_dict = {
+            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
+        }
+        return
+
+    def get(self, retry: int = 0) -> dict:
+        """
+        从代理池中获取代理
+        :param retry:
+        :return:
+        """
+        retry += 1
+        if retry > 3:
+            self.no_valid_proxy_times += 1
+            return None
+        # if time.time() - self.last_get_ts > 3 * 60:
+        #     # 3分钟没有获取过 重置一下
+        #     try:
+        #         self.reset_proxy_pool()
+        #     except Exception as e:
+        #         self.logger.exception(e)
+        # 记录获取时间
+        self.last_get_ts = time.time()
+        #
+        self.warn()
+        proxy_item = self.get_random_proxy()
+        if proxy_item:
+            # 不检测
+            if not self.check_valid:  #
+                # 塞回去
+                proxies = proxy_item.get_proxies()
+                self.put_proxy_item(proxy_item)
+                return proxies
+            else:
+                is_valid = proxy_item.is_valid()
+                if is_valid:
+                    # 记录update_ts
+                    self.proxy_item_update_ts_dict[
+                        proxy_item.proxy_id
+                    ] = proxy_item.update_ts
+                    # 塞回去
+                    proxies = proxy_item.get_proxies()
+                    self.put_proxy_item(proxy_item)
+                    if is_valid == 1:
+                        if proxy_item.use_interval:
+                            proxy_item.use_ts = time.time()
+                        return proxies
+                else:
+                    # 处理失效代理
+                    self.proxy_dict.pop(proxy_item.proxy_id, "")
+                    self.invalid_proxy_dict[
+                        proxy_item.proxy_id
+                    ] = datetime.datetime.now()
+        else:
+            try:
+                time.sleep(3)
+                self.reset_proxy_pool()
+            except Exception as e:
+                self.logger.exception(e)
+        if self.no_valid_proxy_times >= 5:
+            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况
+            # 导致爬虫烂尾
+            try:
+                time.sleep(3)
+                self.reset_proxy_pool()
+            except Exception as e:
+                self.logger.exception(e)
+        return self.get(retry)
+
+    get_proxy = get
+
+    def get_random_proxy(self) -> ProxyItem:
+        """
+        随机获取代理
+        :return:
+        """
+        if self.proxy_queue is not None:
+            if random.random() < 0.5:
+                # 一半概率检查 这是个高频操作 优化一下
+                if time.time() - self.last_reset_time > self.reset_interval_max:
+                    time.sleep(3)
+                    self.reset_proxy_pool(force=True)
+                else:
+                    min_q_size = (
+                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
+                        if self.max_queue_size > 0
+                        else self.real_max_proxy_count / 2
+                    )
+                    if self.proxy_queue.qsize() < min_q_size:
+                        time.sleep(3)
+                        self.reset_proxy_pool()
+            try:
+                return self.proxy_queue.get_nowait()
+            except Exception:
+                pass
+        return None
+
+    def append_proxies(self, proxies_list: list) -> int:
+        """
+        添加代理到代理池
+        :param proxies_list:
+        :return:
+        """
+        count = 0
+        if not isinstance(proxies_list, list):
+            proxies_list = [proxies_list]
+        for proxies in proxies_list:
+            if proxies:
+                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
+                # 增加失效判断 2018/12/18
+                if proxy_item.proxy_id in self.invalid_proxy_dict:
+                    continue
+                if proxy_item.proxy_id not in self.proxy_dict:
+                    # 补充update_ts
+                    if not proxy_item.update_ts:
+                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
+                            proxy_item.proxy_id, 0
+                        )
+                    self.put_proxy_item(proxy_item)
+                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
+                    count += 1
+        return count
+
+    def put_proxy_item(self, proxy_item: ProxyItem):
+        """
+        添加 ProxyItem 到代理池
+        :param proxy_item:
+        :return:
+        """
+        return self.proxy_queue.put_nowait(proxy_item)
+
+    def reset_proxy_pool(self, force: bool = False):
+        """
+        重置代理池
+        :param force: 是否强制重置代理池
+        :return:
+        """
+        if not self.reset_lock:
+            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
+            import threading
+
+            self.reset_lock = threading.RLock()
+        with self.reset_lock:
+            if (
+                    force
+                    or self.proxy_queue is None
+                    or (
+                    self.max_queue_size > 0
+                    and self.proxy_queue.qsize() < self.max_queue_size / 2
+            )
+                    or (
+                    self.max_queue_size < 0
+                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
+            )
+                    or self.no_valid_proxy_times >= 5
+            ):
+                if time.time() - self.last_reset_time < self.reset_interval:
+                    self.reset_fast_count += 1
+                    if self.reset_fast_count % 10 == 0:
+                        self.logger.debug(
+                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
+                        )
+                        time.sleep(1)
+                else:
+                    self.clear()
+                    if self.proxy_queue is None:
+                        import queue
+
+                        self.proxy_queue = queue.Queue()
+                    # TODO 这里获取到的可能重复
+                    proxies_list = get_proxy_from_url(**self.kwargs)
+                    self.real_max_proxy_count = len(proxies_list)
+                    if 0 < self.max_queue_size < self.real_max_proxy_count:
+                        proxies_list = random.sample(proxies_list, self.max_queue_size)
+                    _valid_count = self.append_proxies(proxies_list)
+                    self.last_reset_time = time.time()
+                    self.no_valid_proxy_times = 0
+                    self.logger.debug(
+                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
+                            len(proxies_list),
+                            _valid_count,
+                            len(self.invalid_proxy_dict),
+                            len(self.proxy_dict),
+                        )
+                    )
+        return
+
+    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
+        """
+        对代理进行标记
+        :param proxies_list:
+        :param flag:
+                    -1  废弃
+                    1 延迟使用
+        :param delay: 延迟时间
+        :return:
+        """
+        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
+            return False
+        if not isinstance(proxies_list, list):
+            proxies_list = [proxies_list]
+        for proxies in proxies_list:
+            if not proxies:
+                continue
+            proxy_id = ProxyItem(proxies).proxy_id
+            if proxy_id not in self.proxy_dict:
+                continue
+            self.proxy_dict[proxy_id].flag = flag
+            self.proxy_dict[proxy_id].flag_ts = time.time()
+            self.proxy_dict[proxy_id].delay = delay
+
+        return True
+
+    def get_proxy_item(self, proxy_id="", proxies=None):
+        """
+        获取代理对象
+        :param proxy_id:
+        :param proxies:
+        :return:
+        """
+        if proxy_id:
+            return self.proxy_dict.get(proxy_id)
+        if proxies:
+            proxy_id = ProxyItem(proxies).proxy_id
+            return self.proxy_dict.get(proxy_id)
+        return
+
+    def copy(self):
+        return ProxyPool(**self.kwargs)
+
+    def all(self) -> list:
+        """
+        获取当前代理池中的全部代理
+        :return:
+        """
+        return get_proxy_from_url(**self.kwargs)
+# 
+# 
+# if __name__ == '__main__':
+#     ProxyPool().get()

+ 506 - 0
FworkSpider/feapder/network/request.py

@@ -0,0 +1,506 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-25 11:49:08
+---------
+@summary: 请求结构体
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.cookies import RequestsCookieJar
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+
+import feapder.setting as setting
+import feapder.utils.tools as tools
+from feapder.db.redisdb import RedisDB
+from feapder.network import user_agent
+from feapder.network.proxy_pool import ProxyPool
+from feapder.network.response import Response
+from feapder.utils.log import log
+from feapder.utils.webdriver import WebDriverPool
+
+# 屏蔽warning信息
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+
+
+class Request(object):
+    session = None
+    webdriver_pool: WebDriverPool = None
+    user_agent_pool = user_agent
+    proxies_pool: ProxyPool = None
+
+    cache_db = None  # redis / pika
+    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
+    cached_expire_time = 1200  # 缓存过期时间
+
+    local_filepath = None
+    oss_handler = None
+
+    __REQUEST_ATTRS__ = {
+        # 'method', 'url', 必须传递 不加入**kwargs中
+        "params",
+        "data",
+        "headers",
+        "cookies",
+        "files",
+        "auth",
+        "timeout",
+        "allow_redirects",
+        "proxies",
+        "hooks",
+        "stream",
+        "verify",
+        "cert",
+        "json",
+    }
+
+    DEFAULT_KEY_VALUE = dict(
+        url="",
+        retry_times=0,
+        priority=300,
+        parser_name=None,
+        callback=None,
+        filter_repeat=True,
+        auto_request=True,
+        request_sync=False,
+        use_session=None,
+        random_user_agent=True,
+        download_midware=None,
+        is_abandoned=False,
+        render=False,
+        render_time=0,
+    )
+
+    def __init__(
+        self,
+        url="",
+        retry_times=0,
+        priority=300,
+        parser_name=None,
+        callback=None,
+        filter_repeat=True,
+        auto_request=True,
+        request_sync=False,
+        use_session=None,
+        random_user_agent=True,
+        download_midware=None,
+        is_abandoned=False,
+        render=False,
+        render_time=0,
+        **kwargs,
+    ):
+        """
+        @summary: Request参数
+        ---------
+        框架参数
+        @param url: 待抓取url
+        @param retry_times: 当前重试次数
+        @param priority: 优先级 越小越优先 默认300
+        @param parser_name: 回调函数所在的类名 默认为当前类
+        @param callback: 回调函数 可以是函数 也可是函数名(如想跨类回调时,parser_name指定那个类名,callback指定那个类想回调的方法名即可)
+        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
+        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空,需要自己去请求网页
+        @param request_sync: 是否同步请求下载网页,默认异步。如果该请求url过期时间快,可设置为True,相当于yield的reqeust会立即响应,而不是去排队
+        @param use_session: 是否使用session方式
+        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
+        @param download_midware: 下载中间件。默认为parser中的download_midware
+        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
+        @param render: 是否用浏览器渲染
+        @param render_time: 渲染时长,即打开网页等待指定时间后再获取源码
+        --
+        以下参数与requests参数使用方式一致
+        @param method: 请求方式,如POST或GET,默认根据data值是否为空来判断
+        @param params: 请求参数
+        @param data: 请求body
+        @param json: 请求json字符串,同 json.dumps(data)
+        @param headers:
+        @param cookies: 字典 或 CookieJar 对象
+        @param files:
+        @param auth:
+        @param timeout: (浮点或元组)等待服务器数据的超时限制,是一个浮点数,或是一个(connect timeout, read timeout) 元组
+        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
+        @param verify: 为 True 时将会验证 SSL 证书
+        @param stream: 如果为 False,将会立即下载响应内容
+        @param cert:
+        --
+        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
+        ---------
+        @result:
+        """
+
+        self.url = url
+        self.retry_times = retry_times
+        self.priority = priority
+        self.parser_name = parser_name
+        self.callback = callback
+        self.filter_repeat = filter_repeat
+        self.auto_request = auto_request
+        self.request_sync = request_sync
+        self.use_session = use_session
+        self.random_user_agent = random_user_agent
+        self.download_midware = download_midware
+        self.is_abandoned = is_abandoned
+        self.render = render
+        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
+
+        self.requests_kwargs = {}
+        for key, value in kwargs.items():
+            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
+                self.requests_kwargs[key] = value
+
+            self.__dict__[key] = value
+
+    def __repr__(self):
+        try:
+            return "<Request {}>".format(self.url)
+        except:
+            return "<Request {}>".format(str(self.to_dict)[:40])
+
+    def __setattr__(self, key, value):
+        """
+        针对 request.xxx = xxx 的形式,更新reqeust及内部参数值
+        @param key:
+        @param value:
+        @return:
+        """
+        self.__dict__[key] = value
+
+        if key in self.__class__.__REQUEST_ATTRS__:
+            self.requests_kwargs[key] = value
+
+    def __lt__(self, other):
+        return self.priority < other.priority
+
+    @property
+    def _session(self):
+        use_session = (
+            setting.USE_SESSION if self.use_session is None else self.use_session
+        )  # self.use_session 优先级高
+        if use_session and not self.__class__.session:
+            self.__class__.session = requests.Session()
+            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
+            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
+            # 任何使用该session会话的 HTTP 请求,只要其 URL 是以给定的前缀开头,该传输适配器就会被使用到。
+            self.__class__.session.mount("http", http_adapter)
+
+        return self.__class__.session
+
+    @property
+    def _webdriver_pool(self):
+        if not self.__class__.webdriver_pool:
+            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
+
+        return self.__class__.webdriver_pool
+
+    @property
+    def _proxies_pool(self):
+        if not self.__class__.proxies_pool:
+            self.__class__.proxies_pool = ProxyPool()
+
+        return self.__class__.proxies_pool
+
+    @property
+    def to_dict(self):
+        request_dict = {}
+
+        self.callback = (
+            getattr(self.callback, "__name__")
+            if callable(self.callback)
+            else self.callback
+        )
+        self.download_midware = (
+            getattr(self.download_midware, "__name__")
+            if callable(self.download_midware)
+            else self.download_midware
+        )
+
+        for key, value in self.__dict__.items():
+            if (
+                key in self.__class__.DEFAULT_KEY_VALUE
+                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
+                or key == "requests_kwargs"
+            ):
+                continue
+
+            if key in self.__class__.__REQUEST_ATTRS__:
+                if not isinstance(
+                    value, (bytes, bool, float, int, str, tuple, list, dict)
+                ):
+                    value = tools.dumps_obj(value)
+            else:
+                if not isinstance(value, (bytes, bool, float, int, str)):
+                    value = tools.dumps_obj(value)
+
+            request_dict[key] = value
+
+        return request_dict
+
+    @property
+    def callback_name(self):
+        return (
+            getattr(self.callback, "__name__")
+            if callable(self.callback)
+            else self.callback
+        )
+
+    def get_response(self, save_cached=False):
+        """
+        获取带有selector功能的response
+        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
+        @return:
+        """
+        # 设置超时默认时间
+        self.requests_kwargs.setdefault(
+            "timeout", setting.REQUEST_TIMEOUT
+        )  # connect=22 read=22
+
+        # 设置stream
+        # 默认情况下,当你进行网络请求后,响应体会立即被下载。你可以通过 stream 参数覆盖这个行为,推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点: stream 设为 True,Requests 无法将连接释放回连接池,除非你 消耗了所有的数据,或者调用了 Response.close。 这样会带来连接效率低下的问题。
+        self.requests_kwargs.setdefault("stream", True)
+
+        # 关闭证书验证
+        self.requests_kwargs.setdefault("verify", False)
+
+        # 设置请求方法
+        method = self.__dict__.get("method")
+        if not method:
+            if "data" in self.requests_kwargs:
+                method = "POST"
+            else:
+                method = "GET"
+
+        # 随机user—agent
+        headers = self.requests_kwargs.get("headers", {})
+        if "user-agent" not in headers and "User-Agent" not in headers:
+            if self.render:  # 如果是渲染默认,优先使用WEBDRIVER中配置的ua
+                ua = setting.WEBDRIVER.get(
+                    "user_agent"
+                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
+            else:
+                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
+
+            if self.random_user_agent and setting.RANDOM_HEADERS:
+                headers.update({"User-Agent": ua})
+                self.requests_kwargs.update(headers=headers)
+        else:
+            self.requests_kwargs.setdefault(
+                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
+            )
+
+        # 代理
+        proxies = self.requests_kwargs.get("proxies", -1)
+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
+            while True:
+                proxies = self._proxies_pool.get()
+                if proxies:
+                    self.requests_kwargs.update(proxies=proxies)
+                    break
+                else:
+                    log.debug("暂无可用代理 ...")
+
+        log.debug(
+            """
+                -------------- %srequest for ----------------
+                url  = %s
+                method = %s
+                body = %s
+                """
+            % (
+                ""
+                if not self.parser_name
+                else "%s.%s "
+                % (
+                    self.parser_name,
+                    (
+                        self.callback
+                        and callable(self.callback)
+                        and getattr(self.callback, "__name__")
+                        or self.callback
+                    )
+                    or "parse",
+                ),
+                self.url,
+                method,
+                self.requests_kwargs,
+            )
+        )
+
+        # def hooks(response, *args, **kwargs):
+        #     print(response.url)
+        #
+        # self.requests_kwargs.update(hooks={'response': hooks})
+
+        use_session = (
+            setting.USE_SESSION if self.use_session is None else self.use_session
+        )  # self.use_session 优先级高
+
+        if self.render:
+            # 使用request的user_agent、cookies、proxy
+            user_agent = headers.get("User-Agent") or headers.get("user-agent")
+            cookies = self.requests_kwargs.get("cookies")
+            print(cookies)
+            if cookies and isinstance(cookies, RequestsCookieJar):
+                cookies = cookies.get_dict()
+
+            if not cookies:
+                cookie_str = headers.get("Cookie") or headers.get("cookie")
+                if cookie_str:
+                    cookies = tools.get_cookies_from_str(cookie_str)
+
+            proxy = None
+            if proxies and proxies != -1:
+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
+                    "https", ""
+                ).strip("https://")
+
+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
+
+            try:
+                browser.get(self.url)
+                if cookies:
+                    browser.cookies = cookies
+                if self.render_time:
+                    tools.delay_time(self.render_time)
+
+                html = browser.page_source
+                response = Response.from_dict(
+                    {
+                        "url": browser.current_url,
+                        "cookies": browser.cookies,
+                        "_content": html.encode(),
+                        "status_code": 200,
+                        "elapsed": 666,
+                        "headers": {
+                            "User-Agent": browser.execute_script(
+                                "return navigator.userAgent"
+                            ),
+                            "Cookie": tools.cookies2str(browser.cookies),
+                        },
+                    }
+                )
+
+                response.browser = browser
+            except Exception as e:
+                self._webdriver_pool.remove(browser)
+                raise e
+
+        elif use_session:
+            response = self._session.request(method, self.url, **self.requests_kwargs)
+            response = Response(response)
+        else:
+            response = requests.request(method, self.url, **self.requests_kwargs)
+            response = Response(response)
+
+        if save_cached:
+            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
+
+        return response
+
+    def proxies(self):
+        """
+
+        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
+
+        """
+        return self.requests_kwargs.get("proxies")
+
+    def proxy(self):
+        """
+
+        Returns: ip:port
+
+        """
+        proxies = self.proxies()
+        if proxies:
+            return proxies.get("http", "").strip("http://") or proxies.get(
+                "https", ""
+            ).strip("https://")
+
+    def user_agent(self):
+        headers = self.requests_kwargs.get("headers")
+        if headers:
+            return headers.get("user_agent") or headers.get("User-Agent")
+
+    @property
+    def fingerprint(self):
+        """
+        request唯一表识
+        @return:
+        """
+        url = self.__dict__.get("url", "")
+        # url 归一化
+        url = tools.canonicalize_url(url)
+        args = [url]
+
+        for arg in ["params", "data", "files", "auth", "cert", "json"]:
+            if self.requests_kwargs.get(arg):
+                args.append(self.requests_kwargs.get(arg))
+
+        return tools.get_md5(*args)
+
+    @property
+    def _cache_db(self):
+        if not self.__class__.cache_db:
+            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
+
+        return self.__class__.cache_db
+
+    @property
+    def _cached_redis_key(self):
+        if self.__class__.cached_redis_key:
+            return (
+                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
+            )
+        else:
+            return f"response_cached:test:{self.fingerprint}"
+
+    def save_cached(self, response, expire_time=1200):
+        """
+        使用redis保存response 用于调试 不用每回都下载
+        @param response:
+        @param expire_time: 过期时间
+        @return:
+        """
+
+        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
+
+    def get_response_from_cached(self, save_cached=True):
+        """
+        从缓存中获取response
+        注意:
+            属性值为空:
+                -raw : urllib3.response.HTTPResponse
+                -connection:requests.adapters.HTTPAdapter
+                -history
+
+            属性含义改变:
+                - request 由requests 改为Request
+        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
+        @return:
+        """
+        response_dict = self._cache_db.strget(self._cached_redis_key)
+        if not response_dict:
+            log.info("无response缓存  重新下载")
+            response_obj = self.get_response(save_cached=save_cached)
+        else:
+            response_dict = eval(response_dict)
+            response_obj = Response.from_dict(response_dict)
+        return response_obj
+
+    def del_response_cached(self):
+        self._cache_db.clear(self._cached_redis_key)
+
+    @classmethod
+    def from_dict(cls, request_dict):
+        for key, value in request_dict.items():
+            if isinstance(value, bytes):  # 反序列化 如item
+                request_dict[key] = tools.loads_obj(value)
+
+        return cls(**request_dict)
+
+    def copy(self):
+        return self.__class__.from_dict(self.to_dict)

+ 356 - 0
FworkSpider/feapder/network/response.py

@@ -0,0 +1,356 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-26 11:40:28
+---------
+@summary:
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import datetime
+import os
+import re
+import time
+from urllib.parse import urlparse, urlunparse, urljoin
+
+from bs4 import UnicodeDammit, BeautifulSoup
+from requests.cookies import RequestsCookieJar
+from requests.models import Response as res
+from w3lib.encoding import http_content_type_encoding, html_body_declared_encoding
+
+from feapder.network.selector import Selector
+from feapder.utils.log import log
+
+FAIL_ENCODING = "ISO-8859-1"
+
+# html 源码中的特殊字符,需要删掉,否则会影响etree的构建
+SPECIAL_CHARACTERS = [
+    # 移除控制字符 全部字符列表 https://zh.wikipedia.org/wiki/%E6%8E%A7%E5%88%B6%E5%AD%97%E7%AC%A6
+    "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]"
+]
+
+SPECIAL_CHARACTER_PATTERNS = [
+    re.compile(special_character) for special_character in SPECIAL_CHARACTERS
+]
+
+
+class Response(res):
+    def __init__(self, response):
+        super(Response, self).__init__()
+        self.__dict__.update(response.__dict__)
+
+        self._cached_selector = None
+        self._cached_text = None
+        self._cached_json = None
+
+        self._encoding = None
+
+        self.encoding_errors = "strict"  # strict / replace / ignore
+
+    @classmethod
+    def from_dict(cls, response_dict):
+        """
+        利用字典获取Response对象
+        @param response_dict: 原生的response.__dict__
+        @return:
+        """
+        cookie_jar = RequestsCookieJar()
+        cookie_jar.update(other=response_dict["cookies"])
+        response_dict["cookies"] = cookie_jar
+
+        response_dict["elapsed"] = datetime.timedelta(
+            0, 0, response_dict["elapsed"]
+        )  # 耗时
+        response_dict["connection"] = None
+        response_dict["_content_consumed"] = True
+
+        response = res()
+        response.__dict__.update(response_dict)
+        return cls(response)
+
+    @property
+    def to_dict(self):
+        response_dict = {
+            "_content": self.content,
+            "cookies": self.cookies.get_dict(),
+            "encoding": self.encoding,
+            "headers": self.headers,
+            "status_code": self.status_code,
+            "elapsed": self.elapsed.microseconds,  # 耗时
+            "url": self.url,
+        }
+
+        return response_dict
+
+    def __clear_cache(self):
+        self.__dict__["_cached_selector"] = None
+        self.__dict__["_cached_text"] = None
+        self.__dict__["_cached_json"] = None
+
+    @property
+    def encoding(self):
+        """
+        编码优先级:自定义编码 > header中编码 > 页面编码 > 根据content猜测的编码
+        """
+        self._encoding = (
+            self._encoding
+            or self._headers_encoding()
+            or self._body_declared_encoding()
+            or self.apparent_encoding
+        )
+        return self._encoding
+
+    @encoding.setter
+    def encoding(self, val):
+        self.__clear_cache()
+        self._encoding = val
+
+    code = encoding
+
+    def _headers_encoding(self):
+        """
+        从headers获取头部charset编码
+        """
+        content_type = self.headers.get("Content-Type") or self.headers.get(
+            "content-type"
+        )
+        if content_type:
+            return (
+                http_content_type_encoding(content_type) or "utf-8"
+                if "application/json" in content_type
+                else None
+            )
+
+    def _body_declared_encoding(self):
+        """
+        从html xml等获取<meta charset="编码">
+        """
+
+        return html_body_declared_encoding(self.content)
+
+    def _get_unicode_html(self, html):
+        if not html or not isinstance(html, bytes):
+            return html
+
+        converted = UnicodeDammit(html, is_html=True)
+        if not converted.unicode_markup:
+            raise Exception(
+                "Failed to detect encoding of article HTML, tried: %s"
+                % ", ".join(converted.tried_encodings)
+            )
+
+        html = converted.unicode_markup
+        return html
+
+    def _make_absolute(self, link):
+        """Makes a given link absolute."""
+        try:
+
+            link = link.strip()
+
+            # Parse the link with stdlib.
+            parsed = urlparse(link)._asdict()
+
+            # If link is relative, then join it with base_url.
+            if not parsed["netloc"]:
+                return urljoin(self.url, link)
+
+            # Link is absolute; if it lacks a scheme, add one from base_url.
+            if not parsed["scheme"]:
+                parsed["scheme"] = urlparse(self.url).scheme
+
+                # Reconstruct the URL to incorporate the new scheme.
+                parsed = (v for v in parsed.values())
+                return urlunparse(parsed)
+
+        except Exception as e:
+            log.error(
+                "Invalid URL <{}> can't make absolute_link. exception: {}".format(
+                    link, e
+                )
+            )
+
+        # Link is absolute and complete with scheme; nothing to be done here.
+        return link
+
+    def _absolute_links(self, text):
+        regexs = [
+            r'(<(?i)a.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # a
+            r'(<(?i)img.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # img
+            r'(<(?i)link.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # css
+            r'(<(?i)script.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # js
+        ]
+
+        for regex in regexs:
+
+            def replace_href(text):
+                # html = text.group(0)
+                link = text.group(2)
+                absolute_link = self._make_absolute(link)
+
+                # return re.sub(regex, r'\1{}\3'.format(absolute_link), html) # 使用正则替换,个别字符不支持。如该网址源代码http://permit.mep.gov.cn/permitExt/syssb/xxgk/xxgk!showImage.action?dataid=0b092f8115ff45c5a50947cdea537726
+                return text.group(1) + absolute_link + text.group(3)
+
+            text = re.sub(regex, replace_href, text, flags=re.S)
+
+        return text
+
+    def _del_special_character(self, text):
+        """
+        删除特殊字符
+        """
+        for special_character_pattern in SPECIAL_CHARACTER_PATTERNS:
+            text = special_character_pattern.sub("", text)
+
+        return text
+
+    @property
+    def __text(self):
+        """Content of the response, in unicode.
+
+        If Response.encoding is None, encoding will be guessed using
+        ``chardet``.
+
+        The encoding of the response content is determined based solely on HTTP
+        headers, following RFC 2616 to the letter. If you can take advantage of
+        non-HTTP knowledge to make a better guess at the encoding, you should
+        set ``r.encoding`` appropriately before accessing this property.
+        """
+
+        if not self.content:
+            return ""
+
+        # Decode unicode from given encoding.
+        try:
+            content = str(self.content, self.encoding, errors=self.encoding_errors)
+        except (LookupError, TypeError):
+            # A LookupError is raised if the encoding was not found which could
+            # indicate a misspelling or similar mistake.
+            #
+            # A TypeError can be raised if encoding is None
+            #
+            # So we try blindly encoding.
+            content = str(self.content, errors=self.encoding_errors)
+
+        return content
+
+    @property
+    def text(self):
+        if self._cached_text is None:
+            if self.encoding and self.encoding.upper() != FAIL_ENCODING:
+                try:
+                    self._cached_text = self.__text
+                except UnicodeDecodeError:
+                    self._cached_text = self._get_unicode_html(self.content)
+            else:
+                self._cached_text = self._get_unicode_html(self.content)
+
+            if self._cached_text:
+                self._cached_text = self._absolute_links(self._cached_text)
+                self._cached_text = self._del_special_character(self._cached_text)
+
+        return self._cached_text
+
+    @text.setter
+    def text(self, html):
+        self._cached_text = html
+        self._cached_text = self._absolute_links(self._cached_text)
+        self._cached_text = self._del_special_character(self._cached_text)
+        self._cached_selector = Selector(self.text)
+
+    @property
+    def json(self, **kwargs):
+        if self._cached_json is None:
+            self.encoding = self.encoding or "utf-8"
+            self._cached_json = super(Response, self).json(**kwargs)
+
+        return self._cached_json
+
+    @property
+    def content(self):
+        content = super(Response, self).content
+        return content
+
+    @property
+    def is_html(self):
+        content_type = self.headers.get("Content-Type", "")
+        if "text/html" in content_type:
+            return True
+        else:
+            return False
+
+    @property
+    def selector(self):
+        if self._cached_selector is None:
+            self._cached_selector = Selector(self.text)
+        return self._cached_selector
+
+    def bs4(self, features="html.parser"):
+        soup = BeautifulSoup(self.text, features)
+        return soup
+
+    def extract(self):
+        return self.selector.get()
+
+    def xpath(self, query, **kwargs):
+        return self.selector.xpath(query, **kwargs)
+
+    def css(self, query):
+        return self.selector.css(query)
+
+    def re(self, regex, replace_entities=False):
+        """
+        @summary: 正则匹配
+        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
+        为了使用方便,正则单双引号自动处理为不敏感
+        ---------
+        @param regex: 正则或者re.compile
+        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
+        ---------
+        @result: 列表
+        """
+
+        # 将单双引号设置为不敏感
+        if isinstance(regex, str):
+            regex = re.sub("['\"]", "['\"]", regex)
+
+        return self.selector.re(regex, replace_entities)
+
+    def re_first(self, regex, default=None, replace_entities=False):
+        """
+        @summary: 正则匹配
+        注意:网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ; 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
+        为了使用方便,正则单双引号自动处理为不敏感
+        ---------
+        @param regex: 正则或者re.compile
+        @param default: 未匹配到, 默认值
+        @param replace_entities: 为True时 去掉&nbsp;等字符, 转义&quot;为 " 等, 会使网页结构发生变化。如在网页源码中提取json, 建议设置成False
+        ---------
+        @result: 第一个值或默认值
+        """
+
+        # 将单双引号设置为不敏感
+        if isinstance(regex, str):
+            regex = re.sub("['\"]", "['\"]", regex)
+
+        return self.selector.re_first(regex, default, replace_entities)
+
+    def close_browser(self, request):
+        if hasattr(self, "browser"):
+            request._webdriver_pool.remove(self.browser)
+            del self.browser
+
+    def __del__(self):
+        self.close()
+
+    def open(self, delete_temp_file=False):
+        with open("temp.html", "w", encoding=self.encoding, errors="replace") as html:
+            self.encoding_errors = "replace"
+            html.write(self.text)
+
+        os.system("open temp.html")
+
+        if delete_temp_file:
+            time.sleep(1)
+            os.remove("temp.html")

+ 155 - 0
FworkSpider/feapder/network/selector.py

@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-10-08 15:33:37
+---------
+@summary: 重新定义 selector
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+import re
+
+import six
+from lxml import etree
+from parsel import Selector as ParselSelector
+from parsel import SelectorList as ParselSelectorList
+from w3lib.html import replace_entities as w3lib_replace_entities
+
+
+def extract_regex(regex, text, replace_entities=True, flags=0):
+    """Extract a list of unicode strings from the given text/encoding using the following policies:
+    * if the regex contains a named group called "extract" that will be returned
+    * if the regex contains multiple numbered groups, all those will be returned (flattened)
+    * if the regex doesn't contain any group the entire regex matching is returned
+    """
+    if isinstance(regex, six.string_types):
+        regex = re.compile(regex, flags=flags)
+
+    if "extract" in regex.groupindex:
+        # named group
+        try:
+            extracted = regex.search(text).group("extract")
+        except AttributeError:
+            strings = []
+        else:
+            strings = [extracted] if extracted is not None else []
+    else:
+        # full regex or numbered groups
+        strings = regex.findall(text)
+
+    # strings = flatten(strings) # 这东西会把多维列表铺平
+    if not replace_entities:
+        return strings
+
+    values = []
+    for value in strings:
+        if isinstance(value, (list, tuple)):  # w3lib_replace_entities 不能接收list tuple
+            values.append(
+                [w3lib_replace_entities(v, keep=["lt", "amp"]) for v in value]
+            )
+        else:
+            values.append(w3lib_replace_entities(value, keep=["lt", "amp"]))
+
+    return values
+
+
+def create_root_node(text, parser_cls, base_url=None):
+    """Create root node for text using given parser class.
+    """
+    body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
+    parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
+    root = etree.fromstring(body, parser=parser, base_url=base_url)
+    if root is None:
+        root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
+    return root
+
+
+class SelectorList(ParselSelectorList):
+    """
+    The :class:`SelectorList` class is a subclass of the builtin ``list``
+    class, which provides a few additional methods.
+    """
+
+    def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
+        """
+        Call the ``.re()`` method for the first element in this list and
+        return the result in an unicode string. If the list is empty or the
+        regex doesn't match anything, return the default value (``None`` if
+        the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+
+        datas = self.re(regex, replace_entities=replace_entities, flags=flags)
+        return datas[0] if datas else default
+
+    def re(self, regex, replace_entities=True, flags=re.S):
+        """
+        Call the ``.re()`` method for each element in this list and return
+        their results flattened, as a list of unicode strings.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+        datas = [
+            x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
+        ]
+        return datas[0] if len(datas) == 1 else datas
+
+
+class Selector(ParselSelector):
+    selectorlist_cls = SelectorList
+
+    def __str__(self):
+        data = repr(self.get())
+        return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
+
+    __repr__ = __str__
+
+    def __init__(self, text=None, *args, **kwargs):
+        # 先将&nbsp; 转为空格,否则selector 会转为 \xa0
+        if text:
+            text = re.sub("&nbsp;", "\x20", text)
+        super(Selector, self).__init__(text, *args, **kwargs)
+
+    def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
+        """
+        Apply the given regex and return the first unicode string which
+        matches. If there is no match, return the default value (``None`` if
+        the argument is not provided).
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+
+        datas = self.re(regex, replace_entities=replace_entities, flags=flags)
+
+        return datas[0] if datas else default
+
+    def re(self, regex, replace_entities=True, flags=re.S):
+        """
+        Apply the given regex and return a list of unicode strings with the
+        matches.
+
+        ``regex`` can be either a compiled regular expression or a string which
+        will be compiled to a regular expression using ``re.compile(regex)``.
+
+        By default, character entity references are replaced by their
+        corresponding character (except for ``&amp;`` and ``&lt;``.
+        Passing ``replace_entities`` as ``False`` switches off these
+        replacements.
+        """
+
+        return extract_regex(
+            regex, self.get(), replace_entities=replace_entities, flags=flags
+        )
+
+    def _get_root(self, text, base_url=None):
+        return create_root_node(text, self._parser, base_url=base_url)

+ 389 - 0
FworkSpider/feapder/network/user_agent.py

@@ -0,0 +1,389 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2016-12-28 17:55
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import random
+
+USER_AGENTS = {
+    "chrome": [
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
+        "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
+        "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
+        "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
+        "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
+    ],
+    "opera": [
+        "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
+        "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
+        "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
+        "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
+        "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
+        "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
+        "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
+        "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
+        "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
+        "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
+        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
+        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
+        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
+        "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
+        "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
+        "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
+        "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
+        "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
+        "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
+        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
+        "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
+        "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
+        "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
+        "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
+        "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
+        "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
+        "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
+        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
+    ],
+    "firefox": [
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
+        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
+        "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
+        "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
+        "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
+        "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
+        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
+        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
+        "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
+        "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
+        "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
+    ],
+    "internetexplorer": [
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
+        "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko",
+        "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
+        "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
+        "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
+        "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
+        "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
+        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
+        "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)",
+    ],
+    "safari": [
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
+    ],
+    "mobile": [
+        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
+        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
+        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
+        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
+        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
+        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
+        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
+        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
+        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
+        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
+        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
+        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
+        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
+        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Safari/605.1.15",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
+    ],
+}
+
+
+def get(ua_type: str = None):
+    if not ua_type:
+        ua_type = random.choice(list(USER_AGENTS.keys()))
+    elif ua_type not in USER_AGENTS:
+        raise ValueError(
+            "ua_type error, expect one of {}".format(list(USER_AGENTS.keys()))
+        )
+
+    return random.choice(USER_AGENTS[ua_type])

+ 56 - 0
FworkSpider/feapder/pipelines/__init__.py

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/17 10:57 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import abc
+from typing import Dict, List, Tuple
+
+
+class BasePipeline(metaclass=abc.ABCMeta):
+    """
+    pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
+    """
+
+    @abc.abstractmethod
+    def save_items(self, table, items: List[Dict]) -> bool:
+        """
+        保存数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+
+        Returns: 是否保存成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+
+        return True
+
+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
+        """
+        更新数据, 与UpdateItem配合使用,若爬虫中没使用UpdateItem,则可不实现此接口
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+            update_keys: 更新的字段, 如 ("title", "publish_time")
+
+        Returns: 是否更新成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+
+        return True
+
+    def close(self):
+        """
+        关闭,爬虫结束时调用
+        Returns:
+
+        """
+        pass

+ 47 - 0
FworkSpider/feapder/pipelines/console_pipeline.py

@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/18 12:39 上午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+from feapder.pipelines import BasePipeline
+from typing import Dict, List, Tuple
+
+
+class ConsolePipeline(BasePipeline):
+    """
+    pipeline 是单线程的,批量保存数据的操作,不建议在这里写网络请求代码,如下载图片等
+    """
+
+    def save_items(self, table, items: List[Dict]) -> bool:
+        """
+        保存数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+
+        Returns: 是否保存成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+
+        return True
+
+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
+        """
+        更新数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+            update_keys: 更新的字段, 如 ("title", "publish_time")
+
+        Returns: 是否更新成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+
+        return True

+ 84 - 0
FworkSpider/feapder/pipelines/mongo_pipeline.py

@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-04-18 14:12:21
+---------
+@summary: 导出数据
+---------
+@author: Mkdir700
+@email:  mkdir700@gmail.com
+"""
+from typing import Dict, List, Tuple
+
+from feapder.db.mongodb import MongoDB
+from feapder.pipelines import BasePipeline
+from feapder.utils.log import log
+
+
+class MongoPipeline(BasePipeline):
+    def __init__(self):
+        self._to_db = None
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+
+        return self._to_db
+
+    def save_items(self, table, items: List[Dict]) -> bool:
+        """
+        保存数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+
+        Returns: 是否保存成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+        try:
+            add_count = self.to_db.add_batch(coll_name=table, datas=items)
+            datas_size = len(items)
+            log.info(
+                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
+                % (datas_size, table, add_count, datas_size - add_count)
+            )
+            return True
+        except Exception as e:
+            log.exception(e)
+            return False
+
+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
+        """
+        更新数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+            update_keys: 更新的字段, 如 ("title", "publish_time")
+
+        Returns: 是否更新成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+        try:
+            add_count = self.to_db.add_batch(
+                coll_name=table,
+                datas=items,
+                update_columns=update_keys or list(items[0].keys()),
+            )
+            datas_size = len(items)
+            update_count = datas_size - add_count
+            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
+                datas_size,
+                table,
+                add_count,
+                update_count,
+            )
+            if update_keys:
+                msg += " 更新字段为 {}".format(update_keys)
+            log.info(msg)
+
+            return True
+        except Exception as e:
+            log.exception(e)
+            return False

+ 74 - 0
FworkSpider/feapder/pipelines/mysql_pipeline.py

@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-07-29 22:48:30
+---------
+@summary: 导出数据
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+from typing import Dict, List, Tuple
+
+import feapder.utils.tools as tools
+from feapder.db.mysqldb import MysqlDB
+from feapder.pipelines import BasePipeline
+from feapder.utils.log import log
+
+
+class MysqlPipeline(BasePipeline):
+    def __init__(self):
+        self._to_db = None
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MysqlDB()
+
+        return self._to_db
+
+    def save_items(self, table, items: List[Dict]) -> bool:
+        """
+        保存数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+
+        Returns: 是否保存成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+
+        sql, datas = tools.make_batch_sql(table, items)
+        add_count = self.to_db.add_batch(sql, datas)
+        datas_size = len(datas)
+        if add_count:
+            log.info(
+                "共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count)
+            )
+
+        return add_count != None
+
+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
+        """
+        更新数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+            update_keys: 更新的字段, 如 ("title", "publish_time")
+
+        Returns: 是否更新成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+
+        sql, datas = tools.make_batch_sql(
+            table, items, update_columns=update_keys or list(items[0].keys())
+        )
+        update_count = self.to_db.add_batch(sql, datas)
+        if update_count:
+            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table)
+            if update_keys:
+                msg += " 更新字段为 {}".format(update_keys)
+            log.info(msg)
+
+        return update_count != None

+ 17 - 0
FworkSpider/feapder/requirements.txt

@@ -0,0 +1,17 @@
+better-exceptions>=0.2.2
+DBUtils>=2.0
+parsel>=1.5.2
+PyExecJS>=1.5.1
+pymongo>=3.10.1
+PyMySQL>=0.9.3
+redis>=2.10.6
+requests>=2.22.0
+selenium>=3.141.0
+bs4>=0.0.1
+ipython>=7.14.0
+bitarray>=1.5.3
+redis-py-cluster>=2.1.0
+cryptography>=3.3.2
+urllib3>=1.25.8
+loguru>=0.5.3
+influxdb>=5.3.1

+ 172 - 0
FworkSpider/feapder/setting.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""爬虫配置文件"""
+import os
+
+# redis 表名
+# 任务表模版
+TAB_REQUSETS = "{redis_key}:z_requsets"
+# 任务失败模板
+TAB_FAILED_REQUSETS = "{redis_key}:z_failed_requsets"
+# 数据保存失败模板
+TAB_FAILED_ITEMS = "{redis_key}:s_failed_items"
+# 爬虫状态表模版
+TAB_SPIDER_STATUS = "{redis_key}:z_spider_status"
+# 爬虫时间记录表
+TAB_SPIDER_TIME = "{redis_key}:h_spider_time"
+
+# MYSQL
+MYSQL_IP = os.getenv("MYSQL_IP")
+MYSQL_PORT = int(os.getenv("MYSQL_PORT", 3306))
+MYSQL_DB = os.getenv("MYSQL_DB")
+MYSQL_USER_NAME = os.getenv("MYSQL_USER_NAME")
+MYSQL_USER_PASS = os.getenv("MYSQL_USER_PASS")
+
+# MONGODB
+MONGO_IP = os.getenv("MONGO_IP", "localhost")
+MONGO_PORT = int(os.getenv("MONGO_PORT", 27017))
+MONGO_DB = os.getenv("MONGO_DB")
+MONGO_USER_NAME = os.getenv("MONGO_USER_NAME")
+MONGO_USER_PASS = os.getenv("MONGO_USER_PASS")
+
+# REDIS
+# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+REDISDB_IP_PORTS = os.getenv("REDISDB_IP_PORTS")
+REDISDB_USER_PASS = os.getenv("REDISDB_USER_PASS")
+REDISDB_DB = int(os.getenv("REDISDB_DB", 0))
+# 适用于redis哨兵模式
+REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME")
+
+# 数据入库的pipeline,可自定义,默认MysqlPipeline
+ITEM_PIPELINES = [
+    "feapder.pipelines.mysql_pipeline.MysqlPipeline",
+    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
+]
+EXPORT_DATA_MAX_FAILED_TIMES = 10  # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
+EXPORT_DATA_MAX_RETRY_TIMES = 10  # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
+
+# 爬虫相关
+# COLLECTOR
+COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
+COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
+
+# SPIDER
+SPIDER_THREAD_COUNT = 1  # 爬虫并发数
+SPIDER_SLEEP_TIME = (
+    0
+)  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
+SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
+SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
+SPIDER_AUTO_START_REQUESTS = (
+    True
+)  # 是否主动执行添加 设置为False 需要手动调用start_monitor_task,适用于多进程情况下
+KEEP_ALIVE = False  # 爬虫是否常驻
+
+# 浏览器渲染
+WEBDRIVER = dict(
+    pool_size=1,  # 浏览器的数量
+    load_images=True,  # 是否加载图片
+    user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+    headless=False,  # 是否为无头浏览器
+    driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
+    timeout=30,  # 请求超时时间
+    window_size=(1024, 800),  # 窗口大小
+    executable_path=None,  # 浏览器路径,默认为默认路径
+    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
+    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+)
+
+# 爬虫启动时,重新抓取失败的requests
+RETRY_FAILED_REQUESTS = False
+# 保存失败的request
+SAVE_FAILED_REQUEST = True
+# request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
+REQUEST_LOST_TIMEOUT = 600  # 10分钟
+# request网络请求超时时间
+REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
+
+# 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
+RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
+RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
+RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
+
+# redis 存放item与request的根目录
+REDIS_KEY = ""
+# 爬虫启动时删除的key,类型: 元组/bool/string。 支持正则; 常用于清空任务队列,否则重启时会断点续爬
+DELETE_KEYS = []
+
+# 设置代理
+PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
+PROXY_ENABLE = True
+
+# 随机headers
+RANDOM_HEADERS = True
+# UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
+USER_AGENT_TYPE = "chrome"
+# 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
+DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
+# requests 使用session
+USE_SESSION = False
+
+# 去重
+ITEM_FILTER_ENABLE = False  # item 去重
+ITEM_FILTER_SETTING = dict(
+    filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+)
+REQUEST_FILTER_ENABLE = False  # request 去重
+REQUEST_FILTER_SETTING = dict(
+    filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+    expire_time=2592000,  # 过期时间1个月
+)
+
+# 报警 支持钉钉、企业微信、邮件
+# 钉钉报警
+DINGDING_WARNING_URL = ""  # 钉钉机器人api
+DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
+DINGDING_WARNING_ALL = False  # 是否提示所有人, 默认为False
+# 邮件报警
+EMAIL_SENDER = ""  # 发件人
+EMAIL_PASSWORD = ""  # 授权码
+EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
+EMAIL_SMTPSERVER = "smtp.163.com"  # 邮件服务器 默认为163邮箱
+# 企业微信报警
+WECHAT_WARNING_URL = ""  # 企业微信机器人api
+WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
+WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
+# 时间间隔
+WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
+WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
+WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
+
+LOG_NAME = os.path.basename(os.getcwd())
+LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
+LOG_LEVEL = "DEBUG"
+LOG_COLOR = True  # 是否带有颜色
+LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
+LOG_IS_WRITE_TO_FILE = False  # 是否写文件
+LOG_MODE = "w"  # 写文件的模式
+LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
+LOG_BACKUP_COUNT = 20  # 日志文件保留数量
+LOG_ENCODING = "utf8"  # 日志文件编码
+OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
+
+# 打点监控 influxdb 配置
+INFLUXDB_HOST = os.getenv("INFLUXDB_HOST", "localhost")
+INFLUXDB_PORT = int(os.getenv("INFLUXDB_PORT", 8086))
+INFLUXDB_UDP_PORT = int(os.getenv("INFLUXDB_UDP_PORT", 8089))
+INFLUXDB_USER = os.getenv("INFLUXDB_USER")
+INFLUXDB_PASSWORD = os.getenv("INFLUXDB_PASSWORD")
+INFLUXDB_DATABASE = os.getenv("INFLUXDB_DB")
+# 监控数据存储的表名,爬虫管理系统上会以task_id命名
+INFLUXDB_MEASUREMENT = "task_" + os.getenv("TASK_ID") if os.getenv("TASK_ID") else None
+# 打点监控其他参数,若这里也配置了influxdb的参数, 则会覆盖外面的配置
+METRICS_OTHER_ARGS = dict(retention_policy_duration="180d", emit_interval=60)
+
+############# 导入用户自定义的setting #############
+try:
+    from setting import *
+
+    # 兼容老版本的配置
+    KEEP_ALIVE = not AUTO_STOP_WHEN_SPIDER_DONE
+except:
+    pass

+ 22 - 0
FworkSpider/feapder/templates/air_spider_template.tmpl

@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary:
+---------
+@author: {USER}
+"""
+
+import feapder
+
+
+class ${spider_name}(feapder.AirSpider):
+    def start_requests(self):
+        yield feapder.Request("https://www.baidu.com")
+
+    def parse(self, request, response):
+        print(response)
+
+
+if __name__ == "__main__":
+    ${spider_name}().start()

+ 45 - 0
FworkSpider/feapder/templates/batch_spider_template.tmpl

@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary:
+---------
+@author: {USER}
+"""
+
+import feapder
+
+
+class ${spider_name}(feapder.BatchSpider):
+    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
+    __custom_setting__ = dict(
+        REDISDB_IP_PORTS="localhost:6379",
+        REDISDB_USER_PASS="",
+        REDISDB_DB=0,
+        MYSQL_IP="localhost",
+        MYSQL_PORT=3306,
+        MYSQL_DB="feapder",
+        MYSQL_USER_NAME="feapder",
+        MYSQL_USER_PASS="feapder123",
+    )
+
+    def start_requests(self, task):
+        yield feapder.Request("https://www.baidu.com")
+
+    def parse(self, request, response):
+        print(response)
+
+
+if __name__ == "__main__":
+    spider = ${spider_name}(
+        redis_key="xxx:xxxx",  # redis中存放任务等信息的根key
+        task_table="",  # mysql中的任务表
+        task_keys=["id", "xxx"],  # 需要获取任务表里的字段名,可添加多个
+        task_state="state",  # mysql中任务状态字段
+        batch_record_table="xxx_batch_record",  # mysql中的批次记录表
+        batch_name="xxx(周全)",  # 批次名字
+        batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
+    )
+
+    # spider.start_monitor_task() # 下发及监控任务
+    spider.start() # 采集

+ 105 - 0
FworkSpider/feapder/templates/detail_template.tmpl

@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary:  ${spider_name}
+---------
+@author: {USER}
+"""
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+import time
+from urllib.parse import urljoin
+
+import feapder
+from feapder.utils.tools import wechat_warning
+import execjs
+from items.spider_item import DataBakItem, MgpListItem
+from feapder.db.mongodb import MongoDB
+
+
+
+class ${spider_name}(feapder.Spider):
+    _to_db = None
+    db_name = 'mgp_list'
+    send_list = []
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    def start_requests(self):
+        while True:
+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},sort={"failed":-1},limit=50)
+            for item in data_lsit:
+                request_params = item.get("request_params")
+
+                '''可自定义'''
+
+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
+                                      deal_detail=item.get("deal_detail"),**request_params,
+                                      callback=eval(item.get("parse")),base_info=item,proxies=item.get("proxies"))
+                self.to_db.delete(self.db_name,item)
+            break
+
+    def detail_get(self,request,response):
+        '''需自定义解析规则'''
+        items = request.item
+        list_item = DataBakItem()
+        for key in items:
+            list_item.__setitem__(key,items[key])
+        html = ''
+        # for xpath in request.deal_detail:
+        #    html = response.xpath(xpath).extract_first()
+        #    if html is not None:
+        #        break
+
+        list_item.contenthtml = html
+        # if request.files:
+        #     files_info = request.files
+        #     files =  response.xpath(files_info.get("xpath")).extract()
+        #     for file_url in files:
+        #         if files_info.get("host"):
+        #             file_url = urljoin(files_info.get("host"), file_url)
+        #         if file_url.split(".")[-1] in files.get("other_files"):
+        #             continue
+        yield list_item
+
+
+
+    def failed_request(self, request, response):
+        '''请求、解析次数超过上限后,将原信息重新保存至mongo,并修改failed字段'''
+        mgp = MgpListItem()
+        items = request.base_info
+        for key in items:
+            mgp.__setitem__(key,items[key])
+        mgp.failed +=1
+        print(f'......{mgp.failed}')
+        if mgp.pri > 5:
+            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
+                    '''
+                    根据爬虫优先级报警'''
+                    info= f'''`
+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
+        > **爬虫名称:** {mgp.item.get("site")}
+        > **栏目名称:** {mgp.item.get("channel")}
+        > **爬虫代码:** {mgp.item.get("spidercode")}
+        > **爬虫等级:** {mgp.pri}
+        > **所属管理人员:** {mgp.author}
+        请登录剑鱼爬虫管理平台查看详情。
+        `'''
+                    wechat_warning(info)
+                    self.send_list.append(mgp.item.get("site"))
+        yield mgp
+
+
+    def end_callback(self):
+        print("爬虫结束")
+
+
+
+if __name__ == "__main__":
+    Details(redis_key="fwork:details1").start()

+ 22 - 0
FworkSpider/feapder/templates/item_template.tmpl

@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary:
+---------
+@author: {USER}
+"""
+
+from feapder import Item
+
+
+class ${item_name}Item(Item):
+    """
+    This class was generated by feapder.
+    command: feapder create -i ${table_name}.
+    """
+
+    __table_name__ = "${table_name}"
+
+    def __init__(self, *args, **kwargs):
+        ${propertys}

+ 49 - 0
FworkSpider/feapder/templates/project_template/CHECK_DATA.md

@@ -0,0 +1,49 @@
+# 数据审核 
+## 表说明:
+
+> 表名 含义(更新策略)
+
+## 一、准确性
+
+**字段设计是否满足需求? 表之间的关联字段是否满足要求? (需要人工检查)**
+
+> 注意:是否设计了自增 id,id 的类型是否设置为 bigint?
+> 注意:unique index 是否需要设计?
+> 注意:各张表之间是否需要设计关联字段;
+
+* [ ] 是
+* [ ] 否
+
+**各字段采集内容及存储格式是否满足要求?是否与网页一致?是否有信息缺失?**
+
+> 备注:可尝试对每个字段进行升降序排列,然后抽样检查;
+     
+**是否考虑了网站同一类数据可能出现的数据格式不一致情况?**
+
+> 建议:代码对各个字段不做兼容性处理、数据不一致则抛出异常并记录 
+
+* [ ] 是
+* [ ] 否
+
+## 二、全量性
+
+**如果是增量采集,是否最早信息和最晚信息都采集了,同时条目总数是否正确;**
+**如果是批次采集,是否每个批次都有?**
+
+>备注:需要去网页端评估单个批次的总量;
+>参考sql语句:SELECT count(1), batch_date from [table_name] GROUP BY batch_date;
+
+**如果与另外一张表有关联关系,是否信息关联完整?**
+
+## 三、稳定性
+
+* [ ] 是否能够长期稳定采集? 
+* [ ] 是否加IP代理?
+* [ ] 是否支持断点续跑?
+* [ ] 是否能确保按时启动,定期采集?
+* [ ] 是否已开启报警? 
+
+## 四、采集频次、类型、存储方式
+
+* [ ] 采集频次是否满足要求?
+* [ ] 采集类型是否满足要求:增量采集 or 批次采集? 

+ 8 - 0
FworkSpider/feapder/templates/project_template/README.md

@@ -0,0 +1,8 @@
+# xxx爬虫文档
+## 调研
+
+## 数据库设计
+
+## 爬虫逻辑
+
+## 项目架构

+ 0 - 0
FworkSpider/feapder/templates/project_template/items/__init__.py


+ 79 - 0
FworkSpider/feapder/templates/project_template/main.py

@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary: 爬虫入口
+---------
+@author: {USER}
+"""
+
+from feapder import ArgumentParser
+
+from spiders import *
+
+def crawl_xxx():
+    """
+    AirSpider爬虫
+    """
+    spider = xxx.XXXSpider()
+    spider.start()
+
+def crawl_xxx():
+    """
+    Spider爬虫
+    """
+    spider = xxx.XXXSpider(redis_key="xxx:xxx")
+    spider.start()
+
+
+def crawl_xxx(args):
+    """
+    BatchSpider爬虫
+    """
+    spider = xxx_spider.XXXSpider(
+        task_table="",  # mysql中的任务表
+        batch_record_table="",  # mysql中的批次记录表
+        batch_name="xxx(周全)",  # 批次名字
+        batch_interval=7,  # 批次时间 天为单位 若为小时 可写 1 / 24
+        task_keys=["id", "xxx"],  # 需要获取任务表里的字段名,可添加多个
+        redis_key="xxx:xxxx",  # redis中存放request等信息的根key
+        task_state="state",  # mysql中任务状态字段
+    )
+
+    if args == 1:
+        spider.start_monitor_task()
+    elif args == 2:
+        spider.start()
+    elif args == 3:
+        spider.init_task()
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="xxx爬虫")
+
+    parser.add_argument(
+        "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
+    )
+    parser.add_argument(
+        "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
+    )
+    parser.add_argument(
+        "--crawl_xxx",
+        type=int,
+        nargs=1,
+        help="xxx爬虫",
+        choices=[1, 2, 3],
+        function=crawl_xxx,
+    )
+
+    parser.start()
+
+    # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫,若只有一个爬虫,可不编写main.py
+    # 将上面的xxx修改为自己实际的爬虫名
+    # 查看运行命令 python main.py --help
+    # AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx
+    # BatchSpider运行方式
+    # 1. 下发任务:python main.py --crawl_xxx 1
+    # 2. 采集:python main.py --crawl_xxx 2
+    # 3. 重置任务:python main.py --crawl_xxx 3
+

+ 137 - 0
FworkSpider/feapder/templates/project_template/setting.py

@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+"""爬虫配置文件"""
+# import os
+# import sys
+#
+# # MYSQL
+# MYSQL_IP = "localhost"
+# MYSQL_PORT = 3306
+# MYSQL_DB = ""
+# MYSQL_USER_NAME = ""
+# MYSQL_USER_PASS = ""
+#
+# # MONGODB
+# MONGO_IP = "localhost"
+# MONGO_PORT = 27017
+# MONGO_DB = ""
+# MONGO_USER_NAME = ""
+# MONGO_USER_PASS = ""
+#
+# # REDIS
+# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+# REDISDB_IP_PORTS = "localhost:6379"
+# REDISDB_USER_PASS = ""
+# REDISDB_DB = 0
+# # 适用于redis哨兵模式
+# REDISDB_SERVICE_NAME = ""
+#
+# # 数据入库的pipeline,可自定义,默认MysqlPipeline
+# ITEM_PIPELINES = [
+#     "feapder.pipelines.mysql_pipeline.MysqlPipeline",
+#     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
+# ]
+# EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
+# EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
+#
+# # 爬虫相关
+# # COLLECTOR
+# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
+# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
+#
+# # SPIDER
+# SPIDER_THREAD_COUNT = 1  # 爬虫并发数
+# SPIDER_SLEEP_TIME = 0  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
+# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
+# SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
+# KEEP_ALIVE = False  # 爬虫是否常驻
+#
+# # 浏览器渲染
+# WEBDRIVER = dict(
+#     pool_size=1,  # 浏览器的数量
+#     load_images=True,  # 是否加载图片
+#     user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
+#     proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+#     headless=False,  # 是否为无头浏览器
+#     driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
+#     timeout=30,  # 请求超时时间
+#     window_size=(1024, 800),  # 窗口大小
+#     executable_path=None,  # 浏览器路径,默认为默认路径
+#     render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
+#     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+# )
+#
+# # 爬虫启动时,重新抓取失败的requests
+# RETRY_FAILED_REQUESTS = False
+# # 保存失败的request
+# SAVE_FAILED_REQUEST = True
+# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
+# REQUEST_LOST_TIMEOUT = 600  # 10分钟
+# # request网络请求超时时间
+# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
+#
+# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
+# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
+# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
+# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
+#
+# # 设置代理
+# PROXY_EXTRACT_API = None  # 代理提取API ,返回的代理分割符为\r\n
+# PROXY_ENABLE = True
+#
+# # 随机headers
+# RANDOM_HEADERS = True
+# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
+# USER_AGENT_TYPE = "chrome"
+# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
+# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
+# # requests 使用session
+# USE_SESSION = False
+#
+# # 去重
+# ITEM_FILTER_ENABLE = False  # item 去重
+# REQUEST_FILTER_ENABLE = False  # request 去重
+# ITEM_FILTER_SETTING = dict(
+#     filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+# )
+# REQUEST_FILTER_ENABLE = False  # request 去重
+# REQUEST_FILTER_SETTING = dict(
+#     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+#     expire_time=2592000,  # 过期时间1个月
+# )
+#
+# # 报警 支持钉钉、企业微信、邮件
+# # 钉钉报警
+# DINGDING_WARNING_URL = ""  # 钉钉机器人api
+# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
+# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
+# # 邮件报警
+# EMAIL_SENDER = ""  # 发件人
+# EMAIL_PASSWORD = ""  # 授权码
+# EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
+# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
+# # 企业微信报警
+# WECHAT_WARNING_URL = ""  # 企业微信机器人api
+# WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表,可指定多人
+# WECHAT_WARNING_ALL = False  # 是否提示所有人, 默认为False
+# # 时间间隔
+# WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
+# WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
+# WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
+#
+# LOG_NAME = os.path.basename(os.getcwd())
+# LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
+# LOG_LEVEL = "DEBUG"
+# LOG_COLOR = True  # 是否带有颜色
+# LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
+# LOG_IS_WRITE_TO_FILE = False  # 是否写文件
+# LOG_MODE = "w"  # 写文件的模式
+# LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
+# LOG_BACKUP_COUNT = 20  # 日志文件保留数量
+# LOG_ENCODING = "utf8"  # 日志文件编码
+# OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
+#
+# # 切换工作路径为当前项目路径
+# project_path = os.path.abspath(os.path.dirname(__file__))
+# os.chdir(project_path)  # 切换工作路经
+# sys.path.insert(0, project_path)
+# print('当前工作路径为 ' + os.getcwd())

+ 0 - 0
FworkSpider/feapder/templates/project_template/spiders/__init__.py


+ 88 - 0
FworkSpider/feapder/templates/spider_list_template.tmpl

@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary: ${spider_name}
+---------
+@author: {USER}
+"""
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+import feapder
+from items.spider_item import DataBakItem,MgpListItem,ListItem
+from feapder.dedup import Dedup
+from collections import namedtuple
+
+
+class ${spider_name}(feapder.Spider):
+
+    def start_callback(self):
+         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
+
+         self.menus = [
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "Notice", 1),
+         ]
+    def start_requests(self):
+         for menu in self.menus:
+             for page in range(1,menu.crawl_page+1):
+                 start_url = f''
+                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
+
+    def parse(self, request, response):
+        menu = request.item
+        dedup = Dedup(Dedup.BloomFilter)
+        href_list = []
+        info_list = []
+        for info in info_list:
+            href = ''
+            title = ''
+            create_time = ''
+
+            data_item = DataBakItem()  # 存储数据的管道
+            data_item.href = href  # 标书链接
+            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            data_item.title = title  # 标题
+            data_item.publishtime = create_time  # 标书发布时间
+            data_item.site = "*******记得编辑平台名称"
+            data_item.area = "全国"  # 城市默认:全国
+            data_item.city = ""  # 城市 默认为空
+            ss = dedup.filter_exist_data([href])
+            if ss == []:
+                continue
+            list_item =  MgpListItem()
+            list_item.parse = "self.detail_get"
+            list_item.parser_name = "details"
+            list_item.item = data_item.to_dict
+            list_item.deal_detail = ['//div[@class="****"]',"*****"]
+            list_item.proxies = False
+            list_item.parse_url = href
+            list_item.pri = 1
+            list.files={
+                "list_xpath":'//div[@class="notice-foot"]/a',
+                "url_xpath":'./@href',
+                "name_xpath":'./text()',
+                "files_type":('zip','doxc','ftp'),
+                "file_type":'zip',
+                "url_key":'attachmentDownload',
+                # "host":'http',
+                "kwargs":{"headers": {
+                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
+                }}
+            href_list.append(href)
+            yield list_item
+        list = ListItem()
+        list.site = self.site
+        list.channel = menu.get("channel")
+        list.spidercode = menu.get("code")
+        list.url = request.url
+        list.count = len(info_list)
+        list.rel_count = len(href_list)
+        dedup.add(href_list)
+
+    def end_callback(self):
+        print("爬虫结束")
+
+if __name__ == "__main__":
+    ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 67 - 0
FworkSpider/feapder/templates/spider_template.tmpl

@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+Created on {DATE}
+---------
+@summary:
+---------
+@author: {USER}
+"""
+
+import feapder
+from items.spider_item import DataBakItem
+from untils.proxy_pool import ProxyPool
+from feapder.dedup import Dedup
+from collections import namedtuple
+
+
+class ${spider_name}(feapder.Spider):
+    # 自定义数据库,若项目中有setting.py文件,此自定义可删除
+    def start_callback(self):
+         self.count = 0
+         self.prox_pool = ProxyPool()
+         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
+
+         self.menus = [
+             Menu('${spider_name}', '${spider_name}', "Notice", 1),
+             Menu('${spider_name}', '${spider_name}', "Notice", 1),
+         ]
+    def start_requests(self):
+         for menu in self.menus:
+            start_url = f''
+            yield feapder.Request(url=start_url, item=menu._asdict())
+
+    def parse(self, request, response):
+        menu = request.item
+        self.count += 1   # 一个计数器
+        for info in info_list:
+            list_item = DataBakItem()  # 存储数据的管道
+            list_item.href = href  # 标书链接
+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            list_item.title = title  # 标题
+            list_item.publishtime = create_time  # 标书发布时间
+
+            list_item.site = "#######记得编辑平台名称"
+            list_item.area = "全国"  # 城市默认:全国
+            list_item.city = ""  # 城市 默认为空
+            dedup = Dedup(Dedup.BloomFilter)
+            ss = dedup.filter_exist_data([href])
+            if ss == []:
+                continue
+            yield feapder.Request(href, callback=self.detail, item=list_item)
+    def detail(self,request,response):
+        list_item = request.item
+        html = response.xpath("//div[@class='']").extract_first()  # 标书详细内容
+        list_item.contenthtml = html
+        yield list_item
+
+    def end_callback(self):
+        print("爬虫结束")
+        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
+
+    def download_midware(self, request):
+        request.proxies = self.prox_pool.get()
+        return request
+
+if __name__ == "__main__":
+    ${spider_name}(redis_key="{USER}:${spider_name}").start()

+ 9 - 0
FworkSpider/feapder/utils/__init__.py

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+'''
+Created on 2019/11/5 4:41 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+'''

+ 168 - 0
FworkSpider/feapder/utils/aliyun.py

@@ -0,0 +1,168 @@
+import hashlib
+import os
+import traceback
+import oss2
+import requests
+from feapder import setting
+import time
+
+class UploadOSS:
+    """阿里云 oss"""
+
+    def __init__(self):
+        oss_conf = setting.oss_
+        self.file_path: str = ""
+        self.file_stream: bytes = b''
+        self.__acc_key_id = oss_conf['key_id']
+        self.__acc_key_secret = oss_conf['key_secret']
+        self.__endpoint = oss_conf['endpoint']
+        self.__bucket_name = oss_conf['bucket_name']
+
+    @property
+    def fid(self):
+        """
+        文本摘要值
+
+        @return: 十六进制摘要值
+        """
+        sha1 = hashlib.sha1()
+        sha1.update(str(self.file_stream).encode("utf-8"))
+        return sha1.hexdigest()
+
+    @property
+    def file_size(self):
+        """
+        文件的大小,将字节(bytes)转化(kb/M/G单位)
+
+        @return: 文件大小
+        """
+        try:
+            size = os.path.getsize(self.file_path)
+        except Exception:
+            traceback.print_exc()
+        else:
+            try:
+                _kb = float(size) / 1024
+            except:
+                return "Error"
+            else:
+                if _kb >= 1024:
+                    _M = _kb / 1024
+                    if _M >= 1024:
+                        _G = _M / 1024
+                        return "{:.1f} G".format(_G)
+                    else:
+                        return "{:.1f} M".format(_M)
+                else:
+                    return "{:.1f} kb".format(_kb)
+
+    def get_state(self, attachment,count=0, **kwargs):
+        """
+        下载附件并上传阿里oss
+
+        @param attachment: 附件
+        @return: 附件处理结果
+        """
+        request_params = {
+            'headers': setting.headers,
+            'timeout': 20,
+            'stream': True,
+            **kwargs
+        }
+        with requests.get(attachment["org_url"], **request_params) as req:
+            if req.status_code == 200:
+                self.file_stream = req.content
+                # img_dir = "file"
+                img_dir = f"file/{attachment['channel']}"
+                # 文件夹不存在则创建文件夹
+                if not os.path.exists(img_dir):
+                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
+                # 打开目录,放入下载的附件
+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
+                filname = filname.hexdigest() #加密1次
+                types = attachment["ftype"]
+                self.file_path = "{}/{}".format(img_dir, filname+'.'+types)
+                with open(self.file_path, 'wb') as f:
+                    f.write(self.file_stream)
+                # 上传附件
+                self.put_oss_from_local()
+                file_state = self.file_state(attachment)
+                # 删除附件
+                os.remove(self.file_path)
+                # 返回附件上传处理信息
+                return file_state
+            else:
+                if count<3:
+                    self.post_state(attachment,count=count+1, **kwargs)
+                else:
+                    # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
+                    attachment["url"] = 'oss'
+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
+                    attachment["size"] = '0kb'
+                    attachment["false"] = True
+                    return attachment
+    def post_state(self, attachment,count=0, **kwargs):
+        """
+        下载附件并上传阿里oss
+
+        @param attachment: 附件
+        @return: 附件处理结果
+        """
+        request_params = {
+            'headers': setting.headers,
+            'timeout': 20,
+            'stream': True,
+            **kwargs
+        }
+        with requests.post(attachment["org_url"], **request_params) as req:
+            if req.status_code == 200:
+                self.file_stream = req.content
+                img_dir = f"file/{attachment['channel']}"
+                # 文件夹不存在则创建文件夹
+                if not os.path.exists(img_dir):
+                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
+                # 打开目录,放入下载的附件
+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
+                filname = filname.hexdigest()  # 加密1次
+                types = attachment["ftype"]
+                self.file_path = "{}/{}".format(img_dir, filname + '.' + types)
+
+                with open(self.file_path, 'wb') as f:
+                    f.write(self.file_stream)
+                # 上传附件
+                self.put_oss_from_local()
+                file_state = self.file_state(attachment)
+                # 删除附件
+                # os.remove(self.file_path)
+                # 返回附件上传处理信息
+                return file_state
+            else:
+                if count<3:
+                    self.post_state(attachment,count=count+1, **kwargs)
+                else:
+                    attachment["url"] = 'oss'
+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
+                    attachment["size"] = '0kb'
+                    attachment["false"] = True
+                    return attachment
+
+    def put_oss_from_local(self):
+        """上传一个本地文件到阿里OSS的普通文件"""
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object_from_file(self.fid, self.file_path)
+
+    def file_state(self, attachment):
+        """
+        文件信息
+
+        @param attachment: 附件
+        @return: 附件上传处理信息
+        """
+        # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
+        attachment["url"] = 'oss'
+        attachment["fid"] = self.fid + "." + attachment["ftype"]
+        attachment["size"] = self.file_size
+        return attachment
+
+

+ 63 - 0
FworkSpider/feapder/utils/custom_argparse.py

@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-10-15 14:32:12
+---------
+@summary: 封装ArgumentParser, 使其支持function, 调用start自动执行
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import argparse
+
+
+class ArgumentParser(argparse.ArgumentParser):
+    def __init__(self, *args, **kwargs):
+        self.functions = {}
+
+        super(ArgumentParser, self).__init__(*args, **kwargs)
+
+    def add_argument(self, *args, **kwargs):
+        function = kwargs.pop("function") if "function" in kwargs else None
+        key = self._get_optional_kwargs(*args, **kwargs).get("dest")
+        self.functions[key] = function
+
+        return super(ArgumentParser, self).add_argument(*args, **kwargs)
+
+    def start(self, args=None, namespace=None):
+        args = self.parse_args(args=args, namespace=namespace)
+        for key, value in vars(args).items():  # vars() 函数返回对象object的属性和属性值的字典对象
+            if value not in (None, False):
+                if callable(self.functions[key]):
+                    if value != True:
+                        if isinstance(value, list) and len(value) == 1:
+                            value = value[0]
+                        self.functions[key](value)
+                    else:
+                        self.functions[key]()
+
+    def run(self, args, values=None):
+        if args in self.functions:
+            if values:
+                self.functions[args](values)
+            else:
+                self.functions[args]()
+
+        else:
+            raise Exception(f"无此方法: {args}")
+
+
+if __name__ == "__main__":
+
+    def test():
+        print("test not args func")
+
+    def test2(args):
+        print("test args func", args)
+
+    parser = ArgumentParser(description="测试")
+
+    parser.add_argument("--test2", type=int, nargs=1, help="(1|2)", function=test2)
+    parser.add_argument("--test", action="store_true", help="", function=test)
+
+    parser.start()

+ 93 - 0
FworkSpider/feapder/utils/email_sender.py

@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2020/2/19 12:57 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import os
+import smtplib
+from email.header import Header
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from email.utils import formataddr
+
+from feapder.utils.log import log
+
+
+class EmailSender(object):
+    SENDER = "feapder报警系统"
+
+    def __init__(self, username, password, smtpserver="smtp.163.com"):
+        self.username = username
+        self.password = password
+        self.smtpserver = smtpserver
+        self.smtp_client = smtplib.SMTP_SSL(smtpserver)
+        self.sender = EmailSender.SENDER
+
+    def __enter__(self):
+        self.login()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.quit()
+
+    def quit(self):
+        self.smtp_client.quit()
+
+    def login(self):
+        self.smtp_client.connect(self.smtpserver)
+        self.smtp_client.login(self.username, self.password)
+
+    def send(
+        self,
+        receivers: list,
+        title: str,
+        content: str,
+        content_type: str = "plain",
+        filepath: str = None,
+    ):
+        """
+
+        Args:
+            receivers:
+            title:
+            content:
+            content_type: html / plain
+            filepath:
+
+        Returns:
+
+        """
+        # 创建一个带附件的实例
+        message = MIMEMultipart()
+        message["From"] = formataddr(
+            (self.sender, self.username)
+        )  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
+        message["To"] = formataddr((receivers[0], receivers[0]))  # ",".join(receivers)
+
+        message["Subject"] = Header(title, "utf-8")
+
+        content = MIMEText(content, content_type, "utf-8")
+        message.attach(content)
+
+        # 构造附件
+        if filepath:
+            attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8")
+            attach.add_header(
+                "content-disposition",
+                "attachment",
+                filename=("utf-8", "", os.path.basename(filepath)),
+            )
+            message.attach(attach)
+
+        msg = message.as_string()
+        # 此处直接发送多个邮箱有问题,改成一个个发送
+        for receiver in receivers:
+            log.debug("发送邮件到 {}".format(receiver))
+            self.smtp_client.sendmail(self.username, receiver, msg)
+        log.debug("邮件发送成功!!!")
+        return True

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 6 - 0
FworkSpider/feapder/utils/js/stealth.min.js


+ 265 - 0
FworkSpider/feapder/utils/log.py

@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-12-08 16:50
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import logging
+import os
+import sys
+import time
+from logging.handlers import BaseRotatingHandler
+
+import loguru
+import pymongo
+from better_exceptions import format_exception
+
+import feapder.setting as setting
+
+LOG_FORMAT = "%(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s| %(message)s"
+PRINT_EXCEPTION_DETAILS = True
+
+
+class InterceptHandler(logging.Handler):
+    def emit(self, record):
+        # Retrieve context where the logging call occurred, this happens to be in the 6th frame upward
+        logger_opt = loguru.logger.opt(depth=6, exception=record.exc_info)
+        logger_opt.log(record.levelname, record.getMessage())
+
+
+# 重写 RotatingFileHandler 自定义log的文件名
+# 原来 xxx.log xxx.log.1 xxx.log.2 xxx.log.3 文件由近及远
+# 现在 xxx.log xxx1.log xxx2.log  如果backup_count 是2位数时  则 01  02  03 三位数 001 002 .. 文件由近及远
+class RotatingFileHandler(BaseRotatingHandler):
+    def __init__(
+        self, filename, mode="a", max_bytes=0, backup_count=0, encoding=None, delay=0
+    ):
+        BaseRotatingHandler.__init__(self, filename, mode, encoding, delay)
+        self.max_bytes = max_bytes
+        self.backup_count = backup_count
+        self.placeholder = str(len(str(backup_count)))
+        self._to_db = None
+        self.filename = filename
+
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = pymongo.MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
+
+        return self._to_db.pyspider
+
+
+    def shouldRollover(self, record):
+        parmars = {
+            "spider_name":record.name,
+            "msg":record.msg,
+            "Message":str(record.getMessage)
+        }
+        if record.levelname == "ERROR":
+            crawl_type = 'list'
+            if 'detail' in record.name:
+                crawl_type = 'detail'
+            url = ''
+            item={
+                "recordname":record.name,
+                "spidercode":"spidercode",
+                "author":self.filename,
+                "account":"",
+                "crawl_time":time.time(),
+                "crawl_type": crawl_type,
+                "status_code":"status_code",
+                "url":url,
+                "reason":record.msg,
+                'parmars': parmars,
+            }
+
+            # print('<<<<<<<<<<<<<<<<<<<<<<<插入error_info')
+            # print(item)
+            # print(self.to_db.error_info)
+            # self.to_db.error_info.insert_one(item)
+
+
+
+def get_logger(
+    name=None,
+    path=None,
+    log_level=None,
+    is_write_to_console=None,
+    is_write_to_file=None,
+    color=None,
+    mode=None,
+    max_bytes=None,
+    backup_count=None,
+    encoding=None,
+):
+    """
+    @summary: 获取log
+    ---------
+    @param name: log名
+    @param path: log文件存储路径 如 D://xxx.log
+    @param log_level: log等级 CRITICAL/ERROR/WARNING/INFO/DEBUG
+    @param is_write_to_console: 是否输出到控制台
+    @param is_write_to_file: 是否写入到文件 默认否
+    @param color:是否有颜色
+    @param mode:写文件模式
+    @param max_bytes: 每个日志文件的最大字节数
+    @param backup_count:日志文件保留数量
+    @param encoding:日志文件编码
+    ---------
+    @result:
+    """
+    # 加载setting里最新的值
+    name = name or setting.LOG_NAME
+    path = path or setting.LOG_PATH
+    log_level = log_level or setting.LOG_LEVEL
+    is_write_to_console = (
+        is_write_to_console
+        if is_write_to_console is not None
+        else setting.LOG_IS_WRITE_TO_CONSOLE
+    )
+    is_write_to_file = (
+        is_write_to_file
+        if is_write_to_file is not None
+        else setting.LOG_IS_WRITE_TO_FILE
+    )
+    color = color if color is not None else setting.LOG_COLOR
+    mode = mode or setting.LOG_MODE
+    max_bytes = max_bytes or setting.LOG_MAX_BYTES
+    backup_count = backup_count or setting.LOG_BACKUP_COUNT
+    encoding = encoding or setting.LOG_ENCODING
+
+    # logger 配置
+    name = name.split(os.sep)[-1].split(".")[0]  # 取文件名
+
+    logger = logging.getLogger(name)
+    logger.setLevel(log_level)
+
+    formatter = logging.Formatter(LOG_FORMAT)
+    if PRINT_EXCEPTION_DETAILS:
+        formatter.formatException = lambda exc_info: format_exception(*exc_info)
+
+    # 定义一个RotatingFileHandler,最多备份5个日志文件,每个日志文件最大10M
+    if is_write_to_file:
+        # if path and not os.path.exists(os.path.dirname(path)):
+        #     os.makedirs(os.path.dirname(path))
+
+        rf_handler = RotatingFileHandler(
+            path,
+            mode=mode,
+            max_bytes=max_bytes,
+            backup_count=backup_count,
+            encoding=encoding,
+        )
+        rf_handler.setFormatter(formatter)
+        logger.addHandler(rf_handler)
+    if color and is_write_to_console:
+        loguru_handler = InterceptHandler()
+        loguru_handler.setFormatter(formatter)
+        # logging.basicConfig(handlers=[loguru_handler], level=0)
+        logger.addHandler(loguru_handler)
+    elif is_write_to_console:
+        stream_handler = logging.StreamHandler()
+        stream_handler.stream = sys.stdout
+        stream_handler.setFormatter(formatter)
+        logger.addHandler(stream_handler)
+
+    _handler_list = []
+    _handler_name_list = []
+    # 检查是否存在重复handler
+    for _handler in logger.handlers:
+        if str(_handler) not in _handler_name_list:
+            _handler_name_list.append(str(_handler))
+            _handler_list.append(_handler)
+    logger.handlers = _handler_list
+    return logger
+
+
+# logging.disable(logging.DEBUG) # 关闭所有log
+
+# 不让打印log的配置
+STOP_LOGS = [
+    # ES
+    "urllib3.response",
+    "urllib3.connection",
+    "elasticsearch.trace",
+    "requests.packages.urllib3.util",
+    "requests.packages.urllib3.util.retry",
+    "urllib3.util",
+    "requests.packages.urllib3.response",
+    "requests.packages.urllib3.contrib.pyopenssl",
+    "requests.packages",
+    "urllib3.util.retry",
+    "requests.packages.urllib3.contrib",
+    "requests.packages.urllib3.connectionpool",
+    "requests.packages.urllib3.poolmanager",
+    "urllib3.connectionpool",
+    "requests.packages.urllib3.connection",
+    "elasticsearch",
+    "log_request_fail",
+    # requests
+    "requests",
+    "selenium.webdriver.remote.remote_connection",
+    "selenium.webdriver.remote",
+    "selenium.webdriver",
+    "selenium",
+    # markdown
+    "MARKDOWN",
+    "build_extension",
+    # newspaper
+    "calculate_area",
+    "largest_image_url",
+    "newspaper.images",
+    "newspaper",
+    "Importing",
+    "PIL",
+]
+
+# 关闭日志打印
+for STOP_LOG in STOP_LOGS:
+    log_level = eval("logging." + setting.OTHERS_LOG_LEVAL)
+    logging.getLogger(STOP_LOG).setLevel(log_level)
+
+# print(logging.Logger.manager.loggerDict) # 取使用debug模块的name
+
+# 日志级别大小关系为:CRITICAL > ERROR > WARNING > INFO > DEBUG
+
+
+class Log:
+    log = None
+
+    def __getattr__(self, name):
+        # 调用log时再初始化,为了加载最新的setting
+        if self.__class__.log is None:
+            self.__class__.log = get_logger()
+        return getattr(self.__class__.log, name)
+
+    @property
+    def debug(self):
+        return self.__class__.log.debug
+
+    @property
+    def info(self):
+        return self.__class__.log.info
+
+    @property
+    def warning(self):
+        return self.__class__.log.warning
+
+    @property
+    def exception(self):
+        return self.__class__.log.exception
+
+    @property
+    def error(self):
+        return self.__class__.log.error
+
+    @property
+    def critical(self):
+        return self.__class__.log.critical
+
+
+log = Log()

+ 539 - 0
FworkSpider/feapder/utils/metrics.py

@@ -0,0 +1,539 @@
+import concurrent.futures
+import json
+import os
+import queue
+import random
+import socket
+import threading
+import time
+from collections import Counter
+from typing import Any
+
+from influxdb import InfluxDBClient
+
+from feapder import setting
+from feapder.utils.log import log
+from feapder.utils.tools import aio_wrap, ensure_float, ensure_int
+
+_inited_pid = None
+# this thread should stop running in the forked process
+_executor = concurrent.futures.ThreadPoolExecutor(
+    max_workers=1, thread_name_prefix="metrics"
+)
+
+
+class MetricsEmitter:
+    def __init__(
+        self,
+        influxdb,
+        *,
+        batch_size=10,
+        max_timer_seq=0,
+        emit_interval=10,
+        retention_policy=None,
+        ratio=1.0,
+        debug=False,
+        add_hostname=False,
+        max_points=10240,
+        default_tags=None,
+        time_precision="s",
+    ):
+        """
+        Args:
+            influxdb: influxdb instance
+            batch_size: 打点的批次大小
+            max_timer_seq: 每个时间间隔内最多收集多少个 timer 类型点, 0 表示不限制
+            emit_interval: 最多等待多长时间必须打点
+            retention_policy: 对应的 retention policy
+            ratio: store 和 timer 类型采样率,比如 0.1 表示只有 10% 的点会留下
+            debug: 是否打印调试日志
+            add_hostname: 是否添加 hostname 作为 tag
+            max_points: 本地 buffer 最多累计多少个点
+            time_precision: 打点精度 默认 s
+        """
+        self.pending_points = queue.Queue()
+        self.batch_size = batch_size
+        self.influxdb: InfluxDBClient = influxdb
+        self.tagkv = {}
+        self.max_timer_seq = max_timer_seq
+        self.lock = threading.Lock()
+        self.hostname = socket.gethostname()
+        self.last_emit_ts = time.time()  # 上次提交时间
+        self.emit_interval = emit_interval  # 提交间隔
+        self.max_points = max_points
+        self.retention_policy = retention_policy  # 支持自定义保留策略
+        self.debug = debug
+        self.add_hostname = add_hostname
+        self.ratio = ratio
+        self.default_tags = default_tags or {}
+        self.time_precision = time_precision
+
+    def define_tagkv(self, tagk, tagvs):
+        self.tagkv[tagk] = set(tagvs)
+
+    def _point_tagset(self, p):
+        return f"{p['measurement']}-{sorted(p['tags'].items())}-{p['time']}"
+
+    def _accumulate_points(self, points):
+        """
+        对于处于同一个 key 的点做聚合
+
+          - 对于 counter 类型,同一个 key 的值(_count)可以累加
+          - 对于 store 类型,不做任何操作,influxdb 会自行覆盖
+          - 对于 timer 类型,通过添加一个 _seq 值来区分每个不同的点
+        """
+        counters = {}  # 临时保留 counter 类型的值
+        timer_seqs = Counter()  # 记录不同 key 的 timer 序列号
+        new_points = []
+
+        for point in points:
+            point_type = point["tags"].get("_type", None)
+            tagset = self._point_tagset(point)
+
+            # counter 类型全部聚合,不做丢弃
+            if point_type == "counter":
+                if tagset not in counters:
+                    counters[tagset] = point
+                else:
+                    counters[tagset]["fields"]["_count"] += point["fields"]["_count"]
+            elif point_type == "timer":
+                if self.max_timer_seq and timer_seqs[tagset] > self.max_timer_seq:
+                    continue
+                # 掷一把骰子,如果足够幸运才打点
+                if self.ratio < 1.0 and random.random() > self.ratio:
+                    continue
+                # 增加 _seq tag,以便区分不同的点
+                point["tags"]["_seq"] = timer_seqs[tagset]
+                timer_seqs[tagset] += 1
+                new_points.append(point)
+            else:
+                if self.ratio < 1.0 and random.random() > self.ratio:
+                    continue
+                new_points.append(point)
+
+        # 把累加得到的 counter 值添加进来
+        new_points.extend(counters.values())
+        return new_points
+
+    def _get_ready_emit(self, force=False):
+        """
+        把当前 pending 的值做聚合并返回
+        """
+        if self.debug:
+            log.info("got %s raw points", self.pending_points.qsize())
+
+        # 从 pending 中读取点, 设定一个最大值,避免一直打点,一直获取
+        points = []
+        while len(points) < self.max_points or force:
+            try:
+                points.append(self.pending_points.get_nowait())
+            except queue.Empty:
+                break
+
+        # 聚合点
+        points = self._accumulate_points(points)
+
+        if self.debug:
+            log.info("got %s point", len(points))
+            log.info(json.dumps(points, indent=4))
+
+        return points
+
+    def emit(self, point=None, force=False):
+        """
+        1. 添加新点到 pending
+        2. 如果符合条件,尝试聚合并打点
+        3. 更新打点时间
+
+        :param point:
+        :param force: 强制提交所有点 默认False
+        :return:
+        """
+        if point:
+            self.pending_points.put(point)
+
+        # 判断是否需要提交点 1、数量 2、间隔 3、强力打点
+        if not (
+            force
+            or self.pending_points.qsize() >= self.max_points  # noqa: W503
+            or time.time() - self.last_emit_ts > self.emit_interval  # noqa: W503
+        ):
+            return
+
+        # 需要打点,读取可以打点的值, 确保只有一个线程在做点的压缩
+        with self.lock:
+            points = self._get_ready_emit(force=force)
+
+            if not points:
+                return
+            try:
+                self.influxdb.write_points(
+                    points,
+                    batch_size=self.batch_size,
+                    time_precision=self.time_precision,
+                    retention_policy=self.retention_policy,
+                )
+            except Exception:
+                log.exception("error writing points")
+
+            self.last_emit_ts = time.time()
+
+    def flush(self):
+        if self.debug:
+            log.info("start draining points %s", self.pending_points.qsize())
+        self.emit(force=True)
+
+    def close(self):
+        self.flush()
+        try:
+            self.influxdb.close()
+        except Exception as e:
+            log.exception(e)
+
+    def make_point(self, measurement, tags: dict, fields: dict, timestamp=None):
+        """
+        默认的时间戳是"秒"级别的
+        """
+        assert measurement, "measurement can't be null"
+        tags = tags.copy() if tags else {}
+        tags.update(self.default_tags)
+        fields = fields.copy() if fields else {}
+        if timestamp is None:
+            timestamp = int(time.time())
+        # 支持自定义hostname
+        if self.add_hostname and "hostname" not in tags:
+            tags["hostname"] = self.hostname
+        point = dict(measurement=measurement, tags=tags, fields=fields, time=timestamp)
+        if self.tagkv:
+            for tagk, tagv in tags.items():
+                if tagv not in self.tagkv[tagk]:
+                    raise ValueError("tag value = %s not in %s", tagv, self.tagkv[tagk])
+        return point
+
+    def get_counter_point(
+        self,
+        measurement: str,
+        key: str = None,
+        count: int = 1,
+        tags: dict = None,
+        timestamp: int = None,
+    ):
+        """
+        counter 不能被覆盖
+        """
+        tags = tags.copy() if tags else {}
+        if key is not None:
+            tags["_key"] = key
+        tags["_type"] = "counter"
+        count = ensure_int(count)
+        fields = dict(_count=count)
+        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
+        return point
+
+    def get_store_point(
+        self,
+        measurement: str,
+        key: str = None,
+        value: Any = 0,
+        tags: dict = None,
+        timestamp=None,
+    ):
+        tags = tags.copy() if tags else {}
+        if key is not None:
+            tags["_key"] = key
+        tags["_type"] = "store"
+        fields = dict(_value=value)
+        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
+        return point
+
+    def get_timer_point(
+        self,
+        measurement: str,
+        key: str = None,
+        duration: float = 0,
+        tags: dict = None,
+        timestamp=None,
+    ):
+        tags = tags.copy() if tags else {}
+        if key is not None:
+            tags["_key"] = key
+        tags["_type"] = "timer"
+        fields = dict(_duration=ensure_float(duration))
+        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
+        return point
+
+    def emit_any(self, *args, **kwargs):
+        point = self.make_point(*args, **kwargs)
+        self.emit(point)
+
+    def emit_counter(self, *args, **kwargs):
+        point = self.get_counter_point(*args, **kwargs)
+        self.emit(point)
+
+    def emit_store(self, *args, **kwargs):
+        point = self.get_store_point(*args, **kwargs)
+        self.emit(point)
+
+    def emit_timer(self, *args, **kwargs):
+        point = self.get_timer_point(*args, **kwargs)
+        self.emit(point)
+
+
+_emitter: MetricsEmitter = None
+_measurement: str = None
+
+
+def init(
+    *,
+    influxdb_host=None,
+    influxdb_port=None,
+    influxdb_udp_port=None,
+    influxdb_database=None,
+    influxdb_user=None,
+    influxdb_password=None,
+    influxdb_measurement=None,
+    retention_policy=None,
+    retention_policy_duration="180d",
+    emit_interval=60,
+    batch_size=10,
+    debug=False,
+    use_udp=False,
+    timeout=10,
+    time_precision="s",
+    **kwargs,
+):
+    """
+    打点监控初始化
+    Args:
+        influxdb_host:
+        influxdb_port:
+        influxdb_udp_port:
+        influxdb_database:
+        influxdb_user:
+        influxdb_password:
+        influxdb_measurement: 存储的表,也可以在打点的时候指定
+        retention_policy: 保留策略
+        retention_policy_duration: 保留策略过期时间
+        emit_interval: 打点最大间隔
+        batch_size: 打点的批次大小
+        debug: 是否开启调试
+        use_udp: 是否使用udp协议打点
+        timeout: 与influxdb建立连接时的超时时间
+        time_precision: 打点精度 默认秒
+        **kwargs: 可传递MetricsEmitter类的参数
+
+    Returns:
+
+    """
+    global _inited_pid, _emitter, _measurement
+    if _inited_pid == os.getpid():
+        return
+
+    influxdb_host = influxdb_host or setting.INFLUXDB_HOST
+    influxdb_port = influxdb_port or setting.INFLUXDB_PORT
+    influxdb_udp_port = influxdb_udp_port or setting.INFLUXDB_UDP_PORT
+    influxdb_database = influxdb_database or setting.INFLUXDB_DATABASE
+    influxdb_user = influxdb_user or setting.INFLUXDB_USER
+    influxdb_password = influxdb_password or setting.INFLUXDB_PASSWORD
+    _measurement = influxdb_measurement or setting.INFLUXDB_MEASUREMENT
+    retention_policy = (
+        retention_policy or f"{influxdb_database}_{retention_policy_duration}"
+    )
+
+    if not all(
+        [
+            influxdb_host,
+            influxdb_port,
+            influxdb_udp_port,
+            influxdb_database,
+            influxdb_user,
+            influxdb_password,
+        ]
+    ):
+        return
+
+    influxdb_client = InfluxDBClient(
+        host=influxdb_host,
+        port=influxdb_port,
+        udp_port=influxdb_udp_port,
+        database=influxdb_database,
+        use_udp=use_udp,
+        timeout=timeout,
+        username=influxdb_user,
+        password=influxdb_password,
+    )
+    # 创建数据库
+    if influxdb_database:
+        try:
+            influxdb_client.create_database(influxdb_database)
+            influxdb_client.create_retention_policy(
+                retention_policy,
+                retention_policy_duration,
+                replication="1",
+                default=True,
+            )
+        except Exception as e:
+            log.error("metrics init falied: {}".format(e))
+            return
+
+    _emitter = MetricsEmitter(
+        influxdb_client,
+        debug=debug,
+        batch_size=batch_size,
+        time_precision=time_precision,
+        retention_policy=retention_policy,
+        emit_interval=emit_interval,
+        **kwargs,
+    )
+    _inited_pid = os.getpid()
+    log.info("metrics init successfully")
+
+
+def emit_any(
+    tags: dict,
+    fields: dict,
+    *,
+    classify: str = "",
+    measurement: str = None,
+    timestamp=None,
+):
+    """
+    原生的打点,不进行额外的处理
+    Args:
+        tags: influxdb的tag的字段和值
+        fields: influxdb的field的字段和值
+        classify: 点的类别
+        measurement: 存储的表
+        timestamp: 点的时间搓,默认为当前时间
+
+    Returns:
+
+    """
+    if not _emitter:
+        return
+
+    tags = tags or {}
+    tags["_classify"] = classify
+    measurement = measurement or _measurement
+    _emitter.emit_any(measurement, tags, fields, timestamp)
+
+
+def emit_counter(
+    key: str = None,
+    count: int = 1,
+    *,
+    classify: str = "",
+    tags: dict = None,
+    measurement: str = None,
+    timestamp: int = None,
+):
+    """
+    聚合打点,即会将一段时间内的点求和,然后打一个点数和
+    Args:
+        key: 与点绑定的key值
+        count: 点数
+        classify: 点的类别
+        tags: influxdb的tag的字段和值
+        measurement: 存储的表
+        timestamp: 点的时间搓,默认为当前时间
+
+    Returns:
+
+    """
+    if not _emitter:
+        return
+
+    tags = tags or {}
+    tags["_classify"] = classify
+    measurement = measurement or _measurement
+    _emitter.emit_counter(measurement, key, count, tags, timestamp)
+
+
+def emit_timer(
+    key: str = None,
+    duration: float = 0,
+    *,
+    classify: str = "",
+    tags: dict = None,
+    measurement: str = None,
+    timestamp=None,
+):
+    """
+    时间打点,用于监控程序的运行时长等,每个duration一个点,不会被覆盖
+    Args:
+        key: 与点绑定的key值
+        duration: 时长
+        classify: 点的类别
+        tags: influxdb的tag的字段和值
+        measurement: 存储的表
+        timestamp: 点的时间搓,默认为当前时间
+
+    Returns:
+
+    """
+    if not _emitter:
+        return
+
+    tags = tags or {}
+    tags["_classify"] = classify
+    measurement = measurement or _measurement
+    _emitter.emit_timer(measurement, key, duration, tags, timestamp)
+
+
+def emit_store(
+    key: str = None,
+    value: Any = 0,
+    *,
+    classify: str = "",
+    tags: dict = None,
+    measurement: str,
+    timestamp=None,
+):
+    """
+    直接打点,不进行额外的处理
+    Args:
+        key: 与点绑定的key值
+        value: 点的值
+        classify: 点的类别
+        tags: influxdb的tag的字段和值
+        measurement: 存储的表
+        timestamp: 点的时间搓,默认为当前时间
+
+    Returns:
+
+    """
+    if not _emitter:
+        return
+
+    tags = tags or {}
+    tags["_classify"] = classify
+    measurement = measurement or _measurement
+    _emitter.emit_store(measurement, key, value, tags, timestamp)
+
+
+def flush():
+    """
+    强刷点到influxdb
+    Returns:
+
+    """
+    if not _emitter:
+        return
+    _emitter.flush()
+
+
+def close():
+    """
+    关闭
+    Returns:
+
+    """
+    if not _emitter:
+        return
+    _emitter.close()
+
+
+# 协程打点
+aemit_counter = aio_wrap(executor=_executor)(emit_counter)
+aemit_store = aio_wrap(executor=_executor)(emit_store)
+aemit_timer = aio_wrap(executor=_executor)(emit_timer)

+ 94 - 0
FworkSpider/feapder/utils/perfect_dict.py

@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/4/8 11:32 上午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+
+def ensure_value(value):
+    if isinstance(value, (list, tuple)):
+        _value = []
+        for v in value:
+            _value.append(ensure_value(v))
+
+        if isinstance(value, tuple):
+            value = tuple(_value)
+        else:
+            value = _value
+
+    if isinstance(value, dict):
+        return PerfectDict(value)
+    else:
+        return value
+
+
+class PerfectDict(dict):
+    """
+    >>> data = PerfectDict({"id":1, "url":"xxx"})
+    >>> data
+    {'id': 1, 'url': 'xxx'}
+    >>> data = PerfectDict(id=1, url="xxx")
+    >>> data
+    {'id': 1, 'url': 'xxx'}
+    >>> data.id
+    1
+    >>> data.get("id")
+    1
+    >>> data["id"]
+    1
+    >>> id, url = data
+    >>> id
+    1
+    >>> url
+    'xxx'
+    >>> data[0]
+    1
+    >>> data[1]
+    'xxx'
+    >>> data = PerfectDict({"a": 1, "b": {"b1": 2}, "c": [{"c1": [{"d": 1}]}]})
+    >>> data.b.b1
+    2
+    >>> data[1].b1
+    2
+    >>> data.get("b").b1
+    2
+    >>> data.c[0].c1
+    [{'d': 1}]
+    >>> data.c[0].c1[0]
+    {'d': 1}
+    """
+
+    def __init__(self, _dict: dict = None, _values: list = None, **kwargs):
+        self.__dict__ = _dict or kwargs or {}
+        self.__dict__.pop("__values__", None)
+        super().__init__(self.__dict__, **kwargs)
+        self.__values__ = _values or list(self.__dict__.values())
+
+    def __getitem__(self, key):
+        if isinstance(key, int):
+            value = self.__values__[key]
+        else:
+            value = self.__dict__[key]
+
+        return ensure_value(value)
+
+    def __iter__(self, *args, **kwargs):
+        for value in self.__values__:
+            yield ensure_value(value)
+
+    def __getattribute__(self, item):
+        value = object.__getattribute__(self, item)
+        if item == "__dict__" or item == "__values__":
+            return value
+        return ensure_value(value)
+
+    def get(self, key, default=None):
+        if key in self.__dict__:
+            value = self.__dict__[key]
+            return ensure_value(value)
+
+        return default

+ 115 - 0
FworkSpider/feapder/utils/redis_lock.py

@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2019/11/5 5:25 PM
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import threading
+import time
+
+from feapder.db.redisdb import RedisDB
+from feapder.utils.log import log
+
+
+class RedisLock:
+    redis_cli = None
+
+    def __init__(self, key, redis_cli=None, wait_timeout=0, lock_timeout=86400):
+        """
+        redis超时锁
+        :param key: 存储锁的key redis_lock:[key]
+        :param redis_cli: redis客户端对象
+        :param wait_timeout: 等待加锁超时时间,为0时则不等待加锁,加锁失败
+        :param lock_timeout: 锁超时时间 为0时则不会超时,直到锁释放或意外退出,默认超时为1天
+
+        用法示例:
+        with RedisLock(key="test") as _lock:
+            if _lock.locked:
+                # 用来判断是否加上了锁
+                # do somethings
+        """
+        self.redis_conn = redis_cli
+        self.lock_key = "redis_lock:{}".format(key)
+        # 锁超时时间
+        self.lock_timeout = lock_timeout
+        # 等待加锁时间
+        self.wait_timeout = wait_timeout
+        self.locked = False
+        self.stop_prolong_life = False
+
+    @property
+    def redis_conn(self):
+        if not self.__class__.redis_cli:
+            self.__class__.redis_cli = RedisDB().get_redis_obj()
+
+        return self.__class__.redis_cli
+
+    @redis_conn.setter
+    def redis_conn(self, cli):
+        self.__class__.redis_cli = cli
+
+    def __enter__(self):
+        if not self.locked:
+            self.acquire()
+            # 延长锁的时间
+            thread = threading.Thread(target=self.prolong_life)
+            thread.setDaemon(True)
+            thread.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_prolong_life = True
+        self.release()
+
+    def __repr__(self):
+        return "<RedisLock: {} >".format(self.lock_key)
+
+    def acquire(self):
+        start = time.time()
+        while True:
+            # 尝试加锁
+            if self.redis_conn.set(self.lock_key, time.time(), nx=True, ex=5):
+                self.locked = True
+                break
+
+            if self.wait_timeout > 0:
+                if time.time() - start > self.wait_timeout:
+                    log.info("加锁失败")
+                    break
+            else:
+                break
+            log.debug("等待加锁: {} wait:{}".format(self, time.time() - start))
+            if self.wait_timeout > 10:
+                time.sleep(5)
+            else:
+                time.sleep(1)
+        return
+
+    def release(self):
+        if self.locked:
+            self.redis_conn.delete(self.lock_key)
+            self.locked = False
+        return
+
+    def prolong_life(self):
+        """
+        延长锁的过期时间
+        :return:
+        """
+
+        spend_time = 0
+        while not self.stop_prolong_life:
+            expire = self.redis_conn.ttl(self.lock_key)
+            if expire < 0:  # key 不存在
+                time.sleep(1)
+                continue
+            self.redis_conn.expire(self.lock_key, expire + 5)  # 延长5秒
+            time.sleep(expire)  # 临过期5秒前,再次延长
+            spend_time += expire
+            if self.lock_timeout and spend_time > self.lock_timeout:
+                log.info("锁超时,释放")
+                self.redis_conn.delete(self.lock_key)
+                break

+ 2554 - 0
FworkSpider/feapder/utils/tools.py

@@ -0,0 +1,2554 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018-09-06 14:21
+---------
+@summary: 工具
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+import asyncio
+import calendar
+import codecs
+import configparser  # 读配置文件的
+import datetime
+import functools
+import hashlib
+import html
+import json
+import os
+import pickle
+import random
+import re
+import socket
+import ssl
+import string
+import sys
+import time
+import traceback
+import urllib
+import urllib.parse
+import uuid
+import weakref
+from functools import partial, wraps
+from hashlib import md5
+from pprint import pformat
+from pprint import pprint
+from urllib import request
+from urllib.parse import urljoin
+
+import execjs  # pip install PyExecJS
+import redis
+import requests
+import six
+from requests.cookies import RequestsCookieJar
+from w3lib.url import canonicalize_url as _canonicalize_url
+
+import feapder.setting as setting
+from feapder.utils.email_sender import EmailSender
+from feapder.utils.log import log
+os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
+
+# 全局取消ssl证书验证
+ssl._create_default_https_context = ssl._create_unverified_context
+
+TIME_OUT = 30
+TIMER_TIME = 5
+
+redisdb = None
+
+
+def get_redisdb():
+    global redisdb
+    if not redisdb:
+        ip, port = setting.REDISDB_IP_PORTS.split(":")
+        redisdb = redis.Redis(
+            host=ip,
+            port=port,
+            db=setting.REDISDB_DB,
+            password=setting.REDISDB_USER_PASS,
+            decode_responses=True,
+        )  # redis默认端口是6379
+    return redisdb
+
+
+# 装饰器
+class Singleton(object):
+    def __init__(self, cls):
+        self._cls = cls
+        self._instance = {}
+
+    def __call__(self, *args, **kwargs):
+        if self._cls not in self._instance:
+            self._instance[self._cls] = self._cls(*args, **kwargs)
+        return self._instance[self._cls]
+
+
+def log_function_time(func):
+    try:
+
+        @functools.wraps(func)  # 将函数的原来属性付给新函数
+        def calculate_time(*args, **kw):
+            began_time = time.time()
+            callfunc = func(*args, **kw)
+            end_time = time.time()
+            log.debug(func.__name__ + " run time  = " + str(end_time - began_time))
+            return callfunc
+
+        return calculate_time
+    except:
+        log.debug("求取时间无效 因为函数参数不符")
+        return func
+
+
+def run_safe_model(module_name):
+    def inner_run_safe_model(func):
+        try:
+
+            @functools.wraps(func)  # 将函数的原来属性付给新函数
+            def run_func(*args, **kw):
+                callfunc = None
+                try:
+                    callfunc = func(*args, **kw)
+                except Exception as e:
+                    log.error(module_name + ": " + func.__name__ + " - " + str(e))
+                    traceback.print_exc()
+                return callfunc
+
+            return run_func
+        except Exception as e:
+            log.error(module_name + ": " + func.__name__ + " - " + str(e))
+            traceback.print_exc()
+            return func
+
+    return inner_run_safe_model
+
+
+def memoizemethod_noargs(method):
+    """Decorator to cache the result of a method (without arguments) using a
+    weak reference to its object
+    """
+    cache = weakref.WeakKeyDictionary()
+
+    @functools.wraps(method)
+    def new_method(self, *args, **kwargs):
+        if self not in cache:
+            cache[self] = method(self, *args, **kwargs)
+        return cache[self]
+
+    return new_method
+
+
+########################【网页解析相关】###############################
+
+
+# @log_function_time
+def get_html_by_requests(
+    url, headers=None, code="utf-8", data=None, proxies={}, with_response=False
+):
+    html = ""
+    r = None
+    try:
+        if data:
+            r = requests.post(
+                url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies
+            )
+        else:
+            r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies)
+
+        if code:
+            r.encoding = code
+        html = r.text
+
+    except Exception as e:
+        log.error(e)
+    finally:
+        r and r.close()
+
+    if with_response:
+        return html, r
+    else:
+        return html
+
+
+def get_json_by_requests(
+    url,
+    params=None,
+    headers=None,
+    data=None,
+    proxies={},
+    with_response=False,
+    cookies=None,
+):
+    json = {}
+    response = None
+    try:
+        # response = requests.get(url, params = params)
+        if data:
+            response = requests.post(
+                url,
+                headers=headers,
+                data=data,
+                params=params,
+                timeout=TIME_OUT,
+                proxies=proxies,
+                cookies=cookies,
+            )
+        else:
+            response = requests.get(
+                url,
+                headers=headers,
+                params=params,
+                timeout=TIME_OUT,
+                proxies=proxies,
+                cookies=cookies,
+            )
+        response.encoding = "utf-8"
+        json = response.json()
+    except Exception as e:
+        log.error(e)
+    finally:
+        response and response.close()
+
+    if with_response:
+        return json, response
+    else:
+        return json
+
+
+def get_cookies(response):
+    cookies = requests.utils.dict_from_cookiejar(response.cookies)
+    return cookies
+
+
+def get_cookies_from_str(cookie_str):
+    """
+    >>> get_cookies_from_str("key=value; key2=value2; key3=; key4=; ")
+    {'key': 'value', 'key2': 'value2', 'key3': '', 'key4': ''}
+
+    Args:
+        cookie_str: key=value; key2=value2; key3=; key4=
+
+    Returns:
+
+    """
+    cookies = {}
+    for cookie in cookie_str.split(";"):
+        cookie = cookie.strip()
+        if not cookie:
+            continue
+        key, value = cookie.split("=", 1)
+        key = key.strip()
+        value = value.strip()
+        cookies[key] = value
+
+    return cookies
+
+
+def get_cookies_jar(cookies):
+    """
+    @summary: 适用于selenium生成的cookies转requests的cookies
+    requests.get(xxx, cookies=jar)
+    参考:https://www.cnblogs.com/small-bud/p/9064674.html
+
+    ---------
+    @param cookies: [{},{}]
+    ---------
+    @result: cookie jar
+    """
+
+    cookie_jar = RequestsCookieJar()
+    for cookie in cookies:
+        cookie_jar.set(cookie["name"], cookie["value"])
+
+    return cookie_jar
+
+
+def get_cookies_from_selenium_cookie(cookies):
+    """
+    @summary: 适用于selenium生成的cookies转requests的cookies
+    requests.get(xxx, cookies=jar)
+    参考:https://www.cnblogs.com/small-bud/p/9064674.html
+
+    ---------
+    @param cookies: [{},{}]
+    ---------
+    @result: cookie jar
+    """
+
+    cookie_dict = {}
+    for cookie in cookies:
+        if cookie.get("name"):
+            cookie_dict[cookie["name"]] = cookie["value"]
+
+    return cookie_dict
+
+
+def cookiesjar2str(cookies):
+    str_cookie = ""
+    for k, v in requests.utils.dict_from_cookiejar(cookies).items():
+        str_cookie += k
+        str_cookie += "="
+        str_cookie += v
+        str_cookie += "; "
+    return str_cookie
+
+
+def cookies2str(cookies):
+    str_cookie = ""
+    for k, v in cookies.items():
+        str_cookie += k
+        str_cookie += "="
+        str_cookie += v
+        str_cookie += "; "
+    return str_cookie
+
+
+def get_urls(
+    html,
+    stop_urls=(
+        "javascript",
+        "+",
+        ".css",
+        ".js",
+        ".rar",
+        ".xls",
+        ".exe",
+        ".apk",
+        ".doc",
+        ".jpg",
+        ".png",
+        ".flv",
+        ".mp4",
+    ),
+):
+    # 不匹配javascript、 +、 # 这样的url
+    regex = r'<a.*?href.*?=.*?["|\'](.*?)["|\']'
+
+    urls = get_info(html, regex)
+    urls = sorted(set(urls), key=urls.index)
+    if stop_urls:
+        stop_urls = isinstance(stop_urls, str) and [stop_urls] or stop_urls
+        use_urls = []
+        for url in urls:
+            for stop_url in stop_urls:
+                if stop_url in url:
+                    break
+            else:
+                use_urls.append(url)
+
+        urls = use_urls
+    return urls
+
+
+def get_full_url(root_url, sub_url):
+    """
+    @summary: 得到完整的ur
+    ---------
+    @param root_url: 根url (网页的url)
+    @param sub_url:  子url (带有相对路径的 可以拼接成完整的)
+    ---------
+    @result: 返回完整的url
+    """
+
+    return urljoin(root_url, sub_url)
+
+
+def joint_url(url, params):
+    # param_str = "?"
+    # for key, value in params.items():
+    #     value = isinstance(value, str) and value or str(value)
+    #     param_str += key + "=" + value + "&"
+    #
+    # return url + param_str[:-1]
+
+    if not params:
+        return url
+
+    params = urlencode(params)
+    separator = "?" if "?" not in url else "&"
+    return url + separator + params
+
+
+def canonicalize_url(url):
+    """
+    url 归一化 会参数排序 及去掉锚点
+    """
+    return _canonicalize_url(url)
+
+
+def get_url_md5(url):
+    url = canonicalize_url(url)
+    url = re.sub("^http://", "https://", url)
+    return get_md5(url)
+
+
+def fit_url(urls, identis):
+    identis = isinstance(identis, str) and [identis] or identis
+    fit_urls = []
+    for link in urls:
+        for identi in identis:
+            if identi in link:
+                fit_urls.append(link)
+    return list(set(fit_urls))
+
+
+def get_param(url, key):
+    params = url.split("?")[-1].split("&")
+    for param in params:
+        key_value = param.split("=", 1)
+        if key == key_value[0]:
+            return key_value[1]
+    return None
+
+
+def urlencode(params):
+    """
+    字典类型的参数转为字符串
+    @param params:
+    {
+        'a': 1,
+        'b': 2
+    }
+    @return: a=1&b=2
+    """
+    return urllib.parse.urlencode(params)
+
+
+def urldecode(url):
+    """
+    将字符串类型的参数转为json
+    @param url: xxx?a=1&b=2
+    @return:
+    {
+        'a': 1,
+        'b': 2
+    }
+    """
+    params_json = {}
+    params = url.split("?")[-1].split("&")
+    for param in params:
+        key, value = param.split("=")
+        params_json[key] = unquote_url(value)
+
+    return params_json
+
+
+def unquote_url(url, encoding="utf-8"):
+    """
+    @summary: 将url解码
+    ---------
+    @param url:
+    ---------
+    @result:
+    """
+
+    return urllib.parse.unquote(url, encoding=encoding)
+
+
+def quote_url(url, encoding="utf-8"):
+    """
+    @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html
+    ---------
+    @param url:
+    ---------
+    @result:
+    """
+
+    return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding)
+
+
+def quote_chinese_word(text, encoding="utf-8"):
+    def quote_chinese_word_func(text):
+        chinese_word = text.group(0)
+        return urllib.parse.quote(chinese_word, encoding=encoding)
+
+    return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S)
+
+
+def unescape(str):
+    """
+    反转译
+    """
+    return html.unescape(str)
+
+
+def excape(str):
+    """
+    转译
+    """
+    return html.escape(str)
+
+
+_regexs = {}
+
+
+# @log_function_time
+def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
+    regexs = isinstance(regexs, str) and [regexs] or regexs
+
+    infos = []
+    for regex in regexs:
+        if regex == "":
+            continue
+
+        if regex not in _regexs.keys():
+            _regexs[regex] = re.compile(regex, re.S)
+
+        if fetch_one:
+            infos = _regexs[regex].search(html)
+            if infos:
+                infos = infos.groups()
+            else:
+                continue
+        else:
+            infos = _regexs[regex].findall(str(html))
+
+        if len(infos) > 0:
+            # print(regex)
+            break
+
+    if fetch_one:
+        infos = infos if infos else ("",)
+        return infos if len(infos) > 1 else infos[0]
+    else:
+        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
+        infos = split.join(infos) if split else infos
+        return infos
+
+
+def table_json(table, save_one_blank=True):
+    """
+    将表格转为json 适应于 key:value 在一行类的表格
+    @param table: 使用selector封装后的具有xpath的selector
+    @param save_one_blank: 保留一个空白符
+    @return:
+    """
+    data = {}
+
+    trs = table.xpath(".//tr")
+    for tr in trs:
+        tds = tr.xpath("./td|./th")
+
+        for i in range(0, len(tds), 2):
+            if i + 1 > len(tds) - 1:
+                break
+
+            key = tds[i].xpath("string(.)").extract_first(default="").strip()
+            value = tds[i + 1].xpath("string(.)").extract_first(default="").strip()
+            value = replace_str(value, "[\f\n\r\t\v]", "")
+            value = replace_str(value, " +", " " if save_one_blank else "")
+
+            if key:
+                data[key] = value
+
+    return data
+
+
+def get_table_row_data(table):
+    """
+    获取表格里每一行数据
+    @param table: 使用selector封装后的具有xpath的selector
+    @return: [[],[]..]
+    """
+
+    datas = []
+    rows = table.xpath(".//tr")
+    for row in rows:
+        cols = row.xpath("./td|./th")
+        row_datas = []
+        for col in cols:
+            data = col.xpath("string(.)").extract_first(default="").strip()
+            row_datas.append(data)
+        datas.append(row_datas)
+
+    return datas
+
+
+def rows2json(rows, keys=None):
+    """
+    将行数据转为json
+    @param rows: 每一行的数据
+    @param keys: json的key,空时将rows的第一行作为key
+    @return:
+    """
+    data_start_pos = 0 if keys else 1
+    datas = []
+    keys = keys or rows[0]
+    for values in rows[data_start_pos:]:
+        datas.append(dict(zip(keys, values)))
+
+    return datas
+
+
+def get_form_data(form):
+    """
+    提取form中提交的数据
+    :param form: 使用selector封装后的具有xpath的selector
+    :return:
+    """
+    data = {}
+    inputs = form.xpath(".//input")
+    for input in inputs:
+        name = input.xpath("./@name").extract_first()
+        value = input.xpath("./@value").extract_first()
+        if name:
+            data[name] = value
+
+    return data
+
+
+# mac上不好使
+# def get_domain(url):
+#     domain = ''
+#     try:
+#         domain = get_tld(url)
+#     except Exception as e:
+#         log.debug(e)
+#     return domain
+
+
+def get_domain(url):
+    proto, rest = urllib.parse.splittype(url)
+    domain, rest = urllib.parse.splithost(rest)
+    return domain
+
+
+def get_index_url(url):
+    return "/".join(url.split("/")[:3])
+
+
+def get_ip(domain):
+    ip = socket.getaddrinfo(domain, "http")[0][4][0]
+    return ip
+
+
+def get_localhost_ip():
+    """
+    利用 UDP 协议来实现的,生成一个UDP包,把自己的 IP 放如到 UDP 协议头中,然后从UDP包中获取本机的IP。
+    这个方法并不会真实的向外部发包,所以用抓包工具是看不到的
+    :return:
+    """
+    s = None
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(("8.8.8.8", 80))
+        ip = s.getsockname()[0]
+    finally:
+        if s:
+            s.close()
+
+    return ip
+
+
+def ip_to_num(ip):
+    import struct
+
+    ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
+    return ip_num
+
+
+def is_valid_proxy(proxy, check_url=None):
+    """
+    检验代理是否有效
+    @param proxy: xxx.xxx.xxx:xxx
+    @param check_url: 利用目标网站检查,目标网站url。默认为None, 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host
+    @return: True / False
+    """
+    is_valid = False
+
+    if check_url:
+        proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"}
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
+        }
+        response = None
+        try:
+            response = requests.get(
+                check_url, headers=headers, proxies=proxies, stream=True, timeout=20
+            )
+            is_valid = True
+
+        except Exception as e:
+            log.error("check proxy failed: {} {}".format(e, proxy))
+
+        finally:
+            if response:
+                response.close()
+
+    else:
+        ip, port = proxy.split(":")
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
+            sk.settimeout(7)
+            try:
+                sk.connect((ip, int(port)))  # 检查代理服务器是否开着
+                is_valid = True
+
+            except Exception as e:
+                log.error("check proxy failed: {} {}:{}".format(e, ip, port))
+
+    return is_valid
+
+
+def is_valid_url(url):
+    """
+    验证url是否合法
+    :param url:
+    :return:
+    """
+    if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url):
+        return True
+    else:
+        return False
+
+
+def get_text(soup, *args):
+    try:
+        return soup.get_text()
+    except Exception as e:
+        log.error(e)
+        return ""
+
+
+def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""):
+    """
+    删除html标签
+    @param content: html内容
+    @param except_line_break: 保留p标签
+    @param save_img: 保留图片
+    @param white_replaced: 空白符替换
+    @return:
+    """
+    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
+    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
+    content = replace_str(content, "<!--(.|\n)*?-->")
+    content = replace_str(
+        content, "(?!&[a-z]+=)&[a-z]+;?"
+    )  # 干掉&nbsp等无用的字符 但&xxx= 这种表示参数的除外
+    if except_line_break:
+        content = content.replace("</p>", "/p")
+        content = replace_str(content, "<[^p].*?>")
+        content = content.replace("/p", "</p>")
+        content = replace_str(content, "[ \f\r\t\v]")
+
+    elif save_img:
+        content = replace_str(content, "(?!<img.+?>)<.+?>")  # 替换掉除图片外的其他标签
+        content = replace_str(content, "(?! +)\s+", "\n")  # 保留空格
+        content = content.strip()
+
+    else:
+        content = replace_str(content, "<(.|\n)*?>")
+        content = replace_str(content, "\s", white_replaced)
+        content = content.strip()
+
+    return content
+
+
+def del_html_js_css(content):
+    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
+    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
+    content = replace_str(content, "<!--(.|\n)*?-->")
+
+    return content
+
+
+def is_have_chinese(content):
+    regex = "[\u4e00-\u9fa5]+"
+    chinese_word = get_info(content, regex)
+    return chinese_word and True or False
+
+
+def is_have_english(content):
+    regex = "[a-zA-Z]+"
+    english_words = get_info(content, regex)
+    return english_words and True or False
+
+
+def get_chinese_word(content):
+    regex = "[\u4e00-\u9fa5]+"
+    chinese_word = get_info(content, regex)
+    return chinese_word
+
+
+def get_english_words(content):
+    regex = "[a-zA-Z]+"
+    english_words = get_info(content, regex)
+    return english_words or ""
+
+
+##################################################
+def get_json(json_str):
+    """
+    @summary: 取json对象
+    ---------
+    @param json_str: json格式的字符串
+    ---------
+    @result: 返回json对象
+    """
+
+    try:
+        return json.loads(json_str) if json_str else {}
+    except Exception as e1:
+        try:
+            json_str = json_str.strip()
+            json_str = json_str.replace("'", '"')
+            keys = get_info(json_str, "(\w+):")
+            for key in keys:
+                json_str = json_str.replace(key, '"%s"' % key)
+
+            return json.loads(json_str) if json_str else {}
+
+        except Exception as e2:
+            log.error(
+                """
+                e1: %s
+                format json_str: %s
+                e2: %s
+                """
+                % (e1, json_str, e2)
+            )
+
+        return {}
+
+
+def jsonp2json(jsonp):
+    """
+    将jsonp转为json
+    @param jsonp: jQuery172013600082560040794_1553230569815({})
+    @return:
+    """
+    try:
+        return json.loads(re.match(".*?({.*}).*", jsonp, re.S).group(1))
+    except:
+        raise ValueError("Invalid Input")
+
+
+def dumps_json(json_, indent=4, sort_keys=False):
+    """
+    @summary: 格式化json 用于打印
+    ---------
+    @param json_: json格式的字符串或json对象
+    ---------
+    @result: 格式化后的字符串
+    """
+    try:
+        if isinstance(json_, str):
+            json_ = get_json(json_)
+
+        json_ = json.dumps(
+            json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys
+        )
+
+    except Exception as e:
+        log.error(e)
+        json_ = pformat(json_)
+
+    return json_
+
+
+def get_json_value(json_object, key):
+    """
+    @summary:
+    ---------
+    @param json_object: json对象或json格式的字符串
+    @param key: 建值 如果在多个层级目录下 可写 key1.key2  如{'key1':{'key2':3}}
+    ---------
+    @result: 返回对应的值,如果没有,返回''
+    """
+    current_key = ""
+    value = ""
+    try:
+        json_object = (
+            isinstance(json_object, str) and get_json(json_object) or json_object
+        )
+
+        current_key = key.split(".")[0]
+        value = json_object[current_key]
+
+        key = key[key.find(".") + 1 :]
+    except Exception as e:
+        return value
+
+    if key == current_key:
+        return value
+    else:
+        return get_json_value(value, key)
+
+
+def get_all_keys(datas, depth=None, current_depth=0):
+    """
+    @summary: 获取json李所有的key
+    ---------
+    @param datas: dict / list
+    @param depth: 字典key的层级 默认不限制层级 层级从1开始
+    @param current_depth: 字典key的当前层级 不用传参
+    ---------
+    @result: 返回json所有的key
+    """
+
+    keys = []
+    if depth and current_depth >= depth:
+        return keys
+
+    if isinstance(datas, list):
+        for data in datas:
+            keys.extend(get_all_keys(data, depth, current_depth=current_depth + 1))
+    elif isinstance(datas, dict):
+        for key, value in datas.items():
+            keys.append(key)
+            if isinstance(value, dict):
+                keys.extend(get_all_keys(value, depth, current_depth=current_depth + 1))
+
+    return keys
+
+
+def to_chinese(unicode_str):
+    format_str = json.loads('{"chinese":"%s"}' % unicode_str)
+    return format_str["chinese"]
+
+
+##################################################
+def replace_str(source_str, regex, replace_str=""):
+    """
+    @summary: 替换字符串
+    ---------
+    @param source_str: 原字符串
+    @param regex: 正则
+    @param replace_str: 用什么来替换 默认为''
+    ---------
+    @result: 返回替换后的字符串
+    """
+    str_info = re.compile(regex)
+    return str_info.sub(replace_str, source_str)
+
+
+def del_redundant_blank_character(text):
+    """
+    删除冗余的空白符, 只保留一个
+    :param text:
+    :return:
+    """
+    return re.sub("\s+", " ", text)
+
+
+##################################################
+def get_conf_value(config_file, section, key):
+    cp = configparser.ConfigParser(allow_no_value=True)
+    with codecs.open(config_file, "r", encoding="utf-8") as f:
+        cp.read_file(f)
+    return cp.get(section, key)
+
+
+def mkdir(path):
+    try:
+        if not os.path.exists(path):
+            os.makedirs(path)
+    except OSError as exc:  # Python >2.5
+        pass
+
+
+def write_file(filename, content, mode="w", encoding="utf-8"):
+    """
+    @summary: 写文件
+    ---------
+    @param filename: 文件名(有路径)
+    @param content: 内容
+    @param mode: 模式 w/w+ (覆盖/追加)
+    ---------
+    @result:
+    """
+
+    directory = os.path.dirname(filename)
+    mkdir(directory)
+    with open(filename, mode, encoding=encoding) as file:
+        file.writelines(content)
+
+
+def read_file(filename, readlines=False, encoding="utf-8"):
+    """
+    @summary: 读文件
+    ---------
+    @param filename: 文件名(有路径)
+    @param readlines: 按行读取 (默认False)
+    ---------
+    @result: 按行读取返回List,否则返回字符串
+    """
+
+    content = None
+    try:
+        with open(filename, "r", encoding=encoding) as file:
+            content = file.readlines() if readlines else file.read()
+    except Exception as e:
+        log.error(e)
+
+    return content
+
+
+def get_oss_file_list(oss_handler, prefix, date_range_min, date_range_max=None):
+    """
+    获取文件列表
+    @param prefix: 路径前缀 如 data/car_service_line/yiche/yiche_serial_zongshu_info
+    @param date_range_min: 时间范围 最小值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
+    @param date_range_max: 时间范围 最大值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
+    @return: 每个文件路径 如 html/e_commerce_service_line/alibaba/alibaba_shop_info/2019/03/22/15/53/15/8ca8b9e4-4c77-11e9-9dee-acde48001122.json.snappy
+    """
+
+    # 计算时间范围
+    date_range_max = date_range_max or date_range_min
+    date_format = "/".join(
+        ["%Y", "%m", "%d", "%H", "%M", "%S"][: date_range_min.count("/") + 1]
+    )
+    time_interval = [
+        {"days": 365},
+        {"days": 31},
+        {"days": 1},
+        {"hours": 1},
+        {"minutes": 1},
+        {"seconds": 1},
+    ][date_range_min.count("/")]
+    date_range = get_between_date(
+        date_range_min, date_range_max, date_format=date_format, **time_interval
+    )
+
+    for date in date_range:
+        file_folder_path = os.path.join(prefix, date)
+        objs = oss_handler.list(prefix=file_folder_path)
+        for obj in objs:
+            filename = obj.key
+            yield filename
+
+
+def is_html(url):
+    if not url:
+        return False
+
+    try:
+        content_type = request.urlopen(url).info().get("Content-Type", "")
+
+        if "text/html" in content_type:
+            return True
+        else:
+            return False
+    except Exception as e:
+        log.error(e)
+        return False
+
+
+def is_exist(file_path):
+    """
+    @summary: 文件是否存在
+    ---------
+    @param file_path:
+    ---------
+    @result:
+    """
+
+    return os.path.exists(file_path)
+
+
+def download_file(url, file_path, *, call_func=None, proxies=None, data=None):
+    """
+    下载文件,会自动创建文件存储目录
+    Args:
+        url: 地址
+        file_path: 文件存储地址
+        call_func: 下载成功的回调
+        proxies: 代理
+        data: 请求体
+
+    Returns:
+
+    """
+    directory = os.path.dirname(file_path)
+    mkdir(directory)
+
+    # 进度条
+    def progress_callfunc(blocknum, blocksize, totalsize):
+        """回调函数
+        @blocknum : 已经下载的数据块
+        @blocksize : 数据块的大小
+        @totalsize: 远程文件的大小
+        """
+        percent = 100.0 * blocknum * blocksize / totalsize
+        if percent > 100:
+            percent = 100
+        # print ('进度条 %.2f%%' % percent, end = '\r')
+        sys.stdout.write("进度条 %.2f%%" % percent + "\r")
+        sys.stdout.flush()
+
+    if url:
+        try:
+            if proxies:
+                # create the object, assign it to a variable
+                proxy = request.ProxyHandler(proxies)
+                # construct a new opener using your proxy settings
+                opener = request.build_opener(proxy)
+                # install the openen on the module-level
+                request.install_opener(opener)
+
+            request.urlretrieve(url, file_path, progress_callfunc, data)
+
+            if callable(call_func):
+                call_func()
+            return 1
+        except Exception as e:
+            log.error(e)
+            return 0
+    else:
+        return 0
+
+
+def get_file_list(path, ignore=[]):
+    templist = path.split("*")
+    path = templist[0]
+    file_type = templist[1] if len(templist) >= 2 else ""
+
+    # 递归遍历文件
+    def get_file_list_(path, file_type, ignore, all_file=[]):
+        file_list = os.listdir(path)
+
+        for file_name in file_list:
+            if file_name in ignore:
+                continue
+
+            file_path = os.path.join(path, file_name)
+            if os.path.isdir(file_path):
+                get_file_list_(file_path, file_type, ignore, all_file)
+            else:
+                if not file_type or file_name.endswith(file_type):
+                    all_file.append(file_path)
+
+        return all_file
+
+    return get_file_list_(path, file_type, ignore) if os.path.isdir(path) else [path]
+
+
+def rename_file(old_name, new_name):
+    os.rename(old_name, new_name)
+
+
+def del_file(path, ignore=()):
+    files = get_file_list(path, ignore)
+    for file in files:
+        try:
+            os.remove(file)
+        except Exception as e:
+            log.error(
+                """
+                删除出错: %s
+                Exception : %s
+                """
+                % (file, str(e))
+            )
+        finally:
+            pass
+
+
+def get_file_type(file_name):
+    """
+    @summary: 取文件后缀名
+    ---------
+    @param file_name:
+    ---------
+    @result:
+    """
+    try:
+        return os.path.splitext(file_name)[1]
+    except Exception as e:
+        log.exception(e)
+
+
+def get_file_path(file_path):
+    """
+    @summary: 取文件路径
+    ---------
+    @param file_path: /root/a.py
+    ---------
+    @result: /root
+    """
+    try:
+        return os.path.split(file_path)[0]
+    except Exception as e:
+        log.exception(e)
+
+
+#############################################
+
+
+def exec_js(js_code):
+    """
+    @summary: 执行js代码
+    ---------
+    @param js_code: js代码
+    ---------
+    @result: 返回执行结果
+    """
+
+    return execjs.eval(js_code)
+
+
+def compile_js(js_func):
+    """
+    @summary: 编译js函数
+    ---------
+    @param js_func:js函数
+    ---------
+    @result: 返回函数对象 调用 fun('js_funName', param1,param2)
+    """
+
+    ctx = execjs.compile(js_func)
+    return ctx.call
+
+
+###############################################
+
+#############################################
+
+
+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
+    @param format:时间格式
+    ---------
+    @result: 返回时间戳
+    """
+
+    timestamp = time.mktime(time.strptime(date, time_format))
+    return int(timestamp)
+
+
+def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary:
+    ---------
+    @param timestamp: 将时间戳转化为日期
+    @param format: 日期格式
+    ---------
+    @result: 返回日期
+    """
+    if timestamp is None:
+        raise ValueError("timestamp is null")
+
+    date = time.localtime(timestamp)
+    return time.strftime(time_format, date)
+
+
+def get_current_timestamp():
+    return int(time.time())
+
+
+def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
+    return datetime.datetime.now().strftime(date_format)
+    # return time.strftime(date_format, time.localtime(time.time()))
+
+
+def get_date_number(year=None, month=None, day=None):
+    """
+    @summary: 获取指定日期对应的日期数
+    默认当前周
+    ---------
+    @param year: 2010
+    @param month: 6
+    @param day: 16
+    ---------
+    @result: (年号,第几周,第几天) 如 (2010, 24, 3)
+    """
+    if year and month and day:
+        return datetime.date(year, month, day).isocalendar()
+    elif not any([year, month, day]):
+        return datetime.datetime.now().isocalendar()
+    else:
+        assert year, "year 不能为空"
+        assert month, "month 不能为空"
+        assert day, "day 不能为空"
+
+
+def get_between_date(
+    begin_date, end_date=None, date_format="%Y-%m-%d", **time_interval
+):
+    """
+    @summary: 获取一段时间间隔内的日期,默认为每一天
+    ---------
+    @param begin_date: 开始日期 str 如 2018-10-01
+    @param end_date: 默认为今日
+    @param date_format: 日期格式,应与begin_date的日期格式相对应
+    @param time_interval: 时间间隔 默认一天 支持 days、seconds、microseconds、milliseconds、minutes、hours、weeks
+    ---------
+    @result: list 值为字符串
+    """
+
+    date_list = []
+
+    begin_date = datetime.datetime.strptime(begin_date, date_format)
+    end_date = (
+        datetime.datetime.strptime(end_date, date_format)
+        if end_date
+        else datetime.datetime.strptime(
+            time.strftime(date_format, time.localtime(time.time())), date_format
+        )
+    )
+    time_interval = time_interval or dict(days=1)
+
+    while begin_date <= end_date:
+        date_str = begin_date.strftime(date_format)
+        date_list.append(date_str)
+
+        begin_date += datetime.timedelta(**time_interval)
+
+    if end_date.strftime(date_format) not in date_list:
+        date_list.append(end_date.strftime(date_format))
+
+    return date_list
+
+
+def get_between_months(begin_date, end_date=None):
+    """
+    @summary: 获取一段时间间隔内的月份
+    需要满一整月
+    ---------
+    @param begin_date: 开始时间 如 2018-01-01
+    @param end_date: 默认当前时间
+    ---------
+    @result: 列表 如 ['2018-01', '2018-02']
+    """
+
+    def add_months(dt, months):
+        month = dt.month - 1 + months
+        year = dt.year + month // 12
+        month = month % 12 + 1
+        day = min(dt.day, calendar.monthrange(year, month)[1])
+        return dt.replace(year=year, month=month, day=day)
+
+    date_list = []
+    begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
+    end_date = (
+        datetime.datetime.strptime(end_date, "%Y-%m-%d")
+        if end_date
+        else datetime.datetime.strptime(
+            time.strftime("%Y-%m-%d", time.localtime(time.time())), "%Y-%m-%d"
+        )
+    )
+    while begin_date <= end_date:
+        date_str = begin_date.strftime("%Y-%m")
+        date_list.append(date_str)
+        begin_date = add_months(begin_date, 1)
+    return date_list
+
+
+def get_today_of_day(day_offset=0):
+    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
+
+
+def get_days_of_month(year, month):
+    """
+    返回天数
+    """
+
+    return calendar.monthrange(year, month)[1]
+
+
+def get_firstday_of_month(date):
+    """''
+    date format = "YYYY-MM-DD"
+    """
+
+    year, month, day = date.split("-")
+    year, month, day = int(year), int(month), int(day)
+
+    days = "01"
+    if int(month) < 10:
+        month = "0" + str(int(month))
+    arr = (year, month, days)
+    return "-".join("%s" % i for i in arr)
+
+
+def get_lastday_of_month(date):
+    """''
+    get the last day of month
+    date format = "YYYY-MM-DD"
+    """
+    year, month, day = date.split("-")
+    year, month, day = int(year), int(month), int(day)
+
+    days = calendar.monthrange(year, month)[1]
+    month = add_zero(month)
+    arr = (year, month, days)
+    return "-".join("%s" % i for i in arr)
+
+
+def get_firstday_month(month_offset=0):
+    """''
+    get the first day of month from today
+    month_offset is how many months
+    """
+    (y, m, d) = get_year_month_and_days(month_offset)
+    d = "01"
+    arr = (y, m, d)
+    return "-".join("%s" % i for i in arr)
+
+
+def get_lastday_month(month_offset=0):
+    """''
+    get the last day of month from today
+    month_offset is how many months
+    """
+    return "-".join("%s" % i for i in get_year_month_and_days(month_offset))
+
+
+def get_last_month(month_offset=0):
+    """''
+    get the last day of month from today
+    month_offset is how many months
+    """
+    return "-".join("%s" % i for i in get_year_month_and_days(month_offset)[:2])
+
+
+def get_year_month_and_days(month_offset=0):
+    """
+    @summary:
+    ---------
+    @param month_offset: 月份偏移量
+    ---------
+    @result: ('2019', '04', '30')
+    """
+
+    today = datetime.datetime.now()
+    year, month = today.year, today.month
+
+    this_year = int(year)
+    this_month = int(month)
+    total_month = this_month + month_offset
+    if month_offset >= 0:
+        if total_month <= 12:
+            days = str(get_days_of_month(this_year, total_month))
+            total_month = add_zero(total_month)
+            return (year, total_month, days)
+        else:
+            i = total_month // 12
+            j = total_month % 12
+            if j == 0:
+                i -= 1
+                j = 12
+            this_year += i
+            days = str(get_days_of_month(this_year, j))
+            j = add_zero(j)
+            return (str(this_year), str(j), days)
+    else:
+        if (total_month > 0) and (total_month < 12):
+            days = str(get_days_of_month(this_year, total_month))
+            total_month = add_zero(total_month)
+            return (year, total_month, days)
+        else:
+            i = total_month // 12
+            j = total_month % 12
+            if j == 0:
+                i -= 1
+                j = 12
+            this_year += i
+            days = str(get_days_of_month(this_year, j))
+            j = add_zero(j)
+            return (str(this_year), str(j), days)
+
+
+def add_zero(n):
+    return "%02d" % n
+
+
+def get_month(month_offset=0):
+    """''
+    获取当前日期前后N月的日期
+    if month_offset>0, 获取当前日期前N月的日期
+    if month_offset<0, 获取当前日期后N月的日期
+    date format = "YYYY-MM-DD"
+    """
+    today = datetime.datetime.now()
+    day = add_zero(today.day)
+
+    (y, m, d) = get_year_month_and_days(month_offset)
+    arr = (y, m, d)
+    if int(day) < int(d):
+        arr = (y, m, day)
+    return "-".join("%s" % i for i in arr)
+
+
+@run_safe_model("format_date")
+def format_date(date, old_format="", new_format="%Y-%m-%d %H:%M:%S"):
+    """
+    @summary: 格式化日期格式
+    ---------
+    @param date: 日期 eg:2017年4月17日 3时27分12秒
+    @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒'
+        %y 两位数的年份表示(00-99)
+        %Y 四位数的年份表示(000-9999)
+        %m 月份(01-12)
+        %d 月内中的一天(0-31)
+        %H 24小时制小时数(0-23)
+        %I 12小时制小时数(01-12)
+        %M 分钟数(00-59)
+        %S 秒(00-59)
+    @param new_format: 输出的日期格式
+    ---------
+    @result: 格式化后的日期,类型为字符串 如2017-4-17 03:27:12
+    """
+    if not date:
+        return ""
+
+    if not old_format:
+        regex = "(\d+)"
+        numbers = get_info(date, regex, allow_repeat=True)
+        formats = ["%Y", "%m", "%d", "%H", "%M", "%S"]
+        old_format = date
+        for i, number in enumerate(numbers[:6]):
+            if i == 0 and len(number) == 2:  # 年份可能是两位 用小%y
+                old_format = old_format.replace(
+                    number, formats[i].lower(), 1
+                )  # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时
+            else:
+                old_format = old_format.replace(number, formats[i], 1)  # 替换一次
+
+    try:
+        date_obj = datetime.datetime.strptime(date, old_format)
+        if "T" in date and "Z" in date:
+            date_obj += datetime.timedelta(hours=8)
+            date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
+        else:
+            date_str = datetime.datetime.strftime(date_obj, new_format)
+
+    except Exception as e:
+        log.error("日期格式化出错,old_format = %s 不符合 %s 格式" % (old_format, date))
+        date_str = date
+
+    return date_str
+
+
+def transform_lower_num(data_str: str):
+    num_map = {
+        "一": "1",
+        "二": "2",
+        "三": "3",
+        "四": "4",
+        "五": "5",
+        "六": "6",
+        "七": "7",
+        "八": "8",
+        "九": "9",
+        "十": "0",
+    }
+    pattern = f'[{"|".join(num_map.keys())}|零]'
+    res = re.search(pattern, data_str)
+    if not res:
+        #  如果字符串中没有包含中文数字 不做处理 直接返回
+        return data_str
+
+    data_str = data_str.replace("0", "零")
+    for n in num_map:
+        data_str = data_str.replace(n, num_map[n])
+
+    re_data_str = re.findall("\d+", data_str)
+    for i in re_data_str:
+        if len(i) == 3:
+            new_i = i.replace("0", "")
+            data_str = data_str.replace(i, new_i, 1)
+        elif len(i) == 4:
+            new_i = i.replace("10", "")
+            data_str = data_str.replace(i, new_i, 1)
+        elif len(i) == 2 and int(i) < 10:
+            new_i = int(i) + 10
+            data_str = data_str.replace(i, str(new_i), 1)
+        elif len(i) == 1 and int(i) == 0:
+            new_i = int(i) + 10
+            data_str = data_str.replace(i, str(new_i), 1)
+
+    return data_str.replace("零", "0")
+
+
+@run_safe_model("format_time")
+def format_time(release_time, date_format="%Y-%m-%d %H:%M:%S"):
+    """
+    >>> format_time("2个月前")
+    '2021-08-15 16:24:21'
+    >>> format_time("2月前")
+    '2021-08-15 16:24:36'
+    """
+    release_time = transform_lower_num(release_time)
+    release_time = release_time.replace("日", "天").replace("/", "-")
+
+    if "年前" in release_time:
+        years = re.compile("(\d+)\s*年前").findall(release_time)
+        years_ago = datetime.datetime.now() - datetime.timedelta(
+            days=int(years[0]) * 365
+        )
+        release_time = years_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "月前" in release_time:
+        months = re.compile("(\d+)[\s个]*月前").findall(release_time)
+        months_ago = datetime.datetime.now() - datetime.timedelta(
+            days=int(months[0]) * 30
+        )
+        release_time = months_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "周前" in release_time:
+        weeks = re.compile("(\d+)\s*周前").findall(release_time)
+        weeks_ago = datetime.datetime.now() - datetime.timedelta(days=int(weeks[0]) * 7)
+        release_time = weeks_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "天前" in release_time:
+        ndays = re.compile("(\d+)\s*天前").findall(release_time)
+        days_ago = datetime.datetime.now() - datetime.timedelta(days=int(ndays[0]))
+        release_time = days_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "小时前" in release_time:
+        nhours = re.compile("(\d+)\s*小时前").findall(release_time)
+        hours_ago = datetime.datetime.now() - datetime.timedelta(hours=int(nhours[0]))
+        release_time = hours_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "分钟前" in release_time:
+        nminutes = re.compile("(\d+)\s*分钟前").findall(release_time)
+        minutes_ago = datetime.datetime.now() - datetime.timedelta(
+            minutes=int(nminutes[0])
+        )
+        release_time = minutes_ago.strftime("%Y-%m-%d %H:%M:%S")
+
+    elif "前天" in release_time:
+        today = datetime.date.today()
+        yesterday = today - datetime.timedelta(days=2)
+        release_time = release_time.replace("前天", str(yesterday))
+
+    elif "昨天" in release_time:
+        today = datetime.date.today()
+        yesterday = today - datetime.timedelta(days=1)
+        release_time = release_time.replace("昨天", str(yesterday))
+
+    elif "今天" in release_time:
+        release_time = release_time.replace("今天", get_current_date("%Y-%m-%d"))
+
+    elif "刚刚" in release_time:
+        release_time = get_current_date()
+
+    elif re.search("^\d\d:\d\d", release_time):
+        release_time = get_current_date("%Y-%m-%d") + " " + release_time
+
+    elif not re.compile("\d{4}").findall(release_time):
+        month = re.compile("\d{1,2}").findall(release_time)
+        if month and int(month[0]) <= int(get_current_date("%m")):
+            release_time = get_current_date("%Y") + "-" + release_time
+        else:
+            release_time = str(int(get_current_date("%Y")) - 1) + "-" + release_time
+
+    # 把日和小时粘在一起的拆开
+    template = re.compile("(\d{4}-\d{1,2}-\d{2})(\d{1,2})")
+    release_time = re.sub(template, r"\1 \2", release_time)
+    release_time = format_date(release_time, new_format=date_format)
+
+    return release_time
+
+
+def to_date(date_str, date_format="%Y-%m-%d %H:%M:%S"):
+    return datetime.datetime.strptime(date_str, date_format)
+
+
+def get_before_date(
+    current_date,
+    days,
+    current_date_format="%Y-%m-%d %H:%M:%S",
+    return_date_format="%Y-%m-%d %H:%M:%S",
+):
+    """
+    @summary: 获取之前时间
+    ---------
+    @param current_date: 当前时间 str类型
+    @param days: 时间间隔 -1 表示前一天 1 表示后一天
+    @param days: 返回的时间格式
+    ---------
+    @result: 字符串
+    """
+
+    current_date = to_date(current_date, current_date_format)
+    date_obj = current_date + datetime.timedelta(days=days)
+    return datetime.datetime.strftime(date_obj, return_date_format)
+
+
+def delay_time(sleep_time=60):
+    """
+    @summary: 睡眠  默认1分钟
+    ---------
+    @param sleep_time: 以秒为单位
+    ---------
+    @result:
+    """
+
+    time.sleep(sleep_time)
+
+
+def format_seconds(seconds):
+    """
+    @summary: 将秒转为时分秒
+    ---------
+    @param seconds:
+    ---------
+    @result: 2天3小时2分49秒
+    """
+
+    seconds = int(seconds + 0.5)  # 向上取整
+
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    d, h = divmod(h, 24)
+
+    times = ""
+    if d:
+        times += "{}天".format(d)
+    if h:
+        times += "{}小时".format(h)
+    if m:
+        times += "{}分".format(m)
+    if s:
+        times += "{}秒".format(s)
+
+    return times
+
+
+################################################
+def get_md5(*args):
+    """
+    @summary: 获取唯一的32位md5
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: 7c8684bcbdfcea6697650aa53d7b1405
+    """
+
+    m = hashlib.md5()
+    for arg in args:
+        m.update(str(arg).encode())
+
+    return m.hexdigest()
+
+
+def get_sha1(*args):
+    """
+    @summary: 获取唯一的40位值, 用于获取唯一的id
+    ---------
+    @param *args: 参与联合去重的值
+    ---------
+    @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
+    """
+
+    sha1 = hashlib.sha1()
+    for arg in args:
+        sha1.update(str(arg).encode())
+    return sha1.hexdigest()  # 40位
+
+
+def get_base64(secret, message):
+    """
+    @summary: 数字证书签名算法是:"HMAC-SHA256"
+              参考:https://www.jokecamp.com/blog/examples-of-creating-base64-hashes-using-hmac-sha256-in-different-languages/
+    ---------
+    @param secret: 秘钥
+    @param message: 消息
+    ---------
+    @result: 签名输出类型是:"base64"
+    """
+
+    import hashlib
+    import hmac
+    import base64
+
+    message = bytes(message, "utf-8")
+    secret = bytes(secret, "utf-8")
+
+    signature = base64.b64encode(
+        hmac.new(secret, message, digestmod=hashlib.sha256).digest()
+    ).decode("utf8")
+    return signature
+
+
+def get_uuid(key1="", key2=""):
+    """
+    @summary: 计算uuid值
+    可用于将两个字符串组成唯一的值。如可将域名和新闻标题组成uuid,形成联合索引
+    ---------
+    @param key1:str
+    @param key2:str
+    ---------
+    @result:
+    """
+
+    uuid_object = ""
+
+    if not key1 and not key2:
+        uuid_object = uuid.uuid1()
+    else:
+        hash = md5(bytes(key1, "utf-8") + bytes(key2, "utf-8")).digest()
+        uuid_object = uuid.UUID(bytes=hash[:16], version=3)
+
+    return str(uuid_object)
+
+
+def get_hash(text):
+    return hash(text)
+
+
+##################################################
+
+
+def cut_string(text, length):
+    """
+    @summary: 将文本按指定长度拆分
+    ---------
+    @param text: 文本
+    @param length: 拆分长度
+    ---------
+    @result: 返回按指定长度拆分后形成的list
+    """
+
+    text_list = re.findall(".{%d}" % length, text, re.S)
+    leave_text = text[len(text_list) * length :]
+    if leave_text:
+        text_list.append(leave_text)
+
+    return text_list
+
+
+def get_random_string(length=1):
+    random_string = "".join(random.sample(string.ascii_letters + string.digits, length))
+    return random_string
+
+
+def get_random_password(length=8, special_characters=""):
+    """
+    @summary: 创建随机密码 默认长度为8,包含大写字母、小写字母、数字
+    ---------
+    @param length: 密码长度 默认8
+    @param special_characters: 特殊字符
+    ---------
+    @result: 指定长度的密码
+    """
+
+    while True:
+        random_password = "".join(
+            random.sample(
+                string.ascii_letters + string.digits + special_characters, length
+            )
+        )
+        if (
+            re.search("[0-9]", random_password)
+            and re.search("[A-Z]", random_password)
+            and re.search("[a-z]", random_password)
+        ):
+            if not special_characters:
+                break
+            elif set(random_password).intersection(special_characters):
+                break
+
+    return random_password
+
+
+def get_random_email(length=None, email_types: list = None, special_characters=""):
+    """
+    随机生成邮箱
+    :param length: 邮箱长度
+    :param email_types: 邮箱类型
+    :param special_characters: 特殊字符
+    :return:
+    """
+    if not length:
+        length = random.randint(4, 12)
+    if not email_types:
+        email_types = [
+            "qq.com",
+            "163.com",
+            "gmail.com",
+            "yahoo.com",
+            "hotmail.com",
+            "yeah.net",
+            "126.com",
+            "139.com",
+            "sohu.com",
+        ]
+
+    email_body = get_random_password(length, special_characters)
+    email_type = random.choice(email_types)
+
+    email = email_body + "@" + email_type
+    return email
+
+
+#################################
+
+
+def dumps_obj(obj):
+    return pickle.dumps(obj)
+
+
+def loads_obj(obj_str):
+    return pickle.loads(obj_str)
+
+
+def get_method(obj, name):
+    name = str(name)
+    try:
+        return getattr(obj, name)
+    except AttributeError:
+        log.error("Method %r not found in: %s" % (name, obj))
+        return None
+
+
+def witch_workspace(project_path):
+    """
+    @summary:
+    ---------
+    @param project_path:
+    ---------
+    @result:
+    """
+
+    os.chdir(project_path)  # 切换工作路经
+
+
+############### 数据库相关 #######################
+def format_sql_value(value):
+    if isinstance(value, str):
+        value = value.strip()
+
+    elif isinstance(value, (list, dict)):
+        value = dumps_json(value, indent=None)
+
+    elif isinstance(value, (datetime.date, datetime.time)):
+        value = str(value)
+
+    elif isinstance(value, bool):
+        value = int(value)
+
+    return value
+
+
+def list2str(datas):
+    """
+    列表转字符串
+    :param datas: [1, 2]
+    :return: (1, 2)
+    """
+    data_str = str(tuple(datas))
+    data_str = re.sub(",\)$", ")", data_str)
+    return data_str
+
+
+def make_insert_sql(
+    table, data, auto_update=False, update_columns=(), insert_ignore=False
+):
+    """
+    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
+    ---------
+    @param table:
+    @param data: 表数据 json格式
+    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
+    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
+    @param insert_ignore: 数据存在忽略
+    ---------
+    @result:
+    """
+
+    keys = ["`{}`".format(key) for key in data.keys()]
+    keys = list2str(keys).replace("'", "")
+
+    values = [format_sql_value(value) for value in data.values()]
+    values = list2str(values)
+
+    if update_columns:
+        if not isinstance(update_columns, (tuple, list)):
+            update_columns = [update_columns]
+        update_columns_ = ", ".join(
+            ["{key}=values({key})".format(key=key) for key in update_columns]
+        )
+        sql = (
+            "insert%s into `{table}` {keys} values {values} on duplicate key update %s"
+            % (" ignore" if insert_ignore else "", update_columns_)
+        )
+
+    elif auto_update:
+        sql = "replace into `{table}` {keys} values {values}"
+    else:
+        sql = "insert%s into `{table}` {keys} values {values}" % (
+            " ignore" if insert_ignore else ""
+        )
+
+    sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
+    return sql
+
+
+def make_update_sql(table, data, condition):
+    """
+    @summary: 适用于mysql, oracle数据库时间需要to_date 处理(TODO)
+    ---------
+    @param table:
+    @param data: 表数据 json格式
+    @param condition: where 条件
+    ---------
+    @result:
+    """
+    key_values = []
+
+    for key, value in data.items():
+        value = format_sql_value(value)
+        if isinstance(value, str):
+            key_values.append("`{}`={}".format(key, repr(value)))
+        elif value is None:
+            key_values.append("`{}`={}".format(key, "null"))
+        else:
+            key_values.append("`{}`={}".format(key, value))
+
+    key_values = ", ".join(key_values)
+
+    sql = "update `{table}` set {key_values} where {condition}"
+    sql = sql.format(table=table, key_values=key_values, condition=condition)
+    return sql
+
+
+def make_batch_sql(
+    table, datas, auto_update=False, update_columns=(), update_columns_value=()
+):
+    """
+    @summary: 生产批量的sql
+    ---------
+    @param table:
+    @param datas: 表数据 [{...}]
+    @param auto_update: 使用的是replace into, 为完全覆盖已存在的数据
+    @param update_columns: 需要更新的列 默认全部,当指定值时,auto_update设置无效,当duplicate key冲突时更新指定的列
+    @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号, 如 update_columns_value=("'test'",)
+    ---------
+    @result:
+    """
+    if not datas:
+        return
+
+    keys = list(datas[0].keys())
+    values_placeholder = ["%s"] * len(keys)
+
+    values = []
+    for data in datas:
+        value = []
+        for key in keys:
+            current_data = data.get(key)
+            current_data = format_sql_value(current_data)
+
+            value.append(current_data)
+
+        values.append(value)
+
+    keys = ["`{}`".format(key) for key in keys]
+    keys = list2str(keys).replace("'", "")
+
+    values_placeholder = list2str(values_placeholder).replace("'", "")
+
+    if update_columns:
+        if not isinstance(update_columns, (tuple, list)):
+            update_columns = [update_columns]
+        if update_columns_value:
+            update_columns_ = ", ".join(
+                [
+                    "`{key}`={value}".format(key=key, value=value)
+                    for key, value in zip(update_columns, update_columns_value)
+                ]
+            )
+        else:
+            update_columns_ = ", ".join(
+                ["`{key}`=values(`{key}`)".format(key=key) for key in update_columns]
+            )
+        sql = "insert into `{table}` {keys} values {values_placeholder} on duplicate key update {update_columns}".format(
+            table=table,
+            keys=keys,
+            values_placeholder=values_placeholder,
+            update_columns=update_columns_,
+        )
+    elif auto_update:
+        sql = "replace into `{table}` {keys} values {values_placeholder}".format(
+            table=table, keys=keys, values_placeholder=values_placeholder
+        )
+    else:
+        sql = "insert ignore into `{table}` {keys} values {values_placeholder}".format(
+            table=table, keys=keys, values_placeholder=values_placeholder
+        )
+
+    return sql, values
+
+
+############### json相关 #######################
+
+
+def key2underline(key: str, strict=True):
+    """
+    >>> key2underline("HelloWord")
+    'hello_word'
+    >>> key2underline("SHData", strict=True)
+    's_h_data'
+    >>> key2underline("SHData", strict=False)
+    'sh_data'
+    >>> key2underline("SHDataHi", strict=False)
+    'sh_data_hi'
+    >>> key2underline("SHDataHi", strict=True)
+    's_h_data_hi'
+    >>> key2underline("dataHi", strict=True)
+    'data_hi'
+    """
+    regex = "[A-Z]*" if not strict else "[A-Z]"
+    capitals = re.findall(regex, key)
+
+    if capitals:
+        for capital in capitals:
+            if not capital:
+                continue
+            if key.startswith(capital):
+                if len(capital) > 1:
+                    key = key.replace(
+                        capital, capital[:-1].lower() + "_" + capital[-1].lower(), 1
+                    )
+                else:
+                    key = key.replace(capital, capital.lower(), 1)
+            else:
+                if len(capital) > 1:
+                    key = key.replace(capital, "_" + capital.lower() + "_", 1)
+                else:
+                    key = key.replace(capital, "_" + capital.lower(), 1)
+
+    return key.strip("_")
+
+
+def key2hump(key):
+    """
+    下划线试变成首字母大写
+    """
+    return key.title().replace("_", "")
+
+
+def format_json_key(json_data):
+    json_data_correct = {}
+    for key, value in json_data.items():
+        key = key2underline(key)
+        json_data_correct[key] = value
+
+    return json_data_correct
+
+
+def quick_to_json(text):
+    """
+    @summary: 可快速将浏览器上的header转为json格式
+    ---------
+    @param text:
+    ---------
+    @result:
+    """
+
+    contents = text.split("\n")
+    json = {}
+    for content in contents:
+        if content == "\n":
+            continue
+
+        content = content.strip()
+        regex = ["(:?.*?):(.*)", "(.*?):? +(.*)", "([^:]*)"]
+
+        result = get_info(content, regex)
+        result = result[0] if isinstance(result[0], tuple) else result
+        try:
+            json[result[0]] = eval(result[1].strip())
+        except:
+            json[result[0]] = result[1].strip()
+
+    return json
+
+
+##############################
+
+
+def print_pretty(object):
+    pprint(object)
+
+
+def print_params2json(url):
+    params_json = {}
+    params = url.split("?")[-1].split("&")
+    for param in params:
+        key_value = param.split("=", 1)
+        params_json[key_value[0]] = key_value[1]
+
+    print(dumps_json(params_json))
+
+
+def print_cookie2json(cookie_str_or_list):
+    if isinstance(cookie_str_or_list, str):
+        cookie_json = {}
+        cookies = cookie_str_or_list.split("; ")
+        for cookie in cookies:
+            name, value = cookie.split("=")
+            cookie_json[name] = value
+    else:
+        cookie_json = get_cookies_from_selenium_cookie(cookie_str_or_list)
+
+    print(dumps_json(cookie_json))
+
+
+###############################
+
+
+def flatten(x):
+    """flatten(sequence) -> list
+    Returns a single, flat list which contains all elements retrieved
+    from the sequence and all recursively contained sub-sequences
+    (iterables).
+    Examples:
+    >>> [1, 2, [3,4], (5,6)]
+    [1, 2, [3, 4], (5, 6)]
+    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
+    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
+    >>> flatten(["foo", "bar"])
+    ['foo', 'bar']
+    >>> flatten(["foo", ["baz", 42], "bar"])
+    ['foo', 'baz', 42, 'bar']
+    """
+    return list(iflatten(x))
+
+
+def iflatten(x):
+    """iflatten(sequence) -> iterator
+    Similar to ``.flatten()``, but returns iterator instead"""
+    for el in x:
+        if _is_listlike(el):
+            for el_ in flatten(el):
+                yield el_
+        else:
+            yield el
+
+
+def _is_listlike(x):
+    """
+    >>> _is_listlike("foo")
+    False
+    >>> _is_listlike(5)
+    False
+    >>> _is_listlike(b"foo")
+    False
+    >>> _is_listlike([b"foo"])
+    True
+    >>> _is_listlike((b"foo",))
+    True
+    >>> _is_listlike({})
+    True
+    >>> _is_listlike(set())
+    True
+    >>> _is_listlike((x for x in range(3)))
+    True
+    >>> _is_listlike(six.moves.xrange(5))
+    True
+    """
+    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
+
+
+###################
+
+
+def re_def_supper_class(obj, supper_class):
+    """
+    重新定义父类
+    @param obj: 类 如 class A: 则obj为A 或者 A的实例 a.__class__
+    @param supper_class: 父类
+    @return:
+    """
+    obj.__bases__ = (supper_class,)
+
+
+###################
+freq_limit_record = {}
+
+
+def reach_freq_limit(rate_limit, *key):
+    """
+    频率限制
+    :param rate_limit: 限制时间 单位秒
+    :param key: 频率限制的key
+    :return: True / False
+    """
+    if rate_limit == 0:
+        return False
+
+    msg_md5 = get_md5(*key)
+    key = "rate_limit:{}".format(msg_md5)
+    try:
+        if get_redisdb().get(key):
+            return True
+
+        get_redisdb().set(key, time.time(), ex=rate_limit)
+    except redis.exceptions.ConnectionError as e:
+        # 使用内存做频率限制
+        global freq_limit_record
+
+        if key not in freq_limit_record:
+            freq_limit_record[key] = time.time()
+            return False
+
+        if time.time() - freq_limit_record.get(key) < rate_limit:
+            return True
+        else:
+            freq_limit_record[key] = time.time()
+
+    return False
+
+
+def dingding_warning(
+    message, message_prefix=None, rate_limit=None, url=None, user_phone=None
+):
+    # 为了加载最新的配置
+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
+    url = url or setting.DINGDING_WARNING_URL
+    user_phone = user_phone or setting.DINGDING_WARNING_PHONE
+
+    if not all([url, message]):
+        return
+
+    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
+        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
+        return
+
+    if isinstance(user_phone, str):
+        user_phone = [user_phone] if user_phone else []
+
+    data = {
+        "msgtype": "text",
+        "text": {"content": message},
+        "at": {"atMobiles": user_phone, "isAtAll": setting.DINGDING_WARNING_ALL},
+    }
+
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(
+            url, headers=headers, data=json.dumps(data).encode("utf8")
+        )
+        result = response.json()
+        response.close()
+        if result.get("errcode") == 0:
+            return True
+        else:
+            raise Exception(result.get("errmsg"))
+    except Exception as e:
+        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
+        return False
+
+
+def email_warning(
+    message,
+    title,
+    message_prefix=None,
+    email_sender=None,
+    email_password=None,
+    email_receiver=None,
+    email_smtpserver=None,
+    rate_limit=None,
+):
+    # 为了加载最新的配置
+    email_sender = email_sender or setting.EMAIL_SENDER
+    email_password = email_password or setting.EMAIL_PASSWORD
+    email_receiver = email_receiver or setting.EMAIL_RECEIVER
+    email_smtpserver = email_smtpserver or setting.EMAIL_SMTPSERVER
+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
+
+    if not all([message, email_sender, email_password, email_receiver]):
+        return
+
+    if reach_freq_limit(
+        rate_limit, email_receiver, email_sender, message_prefix or message
+    ):
+        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
+        return
+
+    if isinstance(email_receiver, str):
+        email_receiver = [email_receiver]
+
+    with EmailSender(
+        username=email_sender, password=email_password, smtpserver=email_smtpserver
+    ) as email:
+        return email.send(receivers=email_receiver, title=title, content=message)
+
+
+def linkedsee_warning(message, rate_limit=3600, message_prefix=None, token=None):
+    """
+    灵犀电话报警
+    Args:
+        message:
+        rate_limit:
+        message_prefix:
+        token:
+
+    Returns:
+
+    """
+    if not token:
+        log.info("未设置灵犀token,不支持报警")
+        return
+
+    if reach_freq_limit(rate_limit, token, message_prefix or message):
+        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
+        return
+
+    headers = {"servicetoken": token, "Content-Type": "application/json"}
+
+    url = "http://www.linkedsee.com/alarm/zabbix"
+
+    data = {"content": message}
+    response = requests.post(url, data=json.dumps(data), headers=headers)
+    return response
+
+
+def wechat_warning(
+    message,
+    message_prefix=None,
+    rate_limit=None,
+    url=None,
+    user_phone=None,
+    all_users: bool = None,
+):
+    """企业微信报警"""
+
+    # 为了加载最新的配置
+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
+    url = url or setting.WECHAT_WARNING_URL
+    user_phone = user_phone or setting.WECHAT_WARNING_PHONE
+    all_users = all_users if all_users is not None else setting.WECHAT_WARNING_ALL
+
+    if isinstance(user_phone, str):
+        user_phone = [user_phone] if user_phone else []
+
+    if all_users is True or not user_phone:
+        user_phone = ["@all"]
+
+    if not all([url, message]):
+        return
+
+    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
+        log.info("报警时间间隔过短,此次报警忽略。 内容 {}".format(message))
+        return
+
+    data = {
+        "msgtype": "text",
+        "text": {"content": message, "mentioned_mobile_list": user_phone},
+    }
+
+    headers = {"Content-Type": "application/json"}
+
+    try:
+        response = requests.post(
+            url, headers=headers, data=json.dumps(data).encode("utf8")
+        )
+        result = response.json()
+        response.close()
+        if result.get("errcode") == 0:
+            return True
+        else:
+            raise Exception(result.get("errmsg"))
+    except Exception as e:
+        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
+        return False
+
+
+def send_msg(msg, level="debug", message_prefix=""):
+    if setting.WARNING_LEVEL == "ERROR":
+        if level != "error":
+            return
+
+    if setting.DINGDING_WARNING_URL:
+        keyword = "feapder报警系统\n"
+        dingding_warning(keyword + msg, message_prefix=message_prefix)
+
+    if setting.EMAIL_RECEIVER:
+        title = message_prefix or msg
+        if len(title) > 50:
+            title = title[:50] + "..."
+        email_warning(msg, message_prefix=message_prefix, title=title)
+
+    if setting.WECHAT_WARNING_URL:
+        keyword = "feapder报警系统\n"
+        wechat_warning(keyword + msg, message_prefix=message_prefix)
+
+
+###################
+
+
+def make_item(cls, data: dict):
+    """提供Item类与原数据,快速构建Item实例
+    :param cls: Item类
+    :param data: 字典格式的数据
+    """
+    item = cls()
+    for key, val in data.items():
+        setattr(item, key, val)
+    return item
+
+
+###################
+
+
+def aio_wrap(loop=None, executor=None):
+    """
+    wrap a normal sync version of a function to an async version
+    """
+    outer_loop = loop
+    outer_executor = executor
+
+    def wrap(fn):
+        @wraps(fn)
+        async def run(*args, loop=None, executor=None, **kwargs):
+            if loop is None:
+                if outer_loop is None:
+                    loop = asyncio.get_event_loop()
+                else:
+                    loop = outer_loop
+            if executor is None:
+                executor = outer_executor
+            pfunc = partial(fn, *args, **kwargs)
+            return await loop.run_in_executor(executor, pfunc)
+
+        return run
+
+    return wrap
+
+
+######### number ##########
+
+
+def ensure_int(n):
+    """
+    >>> ensure_int(None)
+    0
+    >>> ensure_int(False)
+    0
+    >>> ensure_int(12)
+    12
+    >>> ensure_int("72")
+    72
+    >>> ensure_int('')
+    0
+    >>> ensure_int('1')
+    1
+    """
+    if not n:
+        return 0
+    return int(n)
+
+
+def ensure_float(n):
+    """
+    >>> ensure_float(None)
+    0.0
+    >>> ensure_float(False)
+    0.0
+    >>> ensure_float(12)
+    12.0
+    >>> ensure_float("72")
+    72.0
+    """
+    if not n:
+        return 0.0
+    return float(n)

+ 334 - 0
FworkSpider/feapder/utils/webdriver.py

@@ -0,0 +1,334 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021/3/18 4:59 下午
+---------
+@summary:
+---------
+@author: Boris
+@email: boris_liu@foxmail.com
+"""
+
+import queue
+import threading
+import os
+from selenium import webdriver
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
+
+from feapder.utils.log import log
+from feapder.utils.tools import Singleton
+
+DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
+
+
+class WebDriver(RemoteWebDriver):
+    CHROME = "CHROME"
+    PHANTOMJS = "PHANTOMJS"
+    FIREFOX = "FIREFOX"
+
+    def __init__(
+        self,
+        load_images=True,
+        user_agent=None,
+        proxy=None,
+        headless=False,
+        driver_type=CHROME,
+        timeout=16,
+        window_size=(1024, 800),
+        executable_path=None,
+        custom_argument=None,
+        **kwargs
+    ):
+        """
+        webdirver 封装,支持chrome、phantomjs 和 firefox
+        Args:
+            load_images: 是否加载图片
+            user_agent: 字符串 或 无参函数,返回值为user_agent
+            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
+            headless: 是否启用无头模式
+            driver_type: CHROME 或 PHANTOMJS,FIREFOX
+            timeout: 请求超时时间
+            window_size: # 窗口大小
+            executable_path: 浏览器路径,默认为默认路径
+            **kwargs:
+        """
+        self._load_images = load_images
+        self._user_agent = user_agent or DEFAULT_USERAGENT
+        self._proxy = proxy
+        self._headless = headless
+        self._timeout = timeout
+        self._window_size = window_size
+        self._executable_path = executable_path
+        self._custom_argument = custom_argument
+
+        self.proxies = {}
+        self.user_agent = None
+
+        if driver_type == WebDriver.CHROME:
+            self.driver = self.chrome_driver()
+
+        elif driver_type == WebDriver.PHANTOMJS:
+            self.driver = self.phantomjs_driver()
+
+        elif driver_type == WebDriver.FIREFOX:
+            self.driver = self.firefox_driver()
+
+        else:
+            raise TypeError(
+                "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
+                    type(driver_type)
+                )
+            )
+
+        # driver.get(url)一直不返回,但也不报错的问题,这时程序会卡住,设置超时选项能解决这个问题。
+        self.driver.set_page_load_timeout(self._timeout)
+        # 设置10秒脚本超时时间
+        self.driver.set_script_timeout(self._timeout)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_val:
+            log.error(exc_val)
+
+        self.quit()
+        return True
+
+    def get_driver(self):
+        return self.driver
+
+    def firefox_driver(self):
+        firefox_profile = webdriver.FirefoxProfile()
+        firefox_options = webdriver.FirefoxOptions()
+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
+        firefox_profile.set_preference("dom.webdriver.enabled",False)
+        if self._proxy:
+            proxy = self._proxy() if callable(self._proxy) else self._proxy
+            proxy = proxy.replace("socks5://","")
+            # 使用socks5 代理
+            firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理:0, 使用代理:1
+            firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
+            firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
+            # firefox_capabilities["marionette"] = True  # http代理的使用
+            # firefox_capabilities["proxy"] = {
+            #     "proxyType": "MANUAL",
+            #     "httpProxy": proxy,
+            #     "ftpProxy": proxy,
+            #     "sslProxy": proxy,
+            # }
+
+        if self._user_agent:
+            firefox_profile.set_preference(
+                "general.useragent.override",
+                self._user_agent() if callable(self._user_agent) else self._user_agent,
+            )
+
+        if not self._load_images:
+            firefox_profile.set_preference("permissions.default.image", 2)
+
+        if self._headless:
+            firefox_options.add_argument("--headless")
+            firefox_options.add_argument("--disable-gpu")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                firefox_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+                executable_path=self._executable_path,
+            )
+        else:
+            driver = webdriver.Firefox(
+                capabilities=firefox_capabilities,
+                options=firefox_options,
+                firefox_profile=firefox_profile,
+            )
+
+        if self._window_size:
+            driver.set_window_size(*self._window_size)
+
+        return driver
+
+    def chrome_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        # docker 里运行需要
+        chrome_options.add_argument("--no-sandbox")
+
+        if self._proxy:
+            chrome_options.add_argument(
+                "--proxy-server={}".format(
+                    self._proxy() if callable(self._proxy) else self._proxy
+                )
+            )
+        if self._user_agent:
+            chrome_options.add_argument(
+                "user-agent={}".format(
+                    self._user_agent()
+                    if callable(self._user_agent)
+                    else self._user_agent
+                )
+            )
+        if not self._load_images:
+            chrome_options.add_experimental_option(
+                "prefs", {"profile.managed_default_content_settings.images": 2}
+            )
+
+        if self._headless:
+            chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--disable-gpu")
+
+        if self._window_size:
+            chrome_options.add_argument(
+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
+            )
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                chrome_options.add_argument(arg)
+
+        if self._executable_path:
+            driver = webdriver.Chrome(
+                chrome_options=chrome_options, executable_path=self._executable_path
+            )
+        else:
+            driver = webdriver.Chrome(chrome_options=chrome_options)
+
+        # 隐藏浏览器特征
+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
+            js = f.read()
+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
+
+        return driver
+
+    def phantomjs_driver(self):
+        import warnings
+
+        warnings.filterwarnings("ignore")
+
+        service_args = []
+        dcap = DesiredCapabilities.PHANTOMJS
+
+        if self._proxy:
+            service_args.append(
+                "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
+            )
+        if self._user_agent:
+            dcap["phantomjs.page.settings.userAgent"] = (
+                self._user_agent() if callable(self._user_agent) else self._user_agent
+            )
+        if not self._load_images:
+            service_args.append("--load-images=no")
+
+        # 添加自定义的配置参数
+        if self._custom_argument:
+            for arg in self._custom_argument:
+                service_args.append(arg)
+
+        if self._executable_path:
+            driver = webdriver.PhantomJS(
+                service_args=service_args,
+                desired_capabilities=dcap,
+                executable_path=self._executable_path,
+            )
+        else:
+            driver = webdriver.PhantomJS(
+                service_args=service_args, desired_capabilities=dcap
+            )
+
+        if self._window_size:
+            driver.set_window_size(self._window_size[0], self._window_size[1])
+
+        del warnings
+
+        return driver
+
+    @property
+    def cookies(self):
+        cookies_json = {}
+        for cookie in self.driver.get_cookies():
+            cookies_json[cookie["name"]] = cookie["value"]
+
+        return cookies_json
+
+    @cookies.setter
+    def cookies(self, val: dict):
+        """
+        设置cookie
+        Args:
+            val: {"key":"value", "key2":"value2"}
+
+        Returns:
+
+        """
+        for key, value in val.items():
+            self.driver.add_cookie({"name": key, "value": value})
+
+    def __getattr__(self, name):
+        if self.driver:
+            return getattr(self.driver, name)
+        else:
+            raise AttributeError
+
+    # def __del__(self):
+    #     self.quit()
+
+
+@Singleton
+class WebDriverPool:
+    def __init__(self, pool_size=5, **kwargs):
+        self.queue = queue.Queue(maxsize=pool_size)
+        self.kwargs = kwargs
+        self.lock = threading.RLock()
+        self.driver_count = 0
+
+    @property
+    def is_full(self):
+        return self.driver_count >= self.queue.maxsize
+
+    def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
+        """
+        获取webdriver
+        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
+        Args:
+            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
+            proxy: xxx.xxx.xxx.xxx
+        Returns:
+
+        """
+        if not self.is_full:
+            with self.lock:
+                if not self.is_full:
+                    kwargs = self.kwargs.copy()
+                    if user_agent:
+                        kwargs["user_agent"] = user_agent
+                    if proxy:
+                        kwargs["proxy"] = proxy
+                    driver = WebDriver(**kwargs)
+                    self.queue.put(driver)
+                    self.driver_count += 1
+
+        driver = self.queue.get()
+        return driver
+
+    def put(self, driver):
+        self.queue.put(driver)
+
+    def remove(self, driver):
+        driver.quit()
+        self.driver_count -= 1
+
+    def close(self):
+        while not self.queue.empty():
+            driver = self.queue.get()
+            driver.quit()
+            self.driver_count -= 1

+ 0 - 0
FworkSpider/items/__init__.py


+ 125 - 0
FworkSpider/items/spider_item.py

@@ -0,0 +1,125 @@
+from feapder import Item
+from untils.tools import int2long,substitute,text_search
+import time
+from feapder.utils.log import log
+from feapder.utils.tools import get_current_date
+from crawlab import save_item
+from datetime import datetime
+import os
+from feapder import setting
+class DataBakItem(Item):
+
+    def __init__(self):
+        self.title = ""  # 文章标题
+        self.publishtime = ""   # 文章发布时间(日期格式 xxxx-xx-xx)
+        self.spidercode = ""   # 爬虫代码(编辑器爬虫平台定义)
+        self.site = ""   # 采集的站点(编辑器爬虫平台定义)
+        self.channel = ""   # 采集的版块(编辑器爬虫平台定义)
+        self.area = "全国"   # 省
+        self.city = ""   # 市
+        self.competehref = None   # 竞品快照页地址
+        self.href = ""   # 非竞品快照页地址
+        self.publishdept = ""
+        self.iscompete=True
+        self.type = ""
+        self.T = "bidding"
+        self.l_np_publishtime = ""  # 发布时间的时间戳(秒级), 需定义为long型
+        self.comeintime = ""  # 入库时间戳(秒级), 需定义为long型
+        self.sendflag = "false"
+        self._d = "comeintime"
+        self.contenthtml = ""  # 快照页源码
+        self.detail = ""  # 快照页源码清洗之后招投标文本
+        self.projectinfo = None  # 快照页源码清洗之后招投标文本
+    def stop(self):
+        print(self.cc_err)
+
+    def pre_to_db(self):
+        # 生成入库时间戳(秒级), 定义为long型
+        self.comeintime = int2long(time.time())
+        # 根据文章发布时间 生成发布时间的时间戳(秒级), 定义为long型
+        '''如果无法解析到发布时间、可以考虑补一个发布时间
+        '''
+        if ":" in self.publishtime:
+            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
+        else:
+            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
+
+        # 数据获取失败处理:输出错误日志
+        if self.contenthtml is None and self.projectinfo is None:
+            log.error(f"{self.href},此链接数据正文抓取失败")
+            # self.sendflag = "true"
+            self.stop()
+        if not self.title or not self.publishtime or not self.href:
+            # self.sendflag = "true"
+            log.error(f"部分数据抓取失败,数据详情:\n 链接:{self.href}\n 发布时间:{self.publishtime}\n标题:{self.title}")
+            self.stop()
+        # html处理正文
+        if self.contenthtml is not None and self.detail =='':
+            self.detail = substitute(self.contenthtml)
+            '''
+            detail:去头、去尾
+            '''
+            if text_search(self.detail).total == 0:
+                # 无正文内容时,该内容直接标记true, 不在被统计
+                self.sendflag = "true"
+        save_item({"site": self.site, "title": self.title,"error":False,
+                   "spidercode":self.spidercode,"channel":self.channel,
+                   })
+
+
+class MgpListItem(Item):
+    def __init__(self):
+        # self.__table_name__='ggg_list'
+
+        self.parse = "" # 需要调用的方法名称
+        self.item = "" # 传过来的参数
+        self.parser_name = "" # 处理详情页的爬虫名
+        self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
+        self.deal_detail = [] # 定义解析详情页主页内容的解析,detail_get是一个xpath列表,detail_post 则是一段处理代码
+        self.create_time = None # 定义解析详情页发布时间的xpath,列表页无发布时间时应用
+        self.parse_url = "" # 定义解析详情页主页内容的xpath
+        self.request_params = {} # 定义callback所需的参数,诸如render,headers,method,data,params等等,
+                                # 必须与requests请求的参数名称对应,否则无法识别
+        self.failed = 0 # 定义callback所需的参数,诸如render,headers,method等等
+        self.author = "开发及维护人员" # 开发及维护人员
+        self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
+        self.ex_python = None # 定义需要执行的python代码,生成params/date,如header和cookie特殊,最好使用特殊定义法
+        self.pri = 1 # 爬虫报警级 可分9级
+        self.proxies = True # 爬虫报警级 可分9级
+        self.files = False # 附件采集配置
+        self.error = None
+        # self.error_info =
+    def pre_to_db(self):
+        # 生成入库时间戳(秒级), 定义为long型
+        self.author = setting.author.get(os.path.basename(os.getcwd()))
+        save_item({"site": self.item.get("site"),"error":True,"author":self.author,
+                   "spidercode":self.item.get("spidercode"),"channel":self.item.get("channel"),"state_code":"code",
+                   "href":self.item.get("href"),"error_info":self.error})
+        '''
+        "site": "站点名", "error_type": "错误类型(detail/list/content/)", "author": "负责人",
+         "spidercode": "", "channel": "", error_count:"错误数量"
+         '''
+
+class ListItem(Item):
+    def __init__(self):
+        self.spidercode = ""  # 爬虫代码(编辑器爬虫平台定义)
+        self.site = ""  # 采集的站点(编辑器爬虫平台定义)
+        self.channel = ""  # 采集的版块(编辑器爬虫平台定义)
+        self.url = ''
+        self.count=0
+        self.rel_count = 0
+
+    def pre_to_db(self):
+        self.author = setting.author.get(os.path.basename(os.getcwd()))
+        if self.author is None:
+            self.author = os.path.basename(os.getcwd())
+        self.runtime = get_current_date(date_format="%Y-%m-%d")
+
+
+
+class ErrorInfoItem(Item):
+    def __init__(self):
+        self.parmars = ""  # 需要调用的方法名称
+        self.item = "111"  # 传过来的参数
+        self.parser_name = "111"  # 处理详情页的爬虫名
+        self.date = time.time()

+ 0 - 0
FworkSpider/login_pool/__init__.py


+ 95 - 0
FworkSpider/login_pool/zglbw.py

@@ -0,0 +1,95 @@
+
+import sys
+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
+from untils.cookie_pool import LoginCookiePool
+import requests
+class ZglbwPool(LoginCookiePool):
+
+    def create_cookie(self, username, password):
+        print(username,password)
+        '''
+        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
+        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
+        
+        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
+        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
+        '''
+        session = requests.Session()
+        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0"}
+        url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=9d424669-5af6-4b3d-bed5-56cc06bd5ca6'
+        data = {
+            "clear": "",
+            "BackURL": "null",
+            "username": username,
+            "password": password,
+            "jcaptchaCode": "shmt"
+        }
+        session.get(url,headers=headers)
+        session.post(url, data=data)
+        # print(res.headers)
+        ss = session.get(url='https://eproport.crecgec.com/getAuthentication')
+        print(ss.text)
+        cookies = requests.utils.dict_from_cookiejar(session.cookies)
+        print(cookies)
+        return cookies
+
+
+
+
+# cookie_pool = ZglbwPool(username_key='username', password_key="password", table_userbase='zglbw',
+#                               redis_key='zglbw')
+# # cookie_pool.create_cookie('zuoshang123',"123qwe!A")
+# # # res = requests.get('https://eproport.crecgec.com/getAuthentication',cookies=cookie)
+# # # print(res.text)
+# cookie_pool.del_cookie(cookie_pool.get_cookie())
+
+
+# def create_cookie():
+#     '''
+#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
+#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
+#
+#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
+#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
+#     '''
+#     session = requests.Session()
+#     url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&response_type=code'
+#     data = {
+#         "clear": "",
+#         "BackURL": "null",
+#         "username": "zuoshang123",
+#         "password": "123qwe!A",
+#         "jcaptchaCode": "shmt"
+#     }
+#     session.get(url)
+#     res = session.post(url, data=data)
+#
+# create_cookie()
+# # import requests
+#
+#
+#
+# # cookies = {
+# #     "srv_id": "53069e9fd596ee2f1c7cf21d24bd170e",
+# #     "uid": "e423da7f-1d30-4571-a011-429326f1cfd1",
+# #     "Hm_lvt_89c053c39b2269b8a37c5881ca224223": "1642647201",
+# #     "JSESSIONID": "752173C3FF0C519DB45BBF781CEC76CB",
+# #     "Hm_lpvt_89c053c39b2269b8a37c5881ca224223": "1642661696"
+# # }
+# # url = "https://passport.crecgec.com/authorize"
+# # params = {
+# #     "type": "cas",
+# #     "client_id": "10000000`53",
+# #     "response_type": "code"
+# # }
+# # data = {
+# #     "clear": "",
+# #     "BackURL": "null",
+# #     "username": "zuoshang123",
+# #     "password": "123qwe!A",
+# #     "jcaptchaCode": "shmt"
+# # }
+# # response = requests.post(url, headers=headers, cookies=cookies, params=params, data=data)
+# #
+# # print(response.text)
+# # print(response)

+ 96 - 0
FworkSpider/mongo_pipeline.py

@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-04-18 14:12:21
+---------
+@summary: 导出数据
+---------
+@author: 马国鹏
+@email:  305021384@qq.com
+"""
+from typing import Dict, List, Tuple
+import time
+from feapder.db.mongodb import MongoDB
+from feapder.dedup import Dedup
+from feapder.pipelines import BasePipeline
+from feapder.utils.log import log
+from untils.tools import *
+from crawlab import save_item
+
+
+
+class MongoPipeline(BasePipeline):
+    def __init__(self):
+        self._to_db = None
+
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+
+        return self._to_db
+
+    def save_items(self, table, items: List[Dict]) -> bool:
+        """
+        保存数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+
+        Returns: 是否保存成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+        """
+        try:
+            add_count = self.to_db.add_batch(coll_name=table, datas=items)
+            for item in items:
+                dedup = Dedup(Dedup.BloomFilter)
+                dedup.add([item.get("href")])
+                # save_item({'count':item.get("href")})
+            datas_size = len(items)
+            log.info(
+                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
+                % (datas_size, table, add_count, datas_size - add_count)
+            )
+            # wechat_warning(f"{site}  数据导报\n共插入 {datas_size} 条数据到 {table}")
+            # for i in range(add_count):
+            if table == "mgp_list":
+                save_item({"site": "失败回填", "title": add_count})
+
+            return True
+        except Exception as e:
+            log.exception(e)
+            return False
+
+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
+        """
+        更新数据
+        Args:
+            table: 表名
+            items: 数据,[{},{},...]
+            update_keys: 更新的字段, 如 ("title", "publish_time")
+
+        Returns: 是否更新成功 True / False
+                 若False,不会将本批数据入到去重库,以便再次入库
+
+        """
+        try:
+            add_count = self.to_db.add_batch(
+                coll_name=table,
+                datas=items,
+                update_columns=update_keys or list(items[0].keys()),
+            )
+            datas_size = len(items)
+            update_count = datas_size - add_count
+            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
+                datas_size,
+                table,
+                add_count,
+                update_count,
+            )
+            if update_keys:
+                msg += " 更新字段为 {}".format(update_keys)
+            log.info(msg)
+
+            return True
+        except Exception as e:
+            log.exception(e)
+            return False

+ 163 - 0
FworkSpider/setting.py

@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+"""爬虫配置文件"""
+import os
+import time
+import sys
+# from scoket_proxy import Socks5Proxy
+#
+# # MYSQL
+# MYSQL_IP = "localhost"
+# MYSQL_PORT = 3306
+# MYSQL_DB = ""
+# MYSQL_USER_NAME = ""
+# MYSQL_USER_PASS = ""
+#
+# MONGODB
+# MONGO_IP = "192.168.20.51"  # 本地 docker 环境
+MONGO_IP = "127.0.0.1"  # 本地环境
+# MONGO_PORT = 27017
+MONGO_PORT = 27001
+
+#
+
+# MONGO_IP = "192.168.3.71"  # 本地环境
+# MONGO_PORT = 27027
+
+
+MONGO_DB = "py_spider"
+# MONGO_USER_NAME = ""
+# MONGO_USER_PASS = ""
+#
+# # REDIS
+# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
+# REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
+REDISDB_IP_PORTS = "127.0.0.1:6379"  # 本地环境
+# REDISDB_IP_PORTS = "192.168.3.71:6379"  # 本地环境
+# REDISDB_USER_PASS = ""
+REDISDB_DB = 10
+# # 适用于redis哨兵模式
+REDISDB_SERVICE_NAME = "quchoong"
+#
+# # 数据入库的pipeline,可自定义,默认MysqlPipeline
+ITEM_PIPELINES = [
+    # "feapder.pipelines.mysql_pipeline.MysqlPipeline",
+    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
+    "mongo_pipeline.MongoPipeline"
+]
+EXPORT_DATA_MAX_FAILED_TIMES = 5 # 导出数据时最大的失败次数,包括保存和更新,超过这个次数报警
+EXPORT_DATA_MAX_RETRY_TIMES = 5 # 导出数据时最大的重试次数,包括保存和更新,超过这个次数则放弃重试
+#
+# # 爬虫相关
+# # COLLECTOR
+# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
+# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
+#
+REDIS_KEY = "fwork"
+# # SPIDER
+# SPIDER_THREAD_COUNT = 10  # 爬虫并发数
+# SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数,包含2和5
+# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
+SPIDER_MAX_RETRY_TIMES = 2  # 每个请求最大重试次数
+# KEEP_ALIVE = False  # 爬虫是否常驻
+#
+# # 浏览器渲染
+WEBDRIVER  = dict(
+    pool_size=1,  # 浏览器的数量
+    load_images=False,  # 是否加载图片
+    # user_agent=None,  # 字符串 或 无参函数,返回值为user_agent
+    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数,返回值为代理地址
+    headless=False,  # 是否为无头浏览器
+    driver_type="FIREFOX",  # CHROME、PHANTOMJS、FIREFOX
+    timeout=30,  # 请求超时时间
+    window_size=(1280, 800),  # 窗口大小
+    executable_path="D:\\geckodriver.exe",  # 浏览器路径,默认为默认路径
+    render_time=0,  # 渲染时长,即打开网页等待指定时间后再获取源码
+    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
+)
+#wget https://github.com/mozilla/geckodriver/releases/download/v0.25.0/geckodriver-v0.25.0-linux64.tar.gz
+# # 爬虫启动时,重新抓取失败的requests
+# RETRY_FAILED_REQUESTS = False
+# # 保存失败的request
+# SAVE_FAILED_REQUEST = True
+# # request防丢机制。(指定的REQUEST_LOST_TIMEOUT时间内request还没做完,会重新下发 重做)
+# REQUEST_LOST_TIMEOUT = 600  # 10分钟
+# # request网络请求超时时间
+# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间,浮点数,或(connect timeout, read timeout)元组
+#
+# # 下载缓存 利用redis缓存,但由于内存大小限制,所以建议仅供开发调试代码时使用,防止每次debug都需要网络请求
+# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
+# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
+# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
+#
+# # 设置代理
+PROXY_EXTRACT_API = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"  # 代理提取API ,返回的代理分割符为\r\n
+PROXY_ENABLE = True
+#
+# # 随机headers
+# RANDOM_HEADERS = True
+# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari','mobile' 若不指定则随机类型
+# USER_AGENT_TYPE = "chrome"
+# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
+# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
+# # requests 使用session
+# USE_SESSION = False
+#
+# # 去重
+# ITEM_FILTER_ENABLE = False  # item 去重
+# REQUEST_FILTER_ENABLE = False  # request 去重
+# ITEM_FILTER_SETTING = dict(
+#     filter_type=1  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+# )
+# REQUEST_FILTER_ENABLE = True  # request 去重
+# REQUEST_FILTER_SETTING = dict(
+#     filter_type=3,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3
+#     expire_time=2592000,  # 过期时间1个月
+# )
+#
+# # 报警 支持钉钉、企业微信、邮件
+# # 钉钉报警
+# DINGDING_WARNING_URL = ""  # 钉钉机器人api
+# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表,可指定多个
+# DINGDING_WARNING_ALL = False # 是否提示所有人, 默认为False
+# # 邮件报警
+# EMAIL_SENDER = ""  # 发件人
+# EMAIL_PASSWORD = ""  # 授权码
+# EMAIL_RECEIVER = ""  # 收件人 支持列表,可指定多个
+# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
+# # 企业微信报警
+# WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=89f0b1e9-8d08-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
+WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
+WECHAT_WARNING_PHONE = "马国鹏"  # 报警人 将会在群内@此人, 支持列表,可指定多人
+WECHAT_WARNING_ALL = True  # 是否提示所有人, 默认为False
+# # 时间间隔
+WARNING_INTERVAL = 360  # 相同报警的报警时间间隔,防止刷屏; 0表示不去重
+WARNING_LEVEL = "DEBUG"  # 报警级别, DEBUG / ERROR
+WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
+#
+LOG_NAME = os.path.basename(os.getcwd())
+
+DTIME = time.strftime("%Y-%m-%d", time.localtime(time.time()))
+# LOG_NAME = os.path.split(sys.argv[0])[-1].split('.')[0]
+# LOG_PATH = "log/%s/%s.log" %(DTIME,LOG_NAME)  # log存储路径
+LOG_PATH = LOG_NAME  # log存储路径
+LOG_LEVEL = "DEBUG"
+LOG_COLOR = True  # 是否带有颜色
+LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
+LOG_IS_WRITE_TO_FILE = True  # 是否写文件
+LOG_MODE = "w"  # 写文件的模式
+LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
+LOG_BACKUP_COUNT = 20  # 日志文件保留数量
+LOG_ENCODING = "utf8"  # 日志文件编码
+OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
+#
+# # 切换工作路径为当前项目路径
+# project_path = os.path.abspath(os.path.dirname(__file__))
+# os.chdir(project_path)  # 切换工作路经
+# sys.path.insert(0, project_path)
+# print('当前工作路径为 ' + os.getcwd())
+jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
+oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing.aliyuncs.com', 'bucket_name': 'jy-datafile'}
+# oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-editor'}
+
+author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}

+ 22 - 0
FworkSpider/untils/__init__.py

@@ -0,0 +1,22 @@
+
+
+
+'''
+    时间一晃而过,转眼间两已经入职近三个月,我有幸来到公司剑雨产品部工作,在这短暂的三个月中,在公司领导的亲切关怀和指导下,在同事们的热情帮助下我很快的熟悉了公司环境,
+适应了新的工作岗位,现将我试用期的工作情况简要小结如下
+    一、严格遵守公司各项规章制度。上班开始,我认真学习了公司《员工手册》及各项管理制度,并严格遵守,做到了无违规现象。
+    二、主动学习、尽快适应,迅速熟悉环境和工作内容。首先从尽快熟悉工作环境和工作内容;其次,主动、虚心向主管、同事请教、学习,基本掌握了日常上班的工作内容,工作流程、工作方法。
+    三、工作积极、认真、负责,通过不断学习、虚心请教,总结积累,较好的完成了领导安排的各项工作任务。
+        1、开发爬虫管理平台
+        2、搭建定制爬虫框架,开发通用模块、伪代码生成器,以达到提升开发效率的目标
+        3、实现管理平台的线上部署与基础测试,目前已部署爬虫15个,且正常运行中
+        4、编写发文档、在小组内进行相关人员的培训,让小组的人一起来对这个框架和管理平台进行测评
+        5、日常数据采集,目前开发共三十个平台爬虫,涉及一百多个栏目,数据采集量达二十多万
+    四、与同事之间和谐相处、加强沟通、团结协作,以尽快更好的融入团队。
+    五、存在问题及解决办法:
+        1、与同事间的沟通交流较少,以后要加强同事间的沟通交流
+        2、js反爬比较能力不够强,以后多学习js相关知识,提高js反爬能力
+        3、逻辑不够严谨,仔细仔细再仔细,
+
+
+'''

+ 24 - 0
FworkSpider/untils/aliyun.py

@@ -0,0 +1,24 @@
+import oss2
+
+# from config.load import oss_conf
+from feapder.setting import oss_ as oss_conf
+
+
+class AliYunService:
+
+    def __init__(self):
+        self.__acc_key_id = oss_conf['key_id']
+        self.__acc_key_secret = oss_conf['key_secret']
+        self.__endpoint = oss_conf['endpoint']
+        self.__bucket_name = oss_conf['bucket_name']
+
+    def push_oss_from_local(self, key, filename):
+        """
+        上传一个本地文件到OSS的普通文件
+
+        :param str key: 上传到OSS的文件名
+        :param str filename: 本地文件名,需要有可读权限
+        """
+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
+        bucket.put_object_from_file(key, filename)

+ 198 - 0
FworkSpider/untils/attachment.py

@@ -0,0 +1,198 @@
+import hashlib
+import os
+import re
+import traceback
+import uuid
+from urllib.parse import urlparse, unquote
+
+import requests
+import urllib3
+
+from feapder.setting import headers
+from untils.execptions import AttachmentNullError
+from untils.aliyun import AliYunService
+from untils.proxy_pool import ProxyPool
+
+urllib3.disable_warnings()
+
+
+def hex_sha1(val):
+    sha1 = hashlib.sha1()
+    if isinstance(val, bytes):
+        sha1.update(str(val).encode("utf-8"))
+    elif isinstance(val, str):
+        sha1.update(val.encode("utf-8"))
+    res = sha1.hexdigest()
+    return res
+
+
+def extract_file_type(text):
+    if text is None:
+        return None
+
+    file_types = {
+        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
+    }
+    for file_type in file_types:
+        tmp = [file_type, file_type.upper()]
+        for t in tmp:
+            result = re.match(f'.*{t}$', text, re.S)
+            if result is not None:
+                return t
+    else:
+        return None
+
+
+def extract_file_name(href: str, file_type: str):
+    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
+    # 中文字符:[\u4e00 -\u9fa5]
+    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
+    parser = urlparse(href)
+    query = (parser.query or parser.path)
+    result = re.search(f'.*\\.{file_type}', query, re.S)
+    if result is not None:
+        encode_str = unquote(result.group())
+        name = re.search(zh_char_pattern, encode_str)
+        if name is not None:
+            return unquote(name.group())
+    return None
+
+
+def verify_file_name(name):
+    if extract_file_type(name) is None:
+        raise ValueError
+
+
+class AttachmentDownloader:
+
+    def __init__(self):
+        self.dir_name = 'file'
+
+    def create_dir(self):
+        if not os.path.exists(self.dir_name):
+            os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
+
+    def create_file_path(self, filename, file_type):
+        self.create_dir()
+        sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
+        tmp_name = "{}.{}".format(sign, file_type)
+        return "{}/{}".format(self.dir_name, tmp_name)
+
+    @staticmethod
+    def create_fid(file_stream: bytes):
+        return hex_sha1(file_stream)
+
+    @staticmethod
+    def _fetch_attachment(
+            url: str,
+            file_path: str,
+            enable_proxy=False,
+            allow_show_exception=False,
+            **kwargs
+    ):
+        request_params = {}
+        request_params.setdefault('headers', kwargs.get('headers') or headers)
+        request_params.setdefault('proxies', kwargs.get('proxies'))
+        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
+        request_params.setdefault('stream', kwargs.get('stream') or True)
+        request_params.setdefault('verify', kwargs.get('verify') or False)
+        if enable_proxy:
+            proxy = ProxyPool()
+        else:
+            proxy = {}
+        retries = 0
+        while retries < 3:
+            try:
+                with requests.get(url, **request_params) as req:
+                    if req.status_code == 200:
+                        stream = req.content
+                        with open(file_path, 'wb') as f:
+                            f.write(stream)
+                        return stream
+                    else:
+                        retries += 1
+            except requests.RequestException:
+                if allow_show_exception:
+                    traceback.print_exc()
+                if enable_proxy:
+                    request_params.update({'proxies': proxy.get()})
+                retries += 1
+        return b''
+
+    @staticmethod
+    def clean_attachment(file_path):
+        os.remove(file_path)
+
+    @staticmethod
+    def getsize(file_path: str):
+        def _getsize(filename):
+            try:
+                return os.path.getsize(filename)
+            except:
+                return 0
+
+        _kb = float(_getsize(file_path)) / 1024
+        if _kb >= 1024:
+            _M = _kb / 1024
+            if _M >= 1024:
+                _G = _M / 1024
+                return "{:.1f} G".format(_G)
+            else:
+                return "{:.1f} M".format(_M)
+        else:
+            return "{:.1f} kb".format(_kb)
+
+    def fetch_attachment(
+            self,
+            file_name: str,
+            file_type: str,
+            download_url: str,
+            enable_proxy=False,
+            allow_request_exception=False,
+            **kwargs
+    ):
+        if not file_name or not file_type or not download_url:
+            raise AttachmentNullError
+
+        file_path = self.create_file_path(file_name, file_type)
+        file_stream = self._fetch_attachment(
+            download_url,
+            file_path,
+            enable_proxy,
+            allow_request_exception,
+            **kwargs
+        )
+        if len(file_stream) > 0:
+            fid = self.create_fid(file_stream)
+            '''上传/下载,无论失败成功都需要给出文件基础信息'''
+            try:
+                result = {
+                    'filename': file_name,
+                    'ftype': file_type,
+                    'fid': "{}.{}".format(fid, file_type),
+                    'org_url': download_url,
+                    'size': self.getsize(file_path),
+                    'url': 'oss',
+                }
+                AliYunService().push_oss_from_local(result['fid'], file_path)
+            except Exception:
+                result = {
+                    'filename': file_name,
+                    'org_url': download_url,
+                }
+            self.clean_attachment(file_path)
+        else:
+            result = {
+                'filename': file_name,
+                'org_url': download_url,
+            }
+        return result
+
+
+# if __name__ == '__main__':
+    # a = AttachmentDownloader().fetch_attachment(
+    #     file_name='成建制移民村(五标段)合同',
+    #     file_type='pdf',
+    #     download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
+    # )
+    # print(a)

+ 61 - 0
FworkSpider/untils/chaojiying.py

@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# coding:utf-8
+
+import requests
+from hashlib import md5
+
+class Chaojiying_Client(object):
+
+    def __init__(self, username, password, soft_id):
+        self.username = username
+        password =  password.encode('utf8')
+        self.password = md5(password).hexdigest()
+        self.soft_id = soft_id
+        self.base_params = {
+            'user': self.username,
+            'pass2': self.password,
+            'softid': self.soft_id,
+        }
+        self.headers = {
+            'Connection': 'Keep-Alive',
+            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
+        }
+
+    def PostPic(self, im, codetype):
+        """
+        im: 图片字节
+        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
+        """
+        params = {
+            'codetype': codetype,
+        }
+        params.update(self.base_params)
+        files = {'userfile': ('ccc.jpg', im)}
+        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
+        return r.json()
+
+    def ReportError(self, im_id):
+        """
+        im_id:报错题目的图片ID
+        """
+        params = {
+            'id': im_id,
+        }
+        params.update(self.base_params)
+        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
+        return r.json()
+
+
+if __name__ == '__main__':
+    # chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '超级鹰')	#用户中心>>软件ID 生成一个替换 96001
+    # im = open('a.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
+    # # print(chaojiying.PostPic(im, 1902))
+    # res = chaojiying.PostPic(im, 2004)
+    # print(res)
+    # if res.get("err_no") != 0:
+    #     chaojiying.ReportError(res.get("pic_id"))
+    # if res.get("")
+    code = "haoho"
+    url = 'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={page}&verifycode=胡吃海喝'[:-4]+code
+
+    print(url)

+ 788 - 0
FworkSpider/untils/cookie_pool.py

@@ -0,0 +1,788 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2018/12/27 11:32 AM
+---------
+@summary: cookie池
+---------
+@author: Boris
+@email:  boris_liu@foxmail.com
+"""
+
+import abc
+import datetime
+import random
+import time
+import warnings
+from collections import Iterable
+from enum import Enum, unique
+import requests
+from feapder.db.mongodb import MongoDB
+
+import feapder.utils.tools as tools
+from feapder import setting
+from feapder.network import user_agent
+
+from feapder.db.mysqldb import MysqlDB
+from feapder.db.redisdb import RedisDB
+from feapder.utils import metrics
+from feapder.utils.log import log
+from feapder.utils.redis_lock import RedisLock
+from feapder.utils.tools import send_msg
+from feapder.utils.webdriver import WebDriver
+
+
+class CookiePoolInterface(metaclass=abc.ABCMeta):
+    """
+    cookie pool interface
+    """
+
+    @abc.abstractmethod
+    def create_cookie(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_cookie(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def del_cookie(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def run(self):
+        raise NotImplementedError
+
+
+class PageCookiePool(CookiePoolInterface):
+    """
+    由页面产生的cookie 不需要用户登陆
+    """
+
+    def __init__(
+        self,
+        redis_key,
+        page_url=None,
+        min_cookies=10000,
+        must_contained_keys=(),
+        keep_alive=False,
+        **kwargs,
+    ):
+        """
+        @param redis_key: 项目名
+        @param page_url: 生产cookie的url
+        @param min_cookies: 最小cookie数
+        @param must_contained_keys: cookie 必须包含的key
+        @param keep_alive: 当cookie数量足够是是否保持随时待命,生产cookie的状态。False为否,满足则退出
+        ---
+        @param kwargs: WebDriver的一些参数
+            load_images: 是否加载图片
+            user_agent_pool: user-agent池 为None时不使用
+            proxies_pool: ;代理池 为None时不使用
+            headless: 是否启用无头模式
+            driver_type: web driver 类型
+            timeout: 请求超时时间 默认16s
+            window_size: 屏幕分辨率 (width, height)
+
+        """
+
+        self._redisdb = RedisDB()
+
+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
+        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
+            redis_key
+        )  # 存储上一次统计cookie 数量的时间,格式为 时间戳:数量
+        self._page_url = page_url
+        self._min_cookies = min_cookies
+        self._must_contained_keys = must_contained_keys
+        self._keep_alive = keep_alive
+
+        self._kwargs = kwargs
+        self._kwargs.setdefault("load_images", False)
+        self._kwargs.setdefault("headless", True)
+
+    def create_cookie(self):
+        """
+        可能会重写
+        @return:
+        """
+        print('ssssssssssssssss',self._kwargs)
+        url = 'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do'
+        header = {
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": user_agent.get()
+        }
+        res = requests.get(url, headers=header)
+        cookies = requests.utils.dict_from_cookiejar(res.cookies)
+        return cookies
+
+
+    def add_cookies(self, cookies):
+        log.info("添加cookie {}".format(cookies))
+        self._redisdb.lpush(self._tab_cookie_pool, cookies)
+    def run(self):
+        while True:
+            try:
+                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
+                need_cookie_count = self._min_cookies - now_cookie_count
+
+                if need_cookie_count > 0:
+                    log.info(
+                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
+                            now_cookie_count, self._min_cookies
+                        )
+                    )
+                    try:
+                        cookies = self.create_cookie()
+                        if cookies:
+                            self.add_cookies(cookies)
+                    except Exception as e:
+                        log.exception(e)
+                else:
+                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
+
+                    # 判断cookie池近一分钟数量是否有变化,无变化则认为爬虫不再用了,退出
+                    last_count_info = self._redisdb.strget(
+                        self._tab_cookie_pool_last_count
+                    )
+                    if not last_count_info:
+                        self._redisdb.strset(
+                            self._tab_cookie_pool_last_count,
+                            "{}:{}".format(time.time(), now_cookie_count),
+                        )
+                    else:
+                        last_time, last_count = last_count_info.split(":")
+                        last_time = float(last_time)
+                        last_count = int(last_count)
+
+                        if time.time() - last_time > 60:
+                            if now_cookie_count == last_count:
+                                log.info("近一分钟,cookie池数量无变化,判定爬虫未使用,退出生产")
+                                break
+                            else:
+                                self._redisdb.strset(
+                                    self._tab_cookie_pool_last_count,
+                                    "{}:{}".format(time.time(), now_cookie_count),
+                                )
+
+                    if self._keep_alive:
+                        log.info("sleep 10")
+                        tools.delay_time(10)
+                    else:
+                        break
+
+            except Exception as e:
+                log.exception(e)
+                tools.delay_time(1)
+
+    def get_cookie(self, wait_when_null=True):
+        while True:
+            try:
+                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
+                if not cookie_info and wait_when_null:
+                    log.info("暂无cookie 生产中...")
+                    self._keep_alive = False
+                    self._min_cookies = 1
+                    with RedisLock(
+                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
+                    ) as _lock:
+                        if _lock.locked:
+                            self.run()
+                    continue
+                return eval(cookie_info) if cookie_info else {}
+            except Exception as e:
+                log.exception(e)
+                tools.delay_time(1)
+
+    def del_cookie(self, cookies):
+        self._redisdb.lrem(self._tab_cookie_pool, cookies)
+
+# PageCookiePool('cookie_1',page_url="https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do").create_cookie()
+class User:
+    def __init__(self, username, cookie):
+        self.username = username
+        self.cookie = cookie
+
+
+class LoginCookiePool(CookiePoolInterface):
+    """
+    需要登陆的cookie池, 用户账号密码等信息用mysql保存
+    """
+
+    def __init__(
+        self,
+        redis_key,
+        *,
+        table_userbase,
+        login_state_key="login_state",
+        lock_state_key="lock_state",
+        username_key="username",
+        password_key="password",
+        login_retry_times=10,
+    ):
+        """
+        @param redis_key: 项目名
+        @param table_userbase: 用户表名
+        @param login_state_key: 登录状态列名
+        @param lock_state_key: 封锁状态列名
+        @param username_key: 登陆名列名
+        @param password_key: 密码列名
+        @param login_retry_times: 登陆失败重试次数
+        """
+
+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
+        self._login_retry_times = login_retry_times
+        self._table_userbase = table_userbase
+        self._login_state_key = login_state_key
+        self._lock_state_key = lock_state_key
+        self._username_key = username_key
+        self._password_key = password_key
+
+        self._redisdb = RedisDB()
+        self._mongo = MongoDB(db='user_login')
+
+
+    def create_cookie(self, username, password):
+
+        """
+        创建cookie
+        @param username: 用户名
+        @param password: 密码
+        @return: return cookie / None
+        """
+        raise NotImplementedError
+
+    def get_user_info(self):
+        """
+        返回用户信息
+        @return: yield username, password
+        """
+
+        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
+
+    def handle_login_failed_user(self, username, password):
+        """
+        处理登录失败的user
+        @param username:
+        @param password:
+        @return:
+        """
+
+        pass
+
+    def handel_exception(self, e):
+        """
+        处理异常
+        @param e:
+        @return:
+        """
+        log.exception(e)
+
+    def save_cookie(self, username, cookie):
+        user_cookie = {"username": username, "cookie": cookie}
+
+        self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
+        self._mongo.add(
+                coll_name=self._table_userbase,
+                data={self._login_state_key:1},
+                update_columns=self._username_key,
+                update_columns_value=username)
+
+    def get_cookie(self, wait_when_null=True) -> User:
+        while True:
+            try:
+                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
+                if not user_cookie and wait_when_null:
+                    log.info("暂无cookie 生产中...")
+                    self.login()
+                    continue
+
+                if user_cookie:
+                    user_cookie = eval(user_cookie)
+                    return User(**user_cookie)
+
+                return None
+            except Exception as e:
+                log.exception(e)
+                tools.delay_time(1)
+
+    def del_cookie(self, user: User):
+        """
+        删除失效的cookie
+        @param user:
+        @return:
+        """
+        user_info = {"username": user.username, "cookie": user.cookie}
+        self._redisdb.lrem(self._tab_cookie_pool, user_info)
+
+        self._mongo.add(
+            coll_name=self._table_userbase,
+            data={self._login_state_key: 1},
+            update_columns=self._username_key,
+            update_columns_value=user.username)
+
+    def user_is_locked(self, user: User):
+
+        self._mongo.add(
+            coll_name=self._table_userbase,
+            data={self._lock_state_key: 1},
+            update_columns=self._username_key,
+            update_columns_value=user.username)
+
+    def run(self):
+        with RedisLock(
+            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
+        ) as _lock:
+            if _lock.locked:
+                user_infos = self.get_user_info()
+                if not isinstance(user_infos, Iterable):
+                    raise ValueError("get_user_info 返回值必须可迭代")
+
+                if not user_infos:
+                    log.info("无可用用户")
+
+                for info in user_infos:
+                    username = info.get("username")
+                    password = info.get("password")
+                    for i in range(self._login_retry_times):
+                        try:
+                            cookie = self.create_cookie(username, password)
+                            if cookie:
+                                self.save_cookie(username, cookie)
+                            else:
+                                self.handle_login_failed_user(username, password)
+
+                            break
+                        except Exception as e:
+                            self.handel_exception(e)
+
+                    else:
+                        self.handle_login_failed_user(username, password)
+
+    login = run
+
+
+@unique
+class LimitTimesUserStatus(Enum):
+    # 使用状态
+    USED = "used"
+    SUCCESS = "success"
+    OVERDUE = "overdue"  # cookie 过期
+    SLEEP = "sleep"
+    EXCEPTION = "exception"
+    # 登陆状态
+    LOGIN_SUCCESS = "login_success"
+    LOGIN_FALIED = "login_failed"
+
+
+class LimitTimesUser:
+    """
+    有次数限制的账户
+    基于本地做的缓存,不支持多进程调用
+    """
+
+    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
+    SITE_NAME = ""  # 网站名
+
+    redisdb = None
+
+    def __init__(
+        self,
+        username,
+        password,
+        max_search_times,
+        proxies=None,
+        search_interval=0,
+        **kwargs,
+    ):
+        """
+        @param username:
+        @param password:
+        @param max_search_times:
+        @param proxies:
+        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如(5,10)即5到10秒;或直接传整数
+        """
+        self.__dict__.update(kwargs)
+        self.username = username
+        self.password = password
+        self.max_search_times = max_search_times
+        self.proxies = proxies
+        self.search_interval = search_interval
+        self.delay_use = 0  # 延时使用,用于等待解封的用户
+
+        if isinstance(search_interval, (tuple, list)):
+            if len(search_interval) != 2:
+                raise ValueError("search_interval 需传递两个值的元组或列表。如(5,10)即5到10秒")
+
+            self.used_for_time_length = (
+                search_interval[1] * 5
+            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
+        else:
+            self.used_for_time_length = (
+                search_interval * 5
+            )  # 抢占式爬虫独享cookie时间,这段时间内其他爬虫不可抢占
+
+        self.account_info = {
+            "login_time": 0,
+            "cookies": {},
+            "search_times": 0,
+            "last_search_time": 0,
+            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
+            "init_search_times_time": 0,  # 初始化搜索次数的时间
+        }
+
+        if not self.__class__.redisdb:
+            self.__class__.redisdb = RedisDB()
+
+        self.sync_account_info_from_redis()
+
+        self.__init_metrics()
+
+    def __init_metrics(self):
+        """
+        初始化打点系统
+        @return:
+        """
+        metrics.init(**setting.METRICS_OTHER_ARGS)
+
+    def record_user_status(self, status: LimitTimesUserStatus):
+        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
+
+    def __repr__(self):
+        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
+
+    def __eq__(self, other):
+        return self.username == other.username
+
+    def sync_account_info_from_redis(self):
+        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
+        if account_info:
+            account_info = eval(account_info)
+            self.account_info.update(account_info)
+
+    @property
+    def cookies(self):
+        cookies = self.account_info.get("cookies")
+        return cookies
+
+    def set_cookies(self, cookies):
+        self.account_info["cookies"] = cookies
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    def set_login_time(self, login_time=None):
+        self.account_info["login_time"] = login_time or time.time()
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    def get_login_time(self):
+        return self.account_info.get("login_time")
+
+    def is_time_to_login(self):
+        return time.time() - self.get_login_time() > 40 * 60
+
+    def get_last_search_time(self):
+        return self.account_info.get("last_search_time", 0)
+
+    def is_time_to_search(self):
+        if self.delay_use:
+            is_time = time.time() - self.get_last_search_time() > self.delay_use
+            if is_time:
+                self.delay_use = 0
+
+        else:
+            is_time = time.time() - self.get_last_search_time() > (
+                random.randint(*self.search_interval)
+                if isinstance(self.search_interval, (tuple, list))
+                else self.search_interval
+            )
+
+        return is_time
+
+    @property
+    def used_for_spider_name(self):
+        return self.account_info.get("used_for_spider_name")
+
+    @used_for_spider_name.setter
+    def used_for_spider_name(self, spider_name):
+        self.account_info["used_for_spider_name"] = spider_name
+
+    def update_status(self):
+        """
+        更新search的一些状态
+        @return:
+        """
+        self.account_info["search_times"] += 1
+        self.account_info["last_search_time"] = time.time()
+
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    @property
+    def search_times(self):
+        init_search_times_time = self.account_info.get("init_search_times_time")
+        current_time = time.time()
+        if (
+            current_time - init_search_times_time >= 86400
+        ):  # 如果距离上次初始化搜索次数时间大于1天,则搜索次数清清零
+            self.account_info["search_times"] = 0
+            self.account_info["init_search_times_time"] = current_time
+
+            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
+
+        return self.account_info["search_times"]
+
+    def is_overwork(self):
+        if self.search_times > self.max_search_times:
+            log.warning("账号 {} 请求次数超限制".format(self.username))
+            return True
+
+        return False
+
+    def is_at_work_time(self):
+        if datetime.datetime.now().hour in list(range(7, 23)):
+            return True
+
+        log.warning("账号 {} 不再工作时间内".format(self.username))
+        return False
+
+    def del_cookie(self):
+        self.account_info["cookies"] = {}
+        return self.redisdb.hset(
+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
+        )
+
+    def create_cookie(self):
+        """
+        生产cookie 有异常需要抛出
+        @return: cookie_dict
+        """
+
+        raise NotImplementedError
+
+    def login(self):
+        """
+        @return: 1 成功 0 失败
+        """
+
+        try:
+            # 预检查
+            if not self.is_time_to_login():
+                log.info("此账号尚未到登陆时间: {}".format(self.username))
+                time.sleep(5)
+                return 0
+
+            cookies = self.create_cookie()
+            if not cookies:
+                raise Exception("登陆失败 未获取到合法cookie")
+
+            if not isinstance(cookies, dict):
+                raise Exception("cookie 必须为字典格式")
+
+            # 保存cookie
+            self.set_login_time()
+            self.set_cookies(cookies)
+            log.info("登录成功 {}".format(self.username))
+            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
+            return 1
+
+        except Exception as e:
+            log.exception(e)
+            send_msg(
+                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
+                level="error",
+                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
+            )
+
+        log.info("登录失败 {}".format(self.username))
+        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
+        return 0
+
+
+class LimitTimesUserPool:
+    """
+    限制查询次数的用户的User pool
+    基于本地做的缓存,不支持多进程调用
+    """
+
+    LOAD_USER_INTERVAL = 60
+
+    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
+        """
+        @param accounts_dic: 账户信息字典
+            {
+                "15011300228": {
+                    "password": "300228",
+                    "proxies": {},
+                    "max_search_times": 500,
+                    "search_interval": 1, # 使用时间间隔
+                    # 其他携带信息
+                }
+            }
+        @param limit_user_class: 用户重写的 limit_user_class
+        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
+        """
+        self.accounts_dict = accounts_dict
+        self.limit_user_class = limit_user_class
+
+        self.limit_times_users = []
+        self.current_user_index = -1
+
+        self.support_more_client = support_more_client
+
+        self.last_load_user_time = 0
+
+    def __load_users(self, username=None):
+        # 装载user
+        log.info("更新可用用户")
+
+        for _username, detail in self.accounts_dict.items():
+            if username and username != _username:
+                continue
+
+            limit_times_users = self.limit_user_class(username=_username, **detail)
+            if limit_times_users in self.limit_times_users:
+                continue
+
+            if limit_times_users.is_overwork():
+                continue
+            else:
+                if (
+                    limit_times_users.cookies or limit_times_users.login()
+                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
+                    self.limit_times_users.append(limit_times_users)
+
+        self.last_load_user_time = time.time()
+
+    def get_user(
+        self,
+        username=None,
+        used_for_spider_name=None,
+        wait_when_null=True,
+        not_limit_frequence=False,
+    ) -> LimitTimesUser:
+        """
+        @params username: 获取指定的用户
+        @params used_for_spider_name: 独享式使用,独享爬虫的名字。其他爬虫不可抢占
+        @params wait_when_null: 无用户时是否等待
+        @params not_limit_frequence: 不限制使用频率
+        @return: LimitTimesUser
+        """
+        if not self.support_more_client:
+            warnings.warn(
+                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存,不支持多进程或多线程",
+                category=Warning,
+            )
+            self._is_show_warning = True
+
+        while True:
+            if (
+                not self.limit_times_users
+                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
+            ):
+                self.__load_users(username)
+                if not self.limit_times_users:
+                    log.warning("无可用的用户")
+                    if wait_when_null:
+                        time.sleep(1)
+                        continue
+                    else:
+                        return None
+
+            self.current_user_index += 1
+            self.current_user_index = self.current_user_index % len(
+                self.limit_times_users
+            )
+
+            limit_times_user = self.limit_times_users[self.current_user_index]
+            if self.support_more_client:  # 需要先同步下最新数据
+                limit_times_user.sync_account_info_from_redis()
+
+            if username and limit_times_user.username != username:
+                log.info(
+                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
+                )
+                time.sleep(1)
+                continue
+
+            # 独占式使用,若为其他爬虫,检查等待使用时间是否超过独占时间,若超过则可以使用
+            if (
+                limit_times_user.used_for_spider_name
+                and limit_times_user.used_for_spider_name != used_for_spider_name
+            ):
+                wait_time = time.time() - limit_times_user.get_last_search_time()
+                if wait_time < limit_times_user.used_for_time_length:
+                    log.info(
+                        "用户{} 被 {} 爬虫独占,需等待 {} 秒后才可使用".format(
+                            limit_times_user.username,
+                            limit_times_user.used_for_spider_name,
+                            limit_times_user.used_for_time_length - wait_time,
+                        )
+                    )
+                    time.sleep(1)
+                    continue
+
+            if (
+                not limit_times_user.is_overwork()
+                and limit_times_user.is_at_work_time()
+            ):
+                if not limit_times_user.cookies:
+                    self.limit_times_users.remove(limit_times_user)
+                    continue
+
+                if not_limit_frequence or limit_times_user.is_time_to_search():
+                    limit_times_user.used_for_spider_name = used_for_spider_name
+
+                    limit_times_user.update_status()
+                    log.info("使用用户 {}".format(limit_times_user.username))
+                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
+                    return limit_times_user
+                else:
+                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
+                    time.sleep(1)
+                    continue
+            else:
+                self.limit_times_users.remove(limit_times_user)
+                self.current_user_index -= 1
+
+                if not limit_times_user.is_at_work_time():
+                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
+                    if wait_when_null:
+                        time.sleep(30)
+                        continue
+                    else:
+                        return None
+
+    def del_user(self, username):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.del_cookie()
+                self.limit_times_users.remove(limit_times_user)
+                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
+                self.__load_users(username)
+                break
+
+    def update_cookies(self, username, cookies):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.set_cookies(cookies)
+                break
+
+    def delay_use(self, username, delay_seconds):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.delay_use = delay_seconds
+                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
+                break
+
+    def record_success_user(self, username):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
+
+    def record_exception_user(self, username):
+        for limit_times_user in self.limit_times_users:
+            if limit_times_user.username == username:
+                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)

+ 33 - 0
FworkSpider/untils/create_menus.py

@@ -0,0 +1,33 @@
+from feapder.db.mongodb import MongoDB
+
+
+class Details:
+    _to_db = None
+    _to_db_xs = None
+    db_name = 'mgp_list'
+    # 定义mongo链接
+    @property
+    def to_db(self):
+        if not self._to_db:
+            self._to_db = MongoDB()
+        return self._to_db
+
+    @property
+    def to_db_xs(self):
+        if not self._to_db_xs:
+            self._to_db_xs = MongoDB(port=27001,db='editor')
+        return self._to_db_xs
+    def main(self,page):
+        menus_list = []
+        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "广东省政府采购网", "$options": "$i"}}})
+        # print(data)
+        for item in data:
+            # print(item)
+            channls = item.get("param_common")[2]
+            code = item.get("code")
+            href = item.get("param_common")[11]
+            print("Menu"+"(",f"'{channls}',",f"'{code}',\n",f"'{href}',",page,"),")
+        #     menus_list.append(f'''Menu({channls},{code},{href},{page})''')
+        # print(menus_list)
+
+Details().main(2)

+ 19 - 0
FworkSpider/untils/execptions.py

@@ -0,0 +1,19 @@
+
+class CustomCheckError(Exception):
+
+    def __init__(self, code: int = 10002, reason: str = '特征条件检查失败'):
+        self.code = code
+        self.reason = reason
+
+
+class AttachmentNullError(Exception):
+
+    def __init__(self, code: int = 10004, reason: str = '附件下载失败'):
+        self.code = code
+        self.reason = reason
+
+
+class CustomAccountPrivilegeError(Exception):
+
+    def __init__(self, *args, **kwargs):
+        pass

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно