3 gadi atpakaļ · 8ba6e05988
--- a/Crawlb/docker-compose.yml
+++ b/Crawlb/docker-compose.yml
@@ -0,0 +1,47 @@
 
				+version: '3.3'
			
 
				+services:
			
 
				+  master:
			
 
				+    image: swordfish:v1
			
 
				+    container_name: master_new
			
 
				+    environment:
			
 
				+       CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+       CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
			
 
				+       CRAWLAB_SERVER_MASTER: "Y"
			
 
				+       CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
			
 
				+       CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
			
 
				+       CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
			
 
				+       CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
			
 
				+       CRAWLAB_REDIS_ADDRESS: "redis"  #
			
 
				+#       CRAWLAB_REDIS_ADDRESS: "172.19.0.2"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+       CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
			
 
				+       CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
			
 
				+       CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
			
 
				+       CRAWLAB_SERVER_REGISTER_TYPE: "mac"
			
 
				+    volumes: # 目录挂载，宿主机在前，容器在后
			
 
				+      - /mnt/magp:/magp
			
 
				+    ports:
			
 
				+        - "8998:8080"
			
 
				+
			
 
				+
			
 
				+#    depends_on:
			
 
				+#          - redis
			
 
				+
			
 
				+#    deploy:
			
 
				+#      resources:
			
 
				+#        limits:
			
 
				+#          memory: 15G
			
 
				+#        reservations:
			
 
				+#          memory: 1G
			
 
				+
			
 
				+#  mongo:
			
 
				+#    image: mongo:latest
			
 
				+#    restart: always
			
 
				+#    ports:
			
 
				+#      - "27027:27017"
			
 
				+#  redis:
			
 
				+#    image: redis:latest
			
 
				+#    container_name: master_redis
			
 
				+#    restart: always
			
 
				+#    ports:
			
 
				+#      - "6379:6379"
			
 
				+#  wget http://download.firefox.com.cn/releases/firefox/78.14/zh-CN/Firefox-latest-x86_64.tar.bz2
			
--- a/Crawlb/docker-compose_work.yml
+++ b/Crawlb/docker-compose_work.yml
@@ -0,0 +1,54 @@
 
				+version: '3.3'
			
 
				+services:
			
 
				+  worker01:
			
 
				+    image: swordfish:v1
			
 
				+    container_name: crawlab_worker01
			
 
				+    environment:
			
 
				+      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
			
 
				+      CRAWLAB_SERVER_MASTER: "N"
			
 
				+      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
			
 
				+      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
			
 
				+      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
			
 
				+      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
			
 
				+      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
			
 
				+      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
			
 
				+      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
			
 
				+      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
			
 
				+      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
			
 
				+
			
 
				+  worker02:
			
 
				+    image: swordfish:v1
			
 
				+    container_name: crawlab_worker02
			
 
				+    environment:
			
 
				+      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
			
 
				+      CRAWLAB_SERVER_MASTER: "N"
			
 
				+      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
			
 
				+      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
			
 
				+      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
			
 
				+      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
			
 
				+      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
			
 
				+      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
			
 
				+      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
			
 
				+      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
			
 
				+      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
			
 
				+  worker03:
			
 
				+    image: swordfish:v1
			
 
				+    container_name: crawlab_worker03
			
 
				+    environment:
			
 
				+      CRAWLAB_MONGO_HOST: "172.17.4.87"  # MongoDB host address MongoDB 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+      CRAWLAB_MONGO_PORT: "27080"  # MongoDB port MongoDB 的端口
			
 
				+      CRAWLAB_SERVER_MASTER: "N"
			
 
				+      CRAWLAB_MONGO_DB: "jianyu_manage"  # MongoDB database MongoDB 的数据库
			
 
				+      CRAWLAB_MONGO_USERNAME: ""  # MongoDB username MongoDB 的用户名
			
 
				+      CRAWLAB_MONGO_PASSWORD: ""  # MongoDB password MongoDB 的密码
			
 
				+      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # MongoDB auth source MongoDB 的验证源
			
 
				+      CRAWLAB_REDIS_ADDRESS: "redis"  # Redis host address Redis 的地址，在 docker compose 网络中，直接引用服务名称
			
 
				+      CRAWLAB_REDIS_PORT: "6379"  # Redis port Redis 的端口
			
 
				+      CRAWLAB_REDIS_DATABASE: "8"  # Redis database Redis 的数据库
			
 
				+      CRAWLAB_REDIS_PASSWORD: ""  # Redis password Redis 的密码
			
 
				+      CRAWLAB_SERVER_REGISTER_TYPE: "ip"
			
 
				+      CRAWLAB_FS_FILER_URL: "http://101.200.210.94:8998/api/filer"
			
--- a/FworkSpider/details/__init__.py
+++ b/FworkSpider/details/__init__.py
@@ -0,0 +1,15 @@
 
				+import requests
			
 
				+
			
 
				+
			
 
				+headers = {
			
 
				+
			
 
				+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
			
 
				+}
			
 
				+cookies = {
			
 
				+    "__jsluid_h": "018c23a4fee58c26aa118512640f8022"
			
 
				+}
			
 
				+url = "http://www.snszgh.gov.cn/gsgg/index.html"
			
 
				+response = requests.get(url, headers=headers,verify=False)
			
 
				+
			
 
				+print(response.text)
			
 
				+print(response)
			
--- a/FworkSpider/details/detail_ztlbw.py
+++ b/FworkSpider/details/detail_ztlbw.py
@@ -0,0 +1,134 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-12-13 13:25:15
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: 马国鹏
			
 
				+"""
			
 
				+
			
 
				+import feapder
			
 
				+from feapder.utils.log import Log
			
 
				+from feapder.utils.tools import wechat_warning
			
 
				+from items.spider_item import DataBakItem, MgpListItem
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+from login_pool.zglbw import ZglbwPool
			
 
				+from untils.attachment import AttachmentDownloader
			
 
				+
			
 
				+Log().info("")
			
 
				+
			
 
				+
			
 
				+class FirefoxDetails(feapder.Spider):
			
 
				+    _to_db = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    send_list = []
			
 
				+
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        while True:
			
 
				+            data_lsit = self.to_db.find(self.db_name, {"parser_name": "details_ztlbw", "item.spidercode": "a_ztlbsww_jzxtp"},
			
 
				+                                        sort={"date": -1}, limit=1)
			
 
				+            print(data_lsit)
			
 
				+            for item in data_lsit:
			
 
				+                url = item.get("parse_url")
			
 
				+                url = "https://eproport.crecgec.com/#/notice/notice-detail?projectId=1484412339522916354&tenantId=1&indexnumber=0"
			
 
				+                cookie = ZglbwPool(table_userbase='zglbw', redis_key='zglbw')
			
 
				+                cookie = cookie.get_cookie().cookie
			
 
				+                yield feapder.Request(url=url, item=item.get("item"),
			
 
				+                                      callback=self.detail_get, base_info=item, render=True,
			
 
				+                                      render_time=3, proxies=False, cookies=cookie)
			
 
				+                self.to_db.delete(self.db_name, item)
			
 
				+            break
			
 
				+
			
 
				+    def detail_get(self, request, response):
			
 
				+        items = request.item
			
 
				+        # print(items)
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key, items[key])
			
 
				+        html = ''
			
 
				+        xpath_list = ['//div[@class="ant-col ant-col-xs-6 ant-col-sm-6 ant-col-lg-12"][1]',
			
 
				+                      '//div[@class="luban-bid-details ant-row ng-star-inserted"][2]',
			
 
				+                      '//div[@class="login ng-star-inserted"]']
			
 
				+        for xpath in xpath_list:
			
 
				+            # import pdb
			
 
				+            # pdb.set_trace()
			
 
				+            html_one = response.xpath(xpath).extract_first()
			
 
				+            if html_one is not None:
			
 
				+                html += '\n'  # 标书详细内容
			
 
				+                html += html_one  # 拼接html
			
 
				+        print(html)
			
 
				+        list_item.contenthtml = html
			
 
				+        files_list = response.xpath("//iframe/@src").extract_first()
			
 
				+        file_url = files_list.split("file=")[-1]
			
 
				+        file_url = file_url.replace("%3A", ":").replace("%2F", "/").replace("%3F", "?").replace("%3D", "=")
			
 
				+        attachments = {}
			
 
				+        file_name = list_item.title
			
 
				+
			
 
				+        attachment = AttachmentDownloader().fetch_attachment(
			
 
				+            file_name=file_name, file_type='pdf', download_url=file_url,
			
 
				+            enable_proxy=False)
			
 
				+        attachments["0"] = attachment
			
 
				+        list_item.projectinfo = {"attachments": attachments}
			
 
				+        yield list_item
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
			
 
				+        if response is None:
			
 
				+            code = 0
			
 
				+        code = response.status_code
			
 
				+        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
			
 
				+        if 200 <= code < 300:
			
 
				+            err = 'analysis'
			
 
				+        elif 300 <= code < 400:
			
 
				+            err = 'download'
			
 
				+        elif 400 <= code < 500:
			
 
				+            err = 'download'
			
 
				+        elif 500 <= code:
			
 
				+            err = "servers"
			
 
				+        else:
			
 
				+            err = "timeout"
			
 
				+        mgp = MgpListItem()
			
 
				+        mgp.code = code
			
 
				+        mgp.error = err
			
 
				+        items = request.base_info
			
 
				+        for key in items:
			
 
				+            mgp.__setitem__(key, items[key])
			
 
				+        mgp.failed += 1
			
 
				+        if mgp.pri is None:
			
 
				+            mgp.pri = 0
			
 
				+
			
 
				+        if mgp.pri > 5:
			
 
				+            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
			
 
				+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
			
 
				+                    '''
			
 
				+                    根据爬虫优先级报警'''
			
 
				+                    info = f'''`
			
 
				+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
			
 
				+        > **爬虫名称:** {mgp.item.get("site")}
			
 
				+        > **栏目名称:** {mgp.item.get("channel")}
			
 
				+        > **爬虫代码:** {mgp.item.get("spidercode")}
			
 
				+        > **爬虫等级:** {mgp.pri}
			
 
				+        > **所属管理人员:** {mgp.author}
			
 
				+        请登录剑鱼爬虫管理平台查看详情。
			
 
				+        `'''
			
 
				+                    wechat_warning(info)
			
 
				+                    self.send_list.append(mgp.item.get("site"))
			
 
				+        yield mgp
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
			
 
				+    # def download_midware(self, request):
			
 
				+    #     request.proxies = self.prox_pool.get()
			
 
				+    #     return request
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    FirefoxDetails(redis_key="magp:details:ztlbw").start()
			
--- a/FworkSpider/details/details.py
+++ b/FworkSpider/details/details.py
@@ -0,0 +1,170 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-12-13 13:25:15
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: 马国鹏
			
 
				+"""
			
 
				+import json
			
 
				+import sys
			
 
				+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
			
 
				+import time
			
 
				+from urllib.parse import urljoin
			
 
				+
			
 
				+import feapder
			
 
				+from feapder.utils.tools import wechat_warning
			
 
				+import execjs
			
 
				+from items.spider_item import DataBakItem, MgpListItem
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+from untils.attachment import AttachmentDownloader
			
 
				+
			
 
				+
			
 
				+class Details(feapder.Spider):
			
 
				+    _to_db = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    send_list = []
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        while True:
			
 
				+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details"},sort={"item.publishtime":-1},limit=50)
			
 
				+            for item in data_lsit:
			
 
				+                print(11111)
			
 
				+                request_params = item.get("request_params")
			
 
				+                if item.get("js"):
			
 
				+                    eval(item.get("js"))
			
 
				+                if item.get("ex_python"):
			
 
				+                    exec(item.get("ex_python"))
			
 
				+                if item.get("proxies"):
			
 
				+
			
 
				+                    yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files=item.get("files"),
			
 
				+                                          deal_detail=item.get("deal_detail"),
			
 
				+                                          callback=eval(item.get("parse")),base_info=item,**request_params)
			
 
				+                else:
			
 
				+                    yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files=item.get("files"),
			
 
				+                                          deal_detail=item.get("deal_detail"),
			
 
				+                                          callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
			
 
				+
			
 
				+                self.to_db.delete(self.db_name,item)
			
 
				+            break
			
 
				+
			
 
				+    def detail_get(self,request,response):
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        for xpath in request.deal_detail:
			
 
				+            html = response.xpath(xpath).extract_first()  # 标书详细内容
			
 
				+            if html is not None:
			
 
				+                break
			
 
				+
			
 
				+        list_item.contenthtml = html
			
 
				+        if request.files:
			
 
				+            files_info = request.files
			
 
				+            files =  response.xpath(files_info.get("list_xpath"))
			
 
				+            if request.files_info:
			
 
				+                files_info = request.files_info
			
 
				+                files = response.xpath(files_info.get("list_xpath"))
			
 
				+                if request.files_info:
			
 
				+                    files_info = request.files_info
			
 
				+                    files = response.xpath(files_info.get("list_xpath"))
			
 
				+                    if len(files) > 0:
			
 
				+                        attachments = {}
			
 
				+                        for index, info in enumerate(files):
			
 
				+                            file_url = info.xpath(files_info.get("url_xpath")).extract_first()
			
 
				+                            file_name = info.xpath(files_info.get("name_xpath")).extract_first()
			
 
				+                            if files_info.get("host"):
			
 
				+                                file_url = urljoin(files_info.get("host"), file_url)
			
 
				+                            if not files_info.get("file_type"):
			
 
				+                                file_type = file_url.split("?")[0].split(".")[-1].lower()
			
 
				+                            else:
			
 
				+                                file_type = files_info.get("file_type")
			
 
				+                            if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
			
 
				+                                attachment = AttachmentDownloader().fetch_attachment(
			
 
				+                                    file_name=file_name, file_type=file_type, download_url=file_url,
			
 
				+                                    enable_proxy=False)
			
 
				+                                attachments[len(attachments) + 1] = attachment
			
 
				+                        if len(attachments) == 0:
			
 
				+                            pass
			
 
				+                        else:
			
 
				+                            list_item.projectinfo = {"attachment": attachments}
			
 
				+
			
 
				+
			
 
				+        yield list_item
			
 
				+
			
 
				+    def detail_json(self,request,response):
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        exec(request.deal_detail)
			
 
				+
			
 
				+        yield list_item
			
 
				+    def detail_post(self,request,response):
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        exec(request.deal_detail)
			
 
				+
			
 
				+        yield list_item
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
			
 
				+        if response is None:
			
 
				+            code = 0
			
 
				+        else:
			
 
				+            code = response.status_code
			
 
				+        err_dic = {"200":"analysis","400":"download","500":"servers","300":"download"}
			
 
				+        if 200<=code<300:
			
 
				+            err = 'analysis'
			
 
				+        elif 300<=code<400:
			
 
				+            err = 'download'
			
 
				+        elif 400<=code<500:
			
 
				+            err = 'download'
			
 
				+        elif 500<=code:
			
 
				+            err = "servers"
			
 
				+        else:
			
 
				+            err = "timeout"
			
 
				+        mgp = MgpListItem()
			
 
				+        mgp.code=code
			
 
				+        mgp.error=err
			
 
				+        items = request.base_info
			
 
				+        for key in items:
			
 
				+            mgp.__setitem__(key,items[key])
			
 
				+        mgp.failed +=1
			
 
				+        if mgp.pri is None:
			
 
				+            mgp.pri = 0
			
 
				+
			
 
				+        if mgp.pri > 5:
			
 
				+            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
			
 
				+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
			
 
				+                    '''
			
 
				+                    根据爬虫优先级报警'''
			
 
				+                    info= f'''`
			
 
				+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
			
 
				+        > **爬虫名称:** {mgp.item.get("site")}
			
 
				+        > **栏目名称:** {mgp.item.get("channel")}
			
 
				+        > **爬虫代码:** {mgp.item.get("spidercode")}
			
 
				+        > **爬虫等级:** {mgp.pri}
			
 
				+        > **所属管理人员:** {mgp.author}
			
 
				+        请登录剑鱼爬虫管理平台查看详情。
			
 
				+        `'''
			
 
				+                    wechat_warning(info)
			
 
				+                    self.send_list.append(mgp.item.get("site"))
			
 
				+        yield mgp
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    Details(redis_key="magp:details1").start()
			
--- a/FworkSpider/details/details_cookie.py
+++ b/FworkSpider/details/details_cookie.py
@@ -0,0 +1,165 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-12-13 13:25:15
			
 
				+---------
			
 
				+@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
			
 
				+---------
			
 
				+@author: 马国鹏
			
 
				+"""
			
 
				+import sys
			
 
				+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
			
 
				+import feapder
			
 
				+from feapder.utils.tools import wechat_warning
			
 
				+import execjs
			
 
				+from items.spider_item import DataBakItem, MgpListItem
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+
			
 
				+from untils.cookie_pool import PageCookiePool
			
 
				+import copy
			
 
				+
			
 
				+class Details(feapder.Spider):
			
 
				+    _to_db = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    send_list = []
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        while True:
			
 
				+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
			
 
				+            for item in data_lsit:
			
 
				+                request_params = item.get("request_params")
			
 
				+
			
 
				+                if item.get("ex_python"):
			
 
				+                    exec(item.get("ex_python"))
			
 
				+
			
 
				+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
			
 
				+                                      deal_detail=item.get("deal_detail"),**request_params,
			
 
				+                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
			
 
				+                self.to_db.delete(self.db_name,item)
			
 
				+            break
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def detail_get(self,request,response):
			
 
				+        '''处理html格式的返回结果'''
			
 
				+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
			
 
				+            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.down_mid)
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        if response.code in (request.down_mid.get("code")):
			
 
				+            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.down_mid)
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        for xpath in request.deal_detail:
			
 
				+            html = response.xpath(xpath).extract_first()  # 标书详细内容
			
 
				+            if html is not None:
			
 
				+                break
			
 
				+
			
 
				+        list_item.contenthtml = html
			
 
				+        yield list_item
			
 
				+
			
 
				+    def detail_json(self,request,response):
			
 
				+        '''处理json串及其他格式的返回结果'''
			
 
				+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
			
 
				+            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.down_mid)
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        if response.code in (request.down_mid.get("code")):
			
 
				+            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.down_mid)
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        exec(request.deal_detail)
			
 
				+
			
 
				+        list_item.contenthtml = html
			
 
				+        yield list_item
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
			
 
				+        if response is None:
			
 
				+            code = 0
			
 
				+        else:
			
 
				+            code = response.status_code
			
 
				+        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
			
 
				+        if 200 <= code < 300:
			
 
				+            err = 'analysis'
			
 
				+        elif 300 <= code < 400:
			
 
				+            err = 'download'
			
 
				+        elif 400 <= code < 500:
			
 
				+            err = 'download'
			
 
				+        elif 500 <= code:
			
 
				+            err = "servers"
			
 
				+        else:
			
 
				+            err = "timeout"
			
 
				+        mgp = MgpListItem()
			
 
				+        mgp.code = code
			
 
				+        mgp.error = err
			
 
				+        items = request.base_info
			
 
				+        for key in items:
			
 
				+            mgp.__setitem__(key, items[key])
			
 
				+        mgp.failed += 1
			
 
				+        if mgp.pri is None:
			
 
				+            mgp.pri = 0
			
 
				+
			
 
				+        if mgp.pri > 5:
			
 
				+            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
			
 
				+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
			
 
				+                    '''
			
 
				+                    根据爬虫优先级报警'''
			
 
				+                    info = f'''`
			
 
				+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
			
 
				+        > **爬虫名称:** {mgp.item.get("site")}
			
 
				+        > **栏目名称:** {mgp.item.get("channel")}
			
 
				+        > **爬虫代码:** {mgp.item.get("spidercode")}
			
 
				+        > **爬虫等级:** {mgp.pri}
			
 
				+        > **所属管理人员:** {mgp.author}
			
 
				+        请登录剑鱼爬虫管理平台查看详情。
			
 
				+        `'''
			
 
				+                    wechat_warning(info)
			
 
				+                    self.send_list.append(mgp.item.get("site"))
			
 
				+        yield mgp
			
 
				+
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+    def download_midware(self, request):
			
 
				+        down_mid = request.down_mid
			
 
				+        key = down_mid.get("key")
			
 
				+        page_url = down_mid.get("page_url")
			
 
				+        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+        request.cookies = cookie_pool.get_cookie()
			
 
				+        return request
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    Details(redis_key="magp:details1").start()
			
--- a/FworkSpider/details/details_firefox.py
+++ b/FworkSpider/details/details_firefox.py
@@ -0,0 +1,115 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-12-13 13:25:15
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: 马国鹏
			
 
				+"""
			
 
				+
			
 
				+import feapder
			
 
				+from feapder.utils.tools import wechat_warning
			
 
				+import execjs
			
 
				+from items.spider_item import DataBakItem, MgpListItem
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+
			
 
				+
			
 
				+
			
 
				+class FirefoxDetails(feapder.Spider):
			
 
				+    _to_db = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    send_list = []
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        while True:
			
 
				+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_firefox"},sort={"date":-1})
			
 
				+            print(data_lsit)
			
 
				+            for item in data_lsit:
			
 
				+                print(item)
			
 
				+                request_params = item.get("request_params")
			
 
				+                if item.get("ex_python"):
			
 
				+                    exec(item.get("ex_python"))
			
 
				+
			
 
				+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
			
 
				+                                      deal_detail=item.get("deal_detail"),**request_params,
			
 
				+                                      callback=eval(item.get("parse")),base_info=item,render=True,
			
 
				+                                      render_time=item.get("render_time"))
			
 
				+                self.to_db.delete(self.db_name,item)
			
 
				+            break
			
 
				+
			
 
				+    def detail_get(self,request,response):
			
 
				+        print(response.text)
			
 
				+        items = request.item
			
 
				+        # print(items)
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        for xpath in request.deal_detail:
			
 
				+            html = response.xpath(xpath).extract_first()  # 标书详细内容
			
 
				+            if html is not None:
			
 
				+                break
			
 
				+        list_item.contenthtml = html
			
 
				+        yield list_item
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
			
 
				+        if response is None:
			
 
				+            code = 0
			
 
				+        code = response.status_code
			
 
				+        err_dic = {"200": "analysis", "400": "download", "500": "servers", "300": "download"}
			
 
				+        if 200 <= code < 300:
			
 
				+            err = 'analysis'
			
 
				+        elif 300 <= code < 400:
			
 
				+            err = 'download'
			
 
				+        elif 400 <= code < 500:
			
 
				+            err = 'download'
			
 
				+        elif 500 <= code:
			
 
				+            err = "servers"
			
 
				+        else:
			
 
				+            err = "timeout"
			
 
				+        mgp = MgpListItem()
			
 
				+        mgp.code = code
			
 
				+        mgp.error = err
			
 
				+        items = request.base_info
			
 
				+        for key in items:
			
 
				+            mgp.__setitem__(key, items[key])
			
 
				+        mgp.failed += 1
			
 
				+        if mgp.pri is None:
			
 
				+            mgp.pri = 0
			
 
				+
			
 
				+        if mgp.pri > 5:
			
 
				+            if mgp.failed in (10, 30, 50, 100, 200) or mgp.failed > 200:
			
 
				+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
			
 
				+                    '''
			
 
				+                    根据爬虫优先级报警'''
			
 
				+                    info = f'''`
			
 
				+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
			
 
				+        > **爬虫名称:** {mgp.item.get("site")}
			
 
				+        > **栏目名称:** {mgp.item.get("channel")}
			
 
				+        > **爬虫代码:** {mgp.item.get("spidercode")}
			
 
				+        > **爬虫等级:** {mgp.pri}
			
 
				+        > **所属管理人员:** {mgp.author}
			
 
				+        请登录剑鱼爬虫管理平台查看详情。
			
 
				+        `'''
			
 
				+                    wechat_warning(info)
			
 
				+                    self.send_list.append(mgp.item.get("site"))
			
 
				+        yield mgp
			
 
				+
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
			
 
				+    # def download_midware(self, request):
			
 
				+    #     request.proxies = self.prox_pool.get()
			
 
				+    #     return request
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    FirefoxDetails(redis_key="magp:details:firefox").start()
			
--- a/FworkSpider/details/details_login.py
+++ b/FworkSpider/details/details_login.py
@@ -0,0 +1,150 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-12-13 13:25:15
			
 
				+---------
			
 
				+@summary:  生成一定有效期cookie，并使用的detail 详情处理方案，默认不限制ip
			
 
				+---------
			
 
				+@author: 马国鹏
			
 
				+"""
			
 
				+
			
 
				+import feapder
			
 
				+from feapder.utils.tools import wechat_warning
			
 
				+import execjs
			
 
				+from items.spider_item import DataBakItem, MgpListItem
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+
			
 
				+from untils.cookie_pool import LoginCookiePool
			
 
				+import copy
			
 
				+
			
 
				+class Details(feapder.Spider):
			
 
				+    _to_db = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    send_list = []
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        while True:
			
 
				+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"details_cookie"},sort={"date":-1})
			
 
				+            for item in data_lsit:
			
 
				+                request_params = item.get("request_params")
			
 
				+                down_mid = copy.copy(item.get("down_mid"))
			
 
				+                key = down_mid.get("key")
			
 
				+                page_url = down_mid.get("page_url")
			
 
				+                cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+                down_mid["cookie_pool"] = cookie_pool
			
 
				+                print(down_mid)
			
 
				+
			
 
				+                if item.get("ex_python"):
			
 
				+                    exec(item.get("ex_python"))
			
 
				+
			
 
				+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
			
 
				+                                      deal_detail=item.get("deal_detail"),**request_params,
			
 
				+                                      callback=eval(item.get("parse")),base_info=item,down_mid=item.get("down_mid"))
			
 
				+                self.to_db.delete(self.db_name,item)
			
 
				+            break
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def detail_get(self,request,response):
			
 
				+        '''处理html格式的返回结果'''
			
 
				+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
			
 
				+            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.get("down_mid"))
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        if response.code in (request.down_mid.get("code")):
			
 
				+            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.get("down_mid"))
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        for xpath in request.deal_detail:
			
 
				+            html = response.xpath(xpath).extract_first()  # 标书详细内容
			
 
				+            if html is not None:
			
 
				+                break
			
 
				+
			
 
				+        list_item.contenthtml = html
			
 
				+        yield list_item
			
 
				+
			
 
				+    def detail_json(self,request,response):
			
 
				+        '''处理json串及其他格式的返回结果'''
			
 
				+        if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
			
 
				+            '''失败处理，当text设置不为None，且在resposne.text中时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.get("down_mid"))
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        if response.code in (request.down_mid.get("code")):
			
 
				+            '''失败处理，response——code不为正确的状态码时，删除当前cookie并重新生产cookie'''
			
 
				+            down_mid = copy.copy(request.get("down_mid"))
			
 
				+            key = down_mid.get("key")
			
 
				+            page_url = down_mid.get("page_url")
			
 
				+            cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+            cookie_pool.del_cookie(request.cookies)
			
 
				+            yield request
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        exec(request.deal_detail)
			
 
				+
			
 
				+        list_item.contenthtml = html
			
 
				+        yield list_item
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
			
 
				+        mgp = MgpListItem()
			
 
				+        items = request.base_info
			
 
				+        for key in items:
			
 
				+            mgp.__setitem__(key,items[key])
			
 
				+        mgp.failed +=1
			
 
				+        print(f'......{mgp.failed}')
			
 
				+        if mgp.pri > 5:
			
 
				+            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
			
 
				+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
			
 
				+                    '''
			
 
				+                    根据爬虫优先级报警'''
			
 
				+                    info= f'''`
			
 
				+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
			
 
				+        > **爬虫名称:** {mgp.item.get("site")}
			
 
				+        > **栏目名称:** {mgp.item.get("channel")}
			
 
				+        > **爬虫代码:** {mgp.item.get("spidercode")}
			
 
				+        > **所属管理人员:** {mgp.author}
			
 
				+        请登录剑鱼爬虫管理平台查看详情。
			
 
				+        `'''
			
 
				+                    wechat_warning(info)
			
 
				+                    self.send_list.append(mgp.item.get("site"))
			
 
				+        yield mgp
			
 
				+
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+    def download_midware(self, request):
			
 
				+        down_mid = request.down_mid
			
 
				+        key = down_mid.get("key")
			
 
				+        page_url = down_mid.get("page_url")
			
 
				+        cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
			
 
				+        request.cookies = cookie_pool.get_cookie()
			
 
				+        return request
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    Details(redis_key="magp:details1").start()
			
--- a/FworkSpider/details/dtcookie_pool.py
+++ b/FworkSpider/details/dtcookie_pool.py
@@ -0,0 +1,88 @@
 
				+import json
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+import execjs
			
 
				+
			
 
				+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
			
 
				+from untils.cookie_pool import PageCookiePool
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+class DTCookiePool(PageCookiePool):
			
 
				+    def __init__(self,redis_key,header,page_url=None,
			
 
				+        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs):
			
 
				+        super(DTCookiePool, self).__init__(redis_key,page_url=None,
			
 
				+        min_cookies=10000,must_contained_keys=(),keep_alive=False,**kwargs)
			
 
				+        self.headers=header
			
 
				+        self.page_url = page_url
			
 
				+
			
 
				+    def create_cookie(self,):
			
 
				+        session = requests.Session()
			
 
				+        start_url = self.page_url
			
 
				+        print(self.headers)
			
 
				+        res = session.get(start_url, headers=self.headers,verify=False)
			
 
				+        js_func = re.findall("document.cookie=(.*?)location.href", res.text)[0]
			
 
				+        js_func = 'function sd() { return ' + js_func + "}"
			
 
				+        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
			
 
				+        ss = ctx.call("sd")
			
 
				+        cookies = {}
			
 
				+
			
 
				+        for item in ss.split(";"):
			
 
				+            if '=' in item:
			
 
				+                cookies[item.split("=")[0]] = item.split("=")[-1]
			
 
				+        res = session.get(start_url, cookies=cookies, headers=self.headers)
			
 
				+        js_do_data = re.findall('};go\((.*?)\)', res.text)[0]
			
 
				+        js_func = re.sub("<(/*?)script>", "", res.text)
			
 
				+        location = re.compile('location(.*?)}else')
			
 
				+        setTimeout = re.compile('_(.{37})setTimeout(.*?)document')
			
 
				+        setTimeout2 = re.compile('setTimeout(.*?)document')
			
 
				+        gox = re.compile('};go(.*?)\)')
			
 
				+        js_func = re.sub(location, "}else", js_func)
			
 
				+        js_func = re.sub(setTimeout, "       document", js_func)
			
 
				+        js_func = re.sub(setTimeout2, "       document", js_func)
			
 
				+        js_func = re.sub(gox, "   return document['cookie']\n};", js_func)
			
 
				+        js_func = '''const jsdom = require("jsdom");
			
 
				+        const {JSDOM} = jsdom;
			
 
				+        const dom = new JSDOM(`<!DOCTYPE html><p>Hello world</p>`);
			
 
				+        window = dom.window;
			
 
				+        document = window.document;''' + js_func
			
 
				+        ctx = execjs.compile(js_func,cwd="C:/Users/topnet/Desktop/文件夹整理包/craler2/node_modules")
			
 
				+        with open('ex_js.js', 'w+', encoding='utf-8') as f:
			
 
				+            f.write(js_func)
			
 
				+        print(js_do_data)
			
 
				+        ss = ctx.call("go", json.loads(js_do_data))
			
 
				+
			
 
				+        for item in ss.split(";"):
			
 
				+            if '=' in item:
			
 
				+                cookies[item.split("=")[0]] = item.split("=")[-1]
			
 
				+                session.cookies.setdefault(item.split("=")[0], item.split("=")[-1])
			
 
				+        res = session.get(start_url, headers=self.headers, cookies=cookies)
			
 
				+        cookies = requests.utils.dict_from_cookiejar(session.cookies)
			
 
				+        return cookies
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    headers = {
			
 
				+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
			
 
				+    "Accept-Encoding": "gzip, deflate, br",
			
 
				+    "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				+    "Cache-Control": "max-age=0",
			
 
				+    "Connection": "keep-alive",
			
 
				+    "Host": "www.hefei.gov.cn",
			
 
				+    "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"96\", \"Google Chrome\";v=\"96\"",
			
 
				+    "sec-ch-ua-mobile": "?0",
			
 
				+    "sec-ch-ua-platform": "\"Windows\"",
			
 
				+    "Sec-Fetch-Dest": "document",
			
 
				+    "Sec-Fetch-Mode": "navigate",
			
 
				+    "Sec-Fetch-Site": "none",
			
 
				+    "Sec-Fetch-User": "?1",
			
 
				+    "Upgrade-Insecure-Requests": "1",
			
 
				+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
			
 
				+}
			
 
				+
			
 
				+    cookie_pool = DTCookiePool(
			
 
				+        page_url='https://www.hefei.gov.cn/public/column/5921?catId=6721141&nav=3&action=list&type=4&pageIndex=2',
			
 
				+        header=headers, redis_key="dongtaices")
			
 
				+    cookie = cookie_pool.get_cookie()
			
 
				+    print(cookie)
			
 
				+    # cookie_pool.del_cookie(cookie)
			
--- a/FworkSpider/details/file/sj.js
+++ b/FworkSpider/details/file/sj.js
--- a/FworkSpider/feapder/VERSION
+++ b/FworkSpider/feapder/VERSION
@@ -0,0 +1 @@
 
				+1.6.9
			
--- a/FworkSpider/feapder/__init__.py
+++ b/FworkSpider/feapder/__init__.py
@@ -0,0 +1,33 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/21 10:41 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import os, sys
			
 
				+import re
			
 
				+
			
 
				+sys.path.insert(0, re.sub(r"([\\/]items$)|([\\/]spiders$)", "", os.getcwd()))
			
 
				+
			
 
				+__all__ = [
			
 
				+    "AirSpider",
			
 
				+    "Spider",
			
 
				+    "BatchSpider",
			
 
				+    "BaseParser",
			
 
				+    "BatchParser",
			
 
				+    "Request",
			
 
				+    "Response",
			
 
				+    "Item",
			
 
				+    "UpdateItem",
			
 
				+    "ArgumentParser",
			
 
				+]
			
 
				+
			
 
				+from feapder.core.spiders import Spider, BatchSpider, AirSpider
			
 
				+from feapder.core.base_parser import BaseParser, BatchParser
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.network.response import Response
			
 
				+from feapder.network.item import Item, UpdateItem
			
 
				+from feapder.utils.custom_argparse import ArgumentParser
			
--- a/FworkSpider/feapder/buffer/__init__.py
+++ b/FworkSpider/feapder/buffer/__init__.py
@@ -0,0 +1,9 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+'''
			
 
				+Created on 2020/4/23 12:09 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+'''
			
--- a/FworkSpider/feapder/buffer/item_buffer.py
+++ b/FworkSpider/feapder/buffer/item_buffer.py
@@ -0,0 +1,426 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-06-19 17:17
			
 
				+---------
			
 
				+@summary: item 管理器， 负责缓冲添加到数据库中的item， 由该manager统一添加。防止多线程同时访问数据库
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import importlib
			
 
				+import threading
			
 
				+from queue import Queue
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.dedup import Dedup
			
 
				+from feapder.network.item import Item, UpdateItem
			
 
				+from feapder.pipelines import BasePipeline
			
 
				+from feapder.pipelines.mysql_pipeline import MysqlPipeline
			
 
				+from feapder.utils import metrics
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+MAX_ITEM_COUNT = 5000  # 缓存中最大item数
			
 
				+UPLOAD_BATCH_MAX_SIZE = 1000
			
 
				+
			
 
				+MYSQL_PIPELINE_PATH = "feapder.pipelines.mysql_pipeline.MysqlPipeline"
			
 
				+
			
 
				+
			
 
				+class ItemBuffer(threading.Thread):
			
 
				+    dedup = None
			
 
				+    __redis_db = None
			
 
				+
			
 
				+    def __init__(self, redis_key, task_table=None):
			
 
				+        if not hasattr(self, "_table_item"):
			
 
				+            super(ItemBuffer, self).__init__()
			
 
				+
			
 
				+            self._thread_stop = False
			
 
				+            self._is_adding_to_db = False
			
 
				+            self._redis_key = redis_key
			
 
				+            self._task_table = task_table
			
 
				+
			
 
				+            self._items_queue = Queue(maxsize=MAX_ITEM_COUNT)
			
 
				+
			
 
				+            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
			
 
				+            self._table_failed_items = setting.TAB_FAILED_ITEMS.format(
			
 
				+                redis_key=redis_key
			
 
				+            )
			
 
				+
			
 
				+            self._item_tables = {
			
 
				+                # 'item_name': 'table_name' # 缓存item名与表名对应关系
			
 
				+            }
			
 
				+
			
 
				+            self._item_update_keys = {
			
 
				+                # 'table_name': ['id', 'name'...] # 缓存table_name与__update_key__的关系
			
 
				+            }
			
 
				+
			
 
				+            self._pipelines = self.load_pipelines()
			
 
				+
			
 
				+            self._have_mysql_pipeline = MYSQL_PIPELINE_PATH in setting.ITEM_PIPELINES
			
 
				+            self._mysql_pipeline = None
			
 
				+
			
 
				+            if setting.ITEM_FILTER_ENABLE and not self.__class__.dedup:
			
 
				+                self.__class__.dedup = Dedup(
			
 
				+                    to_md5=False, **setting.ITEM_FILTER_SETTING
			
 
				+                )
			
 
				+
			
 
				+            # 导出重试的次数
			
 
				+            self.export_retry_times = 0
			
 
				+            # 导出失败的次数 TODO 非air爬虫使用redis统计
			
 
				+            self.export_falied_times = 0
			
 
				+
			
 
				+    @property
			
 
				+    def redis_db(self):
			
 
				+        if self.__class__.__redis_db is None:
			
 
				+            self.__class__.__redis_db = RedisDB()
			
 
				+
			
 
				+        return self.__class__.__redis_db
			
 
				+
			
 
				+    def load_pipelines(self):
			
 
				+        pipelines = []
			
 
				+        for pipeline_path in setting.ITEM_PIPELINES:
			
 
				+            module, class_name = pipeline_path.rsplit(".", 1)
			
 
				+            pipeline_cls = importlib.import_module(module).__getattribute__(class_name)
			
 
				+            pipeline = pipeline_cls()
			
 
				+            if not isinstance(pipeline, BasePipeline):
			
 
				+                raise ValueError(f"{pipeline_path} 需继承 feapder.pipelines.BasePipeline")
			
 
				+            pipelines.append(pipeline)
			
 
				+
			
 
				+        return pipelines
			
 
				+
			
 
				+    @property
			
 
				+    def mysql_pipeline(self):
			
 
				+        if not self._mysql_pipeline:
			
 
				+            module, class_name = MYSQL_PIPELINE_PATH.rsplit(".", 1)
			
 
				+            pipeline_cls = importlib.import_module(module).__getattribute__(class_name)
			
 
				+            self._mysql_pipeline = pipeline_cls()
			
 
				+
			
 
				+        return self._mysql_pipeline
			
 
				+
			
 
				+    def run(self):
			
 
				+        self._thread_stop = False
			
 
				+        while not self._thread_stop:
			
 
				+            self.flush()
			
 
				+            tools.delay_time(1)
			
 
				+
			
 
				+        self.close()
			
 
				+
			
 
				+    def stop(self):
			
 
				+        self._thread_stop = True
			
 
				+        self._started.clear()
			
 
				+
			
 
				+    def put_item(self, item):
			
 
				+        if isinstance(item, Item):
			
 
				+            # 入库前的回调
			
 
				+            item.pre_to_db()
			
 
				+
			
 
				+        self._items_queue.put(item)
			
 
				+
			
 
				+    def flush(self):
			
 
				+        try:
			
 
				+            items = []
			
 
				+            update_items = []
			
 
				+            requests = []
			
 
				+            callbacks = []
			
 
				+            items_fingerprints = []
			
 
				+            data_count = 0
			
 
				+
			
 
				+            while not self._items_queue.empty():
			
 
				+                data = self._items_queue.get_nowait()
			
 
				+                data_count += 1
			
 
				+
			
 
				+                # data 分类
			
 
				+                if callable(data):
			
 
				+                    callbacks.append(data)
			
 
				+
			
 
				+                elif isinstance(data, UpdateItem):
			
 
				+                    update_items.append(data)
			
 
				+
			
 
				+                elif isinstance(data, Item):
			
 
				+                    items.append(data)
			
 
				+                    if setting.ITEM_FILTER_ENABLE:
			
 
				+                        items_fingerprints.append(data.fingerprint)
			
 
				+
			
 
				+                else:  # request-redis
			
 
				+                    requests.append(data)
			
 
				+
			
 
				+                if data_count >= UPLOAD_BATCH_MAX_SIZE:
			
 
				+                    self.__add_item_to_db(
			
 
				+                        items, update_items, requests, callbacks, items_fingerprints
			
 
				+                    )
			
 
				+
			
 
				+                    items = []
			
 
				+                    update_items = []
			
 
				+                    requests = []
			
 
				+                    callbacks = []
			
 
				+                    items_fingerprints = []
			
 
				+                    data_count = 0
			
 
				+
			
 
				+            if data_count:
			
 
				+                self.__add_item_to_db(
			
 
				+                    items, update_items, requests, callbacks, items_fingerprints
			
 
				+                )
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+
			
 
				+    def get_items_count(self):
			
 
				+        return self._items_queue.qsize()
			
 
				+
			
 
				+    def is_adding_to_db(self):
			
 
				+        return self._is_adding_to_db
			
 
				+
			
 
				+    def __dedup_items(self, items, items_fingerprints):
			
 
				+        """
			
 
				+        去重
			
 
				+        @param items:
			
 
				+        @param items_fingerprints:
			
 
				+        @return: 返回去重后的items, items_fingerprints
			
 
				+        """
			
 
				+        if not items:
			
 
				+            return items, items_fingerprints
			
 
				+
			
 
				+        is_exists = self.__class__.dedup.get(items_fingerprints)
			
 
				+        is_exists = is_exists if isinstance(is_exists, list) else [is_exists]
			
 
				+
			
 
				+        dedup_items = []
			
 
				+        dedup_items_fingerprints = []
			
 
				+        items_count = dedup_items_count = dup_items_count = 0
			
 
				+
			
 
				+        while is_exists:
			
 
				+            item = items.pop(0)
			
 
				+            items_fingerprint = items_fingerprints.pop(0)
			
 
				+            is_exist = is_exists.pop(0)
			
 
				+
			
 
				+            items_count += 1
			
 
				+
			
 
				+            if not is_exist:
			
 
				+                dedup_items.append(item)
			
 
				+                dedup_items_fingerprints.append(items_fingerprint)
			
 
				+                dedup_items_count += 1
			
 
				+            else:
			
 
				+                dup_items_count += 1
			
 
				+
			
 
				+        log.info(
			
 
				+            "待入库数据 {} 条， 重复 {} 条，实际待入库数据 {} 条".format(
			
 
				+                items_count, dup_items_count, dedup_items_count
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        return dedup_items, dedup_items_fingerprints
			
 
				+
			
 
				+    def __pick_items(self, items, is_update_item=False):
			
 
				+        """
			
 
				+        将每个表之间的数据分开 拆分后 原items为空
			
 
				+        @param items:
			
 
				+        @param is_update_item:
			
 
				+        @return:
			
 
				+        """
			
 
				+        datas_dict = {
			
 
				+            # 'table_name': [{}, {}]
			
 
				+        }
			
 
				+
			
 
				+        while items:
			
 
				+            item = items.pop(0)
			
 
				+            # 取item下划线格式的名
			
 
				+            # 下划线类的名先从dict中取，没有则现取，然后存入dict。加快下次取的速度
			
 
				+            item_name = item.item_name
			
 
				+            table_name = self._item_tables.get(item_name)
			
 
				+            if not table_name:
			
 
				+                table_name = item.table_name
			
 
				+                self._item_tables[item_name] = table_name
			
 
				+
			
 
				+            if table_name not in datas_dict:
			
 
				+                datas_dict[table_name] = []
			
 
				+
			
 
				+            datas_dict[table_name].append(item.to_dict)
			
 
				+
			
 
				+            if is_update_item and table_name not in self._item_update_keys:
			
 
				+                self._item_update_keys[table_name] = item.update_key
			
 
				+
			
 
				+        return datas_dict
			
 
				+
			
 
				+    def __export_to_db(self, table, datas, is_update=False, update_keys=()):
			
 
				+        # 打点 校验
			
 
				+        self.check_datas(table=table, datas=datas)
			
 
				+
			
 
				+        for pipeline in self._pipelines:
			
 
				+            if is_update:
			
 
				+                if table == self._task_table and not isinstance(
			
 
				+                    pipeline, MysqlPipeline
			
 
				+                ):
			
 
				+                    continue
			
 
				+
			
 
				+                if not pipeline.update_items(table, datas, update_keys=update_keys):
			
 
				+                    log.error(
			
 
				+                        f"{pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
			
 
				+                    )
			
 
				+                    return False
			
 
				+
			
 
				+            else:
			
 
				+                if not pipeline.save_items(table, datas):
			
 
				+                    log.error(
			
 
				+                        f"{pipeline.__class__.__name__} 保存数据失败. table: {table}  items: {datas}"
			
 
				+                    )
			
 
				+                    return False
			
 
				+
			
 
				+        # 若是任务表, 且上面的pipeline里没mysql，则需调用mysql更新任务
			
 
				+        if not self._have_mysql_pipeline and is_update and table == self._task_table:
			
 
				+            if not self.mysql_pipeline.update_items(
			
 
				+                table, datas, update_keys=update_keys
			
 
				+            ):
			
 
				+                log.error(
			
 
				+                    f"{self.mysql_pipeline.__class__.__name__} 更新数据失败. table: {table}  items: {datas}"
			
 
				+                )
			
 
				+                return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def __add_item_to_db(
			
 
				+        self, items, update_items, requests, callbacks, items_fingerprints
			
 
				+    ):
			
 
				+        export_success = True
			
 
				+        self._is_adding_to_db = True
			
 
				+
			
 
				+        # 去重
			
 
				+        if setting.ITEM_FILTER_ENABLE:
			
 
				+            items, items_fingerprints = self.__dedup_items(items, items_fingerprints)
			
 
				+
			
 
				+        # 分捡
			
 
				+        items_dict = self.__pick_items(items)
			
 
				+        update_items_dict = self.__pick_items(update_items, is_update_item=True)
			
 
				+
			
 
				+        # item批量入库
			
 
				+        failed_items = {"add": [], "update": [], "requests": []}
			
 
				+        while items_dict:
			
 
				+            table, datas = items_dict.popitem()
			
 
				+
			
 
				+            log.debug(
			
 
				+                """
			
 
				+                -------------- item 批量入库 --------------
			
 
				+                表名: %s
			
 
				+                datas: %s
			
 
				+                    """
			
 
				+                % (table, tools.dumps_json(datas, indent=16))
			
 
				+            )
			
 
				+
			
 
				+            if not self.__export_to_db(table, datas):
			
 
				+                export_success = False
			
 
				+                failed_items["add"].append({"table": table, "datas": datas})
			
 
				+
			
 
				+        # 执行批量update
			
 
				+        while update_items_dict:
			
 
				+            table, datas = update_items_dict.popitem()
			
 
				+
			
 
				+            log.debug(
			
 
				+                """
			
 
				+                -------------- item 批量更新 --------------
			
 
				+                表名: %s
			
 
				+                datas: %s
			
 
				+                    """
			
 
				+                % (table, tools.dumps_json(datas, indent=16))
			
 
				+            )
			
 
				+
			
 
				+            update_keys = self._item_update_keys.get(table)
			
 
				+            if not self.__export_to_db(
			
 
				+                table, datas, is_update=True, update_keys=update_keys
			
 
				+            ):
			
 
				+                export_success = False
			
 
				+                failed_items["update"].append({"table": table, "datas": datas})
			
 
				+
			
 
				+        if export_success:
			
 
				+            # 执行回调
			
 
				+            while callbacks:
			
 
				+                try:
			
 
				+                    callback = callbacks.pop(0)
			
 
				+                    callback()
			
 
				+                except Exception as e:
			
 
				+                    log.exception(e)
			
 
				+
			
 
				+            # 删除做过的request
			
 
				+            if requests:
			
 
				+                self.redis_db.zrem(self._table_request, requests)
			
 
				+
			
 
				+            # 去重入库
			
 
				+            if setting.ITEM_FILTER_ENABLE:
			
 
				+                if items_fingerprints:
			
 
				+                    self.__class__.dedup.add(items_fingerprints, skip_check=True)
			
 
				+        else:
			
 
				+            failed_items["requests"] = requests
			
 
				+
			
 
				+            if self.export_retry_times > setting.EXPORT_DATA_MAX_RETRY_TIMES:
			
 
				+                if self._redis_key != "air_spider":
			
 
				+                    # 失败的item记录到redis
			
 
				+                    self.redis_db.sadd(self._table_failed_items, failed_items)
			
 
				+
			
 
				+                    # 删除做过的request
			
 
				+                    if requests:
			
 
				+                        self.redis_db.zrem(self._table_request, requests)
			
 
				+
			
 
				+                    log.error(
			
 
				+                        "入库超过最大重试次数，不再重试，数据记录到redis，items:\n {}".format(
			
 
				+                            tools.dumps_json(failed_items)
			
 
				+                        )
			
 
				+                    )
			
 
				+                self.export_retry_times = 0
			
 
				+
			
 
				+            else:
			
 
				+                tip = ["入库不成功"]
			
 
				+                if callbacks:
			
 
				+                    tip.append("不执行回调")
			
 
				+                if requests:
			
 
				+                    tip.append("不删除任务")
			
 
				+                    exists = self.redis_db.zexists(self._table_request, requests)
			
 
				+                    for exist, request in zip(exists, requests):
			
 
				+                        if exist:
			
 
				+                            self.redis_db.zadd(self._table_request, requests, 300)
			
 
				+
			
 
				+                if setting.ITEM_FILTER_ENABLE:
			
 
				+                    tip.append("数据不入去重库")
			
 
				+
			
 
				+                if self._redis_key != "air_spider":
			
 
				+                    tip.append("将自动重试")
			
 
				+
			
 
				+                tip.append("失败items:\n {}".format(tools.dumps_json(failed_items)))
			
 
				+                log.error("，".join(tip))
			
 
				+
			
 
				+                self.export_falied_times += 1
			
 
				+
			
 
				+                if self._redis_key != "air_spider":
			
 
				+                    self.export_retry_times += 1
			
 
				+
			
 
				+            if self.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
			
 
				+                # 报警
			
 
				+                msg = "《{}》爬虫导出数据失败，失败次数：{}，请检查爬虫是否正常".format(
			
 
				+                    self._redis_key, self.export_falied_times
			
 
				+                )
			
 
				+                log.error(msg)
			
 
				+                tools.send_msg(
			
 
				+                    msg=msg,
			
 
				+                    level="error",
			
 
				+                    message_prefix="《%s》爬虫导出数据失败" % (self._redis_key),
			
 
				+                )
			
 
				+
			
 
				+        self._is_adding_to_db = False
			
 
				+
			
 
				+    def check_datas(self, table, datas):
			
 
				+        """
			
 
				+        打点 记录总条数及每个key情况
			
 
				+        @param table: 表名
			
 
				+        @param datas: 数据 列表
			
 
				+        @return:
			
 
				+        """
			
 
				+        metrics.emit_counter("total count", len(datas), classify=table)
			
 
				+        for data in datas:
			
 
				+            for k, v in data.items():
			
 
				+                metrics.emit_counter(k, int(bool(v)), classify=table)
			
 
				+
			
 
				+    def close(self):
			
 
				+        # 调用pipeline的close方法
			
 
				+        for pipeline in self._pipelines:
			
 
				+            try:
			
 
				+                pipeline.close()
			
 
				+            except:
			
 
				+                pass
			
--- a/FworkSpider/feapder/buffer/request_buffer.py
+++ b/FworkSpider/feapder/buffer/request_buffer.py
@@ -0,0 +1,151 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-06-19 17:17
			
 
				+---------
			
 
				+@summary: request 管理器， 负责缓冲添加到数据库中的request
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import collections
			
 
				+import threading
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.dedup import Dedup
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+MAX_URL_COUNT = 1000  # 缓存中最大request数
			
 
				+
			
 
				+
			
 
				+class RequestBuffer(threading.Thread):
			
 
				+    dedup = None
			
 
				+
			
 
				+    def __init__(self, redis_key):
			
 
				+        if not hasattr(self, "_requests_deque"):
			
 
				+            super(RequestBuffer, self).__init__()
			
 
				+
			
 
				+            self._thread_stop = False
			
 
				+            self._is_adding_to_db = False
			
 
				+
			
 
				+            self._requests_deque = collections.deque()
			
 
				+            self._del_requests_deque = collections.deque()
			
 
				+            self._db = RedisDB()
			
 
				+
			
 
				+            self._table_request = setting.TAB_REQUSETS.format(redis_key=redis_key)
			
 
				+            self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
			
 
				+                redis_key=redis_key
			
 
				+            )
			
 
				+
			
 
				+            if not self.__class__.dedup and setting.REQUEST_FILTER_ENABLE:
			
 
				+                self.__class__.dedup = Dedup(
			
 
				+                    name=redis_key, to_md5=False, **setting.REQUEST_FILTER_SETTING
			
 
				+                )  # 默认过期时间为一个月
			
 
				+
			
 
				+    def run(self):
			
 
				+        self._thread_stop = False
			
 
				+        while not self._thread_stop:
			
 
				+            try:
			
 
				+                self.__add_request_to_db()
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            tools.delay_time(1)
			
 
				+
			
 
				+    def stop(self):
			
 
				+        self._thread_stop = True
			
 
				+        self._started.clear()
			
 
				+
			
 
				+    def put_request(self, request):
			
 
				+        self._requests_deque.append(request)
			
 
				+
			
 
				+        if self.get_requests_count() > MAX_URL_COUNT:  # 超过最大缓存，主动调用
			
 
				+            self.flush()
			
 
				+
			
 
				+    def put_del_request(self, request):
			
 
				+        self._del_requests_deque.append(request)
			
 
				+
			
 
				+    def put_failed_request(self, request, table=None):
			
 
				+        try:
			
 
				+            request_dict = request.to_dict
			
 
				+            self._db.zadd(
			
 
				+                table or self._table_failed_request, request_dict, request.priority
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+
			
 
				+    def flush(self):
			
 
				+        try:
			
 
				+            self.__add_request_to_db()
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+
			
 
				+    def get_requests_count(self):
			
 
				+        return len(self._requests_deque)
			
 
				+
			
 
				+    def is_adding_to_db(self):
			
 
				+        return self._is_adding_to_db
			
 
				+
			
 
				+    def __add_request_to_db(self):
			
 
				+        request_list = []
			
 
				+        prioritys = []
			
 
				+        callbacks = []
			
 
				+
			
 
				+        while self._requests_deque:
			
 
				+            request = self._requests_deque.popleft()
			
 
				+            self._is_adding_to_db = True
			
 
				+
			
 
				+            if callable(request):
			
 
				+                # 函数
			
 
				+                # 注意：应该考虑闭包情况。闭包情况可写成
			
 
				+                # def test(xxx = xxx):
			
 
				+                #     # TODO 业务逻辑 使用 xxx
			
 
				+                # 这么写不会导致xxx为循环结束后的最后一个值
			
 
				+                callbacks.append(request)
			
 
				+                continue
			
 
				+
			
 
				+            priority = request.priority
			
 
				+
			
 
				+            # 如果需要去重并且库中已重复 则continue
			
 
				+            if (
			
 
				+                request.filter_repeat
			
 
				+                and setting.REQUEST_FILTER_ENABLE
			
 
				+                and not self.__class__.dedup.add(request.fingerprint)
			
 
				+            ):
			
 
				+                log.debug("request已存在  url = %s" % request.url)
			
 
				+                continue
			
 
				+            else:
			
 
				+                request_list.append(str(request.to_dict))
			
 
				+                prioritys.append(priority)
			
 
				+
			
 
				+            if len(request_list) > MAX_URL_COUNT:
			
 
				+                self._db.zadd(self._table_request, request_list, prioritys)
			
 
				+                request_list = []
			
 
				+                prioritys = []
			
 
				+
			
 
				+        # 入库
			
 
				+        if request_list:
			
 
				+            self._db.zadd(self._table_request, request_list, prioritys)
			
 
				+
			
 
				+        # 执行回调
			
 
				+        for callback in callbacks:
			
 
				+            try:
			
 
				+                callback()
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+        # 删除已做任务
			
 
				+        if self._del_requests_deque:
			
 
				+            request_done_list = []
			
 
				+            while self._del_requests_deque:
			
 
				+                request_done_list.append(self._del_requests_deque.popleft())
			
 
				+
			
 
				+            # 去掉request_list中的requests， 否则可能会将刚添加的request删除
			
 
				+            request_done_list = list(set(request_done_list) - set(request_list))
			
 
				+
			
 
				+            if request_done_list:
			
 
				+                self._db.zrem(self._table_request, request_done_list)
			
 
				+
			
 
				+        self._is_adding_to_db = False
			
--- a/FworkSpider/feapder/commands/__init__.py
+++ b/FworkSpider/feapder/commands/__init__.py
--- a/FworkSpider/feapder/commands/cmdline.py
+++ b/FworkSpider/feapder/commands/cmdline.py
@@ -0,0 +1,45 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/5/8 2:24 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from os.path import dirname, join
			
 
				+
			
 
				+from feapder.commands import create_builder
			
 
				+from feapder.commands import shell
			
 
				+
			
 
				+
			
 
				+def _print_commands():
			
 
				+    with open(join(dirname(dirname(__file__)), "VERSION"), "rb") as f:
			
 
				+        version = f.read().decode("ascii").strip()
			
 
				+
			
 
				+    print("feapder {}".format(version))
			
 
				+    print("\nUsage:")
			
 
				+    print("  feapder <command> [options] [args]\n")
			
 
				+    print("Available commands:")
			
 
				+    cmds = {"create": "create project、spider、item and so on", "shell": "debug response"}
			
 
				+    for cmdname, cmdclass in sorted(cmds.items()):
			
 
				+        print("  %-13s %s" % (cmdname, cmdclass))
			
 
				+
			
 
				+    print('\nUse "feapder <command> -h" to see more info about a command')
			
 
				+
			
 
				+
			
 
				+def execute():
			
 
				+    args = sys.argv
			
 
				+    if len(args) < 2:
			
 
				+        _print_commands()
			
 
				+        return
			
 
				+
			
 
				+    command = args.pop(1)
			
 
				+    if command == "create":
			
 
				+        create_builder.main()
			
 
				+    elif command == "shell":
			
 
				+        shell.main()
			
 
				+    else:
			
 
				+        _print_commands()
			
--- a/FworkSpider/feapder/commands/create/__init__.py
+++ b/FworkSpider/feapder/commands/create/__init__.py
@@ -0,0 +1,21 @@
 
				+__all__ = [
			
 
				+    "CreateProject",
			
 
				+    "CreateSpider",
			
 
				+    "CreateItem",
			
 
				+    "CreateInit",
			
 
				+    "CreateJson",
			
 
				+    "CreateTable",
			
 
				+    "CreateCookies",
			
 
				+    "CreateSetting",
			
 
				+    "CreateParams",
			
 
				+]
			
 
				+
			
 
				+from .create_table import CreateTable
			
 
				+from .create_json import CreateJson
			
 
				+from .create_spider import CreateSpider
			
 
				+from .create_init import CreateInit
			
 
				+from .create_item import CreateItem
			
 
				+from .create_project import CreateProject
			
 
				+from .create_cookies import CreateCookies
			
 
				+from .create_setting import CreateSetting
			
 
				+from .create_params import CreateParams
			
--- a/FworkSpider/feapder/commands/create/create_cookies.py
+++ b/FworkSpider/feapder/commands/create/create_cookies.py
@@ -0,0 +1,48 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/4/25 10:22 上午
			
 
				+---------
			
 
				+@summary: 将浏览器的cookie转为request的cookie
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import sys
			
 
				+
			
 
				+from feapder.utils.tools import get_cookies_from_str, print_pretty
			
 
				+
			
 
				+
			
 
				+class CreateCookies:
			
 
				+    def get_data(self):
			
 
				+        """
			
 
				+        @summary: 从控制台读取多行
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        print("请输入浏览器cookie (列表或字符串格式)")
			
 
				+        data = []
			
 
				+        while True:
			
 
				+            line = sys.stdin.readline().strip()
			
 
				+            if not line:
			
 
				+                break
			
 
				+
			
 
				+            data.append(line)
			
 
				+
			
 
				+        return "".join(data)
			
 
				+
			
 
				+    def create(self):
			
 
				+        data = self.get_data()
			
 
				+        cookies = {}
			
 
				+        try:
			
 
				+            data_json = json.loads(data)
			
 
				+
			
 
				+            for data in data_json:
			
 
				+                cookies[data.get("name")] = data.get("value")
			
 
				+
			
 
				+        except:
			
 
				+            cookies = get_cookies_from_str(data)
			
 
				+
			
 
				+        print_pretty(cookies)
			
--- a/FworkSpider/feapder/commands/create/create_init.py
+++ b/FworkSpider/feapder/commands/create/create_init.py
@@ -0,0 +1,30 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-28 17:38:43
			
 
				+---------
			
 
				+@summary: 创建__init__.py
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+from feapder.utils.tools import dumps_json
			
 
				+
			
 
				+
			
 
				+class CreateInit:
			
 
				+    def create(self):
			
 
				+        __all__ = []
			
 
				+
			
 
				+        import os
			
 
				+
			
 
				+        path = os.getcwd()
			
 
				+        for file in os.listdir(path):
			
 
				+            if file.endswith(".py") and not file.startswith("__init__"):
			
 
				+                model = file.split(".")[0]
			
 
				+                __all__.append(model)
			
 
				+
			
 
				+        del os
			
 
				+
			
 
				+        with open("__init__.py", "w", encoding="utf-8") as file:
			
 
				+            text = "__all__ = %s" % dumps_json(__all__)
			
 
				+            file.write(text)
			
--- a/FworkSpider/feapder/commands/create/create_item.py
+++ b/FworkSpider/feapder/commands/create/create_item.py
@@ -0,0 +1,165 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-28 17:38:43
			
 
				+---------
			
 
				+@summary: 创建item
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import getpass
			
 
				+import os
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder import setting
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from .create_init import CreateInit
			
 
				+
			
 
				+
			
 
				+def deal_file_info(file):
			
 
				+    file = file.replace("{DATE}", tools.get_current_date())
			
 
				+    file = file.replace("{USER}", getpass.getuser())
			
 
				+
			
 
				+    return file
			
 
				+
			
 
				+
			
 
				+class CreateItem:
			
 
				+    def __init__(self):
			
 
				+        self._db = MysqlDB()
			
 
				+        self._create_init = CreateInit()
			
 
				+
			
 
				+    def select_columns(self, table_name):
			
 
				+        # sql = 'SHOW COLUMNS FROM ' + table_name
			
 
				+        sql = f"SELECT COLUMN_NAME, COLUMN_TYPE, IS_NULLABLE, COLUMN_DEFAULT, EXTRA, COLUMN_KEY, COLUMN_COMMENT FROM INFORMATION_SCHEMA.Columns WHERE table_name = '{table_name}' and table_schema = '{setting.MYSQL_DB}'"
			
 
				+        columns = self._db.find(sql)
			
 
				+
			
 
				+        return columns
			
 
				+
			
 
				+    def select_tables_name(self, tables_name):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        ---------
			
 
				+        @param tables_name: 一类tables 如 qidian*
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        sql = f"select table_name from information_schema.tables where table_name like '{tables_name}' and table_schema = '{setting.MYSQL_DB}'"
			
 
				+        tables_name = self._db.find(sql)
			
 
				+
			
 
				+        return tables_name
			
 
				+
			
 
				+    def convert_table_name_to_hump(self, table_name):
			
 
				+        """
			
 
				+        @summary: 格式化表明为驼峰格式
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        table_hump_format = ""
			
 
				+
			
 
				+        words = table_name.split("_")
			
 
				+        for word in words:
			
 
				+            table_hump_format += word.capitalize()  # 首字母大写
			
 
				+
			
 
				+        return table_hump_format
			
 
				+
			
 
				+    def get_item_template(self):
			
 
				+        template_path = os.path.abspath(
			
 
				+            os.path.join(__file__, "../../../templates/item_template.tmpl")
			
 
				+        )
			
 
				+        with open(template_path, "r", encoding="utf-8") as file:
			
 
				+            item_template = file.read()
			
 
				+
			
 
				+        return item_template
			
 
				+
			
 
				+    def create_item(self, item_template, columns, table_name, support_dict):
			
 
				+        table_name_hump_format = self.convert_table_name_to_hump(table_name)
			
 
				+        # 组装 类名
			
 
				+        item_template = item_template.replace("${item_name}", table_name_hump_format)
			
 
				+        if support_dict:
			
 
				+            item_template = item_template.replace("${table_name}", table_name + " 1")
			
 
				+        else:
			
 
				+            item_template = item_template.replace("${table_name}", table_name)
			
 
				+
			
 
				+        # 组装 属性
			
 
				+        propertys = ""
			
 
				+        for column in columns:
			
 
				+            column_name = column[0]
			
 
				+            column_type = column[1]
			
 
				+            is_nullable = column[2]
			
 
				+            column_default = column[3]
			
 
				+            column_extra = column[4]
			
 
				+            column_key = column[5]
			
 
				+            column_comment = column[6]
			
 
				+
			
 
				+            try:
			
 
				+                value = (
			
 
				+                    "kwargs.get('{column_name}')".format(column_name=column_name)
			
 
				+                    if support_dict
			
 
				+                    else (
			
 
				+                        column_default != "CURRENT_TIMESTAMP" and column_default or None
			
 
				+                    )
			
 
				+                    and eval(column_default)
			
 
				+                )
			
 
				+            except:
			
 
				+                value = (
			
 
				+                    "kwargs.get('{column_name}')".format(column_name=column_name)
			
 
				+                    if support_dict
			
 
				+                    else (
			
 
				+                        column_default != "CURRENT_TIMESTAMP" and column_default or None
			
 
				+                    )
			
 
				+                    and column_default
			
 
				+                )
			
 
				+
			
 
				+            if column_extra == "auto_increment" or column_default is not None:
			
 
				+                propertys += f"# self.{column_name} = {value}"
			
 
				+
			
 
				+            else:
			
 
				+                if value is None or isinstance(value, (float, int)) or support_dict:
			
 
				+                    propertys += f"self.{column_name} = {value}"
			
 
				+                else:
			
 
				+                    propertys += f"self.{column_name} = '{value}'"
			
 
				+
			
 
				+            if column_comment:
			
 
				+                propertys += f"  # {column_comment}"
			
 
				+            propertys += "\n" + " " * 8
			
 
				+
			
 
				+        item_template = item_template.replace("${propertys}", propertys.strip())
			
 
				+        item_template = deal_file_info(item_template)
			
 
				+
			
 
				+        return item_template
			
 
				+
			
 
				+    def save_template_to_file(self, item_template, table_name):
			
 
				+        item_file = table_name + "_item.py"
			
 
				+        if os.path.exists(item_file):
			
 
				+            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % item_file)
			
 
				+            if confirm != "y":
			
 
				+                print("取消覆盖  退出")
			
 
				+                return
			
 
				+
			
 
				+        with open(item_file, "w", encoding="utf-8") as file:
			
 
				+            file.write(item_template)
			
 
				+            print("\n%s 生成成功" % item_file)
			
 
				+
			
 
				+        self._create_init.create()
			
 
				+
			
 
				+    def create(self, tables_name, support_dict):
			
 
				+        input_tables_name = tables_name
			
 
				+
			
 
				+        tables_name = self.select_tables_name(tables_name)
			
 
				+        if not tables_name:
			
 
				+            print(tables_name)
			
 
				+            tip = "mysql数据库中无 %s 表 " % input_tables_name
			
 
				+            raise KeyError(tip)
			
 
				+
			
 
				+        for table_name in tables_name:
			
 
				+            table_name = table_name[0]
			
 
				+
			
 
				+            columns = self.select_columns(table_name)
			
 
				+            item_template = self.get_item_template()
			
 
				+            item_template = self.create_item(
			
 
				+                item_template, columns, table_name, support_dict
			
 
				+            )
			
 
				+            self.save_template_to_file(item_template, table_name)
			
--- a/FworkSpider/feapder/commands/create/create_json.py
+++ b/FworkSpider/feapder/commands/create/create_json.py
@@ -0,0 +1,52 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-28 17:38:43
			
 
				+---------
			
 
				+@summary: 字符串转json
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+
			
 
				+
			
 
				+class CreateJson:
			
 
				+    def get_data(self):
			
 
				+        """
			
 
				+        @summary: 从控制台读取多行
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        print("请输入需要转换的内容： （xxx:xxx格式，支持多行）")
			
 
				+        data = []
			
 
				+        while True:
			
 
				+            line = sys.stdin.readline().strip().replace("\t", " " * 4)
			
 
				+            if not line:
			
 
				+                break
			
 
				+
			
 
				+            data.append(line)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def create(self, sort_keys=False):
			
 
				+        contents = self.get_data()
			
 
				+
			
 
				+        json = {}
			
 
				+        for content in contents:
			
 
				+            content = content.strip()
			
 
				+            if not content or content.startswith(":"):
			
 
				+                continue
			
 
				+
			
 
				+            regex = "([^:\s]*)[:|\s]*(.*)"
			
 
				+
			
 
				+            result = tools.get_info(content, regex, fetch_one=True)
			
 
				+            if result[0] in json:
			
 
				+                json[result[0]] = json[result[0]] + "&" + result[1]
			
 
				+            else:
			
 
				+                json[result[0]] = result[1].strip()
			
 
				+
			
 
				+        print(tools.dumps_json(json, sort_keys=sort_keys))
			
--- a/FworkSpider/feapder/commands/create/create_params.py
+++ b/FworkSpider/feapder/commands/create/create_params.py
@@ -0,0 +1,51 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/4/25 10:22 上午
			
 
				+---------
			
 
				+@summary: 将浏览器的cookie转为request的cookie
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+from feapder.utils.tools import dumps_json
			
 
				+
			
 
				+
			
 
				+class CreateParams:
			
 
				+    def get_data(self):
			
 
				+        """
			
 
				+        @summary: 从控制台读取多行
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        print("请输入请求地址")
			
 
				+        data = []
			
 
				+        while True:
			
 
				+            line = sys.stdin.readline().strip()
			
 
				+            if not line:
			
 
				+                break
			
 
				+
			
 
				+            data.append(line)
			
 
				+
			
 
				+        return "".join(data)
			
 
				+
			
 
				+    def get_params(self, url):
			
 
				+        params_json = {}
			
 
				+        params = url.split("?")[-1].split("&")
			
 
				+        for param in params:
			
 
				+            key_value = param.split("=", 1)
			
 
				+            params_json[key_value[0]] = key_value[1]
			
 
				+
			
 
				+        return params_json
			
 
				+
			
 
				+    def create(self):
			
 
				+        data = self.get_data()
			
 
				+
			
 
				+        params = self.get_params(data)
			
 
				+        url = data.split("?")[0]
			
 
				+
			
 
				+        print(f'url = "{url}"')
			
 
				+        print(f"params = {dumps_json(params)}")
			
--- a/FworkSpider/feapder/commands/create/create_project.py
+++ b/FworkSpider/feapder/commands/create/create_project.py
@@ -0,0 +1,52 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-28 17:38:43
			
 
				+---------
			
 
				+@summary: 创建项目
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import getpass
			
 
				+import os
			
 
				+import shutil
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+
			
 
				+
			
 
				+def deal_file_info(file):
			
 
				+    file = file.replace("{DATE}", tools.get_current_date())
			
 
				+    file = file.replace("{USER}", getpass.getuser())
			
 
				+
			
 
				+    return file
			
 
				+
			
 
				+
			
 
				+class CreateProject:
			
 
				+    def copy_callback(self, src, dst, *, follow_symlinks=True):
			
 
				+        if src.endswith(".py"):
			
 
				+            with open(src, "r", encoding="utf-8") as src_file, open(
			
 
				+                dst, "w", encoding="utf8"
			
 
				+            ) as dst_file:
			
 
				+                content = src_file.read()
			
 
				+                content = deal_file_info(content)
			
 
				+                dst_file.write(content)
			
 
				+
			
 
				+        else:
			
 
				+            shutil.copy2(src, dst, follow_symlinks=follow_symlinks)
			
 
				+
			
 
				+    def create(self, project_name):
			
 
				+        if os.path.exists(project_name):
			
 
				+            print("%s 项目已经存在" % project_name)
			
 
				+        else:
			
 
				+            template_path = os.path.abspath(
			
 
				+                os.path.join(__file__, "../../../templates/project_template")
			
 
				+            )
			
 
				+            shutil.copytree(
			
 
				+                template_path, project_name, copy_function=self.copy_callback
			
 
				+            )
			
 
				+
			
 
				+            print("\n%s 项目生成成功" % project_name)
			
 
				+
			
 
				+
			
 
				+
			
--- a/FworkSpider/feapder/commands/create/create_setting.py
+++ b/FworkSpider/feapder/commands/create/create_setting.py
@@ -0,0 +1,27 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/4/23 13:20
			
 
				+---------
			
 
				+@summary: 生成配置文件
			
 
				+---------
			
 
				+@author: mkdir700
			
 
				+@email:  mkdir700@gmail.com
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import shutil
			
 
				+
			
 
				+
			
 
				+class CreateSetting:
			
 
				+    def create(self):
			
 
				+        if os.path.exists("setting.py"):
			
 
				+            confirm = input("配置文件已存在 是否覆盖 (y/n).  ")
			
 
				+            if confirm != "y":
			
 
				+                print("取消覆盖  退出")
			
 
				+                return
			
 
				+
			
 
				+        template_file_path = os.path.abspath(
			
 
				+            os.path.join(__file__, "../../../templates/project_template/setting.py")
			
 
				+        )
			
 
				+        shutil.copy(template_file_path, "./", follow_symlinks=False)
			
 
				+        print("配置文件生成成功")
			
--- a/FworkSpider/feapder/commands/create/create_spider.py
+++ b/FworkSpider/feapder/commands/create/create_spider.py
@@ -0,0 +1,102 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-28 17:38:43
			
 
				+---------
			
 
				+@summary: 创建spider
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import getpass
			
 
				+import os
			
 
				+import re
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+from .create_init import CreateInit
			
 
				+
			
 
				+
			
 
				+def deal_file_info(file):
			
 
				+    file = file.replace("{DATE}", tools.get_current_date())
			
 
				+    # file = file.replace("{USER}", getpass.getuser())
			
 
				+    file = file.replace("{USER}", os.path.basename(os.getcwd()))
			
 
				+
			
 
				+    return file
			
 
				+
			
 
				+
			
 
				+class CreateSpider:
			
 
				+    def __init__(self):
			
 
				+        self._create_init = CreateInit()
			
 
				+
			
 
				+    def cover_to_underline(self, key):
			
 
				+        regex = "[A-Z]*"
			
 
				+        capitals = re.findall(regex, key)
			
 
				+
			
 
				+        if capitals:
			
 
				+            for pos, capital in enumerate(capitals):
			
 
				+                if not capital:
			
 
				+                    continue
			
 
				+                if pos == 0:
			
 
				+                    if len(capital) > 1:
			
 
				+                        key = key.replace(capital, capital.lower() + "_", 1)
			
 
				+                    else:
			
 
				+                        key = key.replace(capital, capital.lower(), 1)
			
 
				+                else:
			
 
				+                    if len(capital) > 1:
			
 
				+                        key = key.replace(capital, "_" + capital.lower() + "_", 1)
			
 
				+                    else:
			
 
				+                        key = key.replace(capital, "_" + capital.lower(), 1)
			
 
				+
			
 
				+        return key
			
 
				+
			
 
				+    def get_spider_template(self, spider_type):
			
 
				+        if spider_type == 1:
			
 
				+            template_path = "air_spider_template.tmpl"
			
 
				+        elif spider_type == 2:
			
 
				+            template_path = "spider_template.tmpl"
			
 
				+        elif spider_type == 3:
			
 
				+            template_path = "batch_spider_template.tmpl"
			
 
				+        elif spider_type == 4:
			
 
				+            template_path = "spider_list_template.tmpl"
			
 
				+        else:
			
 
				+            raise ValueError("spider type error, support 1 2 3")
			
 
				+
			
 
				+        template_path = os.path.abspath(
			
 
				+            os.path.join(__file__, "../../../templates", template_path)
			
 
				+        )
			
 
				+        with open(template_path, "r", encoding="utf-8") as file:
			
 
				+            spider_template = file.read()
			
 
				+
			
 
				+        return spider_template
			
 
				+
			
 
				+    def create_spider(self, spider_template, spider_name):
			
 
				+        spider_template = spider_template.replace("${spider_name}", spider_name)
			
 
				+        spider_template = deal_file_info(spider_template)
			
 
				+        return spider_template
			
 
				+
			
 
				+    def save_spider_to_file(self, spider, spider_name):
			
 
				+        spider_underline = self.cover_to_underline(spider_name)
			
 
				+        spider_file = spider_underline + ".py"
			
 
				+
			
 
				+        if os.path.exists(spider_file):
			
 
				+            confirm = input("%s 文件已存在 是否覆盖 (y/n).  " % spider_file)
			
 
				+            if confirm != "y":
			
 
				+                print("取消覆盖  退出")
			
 
				+                return
			
 
				+
			
 
				+        with open(spider_file, "w", encoding="utf-8") as file:
			
 
				+            file.write(spider)
			
 
				+            print("\n%s 生成成功" % spider_name)
			
 
				+
			
 
				+        self._create_init.create()
			
 
				+
			
 
				+    def create(self, spider_name, spider_type):
			
 
				+        # 检查spider_name
			
 
				+        if not re.search("^[a-zA-Z][a-zA-Z0-9_]*$", spider_name):
			
 
				+            raise Exception("爬虫名不符合命名规范，请用下划线命名或驼峰命名方式")
			
 
				+
			
 
				+        if spider_name.islower():
			
 
				+            spider_name = tools.key2hump(spider_name)
			
 
				+        spider_template = self.get_spider_template(spider_type)
			
 
				+        spider = self.create_spider(spider_template, spider_name)
			
 
				+        self.save_spider_to_file(spider, spider_name)
			
--- a/FworkSpider/feapder/commands/create/create_table.py
+++ b/FworkSpider/feapder/commands/create/create_table.py
@@ -0,0 +1,135 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-28 17:38:43
			
 
				+---------
			
 
				+@summary: 根据json生成表
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from feapder.utils.tools import key2underline
			
 
				+
			
 
				+
			
 
				+class CreateTable:
			
 
				+    def __init__(self):
			
 
				+        self._db = MysqlDB()
			
 
				+
			
 
				+    def is_vaild_date(self, date):
			
 
				+        try:
			
 
				+            if ":" in date:
			
 
				+                time.strptime(date, "%Y-%m-%d %H:%M:%S")
			
 
				+            else:
			
 
				+                time.strptime(date, "%Y-%m-%d")
			
 
				+            return True
			
 
				+        except:
			
 
				+            return False
			
 
				+
			
 
				+    def get_key_type(self, value):
			
 
				+        try:
			
 
				+            value = eval(value)
			
 
				+        except:
			
 
				+            value = value
			
 
				+
			
 
				+        key_type = "varchar(255)"
			
 
				+        if isinstance(value, int):
			
 
				+            key_type = "int"
			
 
				+        elif isinstance(value, float):
			
 
				+            key_type = "double"
			
 
				+        elif isinstance(value, str):
			
 
				+            if self.is_vaild_date(value):
			
 
				+                if ":" in value:
			
 
				+                    key_type = "datetime"
			
 
				+                else:
			
 
				+                    key_type = "date"
			
 
				+            elif len(value) > 255:
			
 
				+                key_type = "text"
			
 
				+            else:
			
 
				+                key_type = "varchar(255)"
			
 
				+
			
 
				+        return key_type
			
 
				+
			
 
				+    def get_data(self):
			
 
				+        """
			
 
				+        @summary: 从控制台读取多行
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        data = ""
			
 
				+        while True:
			
 
				+            line = sys.stdin.readline().strip()
			
 
				+            if not line:
			
 
				+                break
			
 
				+            data += line
			
 
				+
			
 
				+        return tools.get_json(data)
			
 
				+
			
 
				+    def create(self, table_name):
			
 
				+        # 输入表字段
			
 
				+        print('请输入表数据 json格式 如 {"name":"张三"}\n等待输入：\n')
			
 
				+        data = self.get_data()
			
 
				+
			
 
				+        if not isinstance(data, dict):
			
 
				+            raise Exception("表数据格式不正确")
			
 
				+
			
 
				+        # 拼接表结构
			
 
				+        sql = """
			
 
				+            CREATE TABLE `{db}`.`{table_name}` (
			
 
				+                `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT 'id 自动递增',
			
 
				+                {other_key}
			
 
				+                `gtime` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '抓取时间',
			
 
				+                PRIMARY KEY (`id`),
			
 
				+                {unique}
			
 
				+            ) COMMENT='';
			
 
				+        """
			
 
				+
			
 
				+        print("请设置注释 回车跳过")
			
 
				+        other_key = ""
			
 
				+        for key, value in data.items():
			
 
				+            key = key2underline(key)
			
 
				+            key_type = self.get_key_type(value)
			
 
				+
			
 
				+            comment = input("%s : %s  -> comment：" % (key, key_type))
			
 
				+
			
 
				+            other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
			
 
				+                key=key, key_type=key_type, comment=comment
			
 
				+            )
			
 
				+
			
 
				+        print("\n")
			
 
				+
			
 
				+        while True:
			
 
				+            is_need_batch_date = input("是否添加batch_date 字段 （y/n）:")
			
 
				+            if is_need_batch_date == "y":
			
 
				+                other_key += "`{key}` {key_type} COMMENT '{comment}',\n                ".format(
			
 
				+                    key="batch_date", key_type="date", comment="批次时间"
			
 
				+                )
			
 
				+                break
			
 
				+            elif is_need_batch_date == "n":
			
 
				+                break
			
 
				+
			
 
				+        print("\n")
			
 
				+
			
 
				+        while True:
			
 
				+            unique = input("请设置唯一索引, 多个逗号间隔\n等待输入：\n").replace("，", ",")
			
 
				+            if unique:
			
 
				+                break
			
 
				+        unique = "UNIQUE `idx` USING BTREE (`%s`) comment ''" % "`,`".join(
			
 
				+            unique.split(",")
			
 
				+        )
			
 
				+
			
 
				+        sql = sql.format(
			
 
				+            db=setting.MYSQL_DB,
			
 
				+            table_name=table_name,
			
 
				+            other_key=other_key,
			
 
				+            unique=unique,
			
 
				+        )
			
 
				+        print(sql)
			
 
				+        self._db.execute(sql)
			
 
				+        print("\n%s 创建成功" % table_name)
			
--- a/FworkSpider/feapder/commands/create_builder.py
+++ b/FworkSpider/feapder/commands/create_builder.py
@@ -0,0 +1,118 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/2/8 11:21 上午
			
 
				+---------
			
 
				+@summary: 生成器
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import argparse
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+from feapder.commands.create import *
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    spider = argparse.ArgumentParser(description="生成器")
			
 
				+
			
 
				+    spider.add_argument(
			
 
				+        "-p", "--project", help="创建项目 如 feapder create -p <project_name>", metavar=""
			
 
				+    )
			
 
				+    spider.add_argument(
			
 
				+        "-s",
			
 
				+        "--spider",
			
 
				+        nargs="+",
			
 
				+        help="创建爬虫\n"
			
 
				+        "如 feapder create -s <spider_name> <spider_type> "
			
 
				+        "spider_type=1  AirSpider; "
			
 
				+        "spider_type=2  Spider; "
			
 
				+        "spider_type=3  BatchSpider;",
			
 
				+        metavar="",
			
 
				+    )
			
 
				+    spider.add_argument(
			
 
				+        "-i",
			
 
				+        "--item",
			
 
				+        nargs="+",
			
 
				+        help="创建item 如 feapder create -i test 则生成test表对应的item。 "
			
 
				+        "支持like语法模糊匹配所要生产的表。 "
			
 
				+        "若想生成支持字典方式赋值的item，则create -item test 1",
			
 
				+        metavar="",
			
 
				+    )
			
 
				+    spider.add_argument(
			
 
				+        "-t", "--table", help="根据json创建表 如 feapder create -t <table_name>", metavar=""
			
 
				+    )
			
 
				+    spider.add_argument(
			
 
				+        "-init", help="创建__init__.py 如 feapder create -init", action="store_true"
			
 
				+    )
			
 
				+    spider.add_argument("-j", "--json", help="创建json", action="store_true")
			
 
				+    spider.add_argument("-sj", "--sort_json", help="创建有序json", action="store_true")
			
 
				+    spider.add_argument("-c", "--cookies", help="创建cookie", action="store_true")
			
 
				+    spider.add_argument("--params", help="解析地址中的参数", action="store_true")
			
 
				+    spider.add_argument(
			
 
				+        "--setting", help="创建全局配置文件" "feapder create --setting", action="store_true"
			
 
				+    )
			
 
				+
			
 
				+    # 指定数据库
			
 
				+    spider.add_argument("--host", type=str, help="mysql 连接地址", metavar="")
			
 
				+    spider.add_argument("--port", type=str, help="mysql 端口", metavar="")
			
 
				+    spider.add_argument("--username", type=str, help="mysql 用户名", metavar="")
			
 
				+    spider.add_argument("--password", type=str, help="mysql 密码", metavar="")
			
 
				+    spider.add_argument("--db", type=str, help="mysql 数据库名", metavar="")
			
 
				+    args = spider.parse_args()
			
 
				+
			
 
				+    if args.host:
			
 
				+        setting.MYSQL_IP = args.host
			
 
				+    if args.port:
			
 
				+        setting.MYSQL_PORT = int(args.port)
			
 
				+    if args.username:
			
 
				+        setting.MYSQL_USER_NAME = args.username
			
 
				+    if args.password:
			
 
				+        setting.MYSQL_USER_PASS = args.password
			
 
				+    if args.db:
			
 
				+        setting.MYSQL_DB = args.db
			
 
				+
			
 
				+    if args.item:
			
 
				+        item_name, *support_dict = args.item
			
 
				+        support_dict = bool(support_dict)
			
 
				+        CreateItem().create(item_name, support_dict)
			
 
				+
			
 
				+    elif args.spider:
			
 
				+        spider_name, *spider_type = args.spider
			
 
				+        if not spider_type:
			
 
				+            spider_type = 1
			
 
				+        else:
			
 
				+            spider_type = spider_type[0]
			
 
				+        try:
			
 
				+            spider_type = int(spider_type)
			
 
				+        except:
			
 
				+            raise ValueError("spider_type error, support 1, 2, 3")
			
 
				+        CreateSpider().create(spider_name, spider_type)
			
 
				+
			
 
				+    elif args.project:
			
 
				+        CreateProject().create(args.project)
			
 
				+
			
 
				+    elif args.table:
			
 
				+        CreateTable().create(args.table)
			
 
				+
			
 
				+    elif args.init:
			
 
				+        CreateInit().create()
			
 
				+
			
 
				+    elif args.json:
			
 
				+        CreateJson().create()
			
 
				+
			
 
				+    elif args.sort_json:
			
 
				+        CreateJson().create(sort_keys=True)
			
 
				+
			
 
				+    elif args.cookies:
			
 
				+        CreateCookies().create()
			
 
				+
			
 
				+    elif args.setting:
			
 
				+        CreateSetting().create()
			
 
				+
			
 
				+    elif args.params:
			
 
				+        CreateParams().create()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/FworkSpider/feapder/commands/shell.py
+++ b/FworkSpider/feapder/commands/shell.py
@@ -0,0 +1,93 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/5/9 12:37 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import re
			
 
				+import sys
			
 
				+
			
 
				+import IPython
			
 
				+
			
 
				+from feapder import Request
			
 
				+
			
 
				+
			
 
				+def request(**kwargs):
			
 
				+    kwargs.setdefault("proxies", None)
			
 
				+    response = Request(**kwargs).get_response()
			
 
				+    print(response)
			
 
				+
			
 
				+    IPython.embed(header="now you can use response")
			
 
				+
			
 
				+
			
 
				+def fetch_url(url):
			
 
				+    request(url=url)
			
 
				+
			
 
				+
			
 
				+def fetch_curl(curl_args):
			
 
				+    """
			
 
				+    解析及抓取curl请求
			
 
				+    :param curl_args:
			
 
				+    [url, '-H', 'xxx', '-H', 'xxx', '--data-binary', '{"xxx":"xxx"}', '--compressed']
			
 
				+    :return:
			
 
				+    """
			
 
				+    url = curl_args[0]
			
 
				+    curl_args.pop(0)
			
 
				+
			
 
				+    headers = {}
			
 
				+    data = {}
			
 
				+    for i in range(0, len(curl_args), 2):
			
 
				+        if curl_args[i] == "-H":
			
 
				+            regex = "([^:\s]*)[:|\s]*(.*)"
			
 
				+            result = re.search(regex, curl_args[i + 1], re.S).groups()
			
 
				+            if result[0] in headers:
			
 
				+                headers[result[0]] = headers[result[0]] + "&" + result[1]
			
 
				+            else:
			
 
				+                headers[result[0]] = result[1].strip()
			
 
				+
			
 
				+        elif curl_args[i] == "--data-binary":
			
 
				+            data = json.loads(curl_args[i + 1])
			
 
				+
			
 
				+    request(url=url, data=data, headers=headers)
			
 
				+
			
 
				+
			
 
				+def usage():
			
 
				+    """
			
 
				+下载调试器
			
 
				+
			
 
				+usage: feapder shell [options] [args]
			
 
				+
			
 
				+optional arguments:
			
 
				+  -u, --url     抓取指定url
			
 
				+  -c, --curl    抓取curl格式的请求
			
 
				+
			
 
				+    """
			
 
				+    print(usage.__doc__)
			
 
				+    sys.exit()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    args = sys.argv
			
 
				+    if len(args) < 3:
			
 
				+        usage()
			
 
				+
			
 
				+    elif args[1] in ("-h", "--help"):
			
 
				+        usage()
			
 
				+
			
 
				+    elif args[1] in ("-u", "--url"):
			
 
				+        fetch_url(args[2])
			
 
				+
			
 
				+    elif args[1] in ("-c", "--curl"):
			
 
				+        fetch_curl(args[2:])
			
 
				+
			
 
				+    else:
			
 
				+        usage()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/FworkSpider/feapder/core/__init__.py
+++ b/FworkSpider/feapder/core/__init__.py
@@ -0,0 +1,9 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+'''
			
 
				+Created on 2020/4/23 12:09 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+'''
			
--- a/FworkSpider/feapder/core/base_parser.py
+++ b/FworkSpider/feapder/core/base_parser.py
@@ -0,0 +1,252 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-07-25 11:41:57
			
 
				+---------
			
 
				+@summary: parser 的基类
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+import os
			
 
				+import traceback
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from feapder.network.item import UpdateItem
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.aliyun import UploadOSS
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+
			
 
				+
			
 
				+class BaseParser(object):
			
 
				+    def start_requests(self):
			
 
				+        """
			
 
				+        @summary: 添加初始url
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result: yield Request()
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def download_midware(self, request):
			
 
				+        """
			
 
				+        @summary: 下载中间件 可修改请求的一些参数, 或可自定义下载，然后返回 request, response
			
 
				+        ---------
			
 
				+        @param request:
			
 
				+        ---------
			
 
				+        @result: return request / request, response
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def validate(self, request, response):
			
 
				+        """
			
 
				+        @summary: 校验函数, 可用于校验response是否正确
			
 
				+        若函数内抛出异常，则重试请求
			
 
				+        若返回True 或 None，则进入解析函数
			
 
				+        若返回False，则抛弃当前请求
			
 
				+        可通过request.callback_name 区分不同的回调函数，编写不同的校验逻辑
			
 
				+        ---------
			
 
				+        @param request:
			
 
				+        @param response:
			
 
				+        ---------
			
 
				+        @result: True / None / False
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        """
			
 
				+        @summary: 默认的解析函数
			
 
				+        ---------
			
 
				+        @param request:
			
 
				+        @param response:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def exception_request(self, request, response):
			
 
				+        """
			
 
				+        @summary: 请求或者parser里解析出异常的request
			
 
				+        ---------
			
 
				+        @param request:
			
 
				+        @param response:
			
 
				+        ---------
			
 
				+        @result: request / callback / None (返回值必须可迭代)
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        """
			
 
				+        @summary: 超过最大重试次数的request
			
 
				+        可返回修改后的request  若不返回request，则将传进来的request直接人redis的failed表。否则将修改后的request入failed表
			
 
				+        ---------
			
 
				+        @param request:
			
 
				+        ---------
			
 
				+        @result: request / item / callback / None (返回值必须可迭代)
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+    def push_files(self, request, response):
			
 
				+        """
			
 
				+        @summary: 下载 并上传附件文件，传进来的request的auto_request必须为False，否则可能会因为响应失败而无法下载文件
			
 
				+        ---------
			
 
				+        @param request:  request.url 为文件下载地址， 该方法需要自行调用
			
 
				+        request.INFO  为上传文件时所需要提供的部分参数  必传
			
 
				+         info = {
			
 
				+            "org_url": "http://www...",  # 文件下载连接
			
 
				+            "filename": f"{list_item.title}.docx",  # 文件名
			
 
				+            "channel": list_item.channel,
			
 
				+            "ftype": 'docx,zip,ftp', # 文件类型
			
 
				+        }
			
 
				+        request.headers 则存放请求的必要参数，如：parmas，headers  必传
			
 
				+        ---------
			
 
				+        @result: request / item / callback / None (返回值必须可迭代)，正常处理为 None 即可
			
 
				+        """
			
 
				+        list_item = request.item
			
 
				+        res = None
			
 
				+        for i in range(5):
			
 
				+            try:
			
 
				+                parameter = request.parameter
			
 
				+                res = UploadOSS().get_state(request.info,**parameter)
			
 
				+            except:
			
 
				+                log.error(traceback.format_exc())
			
 
				+            if res is not None:
			
 
				+                list_item.projectinfo = res
			
 
				+                yield list_item
			
 
				+                log.info(f"{res.get('filename')}附件下载完成，大小为：{res.get('size')},fid为：{res.get('fid')}")
			
 
				+                return
			
 
				+            else:
			
 
				+                log.error(f"{res.get('filename')}附件下载失败，失败连接为：{res.get('org_url')}")
			
 
				+        if res is None:
			
 
				+            _db = RedisDB()
			
 
				+            request_dict = request.to_dict
			
 
				+            _db.zadd("forwork:files_failed", request_dict)
			
 
				+
			
 
				+    def start_callback(self):
			
 
				+        """
			
 
				+        @summary: 程序开始的回调
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result: None
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        """
			
 
				+        @summary: 程序结束的回调
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result: None
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    @property
			
 
				+    def name(self):
			
 
				+        return self.__class__.__name__
			
 
				+
			
 
				+    def close(self):
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+class BatchParser(BaseParser):
			
 
				+    """
			
 
				+    @summary: 批次爬虫模版
			
 
				+    ---------
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, task_table, batch_record_table, task_state, date_format, mysqldb=None
			
 
				+    ):
			
 
				+        self._mysqldb = mysqldb or MysqlDB()  # mysqldb
			
 
				+
			
 
				+        self._task_table = task_table  # mysql中的任务表
			
 
				+        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
			
 
				+        self._task_state = task_state  # mysql中任务表的state字段名
			
 
				+        self._date_format = date_format  # 批次日期格式
			
 
				+
			
 
				+    def add_task(self):
			
 
				+        """
			
 
				+        @summary: 添加任务, 每次启动start_monitor 都会调用，且在init_task之前调用
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+    def start_requests(self, task):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        ---------
			
 
				+        @param task: 任务信息 list
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+    def update_task_state(self, task_id, state=1, **kwargs):
			
 
				+        """
			
 
				+        @summary: 更新任务表中任务状态，做完每个任务时代码逻辑中要主动调用。可能会重写
			
 
				+        调用方法为 yield lambda : self.update_task_state(task_id, state)
			
 
				+        ---------
			
 
				+        @param task_id:
			
 
				+        @param state:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        kwargs["id"] = task_id
			
 
				+        kwargs[self._task_state] = state
			
 
				+
			
 
				+        sql = tools.make_update_sql(
			
 
				+            self._task_table, kwargs, condition="id = {task_id}".format(task_id=task_id)
			
 
				+        )
			
 
				+
			
 
				+        if self._mysqldb.update(sql):
			
 
				+            log.debug("置任务%s状态成功" % task_id)
			
 
				+        else:
			
 
				+            log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
			
 
				+
			
 
				+    def update_task_batch(self, task_id, state=1, **kwargs):
			
 
				+        """
			
 
				+        批量更新任务 多处调用，更新的字段必须一致
			
 
				+        注意：需要 写成 yield update_task_batch(...) 否则不会更新
			
 
				+        @param task_id:
			
 
				+        @param state:
			
 
				+        @param kwargs:
			
 
				+        @return:
			
 
				+        """
			
 
				+        kwargs["id"] = task_id
			
 
				+        kwargs[self._task_state] = state
			
 
				+
			
 
				+        update_item = UpdateItem(**kwargs)
			
 
				+        update_item.table_name = self._task_table
			
 
				+        update_item.name_underline = self._task_table + "_item"
			
 
				+
			
 
				+        return update_item
			
 
				+
			
 
				+    @property
			
 
				+    def batch_date(self):
			
 
				+        """
			
 
				+        @summary: 获取批次时间
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        batch_date = os.environ.get("batch_date")
			
 
				+        if not batch_date:
			
 
				+            sql = 'select date_format(batch_date, "{date_format}") from {batch_record_table} order by id desc limit 1'.format(
			
 
				+                date_format=self._date_format.replace(":%M", ":%i"),
			
 
				+                batch_record_table=self._batch_record_table,
			
 
				+            )
			
 
				+            batch_info = MysqlDB().find(sql)  # (('2018-08-19'),)
			
 
				+            if batch_info:
			
 
				+                os.environ["batch_date"] = batch_date = batch_info[0][0]
			
 
				+            else:
			
 
				+                log.error("需先运行 start_monitor_task()")
			
 
				+                os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
			
 
				+
			
 
				+        return batch_date
			
--- a/FworkSpider/feapder/core/collector.py
+++ b/FworkSpider/feapder/core/collector.py
@@ -0,0 +1,176 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2016-12-23 11:24
			
 
				+---------
			
 
				+@summary: request 管理
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import collections
			
 
				+import threading
			
 
				+import time
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class Collector(threading.Thread):
			
 
				+    def __init__(self, redis_key):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        ---------
			
 
				+        @param redis_key:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        super(Collector, self).__init__()
			
 
				+        self._db = RedisDB()
			
 
				+
			
 
				+        self._thread_stop = False
			
 
				+
			
 
				+        self._todo_requests = collections.deque()
			
 
				+
			
 
				+        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
			
 
				+        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
			
 
				+
			
 
				+        self._spider_mark = tools.get_localhost_ip() + f"-{time.time()}"
			
 
				+
			
 
				+        self._interval = setting.COLLECTOR_SLEEP_TIME
			
 
				+        self._request_count = setting.COLLECTOR_TASK_COUNT
			
 
				+        self._is_collector_task = False
			
 
				+        self._first_get_task = True
			
 
				+
			
 
				+        self.__delete_dead_node()
			
 
				+
			
 
				+    def run(self):
			
 
				+        self._thread_stop = False
			
 
				+        while not self._thread_stop:
			
 
				+            try:
			
 
				+                self.__report_node_heartbeat()
			
 
				+                self.__input_data()
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            self._is_collector_task = False
			
 
				+
			
 
				+            time.sleep(self._interval)
			
 
				+
			
 
				+    def stop(self):
			
 
				+        self._thread_stop = True
			
 
				+        self._started.clear()
			
 
				+
			
 
				+    def __input_data(self):
			
 
				+        current_timestamp = tools.get_current_timestamp()
			
 
				+        if len(self._todo_requests) >= self._request_count:
			
 
				+            return
			
 
				+
			
 
				+        request_count = self._request_count  # 先赋值
			
 
				+        # 查询最近有心跳的节点数量
			
 
				+        spider_count = self._db.zget_count(
			
 
				+            self._tab_spider_status,
			
 
				+            priority_min=current_timestamp - (self._interval + 10),
			
 
				+            priority_max=current_timestamp,
			
 
				+        )
			
 
				+        # 根据等待节点数量，动态分配request
			
 
				+        if spider_count:
			
 
				+            # 任务数量
			
 
				+            task_count = self._db.zget_count(self._tab_requests)
			
 
				+            # 动态分配的数量 = 任务数量 / 休息的节点数量 + 1
			
 
				+            request_count = task_count // spider_count + 1
			
 
				+
			
 
				+        request_count = (
			
 
				+            request_count
			
 
				+            if request_count <= self._request_count
			
 
				+            else self._request_count
			
 
				+        )
			
 
				+
			
 
				+        if not request_count:
			
 
				+            return
			
 
				+
			
 
				+        # 当前无其他节点，并且是首次取任务，则重置丢失的任务
			
 
				+        if self._first_get_task and spider_count <= 1:
			
 
				+            datas = self._db.zrangebyscore_set_score(
			
 
				+                self._tab_requests,
			
 
				+                priority_min=current_timestamp,
			
 
				+                priority_max=current_timestamp + setting.REQUEST_LOST_TIMEOUT,
			
 
				+                score=300,
			
 
				+                count=None,
			
 
				+            )
			
 
				+            self._first_get_task = False
			
 
				+            lose_count = len(datas)
			
 
				+            if lose_count:
			
 
				+                log.info("重置丢失任务完毕，共{}条".format(len(datas)))
			
 
				+
			
 
				+        # 取任务，只取当前时间搓以内的任务，同时将任务分数修改为 current_timestamp + setting.REQUEST_LOST_TIMEOUT
			
 
				+        requests_list = self._db.zrangebyscore_set_score(
			
 
				+            self._tab_requests,
			
 
				+            priority_min="-inf",
			
 
				+            priority_max=current_timestamp,
			
 
				+            score=current_timestamp + setting.REQUEST_LOST_TIMEOUT,
			
 
				+            count=request_count,
			
 
				+        )
			
 
				+
			
 
				+        if requests_list:
			
 
				+            self._is_collector_task = True
			
 
				+            # 存request
			
 
				+            self.__put_requests(requests_list)
			
 
				+
			
 
				+    def __report_node_heartbeat(self):
			
 
				+        """
			
 
				+        汇报节点心跳，以便任务平均分配
			
 
				+        """
			
 
				+        self._db.zadd(
			
 
				+            self._tab_spider_status, self._spider_mark, tools.get_current_timestamp()
			
 
				+        )
			
 
				+
			
 
				+    def __delete_dead_node(self):
			
 
				+        """
			
 
				+        删除没有心跳的节点信息
			
 
				+        """
			
 
				+        self._db.zremrangebyscore(
			
 
				+            self._tab_spider_status,
			
 
				+            "-inf",
			
 
				+            tools.get_current_timestamp() - (self._interval + 10),
			
 
				+        )
			
 
				+
			
 
				+    def __put_requests(self, requests_list):
			
 
				+        for request in requests_list:
			
 
				+            try:
			
 
				+                request_dict = {
			
 
				+                    "request_obj": Request.from_dict(eval(request)),
			
 
				+                    "request_redis": request,
			
 
				+                }
			
 
				+            except Exception as e:
			
 
				+                log.exception(
			
 
				+                    """
			
 
				+                error %s
			
 
				+                request %s
			
 
				+                """
			
 
				+                    % (e, request)
			
 
				+                )
			
 
				+
			
 
				+                request_dict = None
			
 
				+
			
 
				+            if request_dict:
			
 
				+                self._todo_requests.append(request_dict)
			
 
				+
			
 
				+    def get_requests(self, count):
			
 
				+        requests = []
			
 
				+        count = count if count <= len(self._todo_requests) else len(self._todo_requests)
			
 
				+        while count:
			
 
				+            requests.append(self._todo_requests.popleft())
			
 
				+            count -= 1
			
 
				+
			
 
				+        return requests
			
 
				+
			
 
				+    def get_requests_count(self):
			
 
				+        return len(self._todo_requests) or self._db.zget_count(self._tab_requests) or 0
			
 
				+
			
 
				+    def is_collector_task(self):
			
 
				+        return self._is_collector_task
			
--- a/FworkSpider/feapder/core/handle_failed_requests.py
+++ b/FworkSpider/feapder/core/handle_failed_requests.py
@@ -0,0 +1,56 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-08-13 11:43:01
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+import feapder.setting as setting
			
 
				+from feapder.buffer.request_buffer import RequestBuffer
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class HandleFailedRequests(object):
			
 
				+    """docstring for HandleFailedRequests"""
			
 
				+
			
 
				+    def __init__(self, redis_key):
			
 
				+        super(HandleFailedRequests, self).__init__()
			
 
				+        self._redis_key = redis_key
			
 
				+
			
 
				+        self._redisdb = RedisDB()
			
 
				+        self._request_buffer = RequestBuffer(self._redis_key)
			
 
				+
			
 
				+        self._table_failed_request = setting.TAB_FAILED_REQUSETS.format(
			
 
				+            redis_key=redis_key
			
 
				+        )
			
 
				+
			
 
				+    def get_failed_requests(self, count=10000):
			
 
				+        failed_requests = self._redisdb.zget(self._table_failed_request, count=count)
			
 
				+        failed_requests = [eval(failed_request) for failed_request in failed_requests]
			
 
				+        return failed_requests
			
 
				+
			
 
				+    def reput_failed_requests_to_requests(self):
			
 
				+        log.debug("正在重置失败的requests...")
			
 
				+        total_count = 0
			
 
				+        while True:
			
 
				+            try:
			
 
				+                failed_requests = self.get_failed_requests()
			
 
				+                if not failed_requests:
			
 
				+                    break
			
 
				+
			
 
				+                for request in failed_requests:
			
 
				+                    request["retry_times"] = 0
			
 
				+                    request_obj = Request.from_dict(request)
			
 
				+                    self._request_buffer.put_request(request_obj)
			
 
				+
			
 
				+                    total_count += 1
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+        self._request_buffer.flush()
			
 
				+
			
 
				+        log.debug("重置%s条失败requests为待抓取requests" % total_count)
			
--- a/FworkSpider/feapder/core/parser_control.py
+++ b/FworkSpider/feapder/core/parser_control.py
@@ -0,0 +1,724 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2017-01-03 16:06
			
 
				+---------
			
 
				+@summary: parser 控制类
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import random
			
 
				+import threading
			
 
				+import time
			
 
				+from collections import Iterable
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.buffer.item_buffer import ItemBuffer
			
 
				+from feapder.db.memory_db import MemoryDB
			
 
				+from feapder.network.item import Item
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils import metrics
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class PaserControl(threading.Thread):
			
 
				+    DOWNLOAD_EXCEPTION = "download_exception"
			
 
				+    DOWNLOAD_SUCCESS = "download_success"
			
 
				+    DOWNLOAD_TOTAL = "download_total"
			
 
				+    PAESERS_EXCEPTION = "parser_exception"
			
 
				+
			
 
				+    is_show_tip = False
			
 
				+
			
 
				+    # 实时统计已做任务数及失败任务数，若失败任务数/已做任务数>0.5 则报警
			
 
				+    _success_task_count = 0
			
 
				+    _failed_task_count = 0
			
 
				+
			
 
				+    def __init__(self, collector, redis_key, request_buffer, item_buffer):
			
 
				+        super(PaserControl, self).__init__()
			
 
				+        self._parsers = []
			
 
				+        self._collector = collector
			
 
				+        self._redis_key = redis_key
			
 
				+        self._request_buffer = request_buffer
			
 
				+        self._item_buffer = item_buffer
			
 
				+
			
 
				+        self._thread_stop = False
			
 
				+
			
 
				+        self._wait_task_time = 0
			
 
				+
			
 
				+    def run(self):
			
 
				+        self._thread_stop = False
			
 
				+        while not self._thread_stop:
			
 
				+            try:
			
 
				+                requests = self._collector.get_requests(setting.SPIDER_TASK_COUNT)
			
 
				+                if not requests:
			
 
				+                    if not self.is_show_tip:
			
 
				+                        log.debug("parser 等待任务...")
			
 
				+                        self.is_show_tip = True
			
 
				+
			
 
				+                    # log.debug('parser 等待任务{}...'.format(tools.format_seconds(self._wait_task_time)))
			
 
				+
			
 
				+                    time.sleep(1)
			
 
				+                    self._wait_task_time += 1
			
 
				+                    continue
			
 
				+
			
 
				+                self.is_show_tip = False
			
 
				+                self.deal_requests(requests)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                time.sleep(3)
			
 
				+
			
 
				+    def is_not_task(self):
			
 
				+        return self.is_show_tip
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_task_status_count(cls):
			
 
				+        return cls._failed_task_count, cls._success_task_count
			
 
				+
			
 
				+    def deal_requests(self, requests):
			
 
				+        for request in requests:
			
 
				+
			
 
				+            response = None
			
 
				+            request_redis = request["request_redis"]
			
 
				+            request = request["request_obj"]
			
 
				+
			
 
				+            del_request_redis_after_item_to_db = False
			
 
				+            del_request_redis_after_request_to_db = False
			
 
				+
			
 
				+            for parser in self._parsers:
			
 
				+                if parser.name == request.parser_name:
			
 
				+                    used_download_midware_enable = False
			
 
				+                    try:
			
 
				+                        # 记录需下载的文档
			
 
				+                        self.record_download_status(
			
 
				+                            PaserControl.DOWNLOAD_TOTAL, parser.name
			
 
				+                        )
			
 
				+
			
 
				+                        # 解析request
			
 
				+                        if request.auto_request:
			
 
				+                            request_temp = None
			
 
				+                            response = None
			
 
				+
			
 
				+                            # 下载中间件
			
 
				+                            if request.download_midware:
			
 
				+                                if isinstance(request.download_midware, (list, tuple)):
			
 
				+                                    request_temp = request
			
 
				+                                    for download_midware in request.download_midware:
			
 
				+                                        download_midware = (
			
 
				+                                            download_midware
			
 
				+                                            if callable(download_midware)
			
 
				+                                            else tools.get_method(
			
 
				+                                                parser, download_midware
			
 
				+                                            )
			
 
				+                                        )
			
 
				+                                        request_temp = download_midware(request_temp)
			
 
				+                                else:
			
 
				+                                    download_midware = (
			
 
				+                                        request.download_midware
			
 
				+                                        if callable(request.download_midware)
			
 
				+                                        else tools.get_method(
			
 
				+                                            parser, request.download_midware
			
 
				+                                        )
			
 
				+                                    )
			
 
				+                                    request_temp = download_midware(request)
			
 
				+                            elif request.download_midware != False:
			
 
				+                                request_temp = parser.download_midware(request)
			
 
				+
			
 
				+                            # 请求
			
 
				+                            if request_temp:
			
 
				+                                if (
			
 
				+                                    isinstance(request_temp, (tuple, list))
			
 
				+                                    and len(request_temp) == 2
			
 
				+                                ):
			
 
				+                                    request_temp, response = request_temp
			
 
				+
			
 
				+                                if not isinstance(request_temp, Request):
			
 
				+                                    raise Exception(
			
 
				+                                        "download_midware need return a request, but received type: {}".format(
			
 
				+                                            type(request_temp)
			
 
				+                                        )
			
 
				+                                    )
			
 
				+                                used_download_midware_enable = True
			
 
				+                                if not response:
			
 
				+                                    response = (
			
 
				+                                        request_temp.get_response()
			
 
				+                                        if not setting.RESPONSE_CACHED_USED
			
 
				+                                        else request_temp.get_response_from_cached(
			
 
				+                                            save_cached=False
			
 
				+                                        )
			
 
				+                                    )
			
 
				+                            else:
			
 
				+                                response = (
			
 
				+                                    request.get_response()
			
 
				+                                    if not setting.RESPONSE_CACHED_USED
			
 
				+                                    else request.get_response_from_cached(
			
 
				+                                        save_cached=False
			
 
				+                                    )
			
 
				+                                )
			
 
				+
			
 
				+                            if response == None:
			
 
				+                                raise Exception(
			
 
				+                                    "连接超时 url: %s" % (request.url or request_temp.url)
			
 
				+                                )
			
 
				+
			
 
				+                        else:
			
 
				+                            response = None
			
 
				+
			
 
				+                        # 校验
			
 
				+                        if parser.validate(request, response) == False:
			
 
				+                            continue
			
 
				+
			
 
				+                        if request.callback:  # 如果有parser的回调函数，则用回调处理
			
 
				+                            callback_parser = (
			
 
				+                                request.callback
			
 
				+                                if callable(request.callback)
			
 
				+                                else tools.get_method(parser, request.callback)
			
 
				+                            )
			
 
				+                            results = callback_parser(request, response)
			
 
				+                        else:  # 否则默认用parser处理
			
 
				+                            results = parser.parse(request, response)
			
 
				+
			
 
				+                        if results and not isinstance(results, Iterable):
			
 
				+                            raise Exception(
			
 
				+                                "%s.%s返回值必须可迭代"
			
 
				+                                % (parser.name, request.callback or "parse")
			
 
				+                            )
			
 
				+
			
 
				+                        # 标识上一个result是什么
			
 
				+                        result_type = 0  # 0\1\2 (初始值\request\item)
			
 
				+                        # 此处判断是request 还是 item
			
 
				+                        for result in results or []:
			
 
				+                            if isinstance(result, Request):
			
 
				+                                result_type = 1
			
 
				+                                # 给request的 parser_name 赋值
			
 
				+                                result.parser_name = result.parser_name or parser.name
			
 
				+
			
 
				+                                # 判断是同步的callback还是异步的
			
 
				+                                if result.request_sync:  # 同步
			
 
				+                                    request_dict = {
			
 
				+                                        "request_obj": result,
			
 
				+                                        "request_redis": None,
			
 
				+                                    }
			
 
				+                                    requests.append(request_dict)
			
 
				+                                else:  # 异步
			
 
				+                                    # 将next_request 入库
			
 
				+                                    self._request_buffer.put_request(result)
			
 
				+                                    del_request_redis_after_request_to_db = True
			
 
				+
			
 
				+                            elif isinstance(result, Item):
			
 
				+                                result_type = 2
			
 
				+                                # 将item入库
			
 
				+                                self._item_buffer.put_item(result)
			
 
				+                                # 需删除正在做的request
			
 
				+                                del_request_redis_after_item_to_db = True
			
 
				+
			
 
				+                            elif callable(result):  # result为可执行的无参函数
			
 
				+                                if (
			
 
				+                                    result_type == 2
			
 
				+                                ):  # item 的 callback，buffer里的item均入库后再执行
			
 
				+                                    self._item_buffer.put_item(result)
			
 
				+                                    del_request_redis_after_item_to_db = True
			
 
				+
			
 
				+                                else:  # result_type == 1: # request 的 callback，buffer里的request均入库后再执行。可能有的parser直接返回callback
			
 
				+                                    self._request_buffer.put_request(result)
			
 
				+                                    del_request_redis_after_request_to_db = True
			
 
				+
			
 
				+                            # else:
			
 
				+                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))
			
 
				+
			
 
				+                    except Exception as e:
			
 
				+                        exception_type = (
			
 
				+                            str(type(e)).replace("<class '", "").replace("'>", "")
			
 
				+                        )
			
 
				+                        if exception_type.startswith("requests"):
			
 
				+                            # 记录下载失败的文档
			
 
				+                            self.record_download_status(
			
 
				+                                PaserControl.DOWNLOAD_EXCEPTION, parser.name
			
 
				+                            )
			
 
				+
			
 
				+                        else:
			
 
				+                            # 记录解析程序异常
			
 
				+                            self.record_download_status(
			
 
				+                                PaserControl.PAESERS_EXCEPTION, parser.name
			
 
				+                            )
			
 
				+
			
 
				+                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印， 超时的异常篇幅太多
			
 
				+                            log.exception(e)
			
 
				+
			
 
				+                        log.error(
			
 
				+                            """
			
 
				+                            -------------- %s.%s error -------------
			
 
				+                            error          %s
			
 
				+                            response       %s
			
 
				+                            deal request   %s
			
 
				+                            """
			
 
				+                            % (
			
 
				+                                parser.name,
			
 
				+                                (
			
 
				+                                    request.callback
			
 
				+                                    and callable(request.callback)
			
 
				+                                    and getattr(request.callback, "__name__")
			
 
				+                                    or request.callback
			
 
				+                                )
			
 
				+                                or "parse",
			
 
				+                                str(e),
			
 
				+                                response,
			
 
				+                                tools.dumps_json(request.to_dict, indent=28)
			
 
				+                                if setting.LOG_LEVEL == "DEBUG"
			
 
				+                                else request,
			
 
				+                            )
			
 
				+                        )
			
 
				+
			
 
				+                        request.error_msg = "%s: %s" % (exception_type, e)
			
 
				+                        request.response = str(response)
			
 
				+
			
 
				+                        if "Invalid URL" in str(e):
			
 
				+                            request.is_abandoned = True
			
 
				+
			
 
				+                        requests = parser.exception_request(request, response) or [
			
 
				+                            request
			
 
				+                        ]
			
 
				+                        if not isinstance(requests, Iterable):
			
 
				+                            raise Exception(
			
 
				+                                "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
			
 
				+                            )
			
 
				+                        for request in requests:
			
 
				+                            if callable(request):
			
 
				+                                self._request_buffer.put_request(request)
			
 
				+                                continue
			
 
				+
			
 
				+                            if not isinstance(request, Request):
			
 
				+                                raise Exception("exception_request 需 yield request")
			
 
				+
			
 
				+                            if (
			
 
				+                                request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
			
 
				+                                or request.is_abandoned
			
 
				+                            ):
			
 
				+                                self.__class__._failed_task_count += 1  # 记录失败任务数
			
 
				+
			
 
				+                                # 处理failed_request的返回值 request 或 func
			
 
				+                                results = parser.failed_request(request, response) or [
			
 
				+                                    request
			
 
				+                                ]
			
 
				+                                if not isinstance(results, Iterable):
			
 
				+                                    raise Exception(
			
 
				+                                        "%s.%s返回值必须可迭代"
			
 
				+                                        % (parser.name, "failed_request")
			
 
				+                                    )
			
 
				+
			
 
				+                                for result in results:
			
 
				+                                    if isinstance(result, Request):
			
 
				+                                        if setting.SAVE_FAILED_REQUEST:
			
 
				+                                            if used_download_midware_enable:
			
 
				+                                                # 去掉download_midware 添加的属性
			
 
				+                                                original_request = (
			
 
				+                                                    Request.from_dict(
			
 
				+                                                        eval(request_redis)
			
 
				+                                                    )
			
 
				+                                                    if request_redis
			
 
				+                                                    else result
			
 
				+                                                )
			
 
				+                                                original_request.error_msg = (
			
 
				+                                                    request.error_msg
			
 
				+                                                )
			
 
				+                                                original_request.response = (
			
 
				+                                                    request.response
			
 
				+                                                )
			
 
				+
			
 
				+                                                self._request_buffer.put_failed_request(
			
 
				+                                                    original_request
			
 
				+                                                )
			
 
				+                                            else:
			
 
				+                                                self._request_buffer.put_failed_request(
			
 
				+                                                    result
			
 
				+                                                )
			
 
				+
			
 
				+                                    elif callable(result):
			
 
				+                                        self._request_buffer.put_request(result)
			
 
				+
			
 
				+                                    elif isinstance(result, Item):
			
 
				+                                        self._item_buffer.put_item(result)
			
 
				+
			
 
				+                                del_request_redis_after_request_to_db = True
			
 
				+
			
 
				+                            else:
			
 
				+                                # 将 requests 重新入库 爬取
			
 
				+                                request.retry_times += 1
			
 
				+                                request.filter_repeat = False
			
 
				+                                log.info(
			
 
				+                                    """
			
 
				+                                    入库 等待重试
			
 
				+                                    url     %s
			
 
				+                                    重试次数 %s
			
 
				+                                    最大允许重试次数 %s"""
			
 
				+                                    % (
			
 
				+                                        request.url,
			
 
				+                                        request.retry_times,
			
 
				+                                        setting.SPIDER_MAX_RETRY_TIMES,
			
 
				+                                    )
			
 
				+                                )
			
 
				+                                if used_download_midware_enable:
			
 
				+                                    # 去掉download_midware 添加的属性 使用原来的requests
			
 
				+                                    original_request = (
			
 
				+                                        Request.from_dict(eval(request_redis))
			
 
				+                                        if request_redis
			
 
				+                                        else request
			
 
				+                                    )
			
 
				+                                    if hasattr(request, "error_msg"):
			
 
				+                                        original_request.error_msg = request.error_msg
			
 
				+                                    if hasattr(request, "response"):
			
 
				+                                        original_request.response = request.response
			
 
				+                                    original_request.retry_times = request.retry_times
			
 
				+                                    original_request.filter_repeat = (
			
 
				+                                        request.filter_repeat
			
 
				+                                    )
			
 
				+
			
 
				+                                    self._request_buffer.put_request(original_request)
			
 
				+                                else:
			
 
				+                                    self._request_buffer.put_request(request)
			
 
				+                                del_request_redis_after_request_to_db = True
			
 
				+
			
 
				+                    else:
			
 
				+                        # 记录下载成功的文档
			
 
				+                        self.record_download_status(
			
 
				+                            PaserControl.DOWNLOAD_SUCCESS, parser.name
			
 
				+                        )
			
 
				+                        # 记录成功任务数
			
 
				+                        self.__class__._success_task_count += 1
			
 
				+
			
 
				+                        # 缓存下载成功的文档
			
 
				+                        if setting.RESPONSE_CACHED_ENABLE:
			
 
				+                            request.save_cached(
			
 
				+                                response=response,
			
 
				+                                expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
			
 
				+                            )
			
 
				+
			
 
				+                    finally:
			
 
				+                        # 释放浏览器
			
 
				+                        if response and hasattr(response, "browser"):
			
 
				+                            request._webdriver_pool.put(response.browser)
			
 
				+
			
 
				+                    break
			
 
				+
			
 
				+            # 删除正在做的request 跟随item优先
			
 
				+            if request_redis:
			
 
				+                if del_request_redis_after_item_to_db:
			
 
				+                    self._item_buffer.put_item(request_redis)
			
 
				+
			
 
				+                elif del_request_redis_after_request_to_db:
			
 
				+                    self._request_buffer.put_del_request(request_redis)
			
 
				+
			
 
				+                else:
			
 
				+                    self._request_buffer.put_del_request(request_redis)
			
 
				+
			
 
				+        if setting.SPIDER_SLEEP_TIME:
			
 
				+            if (
			
 
				+                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
			
 
				+                and len(setting.SPIDER_SLEEP_TIME) == 2
			
 
				+            ):
			
 
				+                sleep_time = random.randint(
			
 
				+                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
			
 
				+                )
			
 
				+                time.sleep(sleep_time)
			
 
				+            else:
			
 
				+                time.sleep(setting.SPIDER_SLEEP_TIME)
			
 
				+
			
 
				+    def record_download_status(self, status, spider):
			
 
				+        """
			
 
				+        记录html等文档下载状态
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        metrics.emit_counter(f"{spider}:{status}", 1, classify="document")
			
 
				+
			
 
				+    def stop(self):
			
 
				+        self._thread_stop = True
			
 
				+        self._started.clear()
			
 
				+
			
 
				+    def add_parser(self, parser):
			
 
				+        self._parsers.append(parser)
			
 
				+
			
 
				+
			
 
				+class AirSpiderParserControl(PaserControl):
			
 
				+    is_show_tip = False
			
 
				+
			
 
				+    # 实时统计已做任务数及失败任务数，若失败任务数/已做任务数>0.5 则报警
			
 
				+    _success_task_count = 0
			
 
				+    _failed_task_count = 0
			
 
				+
			
 
				+    def __init__(self, memory_db: MemoryDB, item_buffer: ItemBuffer):
			
 
				+        super(PaserControl, self).__init__()
			
 
				+        self._parsers = []
			
 
				+        self._memory_db = memory_db
			
 
				+        self._thread_stop = False
			
 
				+        self._wait_task_time = 0
			
 
				+        self._item_buffer = item_buffer
			
 
				+
			
 
				+    def run(self):
			
 
				+        while not self._thread_stop:
			
 
				+            try:
			
 
				+                requests = self._memory_db.get()
			
 
				+                if not requests:
			
 
				+                    if not self.is_show_tip:
			
 
				+                        log.debug("parser 等待任务...")
			
 
				+                        self.is_show_tip = True
			
 
				+
			
 
				+                    time.sleep(1)
			
 
				+                    self._wait_task_time += 1
			
 
				+                    continue
			
 
				+
			
 
				+                self.is_show_tip = False
			
 
				+                self.deal_requests([requests])
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                time.sleep(3)
			
 
				+
			
 
				+    def deal_requests(self, requests):
			
 
				+        for request in requests:
			
 
				+
			
 
				+            response = None
			
 
				+
			
 
				+            for parser in self._parsers:
			
 
				+                if parser.name == request.parser_name:
			
 
				+                    try:
			
 
				+                        # 记录需下载的文档
			
 
				+                        self.record_download_status(
			
 
				+                            PaserControl.DOWNLOAD_TOTAL, parser.name
			
 
				+                        )
			
 
				+
			
 
				+                        # 解析request
			
 
				+                        if request.auto_request:
			
 
				+                            request_temp = None
			
 
				+                            response = None
			
 
				+
			
 
				+                            # 下载中间件
			
 
				+                            if request.download_midware:
			
 
				+                                if isinstance(request.download_midware, (list, tuple)):
			
 
				+                                    request_temp = request
			
 
				+                                    for download_midware in request.download_midware:
			
 
				+                                        download_midware = (
			
 
				+                                            download_midware
			
 
				+                                            if callable(download_midware)
			
 
				+                                            else tools.get_method(
			
 
				+                                                parser, download_midware
			
 
				+                                            )
			
 
				+                                        )
			
 
				+                                        request_temp = download_midware(request_temp)
			
 
				+                                else:
			
 
				+                                    download_midware = (
			
 
				+                                        request.download_midware
			
 
				+                                        if callable(request.download_midware)
			
 
				+                                        else tools.get_method(
			
 
				+                                            parser, request.download_midware
			
 
				+                                        )
			
 
				+                                    )
			
 
				+                                    request_temp = download_midware(request)
			
 
				+                            elif request.download_midware != False:
			
 
				+                                request_temp = parser.download_midware(request)
			
 
				+
			
 
				+                            # 请求
			
 
				+                            if request_temp:
			
 
				+                                if (
			
 
				+                                    isinstance(request_temp, (tuple, list))
			
 
				+                                    and len(request_temp) == 2
			
 
				+                                ):
			
 
				+                                    request_temp, response = request_temp
			
 
				+
			
 
				+                                if not isinstance(request_temp, Request):
			
 
				+                                    raise Exception(
			
 
				+                                        "download_midware need return a request, but received type: {}".format(
			
 
				+                                            type(request_temp)
			
 
				+                                        )
			
 
				+                                    )
			
 
				+                                request = request_temp
			
 
				+
			
 
				+                            if not response:
			
 
				+                                response = (
			
 
				+                                    request.get_response()
			
 
				+                                    if not setting.RESPONSE_CACHED_USED
			
 
				+                                    else request.get_response_from_cached(
			
 
				+                                        save_cached=False
			
 
				+                                    )
			
 
				+                                )
			
 
				+
			
 
				+                        else:
			
 
				+                            response = None
			
 
				+
			
 
				+                        # 校验
			
 
				+                        if parser.validate(request, response) == False:
			
 
				+                            continue
			
 
				+
			
 
				+                        if request.callback:  # 如果有parser的回调函数，则用回调处理
			
 
				+                            callback_parser = (
			
 
				+                                request.callback
			
 
				+                                if callable(request.callback)
			
 
				+                                else tools.get_method(parser, request.callback)
			
 
				+                            )
			
 
				+                            results = callback_parser(request, response)
			
 
				+                        else:  # 否则默认用parser处理
			
 
				+                            results = parser.parse(request, response)
			
 
				+
			
 
				+                        if results and not isinstance(results, Iterable):
			
 
				+                            raise Exception(
			
 
				+                                "%s.%s返回值必须可迭代"
			
 
				+                                % (parser.name, request.callback or "parse")
			
 
				+                            )
			
 
				+
			
 
				+                        # 此处判断是request 还是 item
			
 
				+                        for result in results or []:
			
 
				+                            if isinstance(result, Request):
			
 
				+                                # 给request的 parser_name 赋值
			
 
				+                                result.parser_name = result.parser_name or parser.name
			
 
				+
			
 
				+                                # 判断是同步的callback还是异步的
			
 
				+                                if result.request_sync:  # 同步
			
 
				+                                    requests.append(result)
			
 
				+                                else:  # 异步
			
 
				+                                    # 将next_request 入库
			
 
				+                                    self._memory_db.add(result)
			
 
				+
			
 
				+                            elif isinstance(result, Item):
			
 
				+                                self._item_buffer.put_item(result)
			
 
				+
			
 
				+                    except Exception as e:
			
 
				+                        exception_type = (
			
 
				+                            str(type(e)).replace("<class '", "").replace("'>", "")
			
 
				+                        )
			
 
				+                        if exception_type.startswith("requests"):
			
 
				+                            # 记录下载失败的文档
			
 
				+                            self.record_download_status(
			
 
				+                                PaserControl.DOWNLOAD_EXCEPTION, parser.name
			
 
				+                            )
			
 
				+
			
 
				+                        else:
			
 
				+                            # 记录解析程序异常
			
 
				+                            self.record_download_status(
			
 
				+                                PaserControl.PAESERS_EXCEPTION, parser.name
			
 
				+                            )
			
 
				+
			
 
				+                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印， 超时的异常篇幅太多
			
 
				+                            log.exception(e)
			
 
				+
			
 
				+                        log.error(
			
 
				+                            """
			
 
				+                                -------------- %s.%s error -------------
			
 
				+                                error          %s
			
 
				+                                response       %s
			
 
				+                                deal request   %s
			
 
				+                                """
			
 
				+                            % (
			
 
				+                                parser.name,
			
 
				+                                (
			
 
				+                                    request.callback
			
 
				+                                    and callable(request.callback)
			
 
				+                                    and getattr(request.callback, "__name__")
			
 
				+                                    or request.callback
			
 
				+                                )
			
 
				+                                or "parse",
			
 
				+                                str(e),
			
 
				+                                response,
			
 
				+                                tools.dumps_json(request.to_dict, indent=28)
			
 
				+                                if setting.LOG_LEVEL == "DEBUG"
			
 
				+                                else request,
			
 
				+                            )
			
 
				+                        )
			
 
				+
			
 
				+                        request.error_msg = "%s: %s" % (exception_type, e)
			
 
				+                        request.response = str(response)
			
 
				+
			
 
				+                        if "Invalid URL" in str(e):
			
 
				+                            request.is_abandoned = True
			
 
				+
			
 
				+                        requests = parser.exception_request(request, response) or [
			
 
				+                            request
			
 
				+                        ]
			
 
				+                        if not isinstance(requests, Iterable):
			
 
				+                            raise Exception(
			
 
				+                                "%s.%s返回值必须可迭代" % (parser.name, "exception_request")
			
 
				+                            )
			
 
				+                        for request in requests:
			
 
				+                            if not isinstance(request, Request):
			
 
				+                                raise Exception("exception_request 需 yield request")
			
 
				+
			
 
				+                            if (
			
 
				+                                request.retry_times + 1 > setting.SPIDER_MAX_RETRY_TIMES
			
 
				+                                or request.is_abandoned
			
 
				+                            ):
			
 
				+                                self.__class__._failed_task_count += 1  # 记录失败任务数
			
 
				+
			
 
				+                                # 处理failed_request的返回值 request 或 func
			
 
				+                                results = parser.failed_request(request, response) or [
			
 
				+                                    request
			
 
				+                                ]
			
 
				+                                if not isinstance(results, Iterable):
			
 
				+                                    raise Exception(
			
 
				+                                        "%s.%s返回值必须可迭代"
			
 
				+                                        % (parser.name, "failed_request")
			
 
				+                                    )
			
 
				+
			
 
				+                                log.info(
			
 
				+                                    """
			
 
				+                                    任务超过最大重试次数，丢弃
			
 
				+                                    url     %s
			
 
				+                                    重试次数 %s
			
 
				+                                    最大允许重试次数 %s"""
			
 
				+                                    % (
			
 
				+                                        request.url,
			
 
				+                                        request.retry_times,
			
 
				+                                        setting.SPIDER_MAX_RETRY_TIMES,
			
 
				+                                    )
			
 
				+                                )
			
 
				+
			
 
				+                            else:
			
 
				+                                # 将 requests 重新入库 爬取
			
 
				+                                request.retry_times += 1
			
 
				+                                request.filter_repeat = False
			
 
				+                                log.info(
			
 
				+                                    """
			
 
				+                                        入库 等待重试
			
 
				+                                        url     %s
			
 
				+                                        重试次数 %s
			
 
				+                                        最大允许重试次数 %s"""
			
 
				+                                    % (
			
 
				+                                        request.url,
			
 
				+                                        request.retry_times,
			
 
				+                                        setting.SPIDER_MAX_RETRY_TIMES,
			
 
				+                                    )
			
 
				+                                )
			
 
				+                                self._memory_db.add(request)
			
 
				+
			
 
				+                    else:
			
 
				+                        # 记录下载成功的文档
			
 
				+                        self.record_download_status(
			
 
				+                            PaserControl.DOWNLOAD_SUCCESS, parser.name
			
 
				+                        )
			
 
				+                        # 记录成功任务数
			
 
				+                        self.__class__._success_task_count += 1
			
 
				+
			
 
				+                        # 缓存下载成功的文档
			
 
				+                        if setting.RESPONSE_CACHED_ENABLE:
			
 
				+                            request.save_cached(
			
 
				+                                response=response,
			
 
				+                                expire_time=setting.RESPONSE_CACHED_EXPIRE_TIME,
			
 
				+                            )
			
 
				+
			
 
				+                    finally:
			
 
				+                        # 释放浏览器
			
 
				+                        if response and hasattr(response, "browser"):
			
 
				+                            request._webdriver_pool.put(response.browser)
			
 
				+
			
 
				+                    break
			
 
				+
			
 
				+        if setting.SPIDER_SLEEP_TIME:
			
 
				+            if (
			
 
				+                isinstance(setting.SPIDER_SLEEP_TIME, (tuple, list))
			
 
				+                and len(setting.SPIDER_SLEEP_TIME) == 2
			
 
				+            ):
			
 
				+                sleep_time = random.randint(
			
 
				+                    int(setting.SPIDER_SLEEP_TIME[0]), int(setting.SPIDER_SLEEP_TIME[1])
			
 
				+                )
			
 
				+                time.sleep(sleep_time)
			
 
				+            else:
			
 
				+                time.sleep(setting.SPIDER_SLEEP_TIME)
			
--- a/FworkSpider/feapder/core/scheduler.py
+++ b/FworkSpider/feapder/core/scheduler.py
@@ -0,0 +1,579 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2017-01-09 10:38
			
 
				+---------
			
 
				+@summary: 组装parser、 parser_control 和 collector
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import threading
			
 
				+import time
			
 
				+from collections import Iterable
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.buffer.item_buffer import ItemBuffer
			
 
				+from feapder.buffer.request_buffer import RequestBuffer
			
 
				+from feapder.core.base_parser import BaseParser
			
 
				+from feapder.core.collector import Collector
			
 
				+from feapder.core.handle_failed_requests import HandleFailedRequests
			
 
				+from feapder.core.parser_control import PaserControl
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.network.item import Item
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.redis_lock import RedisLock
			
 
				+from feapder.utils import metrics
			
 
				+
			
 
				+SPIDER_START_TIME_KEY = "spider_start_time"
			
 
				+SPIDER_END_TIME_KEY = "spider_end_time"
			
 
				+SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY = "last_task_count_record_time"
			
 
				+
			
 
				+
			
 
				+class Scheduler(threading.Thread):
			
 
				+    __custom_setting__ = {}
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        redis_key=None,
			
 
				+        thread_count=None,
			
 
				+        begin_callback=None,
			
 
				+        end_callback=None,
			
 
				+        delete_keys=(),
			
 
				+        keep_alive=None,
			
 
				+        auto_start_requests=None,
			
 
				+        batch_interval=0,
			
 
				+        wait_lock=True,
			
 
				+        task_table=None,
			
 
				+        **kwargs
			
 
				+    ):
			
 
				+        """
			
 
				+        @summary: 调度器
			
 
				+        ---------
			
 
				+        @param redis_key: 爬虫request及item存放redis中的文件夹
			
 
				+        @param thread_count: 线程数，默认为配置文件中的线程数
			
 
				+        @param begin_callback: 爬虫开始回调函数
			
 
				+        @param end_callback: 爬虫结束回调函数
			
 
				+        @param delete_keys: 爬虫启动时删除的key，类型: 元组/bool/string。 支持正则
			
 
				+        @param keep_alive: 爬虫是否常驻，默认否
			
 
				+        @param auto_start_requests: 爬虫是否自动添加任务
			
 
				+        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时，只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时，爬虫才启动
			
 
				+        @param wait_lock: 下发任务时否等待锁，若不等待锁，可能会存在多进程同时在下发一样的任务，因此分布式环境下请将该值设置True
			
 
				+        @param task_table: 任务表， 批次爬虫传递
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        super(Scheduler, self).__init__()
			
 
				+
			
 
				+        for key, value in self.__class__.__custom_setting__.items():
			
 
				+            if key == "AUTO_STOP_WHEN_SPIDER_DONE":  # 兼容老版本的配置
			
 
				+                setattr(setting, "KEEP_ALIVE", not value)
			
 
				+            else:
			
 
				+                setattr(setting, key, value)
			
 
				+        
			
 
				+
			
 
				+        self._redis_key = redis_key or setting.REDIS_KEY
			
 
				+        if not self._redis_key:
			
 
				+            raise Exception(
			
 
				+                """
			
 
				+                redis_key 为redis中存放request与item的目录。不能为空，
			
 
				+                可在setting中配置，如 REDIS_KEY = 'test'
			
 
				+                或spider初始化时传参, 如 TestSpider(redis_key='test')
			
 
				+                """
			
 
				+            )
			
 
				+
			
 
				+        self._request_buffer = RequestBuffer(redis_key)
			
 
				+        self._item_buffer = ItemBuffer(redis_key, task_table)
			
 
				+
			
 
				+        self._collector = Collector(redis_key)
			
 
				+        self._parsers = []
			
 
				+        self._parser_controls = []
			
 
				+        self._parser_control_obj = PaserControl
			
 
				+
			
 
				+        # 兼容老版本的参数
			
 
				+        if "auto_stop_when_spider_done" in kwargs:
			
 
				+            self._keep_alive = not kwargs.get("auto_stop_when_spider_done")
			
 
				+        else:
			
 
				+            self._keep_alive = (
			
 
				+                keep_alive if keep_alive is not None else setting.KEEP_ALIVE
			
 
				+            )
			
 
				+        self._auto_start_requests = (
			
 
				+            auto_start_requests
			
 
				+            if auto_start_requests is not None
			
 
				+            else setting.SPIDER_AUTO_START_REQUESTS
			
 
				+        )
			
 
				+        self._batch_interval = batch_interval
			
 
				+
			
 
				+        self._begin_callback = (
			
 
				+            begin_callback
			
 
				+            if begin_callback
			
 
				+            else lambda: log.info("\n********** feapder begin **********")
			
 
				+        )
			
 
				+        self._end_callback = (
			
 
				+            end_callback
			
 
				+            if end_callback
			
 
				+            else lambda: log.info("\n********** feapder end **********")
			
 
				+        )
			
 
				+
			
 
				+        self._thread_count = (
			
 
				+            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
			
 
				+        )
			
 
				+
			
 
				+        self._spider_name = redis_key
			
 
				+        self._project_name = redis_key.split(":")[0]
			
 
				+
			
 
				+        self._tab_spider_time = setting.TAB_SPIDER_TIME.format(redis_key=redis_key)
			
 
				+        self._tab_spider_status = setting.TAB_SPIDER_STATUS.format(redis_key=redis_key)
			
 
				+        self._tab_requests = setting.TAB_REQUSETS.format(redis_key=redis_key)
			
 
				+        self._tab_failed_requests = setting.TAB_FAILED_REQUSETS.format(
			
 
				+            redis_key=redis_key
			
 
				+        )
			
 
				+
			
 
				+        self._is_notify_end = False  # 是否已经通知结束
			
 
				+        self._last_task_count = 0  # 最近一次任务数量
			
 
				+        self._redisdb = RedisDB()
			
 
				+
			
 
				+        self._project_total_state_table = "{}_total_state".format(self._project_name)
			
 
				+        self._is_exist_project_total_state_table = False
			
 
				+
			
 
				+        # Request 缓存设置
			
 
				+        Request.cached_redis_key = redis_key
			
 
				+        Request.cached_expire_time = setting.RESPONSE_CACHED_EXPIRE_TIME
			
 
				+
			
 
				+        delete_keys = delete_keys or setting.DELETE_KEYS
			
 
				+        if delete_keys:
			
 
				+            self.delete_tables(delete_keys)
			
 
				+
			
 
				+        self._last_check_task_status_time = 0
			
 
				+        self.wait_lock = wait_lock
			
 
				+
			
 
				+        self.init_metrics()
			
 
				+
			
 
				+    def init_metrics(self):
			
 
				+        """
			
 
				+        初始化打点系统
			
 
				+        """
			
 
				+        metrics.init(**setting.METRICS_OTHER_ARGS)
			
 
				+
			
 
				+    def add_parser(self, parser):
			
 
				+        parser = parser()  # parser 实例化
			
 
				+        if isinstance(parser, BaseParser):
			
 
				+            self._parsers.append(parser)
			
 
				+        else:
			
 
				+            raise ValueError("类型错误，爬虫需继承feapder.BaseParser或feapder.BatchParser")
			
 
				+
			
 
				+    def run(self):
			
 
				+        if not self.is_reach_next_spider_time():
			
 
				+            return
			
 
				+
			
 
				+        self._start()
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                if self.all_thread_is_done():
			
 
				+                    if not self._is_notify_end:
			
 
				+                        self.spider_end()  # 跑完一轮
			
 
				+                        self.record_spider_state(
			
 
				+                            spider_type=1,
			
 
				+                            state=1,
			
 
				+                            spider_end_time=tools.get_current_date(),
			
 
				+                            batch_interval=self._batch_interval,
			
 
				+                        )
			
 
				+
			
 
				+                        self._is_notify_end = True
			
 
				+
			
 
				+                    if not self._keep_alive:
			
 
				+                        self._stop_all_thread()
			
 
				+                        break
			
 
				+
			
 
				+                else:
			
 
				+                    self._is_notify_end = False
			
 
				+
			
 
				+                self.check_task_status()
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
			
 
				+
			
 
				+    def __add_task(self):
			
 
				+        # 启动parser 的 start_requests
			
 
				+        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
			
 
				+        self.record_spider_state(
			
 
				+            spider_type=1,
			
 
				+            state=0,
			
 
				+            batch_date=tools.get_current_date(),
			
 
				+            spider_start_time=tools.get_current_date(),
			
 
				+            batch_interval=self._batch_interval,
			
 
				+        )
			
 
				+
			
 
				+        # 判断任务池中属否还有任务，若有接着抓取
			
 
				+        todo_task_count = self._collector.get_requests_count()
			
 
				+        if todo_task_count:
			
 
				+            log.info("检查到有待做任务 %s 条，不重下发新任务，将接着上回异常终止处继续抓取" % todo_task_count)
			
 
				+        else:
			
 
				+            for parser in self._parsers:
			
 
				+                results = parser.start_requests()
			
 
				+                # 添加request到请求队列，由请求队列统一入库
			
 
				+                if results and not isinstance(results, Iterable):
			
 
				+                    raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
			
 
				+
			
 
				+                result_type = 1
			
 
				+                for result in results or []:
			
 
				+                    if isinstance(result, Request):
			
 
				+                        result.parser_name = result.parser_name or parser.name
			
 
				+                        self._request_buffer.put_request(result)
			
 
				+                        result_type = 1
			
 
				+
			
 
				+                    elif isinstance(result, Item):
			
 
				+                        self._item_buffer.put_item(result)
			
 
				+                        result_type = 2
			
 
				+
			
 
				+                    elif callable(result):  # callbale的request可能是更新数据库操作的函数
			
 
				+                        if result_type == 1:
			
 
				+                            self._request_buffer.put_request(result)
			
 
				+                        else:
			
 
				+                            self._item_buffer.put_item(result)
			
 
				+                    else:
			
 
				+                        raise TypeError(
			
 
				+                            "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
			
 
				+                                type(result)
			
 
				+                            )
			
 
				+                        )
			
 
				+
			
 
				+                self._request_buffer.flush()
			
 
				+                self._item_buffer.flush()
			
 
				+
			
 
				+    def _start(self):
			
 
				+        # 启动request_buffer
			
 
				+        self._request_buffer.start()
			
 
				+        # 启动item_buffer
			
 
				+        self._item_buffer.start()
			
 
				+        # 启动collector
			
 
				+        self._collector.start()
			
 
				+
			
 
				+        # 启动parser control
			
 
				+        for i in range(self._thread_count):
			
 
				+            parser_control = self._parser_control_obj(
			
 
				+                self._collector,
			
 
				+                self._redis_key,
			
 
				+                self._request_buffer,
			
 
				+                self._item_buffer,
			
 
				+            )
			
 
				+
			
 
				+            for parser in self._parsers:
			
 
				+                parser_control.add_parser(parser)
			
 
				+
			
 
				+            parser_control.start()
			
 
				+            self._parser_controls.append(parser_control)
			
 
				+
			
 
				+        # 下发任务 因为时间可能比较长，放到最后面
			
 
				+        if setting.RETRY_FAILED_REQUESTS:
			
 
				+            # 重设失败的任务, 不用加锁，原子性操作
			
 
				+            handle_failed_requests = HandleFailedRequests(self._redis_key)
			
 
				+            handle_failed_requests.reput_failed_requests_to_requests()
			
 
				+
			
 
				+        # 下发新任务
			
 
				+        if self._auto_start_requests:  # 自动下发
			
 
				+            if self.wait_lock:
			
 
				+                # 将添加任务处加锁，防止多进程之间添加重复的任务
			
 
				+                with RedisLock(key=self._spider_name) as lock:
			
 
				+                    if lock.locked:
			
 
				+                        self.__add_task()
			
 
				+            else:
			
 
				+                self.__add_task()
			
 
				+
			
 
				+    def all_thread_is_done(self):
			
 
				+        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的，很有可能当时状态为假，但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
			
 
				+            # 检测 collector 状态
			
 
				+            if (
			
 
				+                self._collector.is_collector_task()
			
 
				+                or self._collector.get_requests_count() > 0
			
 
				+            ):
			
 
				+                return False
			
 
				+
			
 
				+            # 检测 parser_control 状态
			
 
				+            for parser_control in self._parser_controls:
			
 
				+                if not parser_control.is_not_task():
			
 
				+                    return False
			
 
				+
			
 
				+            # 检测 item_buffer 状态
			
 
				+            if (
			
 
				+                self._item_buffer.get_items_count() > 0
			
 
				+                or self._item_buffer.is_adding_to_db()
			
 
				+            ):
			
 
				+                return False
			
 
				+
			
 
				+            # 检测 request_buffer 状态
			
 
				+            if (
			
 
				+                self._request_buffer.get_requests_count() > 0
			
 
				+                or self._request_buffer.is_adding_to_db()
			
 
				+            ):
			
 
				+                return False
			
 
				+
			
 
				+            tools.delay_time(1)
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    @tools.run_safe_model("check_task_status")
			
 
				+    def check_task_status(self):
			
 
				+        """
			
 
				+        检查任务状态 预警
			
 
				+        """
			
 
				+        # 每分钟检查一次
			
 
				+        now_time = time.time()
			
 
				+        if now_time - self._last_check_task_status_time > 60:
			
 
				+            self._last_check_task_status_time = now_time
			
 
				+        else:
			
 
				+            return
			
 
				+
			
 
				+        # 检查redis中任务状态，若连续20分钟内任务数量未发生变化（parser可能卡死），则发出报警信息
			
 
				+        task_count = self._redisdb.zget_count(self._tab_requests)
			
 
				+
			
 
				+        if task_count:
			
 
				+            if task_count != self._last_task_count:
			
 
				+                self._last_task_count = task_count
			
 
				+                self._redisdb.hset(
			
 
				+                    self._tab_spider_time,
			
 
				+                    SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
			
 
				+                    tools.get_current_timestamp(),
			
 
				+                )  # 多进程会重复发消息， 使用reids记录上次统计时间
			
 
				+            else:
			
 
				+                # 判断时间间隔是否超过20分钟
			
 
				+                lua = """
			
 
				+                    -- local key = KEYS[1]
			
 
				+                    local field = ARGV[1]
			
 
				+                    local current_timestamp = ARGV[2]
			
 
				+
			
 
				+                    -- 取值
			
 
				+                    local last_timestamp = redis.call('hget', KEYS[1], field)
			
 
				+                    if last_timestamp and current_timestamp - last_timestamp >= 1200 then
			
 
				+                        return current_timestamp - last_timestamp -- 返回任务停滞时间 秒
			
 
				+                    end
			
 
				+
			
 
				+                    if not last_timestamp then
			
 
				+                        redis.call('hset', KEYS[1], field, current_timestamp)
			
 
				+                    end
			
 
				+
			
 
				+                    return 0
			
 
				+
			
 
				+                """
			
 
				+                redis_obj = self._redisdb.get_redis_obj()
			
 
				+                cmd = redis_obj.register_script(lua)
			
 
				+                overtime = cmd(
			
 
				+                    keys=[self._tab_spider_time],
			
 
				+                    args=[
			
 
				+                        SPIDER_LAST_TASK_COUNT_RECORD_TIME_KEY,
			
 
				+                        tools.get_current_timestamp(),
			
 
				+                    ],
			
 
				+                )
			
 
				+
			
 
				+                if overtime:
			
 
				+                    # 发送报警
			
 
				+                    msg = "{}  爬虫任务停滞 {}，请检查爬虫是否正常".format(
			
 
				+                        self._spider_name, tools.format_seconds(overtime)
			
 
				+                    )
			
 
				+                    log.error(msg)
			
 
				+                    self.send_msg(
			
 
				+                        msg,
			
 
				+                        level="error",
			
 
				+                        message_prefix="《{}》爬虫任务停滞".format(self._spider_name),
			
 
				+                    )
			
 
				+
			
 
				+        else:
			
 
				+            self._last_task_count = 0
			
 
				+
			
 
				+        # 检查失败任务数量 超过1000 报警，
			
 
				+        failed_count = self._redisdb.zget_count(self._tab_failed_requests)
			
 
				+        print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<失败次数：',failed_count)
			
 
				+        if failed_count > setting.WARNING_FAILED_COUNT:
			
 
				+            # 发送报警
			
 
				+            msg = "《%s》爬虫当前失败任务 %s, 请检查爬虫是否正常" % (self._spider_name, failed_count)
			
 
				+            log.error(msg)
			
 
				+            self.send_msg(
			
 
				+                msg,
			
 
				+                level="error",
			
 
				+                message_prefix="《%s》爬虫当前失败任务数报警" % (self._spider_name),
			
 
				+            )
			
 
				+
			
 
				+        # parser_control实时统计已做任务数及失败任务数，若成功率<0.5 则报警
			
 
				+        failed_task_count, success_task_count = PaserControl.get_task_status_count()
			
 
				+        total_count = success_task_count + failed_task_count
			
 
				+        if total_count > 0:
			
 
				+            task_success_rate = success_task_count / total_count
			
 
				+            if task_success_rate < 0.5:
			
 
				+                # 发送报警
			
 
				+                msg = "《%s》爬虫当前任务成功数%s, 失败数%s, 成功率 %.2f, 请检查爬虫是否正常" % (
			
 
				+                    self._spider_name,
			
 
				+                    success_task_count,
			
 
				+                    failed_task_count,
			
 
				+                    task_success_rate,
			
 
				+                )
			
 
				+                log.error(msg)
			
 
				+                self.send_msg(
			
 
				+                    msg,
			
 
				+                    level="error",
			
 
				+                    message_prefix="《%s》爬虫当前任务成功率报警" % (self._spider_name),
			
 
				+                )
			
 
				+
			
 
				+        # 检查入库失败次数
			
 
				+        if self._item_buffer.export_falied_times > setting.EXPORT_DATA_MAX_FAILED_TIMES:
			
 
				+            msg = "《{}》爬虫导出数据失败，失败次数：{}， 请检查爬虫是否正常".format(
			
 
				+                self._spider_name, self._item_buffer.export_falied_times
			
 
				+            )
			
 
				+            log.error(msg)
			
 
				+            self.send_msg(
			
 
				+                msg, level="error", message_prefix="《%s》爬虫导出数据失败" % (self._spider_name)
			
 
				+            )
			
 
				+
			
 
				+    def delete_tables(self, delete_tables_list):
			
 
				+        if isinstance(delete_tables_list, bool):
			
 
				+            delete_tables_list = [self._redis_key + "*"]
			
 
				+        elif not isinstance(delete_tables_list, (list, tuple)):
			
 
				+            delete_tables_list = [delete_tables_list]
			
 
				+
			
 
				+        redis = RedisDB()
			
 
				+        for delete_tab in delete_tables_list:
			
 
				+            if not delete_tab.startswith(self._redis_key):
			
 
				+                delete_tab = self._redis_key + delete_tab
			
 
				+            tables = redis.getkeys(delete_tab)
			
 
				+            for table in tables:
			
 
				+                if table != self._tab_spider_time:
			
 
				+                    log.info("正在删除key %s" % table)
			
 
				+                    redis.clear(table)
			
 
				+
			
 
				+    def _stop_all_thread(self):
			
 
				+        self._request_buffer.stop()
			
 
				+        self._item_buffer.stop()
			
 
				+        # 停止 collector
			
 
				+        self._collector.stop()
			
 
				+        # 停止 parser_controls
			
 
				+        for parser_control in self._parser_controls:
			
 
				+            parser_control.stop()
			
 
				+
			
 
				+        self._started.clear()
			
 
				+
			
 
				+    def send_msg(self, msg, level="debug", message_prefix=""):
			
 
				+        # log.debug("发送报警 level:{} msg{}".format(level, msg))
			
 
				+        tools.send_msg(msg=msg, level=level, message_prefix=message_prefix)
			
 
				+
			
 
				+    def spider_begin(self):
			
 
				+        """
			
 
				+        @summary: start_monitor_task 方式启动，此函数与spider_end不在同一进程内，变量不可共享
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        if self._begin_callback:
			
 
				+            self._begin_callback()
			
 
				+
			
 
				+        for parser in self._parsers:
			
 
				+            parser.start_callback()
			
 
				+
			
 
				+        # 记录开始时间
			
 
				+        if not self._redisdb.hexists(self._tab_spider_time, SPIDER_START_TIME_KEY):
			
 
				+            current_timestamp = tools.get_current_timestamp()
			
 
				+            self._redisdb.hset(
			
 
				+                self._tab_spider_time, SPIDER_START_TIME_KEY, current_timestamp
			
 
				+            )
			
 
				+
			
 
				+            # 发送消息
			
 
				+            # self.send_msg("《%s》爬虫开始" % self._spider_name)
			
 
				+
			
 
				+    def spider_end(self):
			
 
				+        self.record_end_time()
			
 
				+
			
 
				+        if self._end_callback:
			
 
				+            self._end_callback()
			
 
				+
			
 
				+        for parser in self._parsers:
			
 
				+            if not self._keep_alive:
			
 
				+                parser.close()
			
 
				+            parser.end_callback()
			
 
				+
			
 
				+        if not self._keep_alive:
			
 
				+            # 关闭webdirver
			
 
				+            if Request.webdriver_pool:
			
 
				+                Request.webdriver_pool.close()
			
 
				+
			
 
				+            # 关闭打点
			
 
				+            metrics.close()
			
 
				+        else:
			
 
				+            metrics.flush()
			
 
				+
			
 
				+        # 计算抓取时长
			
 
				+        data = self._redisdb.hget(
			
 
				+            self._tab_spider_time, SPIDER_START_TIME_KEY, is_pop=True
			
 
				+        )
			
 
				+        if data:
			
 
				+            begin_timestamp = int(data)
			
 
				+
			
 
				+            spand_time = tools.get_current_timestamp() - begin_timestamp
			
 
				+
			
 
				+            msg = "《%s》爬虫结束，耗时 %s" % (
			
 
				+                self._spider_name,
			
 
				+                tools.format_seconds(spand_time),
			
 
				+            )
			
 
				+            log.info(msg)
			
 
				+
			
 
				+            # self.send_msg(msg)
			
 
				+
			
 
				+        if self._keep_alive:
			
 
				+            log.info("爬虫不自动结束， 等待下一轮任务...")
			
 
				+        else:
			
 
				+            self.delete_tables(self._tab_spider_status)
			
 
				+
			
 
				+    def record_end_time(self):
			
 
				+        # 记录结束时间
			
 
				+        if self._batch_interval:
			
 
				+            current_timestamp = tools.get_current_timestamp()
			
 
				+            self._redisdb.hset(
			
 
				+                self._tab_spider_time, SPIDER_END_TIME_KEY, current_timestamp
			
 
				+            )
			
 
				+
			
 
				+    def is_reach_next_spider_time(self):
			
 
				+        if not self._batch_interval:
			
 
				+            return True
			
 
				+
			
 
				+        last_spider_end_time = self._redisdb.hget(
			
 
				+            self._tab_spider_time, SPIDER_END_TIME_KEY
			
 
				+        )
			
 
				+        if last_spider_end_time:
			
 
				+            last_spider_end_time = int(last_spider_end_time)
			
 
				+            current_timestamp = tools.get_current_timestamp()
			
 
				+            time_interval = current_timestamp - last_spider_end_time
			
 
				+
			
 
				+            if time_interval < self._batch_interval * 86400:
			
 
				+                log.info(
			
 
				+                    "上次运行结束时间为 {} 与当前时间间隔 为 {}, 小于规定的抓取时间间隔 {}。爬虫不执行，退出～".format(
			
 
				+                        tools.timestamp_to_date(last_spider_end_time),
			
 
				+                        tools.format_seconds(time_interval),
			
 
				+                        tools.format_seconds(self._batch_interval * 86400),
			
 
				+                    )
			
 
				+                )
			
 
				+                return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def record_spider_state(
			
 
				+        self,
			
 
				+        spider_type,
			
 
				+        state,
			
 
				+        batch_date=None,
			
 
				+        spider_start_time=None,
			
 
				+        spider_end_time=None,
			
 
				+        batch_interval=None,
			
 
				+    ):
			
 
				+        pass
			
 
				+
			
 
				+    def join(self, timeout=None):
			
 
				+        """
			
 
				+        重写线程的join
			
 
				+        """
			
 
				+        if not self._started.is_set():
			
 
				+            return
			
 
				+
			
 
				+        super().join()
			
--- a/FworkSpider/feapder/core/spiders/__init__.py
+++ b/FworkSpider/feapder/core/spiders/__init__.py
@@ -0,0 +1,15 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/22 12:08 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+__all__ = ["AirSpider", "Spider", "BatchSpider"]
			
 
				+
			
 
				+from feapder.core.spiders.air_spider import AirSpider
			
 
				+from feapder.core.spiders.spider import Spider
			
 
				+from feapder.core.spiders.batch_spider import BatchSpider
			
--- a/FworkSpider/feapder/core/spiders/air_spider.py
+++ b/FworkSpider/feapder/core/spiders/air_spider.py
@@ -0,0 +1,125 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/22 12:05 AM
			
 
				+---------
			
 
				+@summary: 基于内存队列的爬虫，不支持分布式
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+from threading import Thread
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.buffer.item_buffer import ItemBuffer
			
 
				+from feapder.core.base_parser import BaseParser
			
 
				+from feapder.core.parser_control import AirSpiderParserControl
			
 
				+from feapder.db.memory_db import MemoryDB
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils import metrics
			
 
				+
			
 
				+
			
 
				+class AirSpider(BaseParser, Thread):
			
 
				+    __custom_setting__ = {}
			
 
				+
			
 
				+    def __init__(self, thread_count=None):
			
 
				+        """
			
 
				+        基于内存队列的爬虫，不支持分布式
			
 
				+        :param thread_count: 线程数
			
 
				+        """
			
 
				+        super(AirSpider, self).__init__()
			
 
				+
			
 
				+        for key, value in self.__class__.__custom_setting__.items():
			
 
				+            setattr(setting, key, value)
			
 
				+
			
 
				+        self._thread_count = (
			
 
				+            setting.SPIDER_THREAD_COUNT if not thread_count else thread_count
			
 
				+        )
			
 
				+
			
 
				+        self._memory_db = MemoryDB()
			
 
				+        self._parser_controls = []
			
 
				+        self._item_buffer = ItemBuffer(redis_key="air_spider")
			
 
				+
			
 
				+        metrics.init(**setting.METRICS_OTHER_ARGS)
			
 
				+
			
 
				+    def distribute_task(self):
			
 
				+        for request in self.start_requests():
			
 
				+            if not isinstance(request, Request):
			
 
				+                raise ValueError("仅支持 yield Request")
			
 
				+
			
 
				+            request.parser_name = request.parser_name or self.name
			
 
				+            self._memory_db.add(request)
			
 
				+
			
 
				+    def all_thread_is_done(self):
			
 
				+        for i in range(3):  # 降低偶然性, 因为各个环节不是并发的，很有可能当时状态为假，但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性
			
 
				+            # 检测 parser_control 状态
			
 
				+            for parser_control in self._parser_controls:
			
 
				+                if not parser_control.is_not_task():
			
 
				+                    return False
			
 
				+
			
 
				+            # 检测 任务队列 状态
			
 
				+            if not self._memory_db.empty():
			
 
				+                return False
			
 
				+
			
 
				+            # 检测 item_buffer 状态
			
 
				+            if (
			
 
				+                self._item_buffer.get_items_count() > 0
			
 
				+                or self._item_buffer.is_adding_to_db()
			
 
				+            ):
			
 
				+                return False
			
 
				+
			
 
				+            tools.delay_time(1)
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def run(self):
			
 
				+        self.start_callback()
			
 
				+
			
 
				+        for i in range(self._thread_count):
			
 
				+            parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer)
			
 
				+            parser_control.add_parser(self)
			
 
				+            parser_control.start()
			
 
				+            self._parser_controls.append(parser_control)
			
 
				+
			
 
				+        self._item_buffer.start()
			
 
				+
			
 
				+        self.distribute_task()
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                if self.all_thread_is_done():
			
 
				+                    # 停止 parser_controls
			
 
				+                    for parser_control in self._parser_controls:
			
 
				+                        parser_control.stop()
			
 
				+
			
 
				+                    # 关闭item_buffer
			
 
				+                    self._item_buffer.stop()
			
 
				+
			
 
				+                    # 关闭webdirver
			
 
				+                    if Request.webdriver_pool:
			
 
				+                        Request.webdriver_pool.close()
			
 
				+
			
 
				+                    log.info("无任务，爬虫结束")
			
 
				+                    break
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
			
 
				+
			
 
				+        self.end_callback()
			
 
				+        # 为了线程可重复start
			
 
				+        self._started.clear()
			
 
				+        # 关闭打点
			
 
				+        metrics.close()
			
 
				+
			
 
				+    def join(self, timeout=None):
			
 
				+        """
			
 
				+        重写线程的join
			
 
				+        """
			
 
				+        if not self._started.is_set():
			
 
				+            return
			
 
				+
			
 
				+        super().join()
			
--- a/FworkSpider/feapder/core/spiders/batch_spider.py
+++ b/FworkSpider/feapder/core/spiders/batch_spider.py
@@ -0,0 +1,1273 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/22 12:06 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import time
			
 
				+import warnings
			
 
				+from collections import Iterable
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.buffer.item_buffer import MAX_ITEM_COUNT
			
 
				+from feapder.core.base_parser import BatchParser
			
 
				+from feapder.core.scheduler import Scheduler
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.network.item import Item
			
 
				+from feapder.network.item import UpdateItem
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.perfect_dict import PerfectDict
			
 
				+from feapder.utils.redis_lock import RedisLock
			
 
				+
			
 
				+CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
			
 
				+MYSQL_PIPELINE_PATH = "feapder.pipelines.mysql_pipeline.MysqlPipeline"
			
 
				+
			
 
				+
			
 
				+class BatchSpider(BatchParser, Scheduler):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        task_table,
			
 
				+        batch_record_table,
			
 
				+        batch_name,
			
 
				+        batch_interval,
			
 
				+        task_keys,
			
 
				+        task_state="state",
			
 
				+        min_task_count=10000,
			
 
				+        check_task_interval=5,
			
 
				+        task_limit=10000,
			
 
				+        related_redis_key=None,
			
 
				+        related_batch_record=None,
			
 
				+        task_condition="",
			
 
				+        task_order_by="",
			
 
				+        redis_key=None,
			
 
				+        thread_count=None,
			
 
				+        begin_callback=None,
			
 
				+        end_callback=None,
			
 
				+        delete_keys=(),
			
 
				+        keep_alive=None,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @summary: 批次爬虫
			
 
				+        必要条件
			
 
				+        1、需有任务表
			
 
				+            任务表中必须有id 及 任务状态字段 如 state。如指定parser_name字段，则任务会自动下发到对应的parser下, 否则会下发到所有的parser下。其他字段可根据爬虫需要的参数自行扩充
			
 
				+
			
 
				+            参考建表语句如下：
			
 
				+            CREATE TABLE `table_name` (
			
 
				+              `id` int(11) NOT NULL AUTO_INCREMENT,
			
 
				+              `param` varchar(1000) DEFAULT NULL COMMENT '爬虫需要的抓取数据需要的参数',
			
 
				+              `state` int(11) DEFAULT NULL COMMENT '任务状态',
			
 
				+              `parser_name` varchar(255) DEFAULT NULL COMMENT '任务解析器的脚本类名',
			
 
				+              PRIMARY KEY (`id`),
			
 
				+              UNIQUE KEY `nui` (`param`) USING BTREE
			
 
				+            ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
			
 
				+
			
 
				+        2、需有批次记录表 不存在自动创建
			
 
				+        ---------
			
 
				+        @param task_table: mysql中的任务表
			
 
				+        @param batch_record_table: mysql 中的批次记录表
			
 
				+        @param batch_name: 批次采集程序名称
			
 
				+        @param batch_interval: 批次间隔 天为单位。 如想一小时一批次，可写成1/24
			
 
				+        @param task_keys: 需要获取的任务字段 列表 [] 如需指定解析的parser，则需将parser_name字段取出来。
			
 
				+        @param task_state: mysql中任务表的任务状态字段
			
 
				+        @param min_task_count: redis 中最少任务数, 少于这个数量会从mysql的任务表取任务
			
 
				+        @param check_task_interval: 检查是否还有任务的时间间隔；
			
 
				+        @param task_limit: 从数据库中取任务的数量
			
 
				+        @param redis_key: 任务等数据存放在redis中的key前缀
			
 
				+        @param thread_count: 线程数，默认为配置文件中的线程数
			
 
				+        @param begin_callback: 爬虫开始回调函数
			
 
				+        @param end_callback: 爬虫结束回调函数
			
 
				+        @param delete_keys: 爬虫启动时删除的key，类型: 元组/bool/string。 支持正则; 常用于清空任务队列，否则重启时会断点续爬
			
 
				+        @param keep_alive: 爬虫是否常驻，默认否
			
 
				+        @param related_redis_key: 有关联的其他爬虫任务表（redis）注意：要避免环路 如 A -> B & B -> A 。
			
 
				+        @param related_batch_record: 有关联的其他爬虫批次表（mysql）注意：要避免环路 如 A -> B & B -> A 。
			
 
				+            related_redis_key 与 related_batch_record 选其一配置即可；用于相关联的爬虫没结束时，本爬虫也不结束
			
 
				+            若相关连的爬虫为批次爬虫，推荐以related_batch_record配置，
			
 
				+            若相关连的爬虫为普通爬虫，无批次表，可以以related_redis_key配置
			
 
				+        @param task_condition: 任务条件 用于从一个大任务表中挑选出数据自己爬虫的任务，即where后的条件语句
			
 
				+        @param task_order_by: 取任务时的排序条件 如 id desc
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        Scheduler.__init__(
			
 
				+            self,
			
 
				+            redis_key=redis_key,
			
 
				+            thread_count=thread_count,
			
 
				+            begin_callback=begin_callback,
			
 
				+            end_callback=end_callback,
			
 
				+            delete_keys=delete_keys,
			
 
				+            keep_alive=keep_alive,
			
 
				+            auto_start_requests=False,
			
 
				+            batch_interval=batch_interval,
			
 
				+            task_table=task_table,
			
 
				+            **kwargs,
			
 
				+        )
			
 
				+
			
 
				+        self._redisdb = RedisDB()
			
 
				+        self._mysqldb = MysqlDB()
			
 
				+
			
 
				+        self._task_table = task_table  # mysql中的任务表
			
 
				+        self._batch_record_table = batch_record_table  # mysql 中的批次记录表
			
 
				+        self._batch_name = batch_name  # 批次采集程序名称
			
 
				+        self._task_keys = task_keys  # 需要获取的任务字段
			
 
				+
			
 
				+        self._task_state = task_state  # mysql中任务表的state字段名
			
 
				+        self._min_task_count = min_task_count  # redis 中最少任务数
			
 
				+        self._check_task_interval = check_task_interval
			
 
				+        self._task_limit = task_limit  # mysql中一次取的任务数量
			
 
				+        self._related_task_tables = [
			
 
				+            setting.TAB_REQUSETS.format(redis_key=redis_key)
			
 
				+        ]  # 自己的task表也需要检查是否有任务
			
 
				+        if related_redis_key:
			
 
				+            self._related_task_tables.append(
			
 
				+                setting.TAB_REQUSETS.format(redis_key=related_redis_key)
			
 
				+            )
			
 
				+
			
 
				+        self._related_batch_record = related_batch_record
			
 
				+        self._task_condition = task_condition
			
 
				+        self._task_condition_prefix_and = task_condition and " and {}".format(
			
 
				+            task_condition
			
 
				+        )
			
 
				+        self._task_condition_prefix_where = task_condition and " where {}".format(
			
 
				+            task_condition
			
 
				+        )
			
 
				+        self._task_order_by = task_order_by and " order by {}".format(task_order_by)
			
 
				+
			
 
				+        self._batch_date_cache = None
			
 
				+        if self._batch_interval >= 1:
			
 
				+            self._date_format = "%Y-%m-%d"
			
 
				+        elif self._batch_interval < 1 and self._batch_interval >= 1 / 24:
			
 
				+            self._date_format = "%Y-%m-%d %H"
			
 
				+        else:
			
 
				+            self._date_format = "%Y-%m-%d %H:%M"
			
 
				+
			
 
				+        # 报警相关
			
 
				+        self._send_msg_interval = datetime.timedelta(hours=1)  # 每隔1小时发送一次报警
			
 
				+        self._last_send_msg_time = None
			
 
				+
			
 
				+        self._spider_last_done_time = None  # 爬虫最近已做任务数量时间
			
 
				+        self._spider_last_done_count = 0  # 爬虫最近已做任务数量
			
 
				+        self._spider_deal_speed_cached = None
			
 
				+
			
 
				+        self._is_more_parsers = True  # 多模版类爬虫
			
 
				+
			
 
				+    def init_property(self):
			
 
				+        """
			
 
				+        每个批次开始时需要重置的属性
			
 
				+        @return:
			
 
				+        """
			
 
				+        self._last_send_msg_time = None
			
 
				+
			
 
				+        self._spider_last_done_time = None
			
 
				+        self._spider_last_done_count = 0  # 爬虫刚开始启动时已做任务数量
			
 
				+
			
 
				+    def add_parser(self, parser):
			
 
				+        parser = parser(
			
 
				+            self._task_table,
			
 
				+            self._batch_record_table,
			
 
				+            self._task_state,
			
 
				+            self._date_format,
			
 
				+            self._mysqldb,
			
 
				+        )  # parser 实例化
			
 
				+        self._parsers.append(parser)
			
 
				+
			
 
				+    def start_monitor_task(self):
			
 
				+        """
			
 
				+        @summary: 监控任务状态
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        if not self._parsers:  # 不是多模版模式， 将自己注入到parsers，自己为模版
			
 
				+            self._is_more_parsers = False
			
 
				+            self._parsers.append(self)
			
 
				+
			
 
				+        elif len(self._parsers) <= 1:
			
 
				+            self._is_more_parsers = False
			
 
				+
			
 
				+        self.create_batch_record_table()
			
 
				+
			
 
				+        # 添加任务
			
 
				+        for parser in self._parsers:
			
 
				+            parser.add_task()
			
 
				+
			
 
				+        is_first_check = True
			
 
				+        while True:
			
 
				+            try:
			
 
				+                if self.check_batch(is_first_check):  # 该批次已经做完
			
 
				+                    if self._keep_alive:
			
 
				+                        is_first_check = True
			
 
				+                        log.info("爬虫所有任务已做完，不自动结束，等待新任务...")
			
 
				+                        time.sleep(self._check_task_interval)
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        break
			
 
				+
			
 
				+                is_first_check = False
			
 
				+
			
 
				+                # 检查redis中是否有任务 任务小于_min_task_count 则从mysql中取
			
 
				+                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
			
 
				+                todo_task_count = self._redisdb.zget_count(tab_requests)
			
 
				+
			
 
				+                tasks = []
			
 
				+                if todo_task_count < self._min_task_count:  # 从mysql中取任务
			
 
				+                    # 更新batch表的任务状态数量
			
 
				+                    self.update_task_done_count()
			
 
				+
			
 
				+                    log.info("redis 中剩余任务%s 数量过小 从mysql中取任务追加" % todo_task_count)
			
 
				+                    tasks = self.get_todo_task_from_mysql()
			
 
				+                    if not tasks:  # 状态为0的任务已经做完，需要检查状态为2的任务是否丢失
			
 
				+
			
 
				+                        if (
			
 
				+                            todo_task_count == 0
			
 
				+                        ):  # redis 中无待做任务，此时mysql中状态为2的任务为丢失任务。需重新做
			
 
				+                            lose_task_count = self.get_lose_task_count()
			
 
				+
			
 
				+                            if not lose_task_count:
			
 
				+                                time.sleep(self._check_task_interval)
			
 
				+                                continue
			
 
				+
			
 
				+                            elif (
			
 
				+                                lose_task_count > self._task_limit * 5
			
 
				+                            ):  # 丢失任务太多，直接重置，否则每次等redis任务消耗完再取下一批丢失任务，速度过慢
			
 
				+                                log.info("正在重置丢失任务为待做 共 {} 条".format(lose_task_count))
			
 
				+                                # 重置正在做的任务为待做
			
 
				+                                if self.reset_lose_task_from_mysql():
			
 
				+                                    log.info("重置丢失任务成功")
			
 
				+                                else:
			
 
				+                                    log.info("重置丢失任务失败")
			
 
				+
			
 
				+                                continue
			
 
				+
			
 
				+                            else:  # 丢失任务少，直接取
			
 
				+                                log.info(
			
 
				+                                    "正在取丢失任务 共 {} 条, 取 {} 条".format(
			
 
				+                                        lose_task_count,
			
 
				+                                        self._task_limit
			
 
				+                                        if self._task_limit <= lose_task_count
			
 
				+                                        else lose_task_count,
			
 
				+                                    )
			
 
				+                                )
			
 
				+                                tasks = self.get_doing_task_from_mysql()
			
 
				+
			
 
				+                    else:
			
 
				+                        log.info("mysql 中取到待做任务 %s 条" % len(tasks))
			
 
				+
			
 
				+                else:
			
 
				+                    log.info("redis 中尚有%s条积压任务，暂时不派发新任务" % todo_task_count)
			
 
				+
			
 
				+                if not tasks:
			
 
				+                    if todo_task_count >= self._min_task_count:
			
 
				+                        # log.info('任务正在进行 redis中剩余任务 %s' % todo_task_count)
			
 
				+                        pass
			
 
				+                    else:
			
 
				+                        log.info("mysql 中无待做任务 redis中剩余任务 %s" % todo_task_count)
			
 
				+                else:
			
 
				+                    # make start requests
			
 
				+                    self.distribute_task(tasks)
			
 
				+                    log.info("添加任务到redis成功")
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            time.sleep(self._check_task_interval)
			
 
				+
			
 
				+    def create_batch_record_table(self):
			
 
				+        sql = (
			
 
				+            "select table_name from information_schema.tables where table_name like '%s'"
			
 
				+            % self._batch_record_table
			
 
				+        )
			
 
				+        tables_name = self._mysqldb.find(sql)
			
 
				+        if not tables_name:
			
 
				+            sql = """
			
 
				+                CREATE TABLE `{table_name}` (
			
 
				+                      `id` int(11) UNSIGNED NOT NULL AUTO_INCREMENT,
			
 
				+                      `batch_date` {batch_date} DEFAULT NULL COMMENT '批次时间',
			
 
				+                      `total_count` int(11) DEFAULT NULL COMMENT '任务总数',
			
 
				+                      `done_count` int(11) DEFAULT NULL COMMENT '完成数 (1,-1)',
			
 
				+                      `fail_count` int(11) DEFAULT NULL COMMENT '失败任务数 (-1)',
			
 
				+                      `interval` float(11) DEFAULT NULL COMMENT '批次间隔',
			
 
				+                      `interval_unit` varchar(20) DEFAULT NULL COMMENT '批次间隔单位 day, hour',
			
 
				+                      `create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '批次开始时间',
			
 
				+                      `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '本条记录更新时间',
			
 
				+                      `is_done` int(11) DEFAULT '0' COMMENT '批次是否完成 0 未完成  1 完成',
			
 
				+                      PRIMARY KEY (`id`)
			
 
				+                    ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
			
 
				+            """.format(
			
 
				+                table_name=self._batch_record_table,
			
 
				+                batch_date="date" if self._date_format == "%Y-%m-%d" else "datetime",
			
 
				+            )
			
 
				+
			
 
				+            self._mysqldb.execute(sql)
			
 
				+
			
 
				+    def distribute_task(self, tasks):
			
 
				+        """
			
 
				+        @summary: 分发任务
			
 
				+        ---------
			
 
				+        @param tasks:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        if self._is_more_parsers:  # 为多模版类爬虫，需要下发指定的parser
			
 
				+            for task in tasks:
			
 
				+                for parser in self._parsers:  # 寻找task对应的parser
			
 
				+                    if parser.name in task:
			
 
				+                        task = PerfectDict(
			
 
				+                            _dict=dict(zip(self._task_keys, task)), _values=list(task)
			
 
				+                        )
			
 
				+                        requests = parser.start_requests(task)
			
 
				+                        if requests and not isinstance(requests, Iterable):
			
 
				+                            raise Exception(
			
 
				+                                "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
			
 
				+                            )
			
 
				+
			
 
				+                        result_type = 1
			
 
				+                        for request in requests or []:
			
 
				+                            if isinstance(request, Request):
			
 
				+                                request.parser_name = request.parser_name or parser.name
			
 
				+                                self._request_buffer.put_request(request)
			
 
				+                                result_type = 1
			
 
				+
			
 
				+                            elif isinstance(request, Item):
			
 
				+                                self._item_buffer.put_item(request)
			
 
				+                                result_type = 2
			
 
				+
			
 
				+                                if (
			
 
				+                                    self._item_buffer.get_items_count()
			
 
				+                                    >= MAX_ITEM_COUNT
			
 
				+                                ):
			
 
				+                                    self._item_buffer.flush()
			
 
				+
			
 
				+                            elif callable(request):  # callbale的request可能是更新数据库操作的函数
			
 
				+                                if result_type == 1:
			
 
				+                                    self._request_buffer.put_request(request)
			
 
				+                                else:
			
 
				+                                    self._item_buffer.put_item(request)
			
 
				+
			
 
				+                                    if (
			
 
				+                                        self._item_buffer.get_items_count()
			
 
				+                                        >= MAX_ITEM_COUNT
			
 
				+                                    ):
			
 
				+                                        self._item_buffer.flush()
			
 
				+
			
 
				+                            else:
			
 
				+                                raise TypeError(
			
 
				+                                    "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
			
 
				+                                        type(requests)
			
 
				+                                    )
			
 
				+                                )
			
 
				+
			
 
				+                        break
			
 
				+
			
 
				+        else:  # task没对应的parser 则将task下发到所有的parser
			
 
				+            for task in tasks:
			
 
				+                for parser in self._parsers:
			
 
				+                    task = PerfectDict(
			
 
				+                        _dict=dict(zip(self._task_keys, task)), _values=list(task)
			
 
				+                    )
			
 
				+                    requests = parser.start_requests(task)
			
 
				+                    if requests and not isinstance(requests, Iterable):
			
 
				+                        raise Exception(
			
 
				+                            "%s.%s返回值必须可迭代" % (parser.name, "start_requests")
			
 
				+                        )
			
 
				+
			
 
				+                    result_type = 1
			
 
				+                    for request in requests or []:
			
 
				+                        if isinstance(request, Request):
			
 
				+                            request.parser_name = request.parser_name or parser.name
			
 
				+                            self._request_buffer.put_request(request)
			
 
				+                            result_type = 1
			
 
				+
			
 
				+                        elif isinstance(request, Item):
			
 
				+                            self._item_buffer.put_item(request)
			
 
				+                            result_type = 2
			
 
				+
			
 
				+                            if self._item_buffer.get_items_count() >= MAX_ITEM_COUNT:
			
 
				+                                self._item_buffer.flush()
			
 
				+
			
 
				+                        elif callable(request):  # callbale的request可能是更新数据库操作的函数
			
 
				+                            if result_type == 1:
			
 
				+                                self._request_buffer.put_request(request)
			
 
				+                            else:
			
 
				+                                self._item_buffer.put_item(request)
			
 
				+
			
 
				+                                if (
			
 
				+                                    self._item_buffer.get_items_count()
			
 
				+                                    >= MAX_ITEM_COUNT
			
 
				+                                ):
			
 
				+                                    self._item_buffer.flush()
			
 
				+
			
 
				+        self._request_buffer.flush()
			
 
				+        self._item_buffer.flush()
			
 
				+
			
 
				+    def __get_task_state_count(self):
			
 
				+        sql = "select {state}, count(1) from {task_table}{task_condition} group by {state}".format(
			
 
				+            state=self._task_state,
			
 
				+            task_table=self._task_table,
			
 
				+            task_condition=self._task_condition_prefix_where,
			
 
				+        )
			
 
				+        task_state_count = self._mysqldb.find(sql)
			
 
				+
			
 
				+        task_state = {
			
 
				+            "total_count": sum(count for state, count in task_state_count),
			
 
				+            "done_count": sum(
			
 
				+                count for state, count in task_state_count if state in (1, -1)
			
 
				+            ),
			
 
				+            "failed_count": sum(
			
 
				+                count for state, count in task_state_count if state == -1
			
 
				+            ),
			
 
				+        }
			
 
				+
			
 
				+        return task_state
			
 
				+
			
 
				+    def update_task_done_count(self):
			
 
				+        """
			
 
				+        @summary: 更新批次表中的任务状态
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        task_count = self.__get_task_state_count()
			
 
				+
			
 
				+        # log.info('《%s》 批次进度 %s/%s' % (self._batch_name, done_task_count, total_task_count))
			
 
				+
			
 
				+        # 更新批次表
			
 
				+        sql = "update {} set done_count = {}, total_count = {}, fail_count = {}, update_time = CURRENT_TIME, is_done=0, `interval` = {}, interval_unit = '{}' where batch_date = '{}'".format(
			
 
				+            self._batch_record_table,
			
 
				+            task_count.get("done_count"),
			
 
				+            task_count.get("total_count"),
			
 
				+            task_count.get("failed_count"),
			
 
				+            self._batch_interval
			
 
				+            if self._batch_interval >= 1
			
 
				+            else self._batch_interval * 24,
			
 
				+            "day" if self._batch_interval >= 1 else "hour",
			
 
				+            self.batch_date,
			
 
				+        )
			
 
				+        self._mysqldb.update(sql)
			
 
				+
			
 
				+    def update_is_done(self):
			
 
				+        sql = "update {} set is_done = 1, update_time = CURRENT_TIME where batch_date = '{}' and is_done = 0".format(
			
 
				+            self._batch_record_table, self.batch_date
			
 
				+        )
			
 
				+        self._mysqldb.update(sql)
			
 
				+
			
 
				+    def get_todo_task_from_mysql(self):
			
 
				+        """
			
 
				+        @summary: 取待做的任务
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        # TODO 分批取数据 每批最大取 1000000个，防止内存占用过大
			
 
				+        # 查询任务
			
 
				+        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
			
 
				+        sql = "select %s from %s where %s = 0%s%s limit %s" % (
			
 
				+            task_keys,
			
 
				+            self._task_table,
			
 
				+            self._task_state,
			
 
				+            self._task_condition_prefix_and,
			
 
				+            self._task_order_by,
			
 
				+            self._task_limit,
			
 
				+        )
			
 
				+        tasks = self._mysqldb.find(sql)
			
 
				+
			
 
				+        if tasks:
			
 
				+            # 更新任务状态
			
 
				+            for i in range(0, len(tasks), 10000):  # 10000 一批量更新
			
 
				+                task_ids = str(
			
 
				+                    tuple([task[0] for task in tasks[i : i + 10000]])
			
 
				+                ).replace(",)", ")")
			
 
				+                sql = "update %s set %s = 2 where id in %s" % (
			
 
				+                    self._task_table,
			
 
				+                    self._task_state,
			
 
				+                    task_ids,
			
 
				+                )
			
 
				+                self._mysqldb.update(sql)
			
 
				+
			
 
				+        return tasks
			
 
				+
			
 
				+    def get_doing_task_from_mysql(self):
			
 
				+        """
			
 
				+        @summary: 取正在做的任务
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 查询任务
			
 
				+        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
			
 
				+        sql = "select %s from %s where %s = 2%s%s limit %s" % (
			
 
				+            task_keys,
			
 
				+            self._task_table,
			
 
				+            self._task_state,
			
 
				+            self._task_condition_prefix_and,
			
 
				+            self._task_order_by,
			
 
				+            self._task_limit,
			
 
				+        )
			
 
				+        tasks = self._mysqldb.find(sql)
			
 
				+
			
 
				+        return tasks
			
 
				+
			
 
				+    def get_lose_task_count(self):
			
 
				+        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
			
 
				+            date_format=self._date_format.replace(":%M", ":%i"),
			
 
				+            batch_record_table=self._batch_record_table,
			
 
				+        )
			
 
				+        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
			
 
				+        batch_date, total_count, done_count = batch_info[0]
			
 
				+        return total_count - done_count
			
 
				+
			
 
				+    def reset_lose_task_from_mysql(self):
			
 
				+        """
			
 
				+        @summary: 重置丢失任务为待做
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        sql = "update {table} set {state} = 0 where {state} = 2{task_condition}".format(
			
 
				+            table=self._task_table,
			
 
				+            state=self._task_state,
			
 
				+            task_condition=self._task_condition_prefix_and,
			
 
				+        )
			
 
				+        return self._mysqldb.update(sql)
			
 
				+
			
 
				+    def get_deal_speed(self, total_count, done_count, last_batch_date):
			
 
				+        """
			
 
				+        获取处理速度
			
 
				+        @param total_count: 总数量
			
 
				+        @param done_count: 做完数量
			
 
				+        @param last_batch_date: 批次时间 datetime
			
 
				+        @return:
			
 
				+            deal_speed （条/小时）, need_time （秒）, overflow_time（秒） （ overflow_time < 0 时表示提前多少秒完成 )
			
 
				+            或
			
 
				+            None
			
 
				+        """
			
 
				+        if not self._spider_last_done_count:
			
 
				+            now_date = datetime.datetime.now()
			
 
				+            self._spider_last_done_count = done_count
			
 
				+            self._spider_last_done_time = now_date
			
 
				+
			
 
				+        if done_count > self._spider_last_done_count:
			
 
				+            now_date = datetime.datetime.now()
			
 
				+
			
 
				+            time_interval = (now_date - self._spider_last_done_time).total_seconds()
			
 
				+            deal_speed = (
			
 
				+                done_count - self._spider_last_done_count
			
 
				+            ) / time_interval  # 条/秒
			
 
				+            need_time = (total_count - done_count) / deal_speed  # 单位秒
			
 
				+            overflow_time = (
			
 
				+                (now_date - last_batch_date).total_seconds()
			
 
				+                + need_time
			
 
				+                - datetime.timedelta(days=self._batch_interval).total_seconds()
			
 
				+            )  # 溢出时间 秒
			
 
				+            calculate_speed_time = now_date.strftime("%Y-%m-%d %H:%M:%S")  # 统计速度时间
			
 
				+
			
 
				+            deal_speed = int(deal_speed * 3600)  # 条/小时
			
 
				+
			
 
				+            # 更新最近已做任务数及时间
			
 
				+            self._spider_last_done_count = done_count
			
 
				+            self._spider_last_done_time = now_date
			
 
				+
			
 
				+            self._spider_deal_speed_cached = (
			
 
				+                deal_speed,
			
 
				+                need_time,
			
 
				+                overflow_time,
			
 
				+                calculate_speed_time,
			
 
				+            )
			
 
				+
			
 
				+        return self._spider_deal_speed_cached
			
 
				+
			
 
				+    def init_task(self):
			
 
				+        """
			
 
				+        @summary: 初始化任务表中的任务， 新一个批次开始时调用。 可能会重写
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        sql = "update {task_table} set {state} = 0 where {state} != -1{task_condition}".format(
			
 
				+            task_table=self._task_table,
			
 
				+            state=self._task_state,
			
 
				+            task_condition=self._task_condition_prefix_and,
			
 
				+        )
			
 
				+        return self._mysqldb.update(sql)
			
 
				+
			
 
				+    def check_batch(self, is_first_check=False):
			
 
				+        """
			
 
				+        @summary: 检查批次是否完成
			
 
				+        ---------
			
 
				+        @param: is_first_check 是否为首次检查，若首次检查，且检查结果为批次已完成，则不发送批次完成消息。因为之前发送过了
			
 
				+        ---------
			
 
				+        @result: 完成返回True 否则False
			
 
				+        """
			
 
				+
			
 
				+        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count from {batch_record_table} order by id desc limit 1'.format(
			
 
				+            date_format=self._date_format.replace(":%M", ":%i"),
			
 
				+            batch_record_table=self._batch_record_table,
			
 
				+        )
			
 
				+        batch_info = self._mysqldb.find(sql)  # (('2018-08-19', 49686, 0),)
			
 
				+
			
 
				+        if batch_info:
			
 
				+            batch_date, total_count, done_count = batch_info[0]
			
 
				+
			
 
				+            now_date = datetime.datetime.now()
			
 
				+            last_batch_date = datetime.datetime.strptime(batch_date, self._date_format)
			
 
				+            time_difference = now_date - last_batch_date
			
 
				+
			
 
				+            if total_count == done_count and time_difference < datetime.timedelta(
			
 
				+                days=self._batch_interval
			
 
				+            ):  # 若在本批次内，再次检查任务表是否有新增任务
			
 
				+                # # 改成查询任务表 看是否真的没任务了，因为batch_record表里边的数量可能没来得及更新
			
 
				+                task_count = self.__get_task_state_count()
			
 
				+
			
 
				+                total_count = task_count.get("total_count")
			
 
				+                done_count = task_count.get("done_count")
			
 
				+
			
 
				+            if total_count == done_count:
			
 
				+                # 检查相关联的爬虫是否完成
			
 
				+                releated_spider_is_done = self.related_spider_is_done()
			
 
				+                if releated_spider_is_done == False:
			
 
				+                    msg = "《{}》本批次未完成, 正在等待依赖爬虫 {} 结束. 批次时间 {} 批次进度 {}/{}".format(
			
 
				+                        self._batch_name,
			
 
				+                        self._related_batch_record or self._related_task_tables,
			
 
				+                        batch_date,
			
 
				+                        done_count,
			
 
				+                        total_count,
			
 
				+                    )
			
 
				+                    log.info(msg)
			
 
				+                    # 检查是否超时 超时发出报警
			
 
				+                    if time_difference >= datetime.timedelta(
			
 
				+                        days=self._batch_interval
			
 
				+                    ):  # 已经超时
			
 
				+                        if (
			
 
				+                            not self._last_send_msg_time
			
 
				+                            or now_date - self._last_send_msg_time
			
 
				+                            >= self._send_msg_interval
			
 
				+                        ):
			
 
				+                            self._last_send_msg_time = now_date
			
 
				+                            self.send_msg(
			
 
				+                                msg,
			
 
				+                                level="error",
			
 
				+                                message_prefix="《{}》本批次未完成, 正在等待依赖爬虫 {} 结束".format(
			
 
				+                                    self._batch_name,
			
 
				+                                    self._related_batch_record
			
 
				+                                    or self._related_task_tables,
			
 
				+                                ),
			
 
				+                            )
			
 
				+
			
 
				+                    return False
			
 
				+
			
 
				+                elif releated_spider_is_done == True:
			
 
				+                    # 更新is_done 状态
			
 
				+                    self.update_is_done()
			
 
				+
			
 
				+                else:
			
 
				+                    self.update_is_done()
			
 
				+
			
 
				+                msg = "《{}》本批次完成 批次时间 {} 共处理 {} 条任务".format(
			
 
				+                    self._batch_name, batch_date, done_count
			
 
				+                )
			
 
				+                log.info(msg)
			
 
				+                if not is_first_check:
			
 
				+                    self.send_msg(msg)
			
 
				+
			
 
				+                # 判断下一批次是否到
			
 
				+                if time_difference >= datetime.timedelta(days=self._batch_interval):
			
 
				+                    msg = "《{}》下一批次开始".format(self._batch_name)
			
 
				+                    log.info(msg)
			
 
				+                    self.send_msg(msg)
			
 
				+
			
 
				+                    # 初始化任务表状态
			
 
				+                    if self.init_task() != False:  # 更新失败返回False 其他返回True/None
			
 
				+                        # 初始化属性
			
 
				+                        self.init_property()
			
 
				+
			
 
				+                        is_success = (
			
 
				+                            self.record_batch()
			
 
				+                        )  # 有可能插入不成功，但是任务表已经重置了，不过由于当前时间为下一批次的时间，检查批次是否结束时不会检查任务表，所以下次执行时仍然会重置
			
 
				+                        if is_success:
			
 
				+                            # 看是否有等待任务的worker，若有则需要等会再下发任务，防止work批次时间没来得及更新
			
 
				+                            current_timestamp = tools.get_current_timestamp()
			
 
				+                            spider_count = self._redisdb.zget_count(
			
 
				+                                self._tab_spider_status,
			
 
				+                                priority_min=current_timestamp
			
 
				+                                - (setting.COLLECTOR_SLEEP_TIME + 10),
			
 
				+                                priority_max=current_timestamp,
			
 
				+                            )
			
 
				+                            if spider_count:
			
 
				+                                log.info(
			
 
				+                                    f"插入新批次记录成功，检测到有{spider_count}个爬虫进程在等待任务，本批任务1分钟后开始下发, 防止爬虫端缓存的批次时间没来得及更新"
			
 
				+                                )
			
 
				+                                tools.delay_time(60)
			
 
				+                            else:
			
 
				+                                log.info("插入新批次记录成功")
			
 
				+
			
 
				+                            return False  # 下一批次开始
			
 
				+
			
 
				+                        else:
			
 
				+                            return True  # 下一批次不开始。先不派发任务，因为批次表新批次插入失败了，需要插入成功后再派发任务
			
 
				+
			
 
				+                else:
			
 
				+                    log.info("《{}》下次批次时间未到".format(self._batch_name))
			
 
				+                    if not is_first_check:
			
 
				+                        self.send_msg("《{}》下次批次时间未到".format(self._batch_name))
			
 
				+                    return True
			
 
				+
			
 
				+            else:
			
 
				+                if time_difference >= datetime.timedelta(
			
 
				+                    days=self._batch_interval
			
 
				+                ):  # 已经超时
			
 
				+                    time_out = time_difference - datetime.timedelta(
			
 
				+                        days=self._batch_interval
			
 
				+                    )
			
 
				+                    time_out_pretty = tools.format_seconds(time_out.total_seconds())
			
 
				+
			
 
				+                    msg = "《{}》本批次已超时{} 批次时间 {}, 批次进度 {}/{}".format(
			
 
				+                        self._batch_name,
			
 
				+                        time_out_pretty,
			
 
				+                        batch_date,
			
 
				+                        done_count,
			
 
				+                        total_count,
			
 
				+                    )
			
 
				+                    if self._batch_interval >= 1:
			
 
				+                        msg += ", 期望时间{}天".format(self._batch_interval)
			
 
				+                    else:
			
 
				+                        msg += ", 期望时间{}小时".format(self._batch_interval * 24)
			
 
				+
			
 
				+                    result = self.get_deal_speed(
			
 
				+                        total_count=total_count,
			
 
				+                        done_count=done_count,
			
 
				+                        last_batch_date=last_batch_date,
			
 
				+                    )
			
 
				+                    if result:
			
 
				+                        deal_speed, need_time, overflow_time, calculate_speed_time = (
			
 
				+                            result
			
 
				+                        )
			
 
				+                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
			
 
				+                            calculate_speed_time,
			
 
				+                            deal_speed,
			
 
				+                            tools.format_seconds(need_time),
			
 
				+                        )
			
 
				+
			
 
				+                        if overflow_time > 0:
			
 
				+                            msg += ", 该批次预计总超时 {}, 请及时处理".format(
			
 
				+                                tools.format_seconds(overflow_time)
			
 
				+                            )
			
 
				+
			
 
				+                    log.info(msg)
			
 
				+
			
 
				+                    if (
			
 
				+                        not self._last_send_msg_time
			
 
				+                        or now_date - self._last_send_msg_time
			
 
				+                        >= self._send_msg_interval
			
 
				+                    ):
			
 
				+                        self._last_send_msg_time = now_date
			
 
				+                        self.send_msg(
			
 
				+                            msg,
			
 
				+                            level="error",
			
 
				+                            message_prefix="《{}》批次超时".format(self._batch_name),
			
 
				+                        )
			
 
				+
			
 
				+                else:  # 未超时
			
 
				+                    remaining_time = (
			
 
				+                        datetime.timedelta(days=self._batch_interval) - time_difference
			
 
				+                    )
			
 
				+                    remaining_time_pretty = tools.format_seconds(
			
 
				+                        remaining_time.total_seconds()
			
 
				+                    )
			
 
				+
			
 
				+                    if self._batch_interval >= 1:
			
 
				+                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}天, 剩余{}".format(
			
 
				+                            self._batch_name,
			
 
				+                            batch_date,
			
 
				+                            done_count,
			
 
				+                            total_count,
			
 
				+                            self._batch_interval,
			
 
				+                            remaining_time_pretty,
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        msg = "《{}》本批次正在进行, 批次时间 {}, 批次进度 {}/{}, 期望时间{}小时, 剩余{}".format(
			
 
				+                            self._batch_name,
			
 
				+                            batch_date,
			
 
				+                            done_count,
			
 
				+                            total_count,
			
 
				+                            self._batch_interval * 24,
			
 
				+                            remaining_time_pretty,
			
 
				+                        )
			
 
				+
			
 
				+                    result = self.get_deal_speed(
			
 
				+                        total_count=total_count,
			
 
				+                        done_count=done_count,
			
 
				+                        last_batch_date=last_batch_date,
			
 
				+                    )
			
 
				+                    if result:
			
 
				+                        deal_speed, need_time, overflow_time, calculate_speed_time = (
			
 
				+                            result
			
 
				+                        )
			
 
				+                        msg += ", 任务处理速度于{}统计, 约 {}条/小时, 预计还需 {}".format(
			
 
				+                            calculate_speed_time,
			
 
				+                            deal_speed,
			
 
				+                            tools.format_seconds(need_time),
			
 
				+                        )
			
 
				+
			
 
				+                        if overflow_time > 0:
			
 
				+                            msg += ", 该批次可能会超时 {}, 请及时处理".format(
			
 
				+                                tools.format_seconds(overflow_time)
			
 
				+                            )
			
 
				+                            # 发送警报
			
 
				+                            if (
			
 
				+                                not self._last_send_msg_time
			
 
				+                                or now_date - self._last_send_msg_time
			
 
				+                                >= self._send_msg_interval
			
 
				+                            ):
			
 
				+                                self._last_send_msg_time = now_date
			
 
				+                                self.send_msg(
			
 
				+                                    msg,
			
 
				+                                    level="error",
			
 
				+                                    message_prefix="《{}》批次可能超时".format(
			
 
				+                                        self._batch_name
			
 
				+                                    ),
			
 
				+                                )
			
 
				+
			
 
				+                        elif overflow_time < 0:
			
 
				+                            msg += ", 该批次预计提前 {} 完成".format(
			
 
				+                                tools.format_seconds(-overflow_time)
			
 
				+                            )
			
 
				+
			
 
				+                    log.info(msg)
			
 
				+
			
 
				+        else:
			
 
				+            # 插入batch_date
			
 
				+            self.record_batch()
			
 
				+
			
 
				+            # 初始化任务表状态 可能有产生任务的代码
			
 
				+            self.init_task()
			
 
				+
			
 
				+            return False
			
 
				+
			
 
				+    def related_spider_is_done(self):
			
 
				+        """
			
 
				+        相关连的爬虫是否跑完
			
 
				+        @return: True / False / None 表示无相关的爬虫 可由自身的total_count 和 done_count 来判断
			
 
				+        """
			
 
				+
			
 
				+        for related_redis_task_table in self._related_task_tables:
			
 
				+            if self._redisdb.exists_key(related_redis_task_table):
			
 
				+                return False
			
 
				+
			
 
				+        if self._related_batch_record:
			
 
				+            sql = "select is_done from {} order by id desc limit 1".format(
			
 
				+                self._related_batch_record
			
 
				+            )
			
 
				+            is_done = self._mysqldb.find(sql)
			
 
				+            is_done = is_done[0][0] if is_done else None
			
 
				+
			
 
				+            if is_done is None:
			
 
				+                log.warning("相关联的批次表不存在或无批次信息")
			
 
				+                return None
			
 
				+
			
 
				+            if not is_done:
			
 
				+                return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def record_batch(self):
			
 
				+        """
			
 
				+        @summary: 记录批次信息（初始化）
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 查询总任务数
			
 
				+        sql = "select count(1) from %s%s" % (
			
 
				+            self._task_table,
			
 
				+            self._task_condition_prefix_where,
			
 
				+        )
			
 
				+        total_task_count = self._mysqldb.find(sql)[0][0]
			
 
				+
			
 
				+        batch_date = tools.get_current_date(self._date_format)
			
 
				+
			
 
				+        sql = (
			
 
				+            "insert into %s (batch_date, done_count, total_count, `interval`, interval_unit, create_time) values ('%s', %s, %s, %s, '%s', CURRENT_TIME)"
			
 
				+            % (
			
 
				+                self._batch_record_table,
			
 
				+                batch_date,
			
 
				+                0,
			
 
				+                total_task_count,
			
 
				+                self._batch_interval
			
 
				+                if self._batch_interval >= 1
			
 
				+                else self._batch_interval * 24,
			
 
				+                "day" if self._batch_interval >= 1 else "hour",
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        affect_count = self._mysqldb.add(sql)  # None / 0 / 1 (1 为成功)
			
 
				+        if affect_count:
			
 
				+            # 重置批次日期
			
 
				+            self._batch_date_cache = batch_date
			
 
				+            # 重新刷下self.batch_date 中的 os.environ.get('batch_date') 否则日期还停留在上一个批次
			
 
				+            os.environ["batch_date"] = self._batch_date_cache
			
 
				+
			
 
				+            # 爬虫开始
			
 
				+            self.spider_begin()
			
 
				+            self.record_spider_state(
			
 
				+                spider_type=2,
			
 
				+                state=0,
			
 
				+                batch_date=batch_date,
			
 
				+                spider_start_time=tools.get_current_date(),
			
 
				+                batch_interval=self._batch_interval,
			
 
				+            )
			
 
				+        else:
			
 
				+            log.error("插入新批次失败")
			
 
				+
			
 
				+        return affect_count
			
 
				+
			
 
				+    # -------- 批次结束逻辑 ------------
			
 
				+
			
 
				+    def task_is_done(self):
			
 
				+        """
			
 
				+        @summary: 检查任务状态 是否做完 同时更新批次时间 (不能挂 挂了批次时间就不更新了)
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result: True / False （做完 / 未做完）
			
 
				+        """
			
 
				+
			
 
				+        is_done = False
			
 
				+
			
 
				+        # 查看批次记录表任务状态
			
 
				+        sql = 'select date_format(batch_date, "{date_format}"), total_count, done_count, is_done from {batch_record_table} order by id desc limit 1'.format(
			
 
				+            date_format=self._date_format.replace(":%M", ":%i"),
			
 
				+            batch_record_table=self._batch_record_table,
			
 
				+        )
			
 
				+
			
 
				+        batch_info = self._mysqldb.find(sql)
			
 
				+        if batch_info is None:
			
 
				+            raise Exception("查询批次信息失败")
			
 
				+
			
 
				+        if batch_info:
			
 
				+            self._batch_date_cache, total_count, done_count, is_done = batch_info[
			
 
				+                0
			
 
				+            ]  # 更新self._batch_date_cache, 防止新批次已经开始了，但self._batch_date_cache还是原来的批次时间
			
 
				+
			
 
				+            log.info(
			
 
				+                "《%s》 批次时间%s 批次进度 %s/%s 完成状态 %d"
			
 
				+                % (
			
 
				+                    self._batch_name,
			
 
				+                    self._batch_date_cache,
			
 
				+                    done_count,
			
 
				+                    total_count,
			
 
				+                    is_done,
			
 
				+                )
			
 
				+            )
			
 
				+            os.environ["batch_date"] = self._batch_date_cache  # 更新BatchParser里边的批次时间
			
 
				+
			
 
				+        if is_done:  # 检查任务表中是否有没做的任务 若有则is_done 为 False
			
 
				+            # 比较耗时 加锁防止多进程同时查询
			
 
				+            with RedisLock(key=self._spider_name) as lock:
			
 
				+                if lock.locked:
			
 
				+                    log.info("批次表标记已完成，正在检查任务表是否有未完成的任务")
			
 
				+
			
 
				+                    sql = "select 1 from %s where (%s = 0 or %s=2)%s limit 1" % (
			
 
				+                        self._task_table,
			
 
				+                        self._task_state,
			
 
				+                        self._task_state,
			
 
				+                        self._task_condition_prefix_and,
			
 
				+                    )
			
 
				+                    tasks = self._mysqldb.find(sql)  # [(1,)]  / []
			
 
				+                    if tasks:
			
 
				+                        log.info("检测到任务表中有未完成任务，等待任务下发")
			
 
				+                        is_done = False
			
 
				+
			
 
				+                        # 更新batch_record 表的is_done 状态，减少查询任务表的次数
			
 
				+                        sql = 'update {batch_record_table} set is_done = 0 where batch_date = "{batch_date}"'.format(
			
 
				+                            batch_record_table=self._batch_record_table,
			
 
				+                            batch_date=self._batch_date_cache,
			
 
				+                        )
			
 
				+                        self._mysqldb.update(sql)
			
 
				+
			
 
				+                    else:
			
 
				+                        log.info("任务表中任务均已完成，爬虫结束")
			
 
				+                else:
			
 
				+                    log.info("批次表标记已完成，其他爬虫进程正在检查任务表是否有未完成的任务，本进程跳过检查，继续等待")
			
 
				+
			
 
				+                    is_done = False
			
 
				+
			
 
				+        return is_done
			
 
				+
			
 
				+    def run(self):
			
 
				+        """
			
 
				+        @summary: 重写run方法 检查mysql中的任务是否做完， 做完停止
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        try:
			
 
				+            self.create_batch_record_table()
			
 
				+
			
 
				+            if not self._parsers:  # 不是add_parser 模式
			
 
				+                self._parsers.append(self)
			
 
				+
			
 
				+            self._start()
			
 
				+
			
 
				+            while True:
			
 
				+                try:
			
 
				+                    if (
			
 
				+                        self.task_is_done() and self.all_thread_is_done()
			
 
				+                    ):  # redis全部的任务已经做完 并且mysql中的任务已经做完（检查各个线程all_thread_is_done，防止任务没做完，就更新任务状态，导致程序结束的情况）
			
 
				+                        if not self._is_notify_end:
			
 
				+                            self.spider_end()
			
 
				+                            self.record_spider_state(
			
 
				+                                spider_type=2,
			
 
				+                                state=1,
			
 
				+                                batch_date=self._batch_date_cache,
			
 
				+                                spider_end_time=tools.get_current_date(),
			
 
				+                                batch_interval=self._batch_interval,
			
 
				+                            )
			
 
				+
			
 
				+                            self._is_notify_end = True
			
 
				+
			
 
				+                        if not self._keep_alive:
			
 
				+                            self._stop_all_thread()
			
 
				+                            break
			
 
				+                    else:
			
 
				+                        self._is_notify_end = False
			
 
				+
			
 
				+                    self.check_task_status()
			
 
				+
			
 
				+                except Exception as e:
			
 
				+                    log.exception(e)
			
 
				+
			
 
				+                tools.delay_time(10)  # 10秒钟检查一次爬虫状态
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            msg = "《%s》主线程异常 爬虫结束 exception: %s" % (self._batch_name, e)
			
 
				+            log.error(msg)
			
 
				+            self.send_msg(
			
 
				+                msg, level="error", message_prefix="《%s》爬虫异常结束".format(self._batch_name)
			
 
				+            )
			
 
				+
			
 
				+            os._exit(137)  # 使退出码为35072 方便爬虫管理器重启
			
 
				+
			
 
				+    @classmethod
			
 
				+    def to_DebugBatchSpider(cls, *args, **kwargs):
			
 
				+        # DebugBatchSpider 继承 cls
			
 
				+        DebugBatchSpider.__bases__ = (cls,)
			
 
				+        DebugBatchSpider.__name__ = cls.__name__
			
 
				+        return DebugBatchSpider(*args, **kwargs)
			
 
				+
			
 
				+
			
 
				+class DebugBatchSpider(BatchSpider):
			
 
				+    """
			
 
				+    Debug批次爬虫
			
 
				+    """
			
 
				+
			
 
				+    __debug_custom_setting__ = dict(
			
 
				+        COLLECTOR_SLEEP_TIME=1,
			
 
				+        COLLECTOR_TASK_COUNT=1,
			
 
				+        # SPIDER
			
 
				+        SPIDER_THREAD_COUNT=1,
			
 
				+        SPIDER_SLEEP_TIME=0,
			
 
				+        SPIDER_TASK_COUNT=1,
			
 
				+        SPIDER_MAX_RETRY_TIMES=10,
			
 
				+        REQUEST_LOST_TIMEOUT=600,  # 10分钟
			
 
				+        PROXY_ENABLE=False,
			
 
				+        RETRY_FAILED_REQUESTS=False,
			
 
				+        # 保存失败的request
			
 
				+        SAVE_FAILED_REQUEST=False,
			
 
				+        # 过滤
			
 
				+        ITEM_FILTER_ENABLE=False,
			
 
				+        REQUEST_FILTER_ENABLE=False,
			
 
				+        OSS_UPLOAD_TABLES=(),
			
 
				+        DELETE_KEYS=True,
			
 
				+        ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
			
 
				+    )
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        task_id=None,
			
 
				+        task=None,
			
 
				+        save_to_db=False,
			
 
				+        update_stask=False,
			
 
				+        *args,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param task_id:  任务id
			
 
				+        @param task:  任务  task 与 task_id 二者选一即可
			
 
				+        @param save_to_db: 数据是否入库 默认否
			
 
				+        @param update_stask: 是否更新任务 默认否
			
 
				+        @param args:
			
 
				+        @param kwargs:
			
 
				+        """
			
 
				+        warnings.warn(
			
 
				+            "您正处于debug模式下，该模式下不会更新任务状态及数据入库，仅用于调试。正式发布前请更改为正常模式", category=Warning
			
 
				+        )
			
 
				+
			
 
				+        if not task and not task_id:
			
 
				+            raise Exception("task_id 与 task 不能同时为null")
			
 
				+
			
 
				+        kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
			
 
				+        if save_to_db and not self.__class__.__custom_setting__.get("ITEM_PIPELINES"):
			
 
				+            self.__class__.__debug_custom_setting__.update(
			
 
				+                ITEM_PIPELINES=[MYSQL_PIPELINE_PATH]
			
 
				+            )
			
 
				+        self.__class__.__custom_setting__.update(
			
 
				+            self.__class__.__debug_custom_setting__
			
 
				+        )
			
 
				+
			
 
				+        super(DebugBatchSpider, self).__init__(*args, **kwargs)
			
 
				+
			
 
				+        self._task_id = task_id
			
 
				+        self._task = task
			
 
				+        self._update_task = update_stask
			
 
				+
			
 
				+    def start_monitor_task(self):
			
 
				+        """
			
 
				+        @summary: 监控任务状态
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        if not self._parsers:  # 不是多模版模式， 将自己注入到parsers，自己为模版
			
 
				+            self._is_more_parsers = False
			
 
				+            self._parsers.append(self)
			
 
				+
			
 
				+        elif len(self._parsers) <= 1:
			
 
				+            self._is_more_parsers = False
			
 
				+
			
 
				+        if self._task:
			
 
				+            self.distribute_task([self._task])
			
 
				+        else:
			
 
				+            tasks = self.get_todo_task_from_mysql()
			
 
				+            if not tasks:
			
 
				+                raise Exception("未获取到任务 请检查 task_id: {} 是否存在".format(self._task_id))
			
 
				+            self.distribute_task(tasks)
			
 
				+
			
 
				+        os.environ.setdefault("batch_date", "1970-00-00")
			
 
				+        log.debug("下发任务完毕")
			
 
				+
			
 
				+    def get_todo_task_from_mysql(self):
			
 
				+        """
			
 
				+        @summary: 取待做的任务
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 查询任务
			
 
				+        task_keys = ", ".join([f"`{key}`" for key in self._task_keys])
			
 
				+        sql = "select %s from %s where id=%s" % (
			
 
				+            task_keys,
			
 
				+            self._task_table,
			
 
				+            self._task_id,
			
 
				+        )
			
 
				+        tasks = self._mysqldb.find(sql)
			
 
				+
			
 
				+        return tasks
			
 
				+
			
 
				+    def save_cached(self, request, response, table):
			
 
				+        pass
			
 
				+
			
 
				+    def update_task_state(self, task_id, state=1, *args, **kwargs):
			
 
				+        """
			
 
				+        @summary: 更新任务表中任务状态，做完每个任务时代码逻辑中要主动调用。可能会重写
			
 
				+        调用方法为 yield lambda : self.update_task_state(task_id, state)
			
 
				+        ---------
			
 
				+        @param task_id:
			
 
				+        @param state:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        if self._update_task:
			
 
				+            kwargs["id"] = task_id
			
 
				+            kwargs[self._task_state] = state
			
 
				+
			
 
				+            sql = tools.make_update_sql(
			
 
				+                self._task_table,
			
 
				+                kwargs,
			
 
				+                condition="id = {task_id}".format(task_id=task_id),
			
 
				+            )
			
 
				+
			
 
				+            if self._mysqldb.update(sql):
			
 
				+                log.debug("置任务%s状态成功" % task_id)
			
 
				+            else:
			
 
				+                log.error("置任务%s状态失败  sql=%s" % (task_id, sql))
			
 
				+
			
 
				+    def update_task_batch(self, task_id, state=1, *args, **kwargs):
			
 
				+        """
			
 
				+        批量更新任务 多处调用，更新的字段必须一致
			
 
				+        注意：需要 写成 yield update_task_batch(...) 否则不会更新
			
 
				+        @param task_id:
			
 
				+        @param state:
			
 
				+        @param kwargs:
			
 
				+        @return:
			
 
				+        """
			
 
				+        if self._update_task:
			
 
				+            kwargs["id"] = task_id
			
 
				+            kwargs[self._task_state] = state
			
 
				+
			
 
				+            update_item = UpdateItem(**kwargs)
			
 
				+            update_item.table_name = self._task_table
			
 
				+            update_item.name_underline = self._task_table + "_item"
			
 
				+
			
 
				+            return update_item
			
 
				+
			
 
				+    def delete_tables(self, delete_tables_list):
			
 
				+        if isinstance(delete_tables_list, bool):
			
 
				+            delete_tables_list = [self._redis_key + "*"]
			
 
				+        elif not isinstance(delete_tables_list, (list, tuple)):
			
 
				+            delete_tables_list = [delete_tables_list]
			
 
				+
			
 
				+        redis = RedisDB()
			
 
				+        for delete_tab in delete_tables_list:
			
 
				+            if delete_tab == "*":
			
 
				+                delete_tab = self._redis_key + "*"
			
 
				+
			
 
				+            tables = redis.getkeys(delete_tab)
			
 
				+            for table in tables:
			
 
				+                log.info("正在删除表 %s" % table)
			
 
				+                redis.clear(table)
			
 
				+
			
 
				+    def run(self):
			
 
				+        self.start_monitor_task()
			
 
				+
			
 
				+        if not self._parsers:  # 不是add_parser 模式
			
 
				+            self._parsers.append(self)
			
 
				+
			
 
				+        self._start()
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                if self.all_thread_is_done():
			
 
				+                    self._stop_all_thread()
			
 
				+                    break
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
			
 
				+
			
 
				+        self.delete_tables([self._redis_key + "*"])
			
 
				+
			
 
				+    def record_spider_state(
			
 
				+        self,
			
 
				+        spider_type,
			
 
				+        state,
			
 
				+        batch_date=None,
			
 
				+        spider_start_time=None,
			
 
				+        spider_end_time=None,
			
 
				+        batch_interval=None,
			
 
				+    ):
			
 
				+        pass
			
--- a/FworkSpider/feapder/core/spiders/spider.py
+++ b/FworkSpider/feapder/core/spiders/spider.py
@@ -0,0 +1,437 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/22 12:05 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+import warnings
			
 
				+from collections import Iterable
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.core.base_parser import BaseParser
			
 
				+from feapder.core.scheduler import Scheduler
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.network.item import Item
			
 
				+from feapder.network.request import Request
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+CONSOLE_PIPELINE_PATH = "feapder.pipelines.console_pipeline.ConsolePipeline"
			
 
				+
			
 
				+
			
 
				+class Spider(
			
 
				+    BaseParser, Scheduler
			
 
				+):  # threading 中有name函数， 必须先继承BaseParser 否则其内部的name会被Schedule的基类threading.Thread的name覆盖
			
 
				+    """
			
 
				+    @summary: 为了简化搭建爬虫
			
 
				+    ---------
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        redis_key=None,
			
 
				+        min_task_count=1,
			
 
				+        check_task_interval=5,
			
 
				+        thread_count=None,
			
 
				+        begin_callback=None,
			
 
				+        end_callback=None,
			
 
				+        delete_keys=(),
			
 
				+        keep_alive=None,
			
 
				+        auto_start_requests=None,
			
 
				+        batch_interval=0,
			
 
				+        wait_lock=True,
			
 
				+        **kwargs
			
 
				+    ):
			
 
				+        """
			
 
				+        @summary: 爬虫
			
 
				+        ---------
			
 
				+        @param redis_key: 任务等数据存放在redis中的key前缀
			
 
				+        @param min_task_count: 任务队列中最少任务数, 少于这个数量才会添加任务，默认1。start_monitor_task 模式下生效
			
 
				+        @param check_task_interval: 检查是否还有任务的时间间隔；默认5秒
			
 
				+        @param thread_count: 线程数，默认为配置文件中的线程数
			
 
				+        @param begin_callback: 爬虫开始回调函数
			
 
				+        @param end_callback: 爬虫结束回调函数
			
 
				+        @param delete_keys: 爬虫启动时删除的key，类型: 元组/bool/string。 支持正则; 常用于清空任务队列，否则重启时会断点续爬
			
 
				+        @param keep_alive: 爬虫是否常驻
			
 
				+        @param auto_start_requests: 爬虫是否自动添加任务
			
 
				+        @param batch_interval: 抓取时间间隔 默认为0 天为单位 多次启动时，只有当前时间与第一次抓取结束的时间间隔大于指定的时间间隔时，爬虫才启动
			
 
				+        @param wait_lock: 下发任务时否等待锁，若不等待锁，可能会存在多进程同时在下发一样的任务，因此分布式环境下请将该值设置True
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        super(Spider, self).__init__(
			
 
				+            redis_key=redis_key,
			
 
				+            thread_count=thread_count,
			
 
				+            begin_callback=begin_callback,
			
 
				+            end_callback=end_callback,
			
 
				+            delete_keys=delete_keys,
			
 
				+            keep_alive=keep_alive,
			
 
				+            auto_start_requests=auto_start_requests,
			
 
				+            batch_interval=batch_interval,
			
 
				+            wait_lock=wait_lock,
			
 
				+            **kwargs
			
 
				+        )
			
 
				+
			
 
				+        self._min_task_count = min_task_count
			
 
				+        self._check_task_interval = check_task_interval
			
 
				+
			
 
				+        self._is_distributed_task = False
			
 
				+        self._is_show_not_task = False
			
 
				+
			
 
				+    def start_monitor_task(self, *args, **kws):
			
 
				+        if not self.is_reach_next_spider_time():
			
 
				+            return
			
 
				+
			
 
				+        self._auto_start_requests = False
			
 
				+        redisdb = RedisDB()
			
 
				+
			
 
				+        if not self._parsers:  # 不是add_parser 模式
			
 
				+            self._parsers.append(self)
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                # 检查redis中是否有任务
			
 
				+                tab_requests = setting.TAB_REQUSETS.format(redis_key=self._redis_key)
			
 
				+                todo_task_count = redisdb.zget_count(tab_requests)
			
 
				+
			
 
				+                if todo_task_count < self._min_task_count:  # 添加任务
			
 
				+                    # make start requests
			
 
				+                    self.distribute_task(*args, **kws)
			
 
				+
			
 
				+                else:
			
 
				+                    log.info("redis 中尚有%s条积压任务，暂时不派发新任务" % todo_task_count)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            if not self._keep_alive:
			
 
				+                break
			
 
				+
			
 
				+            time.sleep(self._check_task_interval)
			
 
				+
			
 
				+    def distribute_task(self, *args, **kws):
			
 
				+        """
			
 
				+        @summary: 分发任务 并将返回的request入库
			
 
				+        ---------
			
 
				+        @param tasks:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        self._is_distributed_task = False
			
 
				+
			
 
				+        for parser in self._parsers:
			
 
				+            requests = parser.start_requests(*args, **kws)
			
 
				+            if requests and not isinstance(requests, Iterable):
			
 
				+                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
			
 
				+
			
 
				+            result_type = 1
			
 
				+            for request in requests or []:
			
 
				+                if isinstance(request, Request):
			
 
				+                    request.parser_name = request.parser_name or parser.name
			
 
				+                    self._request_buffer.put_request(request)
			
 
				+
			
 
				+                    self._is_distributed_task = True
			
 
				+                    result_type = 1
			
 
				+
			
 
				+                elif isinstance(request, Item):
			
 
				+                    self._item_buffer.put_item(request)
			
 
				+                    result_type = 2
			
 
				+
			
 
				+                elif callable(request):  # callbale的request可能是更新数据库操作的函数
			
 
				+                    if result_type == 1:
			
 
				+                        self._request_buffer.put_request(request)
			
 
				+                    else:
			
 
				+                        self._item_buffer.put_item(request)
			
 
				+                else:
			
 
				+                    raise TypeError(
			
 
				+                        "start_requests yield result type error, expect Request、Item、callback func, bug get type: {}".format(
			
 
				+                            type(request)
			
 
				+                        )
			
 
				+                    )
			
 
				+
			
 
				+            self._request_buffer.flush()
			
 
				+            self._item_buffer.flush()
			
 
				+
			
 
				+        if self._is_distributed_task:  # 有任务时才提示启动爬虫
			
 
				+            # begin
			
 
				+            self.spider_begin()
			
 
				+            self.record_spider_state(
			
 
				+                spider_type=1,
			
 
				+                state=0,
			
 
				+                batch_date=tools.get_current_date(),
			
 
				+                spider_start_time=tools.get_current_date(),
			
 
				+                batch_interval=self._batch_interval,
			
 
				+            )
			
 
				+
			
 
				+            # 重置已经提示无任务状态为False
			
 
				+            self._is_show_not_task = False
			
 
				+
			
 
				+        elif not self._is_show_not_task:  # 无任务，且没推送过无任务信息
			
 
				+            # 发送无任务消息
			
 
				+            msg = "《%s》start_requests无任务添加" % (self._spider_name)
			
 
				+            log.info(msg)
			
 
				+
			
 
				+            # self.send_msg(msg)
			
 
				+
			
 
				+            self._is_show_not_task = True
			
 
				+
			
 
				+    def run(self):
			
 
				+        if not self.is_reach_next_spider_time():
			
 
				+            return
			
 
				+
			
 
				+        if not self._parsers:  # 不是add_parser 模式
			
 
				+            self._parsers.append(self)
			
 
				+
			
 
				+        self._start()
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                if self.all_thread_is_done():
			
 
				+                    if not self._is_notify_end:
			
 
				+                        self.spider_end()  # 跑完一轮
			
 
				+                        self.record_spider_state(
			
 
				+                            spider_type=1,
			
 
				+                            state=1,
			
 
				+                            spider_end_time=tools.get_current_date(),
			
 
				+                            batch_interval=self._batch_interval,
			
 
				+                        )
			
 
				+
			
 
				+                        self._is_notify_end = True
			
 
				+
			
 
				+                    if not self._keep_alive:
			
 
				+                        self._stop_all_thread()
			
 
				+                        break
			
 
				+
			
 
				+                else:
			
 
				+                    self._is_notify_end = False
			
 
				+
			
 
				+                self.check_task_status()
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
			
 
				+
			
 
				+    @classmethod
			
 
				+    def to_DebugSpider(cls, *args, **kwargs):
			
 
				+        # DebugSpider 继承 cls
			
 
				+        DebugSpider.__bases__ = (cls,)
			
 
				+        DebugSpider.__name__ = cls.__name__
			
 
				+        return DebugSpider(*args, **kwargs)
			
 
				+
			
 
				+
			
 
				+class DebugSpider(Spider):
			
 
				+    """
			
 
				+    Debug爬虫
			
 
				+    """
			
 
				+
			
 
				+    __debug_custom_setting__ = dict(
			
 
				+        COLLECTOR_SLEEP_TIME=1,
			
 
				+        COLLECTOR_TASK_COUNT=1,
			
 
				+        # SPIDER
			
 
				+        SPIDER_THREAD_COUNT=1,
			
 
				+        SPIDER_SLEEP_TIME=0,
			
 
				+        SPIDER_TASK_COUNT=1,
			
 
				+        SPIDER_MAX_RETRY_TIMES=10,
			
 
				+        REQUEST_LOST_TIMEOUT=600,  # 10分钟
			
 
				+        PROXY_ENABLE=False,
			
 
				+        RETRY_FAILED_REQUESTS=False,
			
 
				+        # 保存失败的request
			
 
				+        SAVE_FAILED_REQUEST=False,
			
 
				+        # 过滤
			
 
				+        ITEM_FILTER_ENABLE=False,
			
 
				+        REQUEST_FILTER_ENABLE=False,
			
 
				+        OSS_UPLOAD_TABLES=(),
			
 
				+        DELETE_KEYS=True,
			
 
				+        ITEM_PIPELINES=[CONSOLE_PIPELINE_PATH],
			
 
				+    )
			
 
				+
			
 
				+    def __init__(self, request=None, request_dict=None, *args, **kwargs):
			
 
				+        """
			
 
				+        @param request: request 类对象
			
 
				+        @param request_dict: request 字典。 request 与 request_dict 二者选一即可
			
 
				+        @param kwargs:
			
 
				+        """
			
 
				+        warnings.warn(
			
 
				+            "您正处于debug模式下，该模式下不会更新任务状态及数据入库，仅用于调试。正式发布前请更改为正常模式", category=Warning
			
 
				+        )
			
 
				+
			
 
				+        if not request and not request_dict:
			
 
				+            raise Exception("request 与 request_dict 不能同时为null")
			
 
				+
			
 
				+        kwargs["redis_key"] = kwargs["redis_key"] + "_debug"
			
 
				+        self.__class__.__custom_setting__.update(
			
 
				+            self.__class__.__debug_custom_setting__
			
 
				+        )
			
 
				+
			
 
				+        super(DebugSpider, self).__init__(*args, **kwargs)
			
 
				+
			
 
				+        self._request = request or Request.from_dict(request_dict)
			
 
				+
			
 
				+    def save_cached(self, request, response, table):
			
 
				+        pass
			
 
				+
			
 
				+    def delete_tables(self, delete_tables_list):
			
 
				+        if isinstance(delete_tables_list, bool):
			
 
				+            delete_tables_list = [self._redis_key + "*"]
			
 
				+        elif not isinstance(delete_tables_list, (list, tuple)):
			
 
				+            delete_tables_list = [delete_tables_list]
			
 
				+
			
 
				+        redis = RedisDB()
			
 
				+        for delete_tab in delete_tables_list:
			
 
				+            if delete_tab == "*":
			
 
				+                delete_tab = self._redis_key + "*"
			
 
				+
			
 
				+            tables = redis.getkeys(delete_tab)
			
 
				+            for table in tables:
			
 
				+                log.info("正在删除表 %s" % table)
			
 
				+                redis.clear(table)
			
 
				+
			
 
				+    def __start_requests(self):
			
 
				+        yield self._request
			
 
				+
			
 
				+    def distribute_task(self):
			
 
				+        """
			
 
				+        @summary: 分发任务 并将返回的request入库
			
 
				+        ---------
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        self._is_distributed_task = False
			
 
				+
			
 
				+        for parser in self._parsers:
			
 
				+            requests = parser.__start_requests()
			
 
				+            if requests and not isinstance(requests, Iterable):
			
 
				+                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
			
 
				+
			
 
				+            result_type = 1
			
 
				+            for request in requests or []:
			
 
				+                if isinstance(request, Request):
			
 
				+                    request.parser_name = request.parser_name or parser.name
			
 
				+                    self._request_buffer.put_request(request)
			
 
				+
			
 
				+                    self._is_distributed_task = True
			
 
				+                    result_type = 1
			
 
				+
			
 
				+                elif isinstance(request, Item):
			
 
				+                    self._item_buffer.put_item(request)
			
 
				+                    result_type = 2
			
 
				+
			
 
				+                elif callable(request):  # callbale的request可能是更新数据库操作的函数
			
 
				+                    if result_type == 1:
			
 
				+                        self._request_buffer.put_request(request)
			
 
				+                    else:
			
 
				+                        self._item_buffer.put_item(request)
			
 
				+
			
 
				+            self._request_buffer.flush()
			
 
				+            self._item_buffer.flush()
			
 
				+
			
 
				+        if self._is_distributed_task:  # 有任务时才提示启动爬虫
			
 
				+            # begin
			
 
				+            self.spider_begin()
			
 
				+            self.record_spider_state(
			
 
				+                spider_type=1,
			
 
				+                state=0,
			
 
				+                batch_date=tools.get_current_date(),
			
 
				+                spider_start_time=tools.get_current_date(),
			
 
				+                batch_interval=self._batch_interval,
			
 
				+            )
			
 
				+
			
 
				+            # 重置已经提示无任务状态为False
			
 
				+            self._is_show_not_task = False
			
 
				+
			
 
				+        elif not self._is_show_not_task:  # 无任务，且没推送过无任务信息
			
 
				+            # 发送无任务消息
			
 
				+            msg = "《%s》start_requests无任务添加" % (self._spider_name)
			
 
				+            log.info(msg)
			
 
				+
			
 
				+            # self.send_msg(msg)
			
 
				+
			
 
				+            self._is_show_not_task = True
			
 
				+
			
 
				+    def record_spider_state(
			
 
				+        self,
			
 
				+        spider_type,
			
 
				+        state,
			
 
				+        batch_date=None,
			
 
				+        spider_start_time=None,
			
 
				+        spider_end_time=None,
			
 
				+        batch_interval=None,
			
 
				+    ):
			
 
				+        pass
			
 
				+
			
 
				+    def _start(self):
			
 
				+        # 启动parser 的 start_requests
			
 
				+        self.spider_begin()  # 不自动结束的爬虫此处只能执行一遍
			
 
				+
			
 
				+        for parser in self._parsers:
			
 
				+            results = parser.__start_requests()
			
 
				+            # 添加request到请求队列，由请求队列统一入库
			
 
				+            if results and not isinstance(results, Iterable):
			
 
				+                raise Exception("%s.%s返回值必须可迭代" % (parser.name, "start_requests"))
			
 
				+
			
 
				+            result_type = 1
			
 
				+            for result in results or []:
			
 
				+                if isinstance(result, Request):
			
 
				+                    result.parser_name = result.parser_name or parser.name
			
 
				+                    self._request_buffer.put_request(result)
			
 
				+                    result_type = 1
			
 
				+
			
 
				+                elif isinstance(result, Item):
			
 
				+                    self._item_buffer.put_item(result)
			
 
				+                    result_type = 2
			
 
				+
			
 
				+                elif callable(result):  # callbale的request可能是更新数据库操作的函数
			
 
				+                    if result_type == 1:
			
 
				+                        self._request_buffer.put_request(result)
			
 
				+                    else:
			
 
				+                        self._item_buffer.put_item(result)
			
 
				+
			
 
				+            self._request_buffer.flush()
			
 
				+            self._item_buffer.flush()
			
 
				+
			
 
				+        # 启动collector
			
 
				+        self._collector.start()
			
 
				+
			
 
				+        # 启动parser control
			
 
				+        for i in range(self._thread_count):
			
 
				+            parser_control = self._parser_control_obj(
			
 
				+                self._collector,
			
 
				+                self._redis_key,
			
 
				+                self._request_buffer,
			
 
				+                self._item_buffer,
			
 
				+            )
			
 
				+
			
 
				+            for parser in self._parsers:
			
 
				+                parser_control.add_parser(parser)
			
 
				+
			
 
				+            parser_control.start()
			
 
				+            self._parser_controls.append(parser_control)
			
 
				+
			
 
				+        # 启动request_buffer
			
 
				+        self._request_buffer.start()
			
 
				+
			
 
				+        # 启动item_buffer
			
 
				+        self._item_buffer.start()
			
 
				+
			
 
				+    def run(self):
			
 
				+        if not self._parsers:  # 不是add_parser 模式
			
 
				+            self._parsers.append(self)
			
 
				+
			
 
				+        self._start()
			
 
				+
			
 
				+        while True:
			
 
				+            try:
			
 
				+                if self.all_thread_is_done():
			
 
				+                    self._stop_all_thread()
			
 
				+                    break
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+
			
 
				+            tools.delay_time(1)  # 1秒钟检查一次爬虫状态
			
 
				+
			
 
				+        self.delete_tables([self._redis_key + "*"])
			
--- a/FworkSpider/feapder/db/__init__.py
+++ b/FworkSpider/feapder/db/__init__.py
@@ -0,0 +1,9 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/23 12:09 AM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
--- a/FworkSpider/feapder/db/memory_db.py
+++ b/FworkSpider/feapder/db/memory_db.py
@@ -0,0 +1,37 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/4/21 11:42 PM
			
 
				+---------
			
 
				+@summary: 基于内存的队列，代替redis
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+from queue import PriorityQueue
			
 
				+
			
 
				+
			
 
				+class MemoryDB:
			
 
				+    def __init__(self):
			
 
				+        self.priority_queue = PriorityQueue()
			
 
				+
			
 
				+    def add(self, item):
			
 
				+        """
			
 
				+        添加任务
			
 
				+        :param item: 数据: 支持小于号比较的类 或者 （priority, item）
			
 
				+        :return:
			
 
				+        """
			
 
				+        self.priority_queue.put(item)
			
 
				+
			
 
				+    def get(self):
			
 
				+        """
			
 
				+        获取任务
			
 
				+        :return:
			
 
				+        """
			
 
				+        try:
			
 
				+            item = self.priority_queue.get_nowait()
			
 
				+            return item
			
 
				+        except:
			
 
				+            return
			
 
				+
			
 
				+    def empty(self):
			
 
				+        return self.priority_queue.empty()
			
--- a/FworkSpider/feapder/db/mongodb.py
+++ b/FworkSpider/feapder/db/mongodb.py
@@ -0,0 +1,426 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-04-18 14:12:21
			
 
				+---------
			
 
				+@summary: 操作mongo数据库
			
 
				+---------
			
 
				+@author: Mkdir700
			
 
				+@email:  mkdir700@gmail.com
			
 
				+"""
			
 
				+import re
			
 
				+from typing import List, Dict, Optional
			
 
				+from urllib import parse
			
 
				+
			
 
				+import pymongo
			
 
				+from pymongo import MongoClient
			
 
				+from pymongo.collection import Collection
			
 
				+from pymongo.database import Database
			
 
				+from pymongo.errors import DuplicateKeyError, BulkWriteError
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class MongoDB:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        ip=None,
			
 
				+        port=None,
			
 
				+        db=None,
			
 
				+        user_name=None,
			
 
				+        user_pass=None,
			
 
				+        url=None,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        if url:
			
 
				+            self.client = MongoClient(url, **kwargs)
			
 
				+        else:
			
 
				+            if not ip:
			
 
				+                ip = setting.MONGO_IP
			
 
				+            if not port:
			
 
				+                port = setting.MONGO_PORT
			
 
				+            if not db:
			
 
				+                db = setting.MONGO_DB
			
 
				+            if not user_name:
			
 
				+                user_name = setting.MONGO_USER_NAME
			
 
				+            if not user_pass:
			
 
				+                user_pass = setting.MONGO_USER_PASS
			
 
				+            self.client = MongoClient(
			
 
				+                host=ip, port=port, username=user_name, password=user_pass
			
 
				+            )
			
 
				+
			
 
				+        self.db = self.get_database(db)
			
 
				+
			
 
				+        # 缓存索引信息
			
 
				+        self.__index__cached = {}
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_url(cls, url, **kwargs):
			
 
				+        """
			
 
				+        Args:
			
 
				+            url: mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
			
 
				+                 参考：http://mongodb.github.io/mongo-java-driver/3.4/javadoc/com/mongodb/MongoClientURI.html
			
 
				+            **kwargs:
			
 
				+
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        url_parsed = parse.urlparse(url)
			
 
				+
			
 
				+        db_type = url_parsed.scheme.strip()
			
 
				+        if db_type != "mongodb":
			
 
				+            raise Exception(
			
 
				+                "url error, expect mongodb://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]], but get {}".format(
			
 
				+                    url
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        return cls(url=url, **kwargs)
			
 
				+
			
 
				+    def get_database(self, database, **kwargs) -> Database:
			
 
				+        """
			
 
				+        获取数据库对象
			
 
				+        @param database: 数据库名
			
 
				+        @return:
			
 
				+        """
			
 
				+        return self.client.get_database(database, **kwargs)
			
 
				+
			
 
				+    def get_collection(self, coll_name, **kwargs) -> Collection:
			
 
				+        """
			
 
				+        根据集合名获取集合对象
			
 
				+        @param coll_name: 集合名
			
 
				+        @return:
			
 
				+        """
			
 
				+        return self.db.get_collection(coll_name, **kwargs)
			
 
				+
			
 
				+    def find(
			
 
				+        self, coll_name: str, condition: Optional[Dict] = None, limit: int = 0, **kwargs
			
 
				+    ) -> List[Dict]:
			
 
				+        """
			
 
				+        @summary:
			
 
				+        无数据： 返回[]
			
 
				+        有数据： [{'_id': 'xx', ...}, ...]
			
 
				+        ---------
			
 
				+        @param coll_name: 集合名(表名)
			
 
				+        @param condition: 查询条件
			
 
				+        @param limit: 结果数量
			
 
				+        @param kwargs:
			
 
				+            更多参数 https://docs.mongodb.com/manual/reference/command/find/#command-fields
			
 
				+
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        condition = {} if condition is None else condition
			
 
				+        command = {"find": coll_name, "filter": condition, "limit": limit}
			
 
				+        command.update(kwargs)
			
 
				+        result = self.run_command(command)
			
 
				+        cursor = result["cursor"]
			
 
				+        cursor_id = cursor["id"]
			
 
				+        dataset = cursor["firstBatch"]
			
 
				+        while True:
			
 
				+            if cursor_id == 0:
			
 
				+                break
			
 
				+            result = self.run_command(
			
 
				+                {
			
 
				+                    "getMore": cursor_id,
			
 
				+                    "collection": coll_name,
			
 
				+                    "batchSize": kwargs.get("batchSize", 100),
			
 
				+                }
			
 
				+            )
			
 
				+            cursor = result["cursor"]
			
 
				+            cursor_id = cursor["id"]
			
 
				+            dataset.extend(cursor["nextBatch"])
			
 
				+        return dataset
			
 
				+
			
 
				+    def add(
			
 
				+        self,
			
 
				+        coll_name,
			
 
				+        data: Dict,
			
 
				+        replace=False,
			
 
				+        update_columns=(),
			
 
				+        update_columns_value=(),
			
 
				+        insert_ignore=False,
			
 
				+    ):
			
 
				+        """
			
 
				+        添加单条数据
			
 
				+        Args:
			
 
				+            coll_name: 集合名
			
 
				+            data: 单条数据
			
 
				+            replace: 唯一索引冲突时直接覆盖旧数据，默认为False
			
 
				+            update_columns: 更新指定的列（如果数据唯一索引冲突，则更新指定字段，如 update_columns = ["name", "title"]
			
 
				+            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
			
 
				+            insert_ignore: 索引冲突是否忽略 默认False
			
 
				+
			
 
				+        Returns: 插入成功的行数
			
 
				+
			
 
				+        """
			
 
				+        affect_count = 1
			
 
				+        collection = self.get_collection(coll_name)
			
 
				+        try:
			
 
				+            collection.insert_one(data)
			
 
				+        except DuplicateKeyError as e:
			
 
				+            data.pop("_id", "")
			
 
				+            # 存在则更新
			
 
				+            if update_columns:
			
 
				+                if not isinstance(update_columns, (tuple, list)):
			
 
				+                    update_columns = [update_columns]
			
 
				+
			
 
				+                condition = self.__get_update_condition(
			
 
				+                    coll_name, data, e.details.get("errmsg")
			
 
				+                )
			
 
				+
			
 
				+                # 更新指定的列
			
 
				+                if update_columns_value:
			
 
				+                    # 使用指定的值更新
			
 
				+                    doc = {
			
 
				+                        key: value
			
 
				+                        for key, value in zip(update_columns, update_columns_value)
			
 
				+                    }
			
 
				+                else:
			
 
				+                    # 使用数据本身的值更新
			
 
				+                    doc = {key: data[key] for key in update_columns}
			
 
				+
			
 
				+                collection.update_one(condition, {"$set": doc})
			
 
				+
			
 
				+            # 覆盖更新
			
 
				+            elif replace:
			
 
				+                condition = self.__get_update_condition(
			
 
				+                    coll_name, data, e.details.get("errmsg")
			
 
				+                )
			
 
				+                # 替换已存在的数据
			
 
				+                collection.replace_one(condition, data)
			
 
				+
			
 
				+            elif not insert_ignore:
			
 
				+                raise e
			
 
				+
			
 
				+        return affect_count
			
 
				+
			
 
				+    def add_batch(
			
 
				+        self,
			
 
				+        coll_name: str,
			
 
				+        datas: List[Dict],
			
 
				+        replace=False,
			
 
				+        update_columns=(),
			
 
				+        update_columns_value=(),
			
 
				+        condition_fields: dict = None,
			
 
				+    ):
			
 
				+        """
			
 
				+        批量添加数据
			
 
				+        Args:
			
 
				+            coll_name: 集合名
			
 
				+            datas: 数据 [{'_id': 'xx'}, ... ]
			
 
				+            replace:  唯一索引冲突时直接覆盖旧数据，默认为False
			
 
				+            update_columns: 更新指定的列（如果数据的唯一索引存在，则更新指定字段，如 update_columns = ["name", "title"]
			
 
				+            update_columns_value: 指定更新的字段对应的值, 不指定则用数据本身的值更新
			
 
				+            condition_fields: 用于条件查找的字段，不指定则用索引冲突中的字段查找
			
 
				+
			
 
				+        Returns: 添加行数，不包含更新
			
 
				+
			
 
				+        """
			
 
				+        add_count = 0
			
 
				+
			
 
				+        if not datas:
			
 
				+            return add_count
			
 
				+
			
 
				+        collection = self.get_collection(coll_name)
			
 
				+        if not isinstance(update_columns, (tuple, list)):
			
 
				+            update_columns = [update_columns]
			
 
				+
			
 
				+        try:
			
 
				+            add_count = len(datas)
			
 
				+            collection.insert_many(datas, ordered=False)
			
 
				+        except BulkWriteError as e:
			
 
				+            write_errors = e.details.get("writeErrors")
			
 
				+            for error in write_errors:
			
 
				+                if error.get("code") == 11000:
			
 
				+                    # 数据重复
			
 
				+                    # 获取重复的数据
			
 
				+                    data = error.get("op")
			
 
				+                    data.pop("_id", "")
			
 
				+
			
 
				+                    def get_condition():
			
 
				+                        # 获取更新条件
			
 
				+                        if condition_fields:
			
 
				+                            condition = {
			
 
				+                                condition_field: data[condition_field]
			
 
				+                                for condition_field in condition_fields
			
 
				+                            }
			
 
				+                        else:
			
 
				+                            # 根据重复的值获取更新条件
			
 
				+                            condition = self.__get_update_condition(
			
 
				+                                coll_name, data, error.get("errmsg")
			
 
				+                            )
			
 
				+
			
 
				+                        return condition
			
 
				+
			
 
				+                    if update_columns:
			
 
				+                        # 更新指定的列
			
 
				+                        if update_columns_value:
			
 
				+                            # 使用指定的值更新
			
 
				+                            doc = {
			
 
				+                                key: value
			
 
				+                                for key, value in zip(
			
 
				+                                    update_columns, update_columns_value
			
 
				+                                )
			
 
				+                            }
			
 
				+                        else:
			
 
				+                            # 使用数据本身的值更新
			
 
				+                            doc = {}
			
 
				+                            for key in update_columns:
			
 
				+                                doc = {key: data.get(key)}
			
 
				+
			
 
				+                        collection.update_one(get_condition(), {"$set": doc})
			
 
				+                        add_count -= 1
			
 
				+
			
 
				+                    elif replace:
			
 
				+                        # 覆盖更新
			
 
				+                        collection.replace_one(get_condition(), data)
			
 
				+                        add_count -= 1
			
 
				+
			
 
				+                    else:
			
 
				+                        # log.error(error)
			
 
				+                        add_count -= 1
			
 
				+
			
 
				+        return add_count
			
 
				+
			
 
				+    def count(self, coll_name, condition: Optional[Dict], limit=0, **kwargs):
			
 
				+        """
			
 
				+        计数
			
 
				+        @param coll_name: 集合名
			
 
				+        @param condition: 查询条件
			
 
				+        @param limit: 限制数量
			
 
				+        @param kwargs:
			
 
				+        ----
			
 
				+        command = {
			
 
				+          count: <collection or view>,
			
 
				+          query: <document>,
			
 
				+          limit: <integer>,
			
 
				+          skip: <integer>,
			
 
				+          hint: <hint>,
			
 
				+          readConcern: <document>,
			
 
				+          collation: <document>,
			
 
				+          comment: <any>
			
 
				+        }
			
 
				+        https://docs.mongodb.com/manual/reference/command/count/#mongodb-dbcommand-dbcmd.count
			
 
				+        @return: 数据数量
			
 
				+        """
			
 
				+        command = {"count": coll_name, "query": condition, "limit": limit, **kwargs}
			
 
				+        result = self.run_command(command)
			
 
				+        return result["n"]
			
 
				+
			
 
				+    def update(self, coll_name, data: Dict, condition: Dict, upsert: bool = False):
			
 
				+        """
			
 
				+        更新
			
 
				+        Args:
			
 
				+            coll_name: 集合名
			
 
				+            data: 单条数据 {"xxx":"xxx"}
			
 
				+            condition: 更新条件 {"_id": "xxxx"}
			
 
				+            upsert: 数据不存在则插入,默认为 False
			
 
				+
			
 
				+        Returns: True / False
			
 
				+        """
			
 
				+        try:
			
 
				+            collection = self.get_collection(coll_name)
			
 
				+            collection.update_one(condition, {"$set": data}, upsert=upsert)
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:{}
			
 
				+                condition: {}
			
 
				+            """.format(
			
 
				+                    e, condition
			
 
				+                )
			
 
				+            )
			
 
				+            return False
			
 
				+        else:
			
 
				+            return True
			
 
				+
			
 
				+    def delete(self, coll_name, condition: Dict) -> bool:
			
 
				+        """
			
 
				+        删除
			
 
				+        Args:
			
 
				+            coll_name: 集合名
			
 
				+            condition: 查找条件
			
 
				+        Returns: True / False
			
 
				+
			
 
				+        """
			
 
				+        try:
			
 
				+            collection = self.get_collection(coll_name)
			
 
				+            collection.delete_one(condition)
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:{}
			
 
				+                condition: {}
			
 
				+            """.format(
			
 
				+                    e, condition
			
 
				+                )
			
 
				+            )
			
 
				+            return False
			
 
				+        else:
			
 
				+            return True
			
 
				+
			
 
				+    def run_command(self, command: Dict):
			
 
				+        """
			
 
				+        运行指令
			
 
				+        参考文档 https://www.geek-book.com/src/docs/mongodb/mongodb/docs.mongodb.com/manual/reference/command/index.html
			
 
				+        @param command:
			
 
				+        @return:
			
 
				+        """
			
 
				+        return self.db.command(command)
			
 
				+
			
 
				+    def create_index(self, coll_name, keys, unique=True):
			
 
				+        collection = self.get_collection(coll_name)
			
 
				+        _keys = [(key, pymongo.ASCENDING) for key in keys]
			
 
				+        collection.create_index(_keys, unique=unique)
			
 
				+
			
 
				+    def get_index(self, coll_name):
			
 
				+        return self.get_collection(coll_name).index_information()
			
 
				+
			
 
				+    def drop_collection(self, coll_name):
			
 
				+        return self.db.drop_collection(coll_name)
			
 
				+
			
 
				+    def get_index_key(self, coll_name, index_name):
			
 
				+        """
			
 
				+        获取参与索引的key
			
 
				+        Args:
			
 
				+            index_name: 索引名
			
 
				+
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        cache_key = f"{coll_name}:{index_name}"
			
 
				+
			
 
				+        if cache_key in self.__index__cached:
			
 
				+            return self.__index__cached.get(cache_key)
			
 
				+
			
 
				+        index = self.get_index(coll_name)
			
 
				+        index_detail = index.get(index_name)
			
 
				+        if not index_detail:
			
 
				+            errmsg = f"not found index {index_name} in collection {coll_name}"
			
 
				+            raise Exception(errmsg)
			
 
				+
			
 
				+        index_keys = [val[0] for val in index_detail.get("key")]
			
 
				+        self.__index__cached[cache_key] = index_keys
			
 
				+        return index_keys
			
 
				+
			
 
				+    def __get_update_condition(
			
 
				+        self, coll_name: str, data: dict, duplicate_errmsg: str
			
 
				+    ) -> dict:
			
 
				+        """
			
 
				+        根据索引冲突的报错信息 获取更新条件
			
 
				+        Args:
			
 
				+            duplicate_errmsg: E11000 duplicate key error collection: feapder.test index: a_1_b_1 dup key: { : 1, : "你好" }
			
 
				+            data: {"a": 1, "b": "你好", "c": "嘻嘻"}
			
 
				+
			
 
				+        Returns: {"a": 1, "b": "你好"}
			
 
				+
			
 
				+        """
			
 
				+        index_name = re.search(r"index: (\w+)", duplicate_errmsg).group(1)
			
 
				+        index_keys = self.get_index_key(coll_name, index_name)
			
 
				+
			
 
				+        condition = {key: data.get(key) for key in index_keys}
			
 
				+        return condition
			
 
				+
			
 
				+    def __getattr__(self, name):
			
 
				+        return getattr(self.db, name)
			
--- a/FworkSpider/feapder/db/mysqldb.py
+++ b/FworkSpider/feapder/db/mysqldb.py
@@ -0,0 +1,381 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2016-11-16 16:25
			
 
				+---------
			
 
				+@summary: 操作oracle数据库
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import datetime
			
 
				+import json
			
 
				+from urllib import parse
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+import pymysql
			
 
				+from dbutils.pooled_db import PooledDB
			
 
				+from pymysql import cursors
			
 
				+from pymysql import err
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.tools import make_insert_sql, make_batch_sql, make_update_sql
			
 
				+
			
 
				+
			
 
				+def auto_retry(func):
			
 
				+    def wapper(*args, **kwargs):
			
 
				+        for i in range(3):
			
 
				+            try:
			
 
				+                return func(*args, **kwargs)
			
 
				+            except (err.InterfaceError, err.OperationalError) as e:
			
 
				+                log.error(
			
 
				+                    """
			
 
				+                    error:%s
			
 
				+                    sql:  %s
			
 
				+                    """
			
 
				+                    % (e, kwargs.get("sql") or args[1])
			
 
				+                )
			
 
				+
			
 
				+    return wapper
			
 
				+
			
 
				+
			
 
				+class MysqlDB:
			
 
				+    def __init__(
			
 
				+        self, ip=None, port=None, db=None, user_name=None, user_pass=None, **kwargs
			
 
				+    ):
			
 
				+        # 可能会改setting中的值，所以此处不能直接赋值为默认值，需要后加载赋值
			
 
				+        if not ip:
			
 
				+            ip = setting.MYSQL_IP
			
 
				+        if not port:
			
 
				+            port = setting.MYSQL_PORT
			
 
				+        if not db:
			
 
				+            db = setting.MYSQL_DB
			
 
				+        if not user_name:
			
 
				+            user_name = setting.MYSQL_USER_NAME
			
 
				+        if not user_pass:
			
 
				+            user_pass = setting.MYSQL_USER_PASS
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            self.connect_pool = PooledDB(
			
 
				+                creator=pymysql,
			
 
				+                mincached=1,
			
 
				+                maxcached=100,
			
 
				+                maxconnections=100,
			
 
				+                blocking=True,
			
 
				+                ping=7,
			
 
				+                host=ip,
			
 
				+                port=port,
			
 
				+                user=user_name,
			
 
				+                passwd=user_pass,
			
 
				+                db=db,
			
 
				+                charset="utf8mb4",
			
 
				+                cursorclass=cursors.SSCursor,
			
 
				+            )  # cursorclass 使用服务的游标，默认的在多线程下大批量插入数据会使内存递增
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+            连接数据失败：
			
 
				+            ip: {}
			
 
				+            port: {}
			
 
				+            db: {}
			
 
				+            user_name: {}
			
 
				+            user_pass: {}
			
 
				+            exception: {}
			
 
				+            """.format(
			
 
				+                    ip, port, db, user_name, user_pass, e
			
 
				+                )
			
 
				+            )
			
 
				+        else:
			
 
				+            log.debug("连接到mysql数据库 %s : %s" % (ip, db))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_url(cls, url, **kwargs):
			
 
				+        # mysql://username:password@ip:port/db?charset=utf8mb4
			
 
				+        url_parsed = parse.urlparse(url)
			
 
				+
			
 
				+        db_type = url_parsed.scheme.strip()
			
 
				+        if db_type != "mysql":
			
 
				+            raise Exception(
			
 
				+                "url error, expect mysql://username:ip:port/db?charset=utf8mb4, but get {}".format(
			
 
				+                    url
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        connect_params = {}
			
 
				+        connect_params["ip"] = url_parsed.hostname.strip()
			
 
				+        connect_params["port"] = url_parsed.port
			
 
				+        connect_params["user_name"] = url_parsed.username.strip()
			
 
				+        connect_params["user_pass"] = url_parsed.password.strip()
			
 
				+        connect_params["db"] = url_parsed.path.strip("/").strip()
			
 
				+
			
 
				+        connect_params.update(kwargs)
			
 
				+
			
 
				+        return cls(**connect_params)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def unescape_string(value):
			
 
				+        if not isinstance(value, str):
			
 
				+            return value
			
 
				+
			
 
				+        value = value.replace("\\0", "\0")
			
 
				+        value = value.replace("\\\\", "\\")
			
 
				+        value = value.replace("\\n", "\n")
			
 
				+        value = value.replace("\\r", "\r")
			
 
				+        value = value.replace("\\Z", "\032")
			
 
				+        value = value.replace('\\"', '"')
			
 
				+        value = value.replace("\\'", "'")
			
 
				+
			
 
				+        return value
			
 
				+
			
 
				+    def get_connection(self):
			
 
				+        conn = self.connect_pool.connection(shareable=False)
			
 
				+        # cursor = conn.cursor(cursors.SSCursor)
			
 
				+        cursor = conn.cursor()
			
 
				+
			
 
				+        return conn, cursor
			
 
				+
			
 
				+    def close_connection(self, conn, cursor):
			
 
				+        cursor.close()
			
 
				+        conn.close()
			
 
				+
			
 
				+    def size_of_connections(self):
			
 
				+        """
			
 
				+        当前活跃的连接数
			
 
				+        @return:
			
 
				+        """
			
 
				+        return self.connect_pool._connections
			
 
				+
			
 
				+    def size_of_connect_pool(self):
			
 
				+        """
			
 
				+        池子里一共有多少连接
			
 
				+        @return:
			
 
				+        """
			
 
				+        return len(self.connect_pool._idle_cache)
			
 
				+
			
 
				+    @auto_retry
			
 
				+    def find(self, sql, limit=0, to_json=False):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        无数据： 返回()
			
 
				+        有数据： 若limit == 1 则返回 (data1, data2)
			
 
				+                否则返回 ((data1, data2),)
			
 
				+        ---------
			
 
				+        @param sql:
			
 
				+        @param limit:
			
 
				+        @param to_json 是否将查询结果转为json
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        conn, cursor = self.get_connection()
			
 
				+
			
 
				+        cursor.execute(sql)
			
 
				+
			
 
				+        if limit == 1:
			
 
				+            result = cursor.fetchone()  # 全部查出来，截取 不推荐使用
			
 
				+        elif limit > 1:
			
 
				+            result = cursor.fetchmany(limit)  # 全部查出来，截取 不推荐使用
			
 
				+        else:
			
 
				+            result = cursor.fetchall()
			
 
				+
			
 
				+        if to_json:
			
 
				+            columns = [i[0] for i in cursor.description]
			
 
				+
			
 
				+            # 处理数据
			
 
				+            def convert(col):
			
 
				+                if isinstance(col, (datetime.date, datetime.time)):
			
 
				+                    return str(col)
			
 
				+                elif isinstance(col, str) and (
			
 
				+                    col.startswith("{") or col.startswith("[")
			
 
				+                ):
			
 
				+                    try:
			
 
				+                        # col = self.unescape_string(col)
			
 
				+                        return json.loads(col)
			
 
				+                    except:
			
 
				+                        return col
			
 
				+                else:
			
 
				+                    # col = self.unescape_string(col)
			
 
				+                    return col
			
 
				+
			
 
				+            if limit == 1:
			
 
				+                result = [convert(col) for col in result]
			
 
				+                result = dict(zip(columns, result))
			
 
				+            else:
			
 
				+                result = [[convert(col) for col in row] for row in result]
			
 
				+                result = [dict(zip(columns, r)) for r in result]
			
 
				+
			
 
				+        self.close_connection(conn, cursor)
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def add(self, sql, exception_callfunc=None):
			
 
				+        """
			
 
				+
			
 
				+        Args:
			
 
				+            sql:
			
 
				+            exception_callfunc: 异常回调
			
 
				+
			
 
				+        Returns: 添加行数
			
 
				+
			
 
				+        """
			
 
				+        affect_count = None
			
 
				+
			
 
				+        try:
			
 
				+            conn, cursor = self.get_connection()
			
 
				+            affect_count = cursor.execute(sql)
			
 
				+            conn.commit()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:%s
			
 
				+                sql:  %s
			
 
				+            """
			
 
				+                % (e, sql)
			
 
				+            )
			
 
				+            if exception_callfunc:
			
 
				+                exception_callfunc(e)
			
 
				+        finally:
			
 
				+            self.close_connection(conn, cursor)
			
 
				+
			
 
				+        return affect_count
			
 
				+
			
 
				+    def add_smart(self, table, data: Dict, **kwargs):
			
 
				+        """
			
 
				+        添加数据, 直接传递json格式的数据，不用拼sql
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            data: 字典 {"xxx":"xxx"}
			
 
				+            **kwargs:
			
 
				+
			
 
				+        Returns: 添加行数
			
 
				+
			
 
				+        """
			
 
				+        sql = make_insert_sql(table, data, **kwargs)
			
 
				+        return self.add(sql)
			
 
				+
			
 
				+    def add_batch(self, sql, datas: List[Dict]):
			
 
				+        """
			
 
				+        @summary: 批量添加数据
			
 
				+        ---------
			
 
				+        @ param sql: insert ignore into (xxx,xxx) values (%s, %s, %s)
			
 
				+        # param datas: 列表 [{}, {}, {}]
			
 
				+        ---------
			
 
				+        @result: 添加行数
			
 
				+        """
			
 
				+        affect_count = None
			
 
				+
			
 
				+        try:
			
 
				+            conn, cursor = self.get_connection()
			
 
				+            affect_count = cursor.executemany(sql, datas)
			
 
				+            conn.commit()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:%s
			
 
				+                sql:  %s
			
 
				+                """
			
 
				+                % (e, sql)
			
 
				+            )
			
 
				+        finally:
			
 
				+            self.close_connection(conn, cursor)
			
 
				+
			
 
				+        return affect_count
			
 
				+
			
 
				+    def add_batch_smart(self, table, datas: List[Dict], **kwargs):
			
 
				+        """
			
 
				+        批量添加数据, 直接传递list格式的数据，不用拼sql
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            datas: 列表 [{}, {}, {}]
			
 
				+            **kwargs:
			
 
				+
			
 
				+        Returns: 添加行数
			
 
				+
			
 
				+        """
			
 
				+        sql, datas = make_batch_sql(table, datas, **kwargs)
			
 
				+        return self.add_batch(sql, datas)
			
 
				+
			
 
				+    def update(self, sql):
			
 
				+        try:
			
 
				+            conn, cursor = self.get_connection()
			
 
				+            cursor.execute(sql)
			
 
				+            conn.commit()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:%s
			
 
				+                sql:  %s
			
 
				+            """
			
 
				+                % (e, sql)
			
 
				+            )
			
 
				+            return False
			
 
				+        else:
			
 
				+            return True
			
 
				+        finally:
			
 
				+            self.close_connection(conn, cursor)
			
 
				+
			
 
				+    def update_smart(self, table, data: Dict, condition):
			
 
				+        """
			
 
				+        更新, 不用拼sql
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            data: 数据 {"xxx":"xxx"}
			
 
				+            condition: 更新条件 where后面的条件，如 condition='status=1'
			
 
				+
			
 
				+        Returns: True / False
			
 
				+
			
 
				+        """
			
 
				+        sql = make_update_sql(table, data, condition)
			
 
				+        return self.update(sql)
			
 
				+
			
 
				+    def delete(self, sql):
			
 
				+        """
			
 
				+        删除
			
 
				+        Args:
			
 
				+            sql:
			
 
				+
			
 
				+        Returns: True / False
			
 
				+
			
 
				+        """
			
 
				+        try:
			
 
				+            conn, cursor = self.get_connection()
			
 
				+            cursor.execute(sql)
			
 
				+            conn.commit()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:%s
			
 
				+                sql:  %s
			
 
				+            """
			
 
				+                % (e, sql)
			
 
				+            )
			
 
				+            return False
			
 
				+        else:
			
 
				+            return True
			
 
				+        finally:
			
 
				+            self.close_connection(conn, cursor)
			
 
				+
			
 
				+    def execute(self, sql):
			
 
				+        try:
			
 
				+            conn, cursor = self.get_connection()
			
 
				+            cursor.execute(sql)
			
 
				+            conn.commit()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                error:%s
			
 
				+                sql:  %s
			
 
				+            """
			
 
				+                % (e, sql)
			
 
				+            )
			
 
				+            return False
			
 
				+        else:
			
 
				+            return True
			
 
				+        finally:
			
 
				+            self.close_connection(conn, cursor)
			
--- a/FworkSpider/feapder/db/redisdb.py
+++ b/FworkSpider/feapder/db/redisdb.py
@@ -0,0 +1,848 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2016-11-16 16:25
			
 
				+---------
			
 
				+@summary: 操作redis数据库
			
 
				+---------
			
 
				+@author: Boris
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+
			
 
				+import redis
			
 
				+from redis._compat import unicode, long, basestring
			
 
				+from redis.connection import Encoder as _Encoder
			
 
				+from redis.exceptions import ConnectionError, TimeoutError
			
 
				+from redis.exceptions import DataError
			
 
				+from redis.sentinel import Sentinel
			
 
				+from rediscluster import RedisCluster
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class Encoder(_Encoder):
			
 
				+    def encode(self, value):
			
 
				+        "Return a bytestring or bytes-like representation of the value"
			
 
				+        if isinstance(value, (bytes, memoryview)):
			
 
				+            return value
			
 
				+        # elif isinstance(value, bool):
			
 
				+        #     # special case bool since it is a subclass of int
			
 
				+        #     raise DataError(
			
 
				+        #         "Invalid input of type: 'bool'. Convert to a "
			
 
				+        #         "bytes, string, int or float first."
			
 
				+        #     )
			
 
				+        elif isinstance(value, float):
			
 
				+            value = repr(value).encode()
			
 
				+        elif isinstance(value, (int, long)):
			
 
				+            # python 2 repr() on longs is '123L', so use str() instead
			
 
				+            value = str(value).encode()
			
 
				+        elif isinstance(value, (list, dict, tuple)):
			
 
				+            value = unicode(value)
			
 
				+        elif not isinstance(value, basestring):
			
 
				+            # a value we don't know how to deal with. throw an error
			
 
				+            typename = type(value).__name__
			
 
				+            raise DataError(
			
 
				+                "Invalid input of type: '%s'. Convert to a "
			
 
				+                "bytes, string, int or float first." % typename
			
 
				+            )
			
 
				+        if isinstance(value, unicode):
			
 
				+            value = value.encode(self.encoding, self.encoding_errors)
			
 
				+        return value
			
 
				+
			
 
				+
			
 
				+redis.connection.Encoder = Encoder
			
 
				+
			
 
				+
			
 
				+class RedisDB:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        ip_ports=None,
			
 
				+        db=None,
			
 
				+        user_pass=None,
			
 
				+        url=None,
			
 
				+        decode_responses=True,
			
 
				+        service_name=None,
			
 
				+        max_connections=32,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        redis的封装
			
 
				+        Args:
			
 
				+            ip_ports: ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
			
 
				+            db:
			
 
				+            user_pass:
			
 
				+            url:
			
 
				+            decode_responses:
			
 
				+            service_name: 适用于redis哨兵模式
			
 
				+        """
			
 
				+
			
 
				+        # 可能会改setting中的值，所以此处不能直接赋值为默认值，需要后加载赋值
			
 
				+        if ip_ports is None:
			
 
				+            ip_ports = setting.REDISDB_IP_PORTS
			
 
				+        if db is None:
			
 
				+            db = setting.REDISDB_DB
			
 
				+        if user_pass is None:
			
 
				+            user_pass = setting.REDISDB_USER_PASS
			
 
				+        if service_name is None:
			
 
				+            service_name = setting.REDISDB_SERVICE_NAME
			
 
				+
			
 
				+        self._is_redis_cluster = False
			
 
				+
			
 
				+        self.__redis = None
			
 
				+        self._url = url
			
 
				+        self._ip_ports = ip_ports
			
 
				+        self._db = db
			
 
				+        self._user_pass = user_pass
			
 
				+        self._decode_responses = decode_responses
			
 
				+        self._service_name = service_name
			
 
				+        self._max_connections = max_connections
			
 
				+        self._kwargs = kwargs
			
 
				+        self.get_connect()
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        if self._url:
			
 
				+            return "<Redisdb url:{}>".format(self._url)
			
 
				+
			
 
				+        return "<Redisdb ip_ports: {} db:{} user_pass:{}>".format(
			
 
				+            self._ip_ports, self._db, self._user_pass
			
 
				+        )
			
 
				+
			
 
				+    @property
			
 
				+    def _redis(self):
			
 
				+        try:
			
 
				+            if not self.__redis.ping():
			
 
				+                raise ConnectionError("unable to connect to redis")
			
 
				+        except:
			
 
				+            self._reconnect()
			
 
				+
			
 
				+        return self.__redis
			
 
				+
			
 
				+    @_redis.setter
			
 
				+    def _redis(self, val):
			
 
				+        self.__redis = val
			
 
				+
			
 
				+    def get_connect(self):
			
 
				+        # 获取数据库连接
			
 
				+        try:
			
 
				+            if not self._url:
			
 
				+                if not self._ip_ports:
			
 
				+                    raise Exception("未设置 redis 连接信息")
			
 
				+
			
 
				+                ip_ports = (
			
 
				+                    self._ip_ports
			
 
				+                    if isinstance(self._ip_ports, list)
			
 
				+                    else self._ip_ports.split(",")
			
 
				+                )
			
 
				+                if len(ip_ports) > 1:
			
 
				+                    startup_nodes = []
			
 
				+                    for ip_port in ip_ports:
			
 
				+                        ip, port = ip_port.split(":")
			
 
				+                        startup_nodes.append({"host": ip, "port": port})
			
 
				+
			
 
				+                    if self._service_name:
			
 
				+                        # log.debug("使用redis哨兵模式")
			
 
				+                        hosts = [(node["host"], node["port"]) for node in startup_nodes]
			
 
				+                        sentinel = Sentinel(hosts, socket_timeout=3, **self._kwargs)
			
 
				+                        self._redis = sentinel.master_for(
			
 
				+                            self._service_name,
			
 
				+                            password=self._user_pass,
			
 
				+                            db=self._db,
			
 
				+                            redis_class=redis.StrictRedis,
			
 
				+                            decode_responses=self._decode_responses,
			
 
				+                            max_connections=self._max_connections,
			
 
				+                            **self._kwargs,
			
 
				+                        )
			
 
				+
			
 
				+                    else:
			
 
				+                        # log.debug("使用redis集群模式")
			
 
				+                        self._redis = RedisCluster(
			
 
				+                            startup_nodes=startup_nodes,
			
 
				+                            decode_responses=self._decode_responses,
			
 
				+                            password=self._user_pass,
			
 
				+                            max_connections=self._max_connections,
			
 
				+                            **self._kwargs,
			
 
				+                        )
			
 
				+
			
 
				+                    self._is_redis_cluster = True
			
 
				+                else:
			
 
				+                    ip, port = ip_ports[0].split(":")
			
 
				+                    self._redis = redis.StrictRedis(
			
 
				+                        host=ip,
			
 
				+                        port=port,
			
 
				+                        db=self._db,
			
 
				+                        password=self._user_pass,
			
 
				+                        decode_responses=self._decode_responses,
			
 
				+                        max_connections=self._max_connections,
			
 
				+                        **self._kwargs,
			
 
				+                    )
			
 
				+                    self._is_redis_cluster = False
			
 
				+            else:
			
 
				+                self._redis = redis.StrictRedis.from_url(
			
 
				+                    self._url, decode_responses=self._decode_responses
			
 
				+                )
			
 
				+                self._is_redis_cluster = False
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            raise
			
 
				+
			
 
				+        # 不要写成self._redis.ping() 否则循环调用了
			
 
				+        return self.__redis.ping()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_url(cls, url):
			
 
				+        """
			
 
				+
			
 
				+        Args:
			
 
				+            url: redis://[[username]:[password]]@[host]:[port]/[db]
			
 
				+
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        return cls(url=url)
			
 
				+
			
 
				+    def sadd(self, table, values):
			
 
				+        """
			
 
				+        @summary: 使用无序set集合存储数据， 去重
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param values: 值； 支持list 或 单个值
			
 
				+        ---------
			
 
				+        @result: 若库中存在 返回0，否则入库，返回1。 批量添加返回None
			
 
				+        """
			
 
				+
			
 
				+        if isinstance(values, list):
			
 
				+            pipe = self._redis.pipeline()
			
 
				+
			
 
				+            if not self._is_redis_cluster:
			
 
				+                pipe.multi()
			
 
				+            for value in values:
			
 
				+                pipe.sadd(table, value)
			
 
				+            pipe.execute()
			
 
				+
			
 
				+        else:
			
 
				+            return self._redis.sadd(table, values)
			
 
				+
			
 
				+    def sget(self, table, count=1, is_pop=True):
			
 
				+        """
			
 
				+        返回 list 如 ['1'] 或 []
			
 
				+        @param table:
			
 
				+        @param count:
			
 
				+        @param is_pop:
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        datas = []
			
 
				+        if is_pop:
			
 
				+            count = count if count <= self.sget_count(table) else self.sget_count(table)
			
 
				+            if count:
			
 
				+                if count > 1:
			
 
				+                    pipe = self._redis.pipeline()
			
 
				+
			
 
				+                    if not self._is_redis_cluster:
			
 
				+                        pipe.multi()
			
 
				+                    while count:
			
 
				+                        pipe.spop(table)
			
 
				+                        count -= 1
			
 
				+                    datas = pipe.execute()
			
 
				+
			
 
				+                else:
			
 
				+                    datas.append(self._redis.spop(table))
			
 
				+
			
 
				+        else:
			
 
				+            datas = self._redis.srandmember(table, count)
			
 
				+
			
 
				+        return datas
			
 
				+
			
 
				+    def srem(self, table, values):
			
 
				+        """
			
 
				+        @summary: 移除集合中的指定元素
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param values: 一个或者列表
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        if isinstance(values, list):
			
 
				+            pipe = self._redis.pipeline()
			
 
				+
			
 
				+            if not self._is_redis_cluster:
			
 
				+                pipe.multi()
			
 
				+            for value in values:
			
 
				+                pipe.srem(table, value)
			
 
				+            pipe.execute()
			
 
				+        else:
			
 
				+            self._redis.srem(table, values)
			
 
				+
			
 
				+    def sget_count(self, table):
			
 
				+        return self._redis.scard(table)
			
 
				+
			
 
				+    def sdelete(self, table):
			
 
				+        """
			
 
				+        @summary: 删除set集合的大键（数据量大的表）
			
 
				+        删除大set键，使用sscan命令，每次扫描集合中500个元素，再用srem命令每次删除一个键
			
 
				+        若直接用delete命令，会导致Redis阻塞，出现故障切换和应用程序崩溃的故障。
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 当 SCAN 命令的游标参数被设置为 0 时， 服务器将开始一次新的迭代， 而当服务器向用户返回值为 0 的游标时， 表示迭代已结束
			
 
				+        cursor = "0"
			
 
				+        while cursor != 0:
			
 
				+            cursor, data = self._redis.sscan(table, cursor=cursor, count=500)
			
 
				+            for item in data:
			
 
				+                # pipe.srem(table, item)
			
 
				+                self._redis.srem(table, item)
			
 
				+
			
 
				+            # pipe.execute()
			
 
				+
			
 
				+    def sismember(self, table, key):
			
 
				+        "Return a boolean indicating if ``value`` is a member of set ``name``"
			
 
				+        return self._redis.sismember(table, key)
			
 
				+
			
 
				+    def zadd(self, table, values, prioritys=0):
			
 
				+        """
			
 
				+        @summary: 使用有序set集合存储数据， 去重(值存在更新)
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param values: 值； 支持list 或 单个值
			
 
				+        @param prioritys: 优先级； double类型，支持list 或 单个值。 根据此字段的值来排序, 值越小越优先。 可不传值，默认value的优先级为0
			
 
				+        ---------
			
 
				+        @result:若库中存在 返回0，否则入库，返回1。 批量添加返回 [0, 1 ...]
			
 
				+        """
			
 
				+        if isinstance(values, list):
			
 
				+            if not isinstance(prioritys, list):
			
 
				+                prioritys = [prioritys] * len(values)
			
 
				+            else:
			
 
				+                assert len(values) == len(prioritys), "values值要与prioritys值一一对应"
			
 
				+
			
 
				+            pipe = self._redis.pipeline()
			
 
				+
			
 
				+            if not self._is_redis_cluster:
			
 
				+                pipe.multi()
			
 
				+            for value, priority in zip(values, prioritys):
			
 
				+                pipe.execute_command(
			
 
				+                    "ZADD", table, priority, value
			
 
				+                )  # 为了兼容2.x与3.x版本的redis
			
 
				+            return pipe.execute()
			
 
				+
			
 
				+        else:
			
 
				+            return self._redis.execute_command(
			
 
				+                "ZADD", table, prioritys, values
			
 
				+            )  # 为了兼容2.x与3.x版本的redis
			
 
				+
			
 
				+    def zget(self, table, count=1, is_pop=True):
			
 
				+        """
			
 
				+        @summary: 从有序set集合中获取数据 优先返回分数小的（优先级高的）
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param count: 数量 -1 返回全部数据
			
 
				+        @param is_pop:获取数据后，是否在原set集合中删除，默认是
			
 
				+        ---------
			
 
				+        @result: 列表
			
 
				+        """
			
 
				+
			
 
				+        start_pos = 0  # 包含
			
 
				+        end_pos = count - 1 if count > 0 else count
			
 
				+
			
 
				+        pipe = self._redis.pipeline()
			
 
				+
			
 
				+        if not self._is_redis_cluster:
			
 
				+            pipe.multi()  # 标记事务的开始 参考 http://www.runoob.com/redis/redis-transactions.html
			
 
				+        pipe.zrange(table, start_pos, end_pos)  # 取值
			
 
				+        if is_pop:
			
 
				+            pipe.zremrangebyrank(table, start_pos, end_pos)  # 删除
			
 
				+        results, *count = pipe.execute()
			
 
				+        return results
			
 
				+
			
 
				+    def zremrangebyscore(self, table, priority_min, priority_max):
			
 
				+        """
			
 
				+        根据分数移除成员 闭区间
			
 
				+        @param table:
			
 
				+        @param priority_min:
			
 
				+        @param priority_max:
			
 
				+        @return: 被移除的成员个数
			
 
				+        """
			
 
				+        return self._redis.zremrangebyscore(table, priority_min, priority_max)
			
 
				+
			
 
				+    def zrangebyscore(self, table, priority_min, priority_max, count=None, is_pop=True):
			
 
				+        """
			
 
				+        @summary: 返回指定分数区间的数据 闭区间
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param priority_min: 优先级越小越优先
			
 
				+        @param priority_max:
			
 
				+        @param count: 获取的数量，为空则表示分数区间内的全部数据
			
 
				+        @param is_pop: 是否删除
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 使用lua脚本， 保证操作的原子性
			
 
				+        lua = """
			
 
				+            -- local key = KEYS[1]
			
 
				+            local min_score = ARGV[2]
			
 
				+            local max_score = ARGV[3]
			
 
				+            local is_pop = ARGV[4]
			
 
				+            local count = ARGV[5]
			
 
				+
			
 
				+            -- 取值
			
 
				+            local datas = nil
			
 
				+            if count then
			
 
				+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
			
 
				+            else
			
 
				+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
			
 
				+            end
			
 
				+
			
 
				+            -- 删除redis中刚取到的值
			
 
				+            if (is_pop=='True' or is_pop=='1') then
			
 
				+                for i=1, #datas do
			
 
				+                    redis.call('zrem', KEYS[1], datas[i])
			
 
				+                end
			
 
				+            end
			
 
				+
			
 
				+
			
 
				+            return datas
			
 
				+
			
 
				+        """
			
 
				+        cmd = self._redis.register_script(lua)
			
 
				+        if count:
			
 
				+            res = cmd(
			
 
				+                keys=[table], args=[table, priority_min, priority_max, is_pop, count]
			
 
				+            )
			
 
				+        else:
			
 
				+            res = cmd(keys=[table], args=[table, priority_min, priority_max, is_pop])
			
 
				+
			
 
				+        return res
			
 
				+
			
 
				+    def zrangebyscore_increase_score(
			
 
				+        self, table, priority_min, priority_max, increase_score, count=None
			
 
				+    ):
			
 
				+        """
			
 
				+        @summary: 返回指定分数区间的数据 闭区间， 同时修改分数
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param priority_min: 最小分数
			
 
				+        @param priority_max: 最大分数
			
 
				+        @param increase_score: 分数值增量 正数则在原有的分数上叠加，负数则相减
			
 
				+        @param count: 获取的数量，为空则表示分数区间内的全部数据
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 使用lua脚本， 保证操作的原子性
			
 
				+        lua = """
			
 
				+            -- local key = KEYS[1]
			
 
				+            local min_score = ARGV[1]
			
 
				+            local max_score = ARGV[2]
			
 
				+            local increase_score = ARGV[3]
			
 
				+            local count = ARGV[4]
			
 
				+
			
 
				+            -- 取值
			
 
				+            local datas = nil
			
 
				+            if count then
			
 
				+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'limit', 0, count)
			
 
				+            else
			
 
				+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score)
			
 
				+            end
			
 
				+
			
 
				+            --修改优先级
			
 
				+            for i=1, #datas do
			
 
				+                redis.call('zincrby', KEYS[1], increase_score, datas[i])
			
 
				+            end
			
 
				+
			
 
				+            return datas
			
 
				+
			
 
				+        """
			
 
				+        cmd = self._redis.register_script(lua)
			
 
				+        if count:
			
 
				+            res = cmd(
			
 
				+                keys=[table], args=[priority_min, priority_max, increase_score, count]
			
 
				+            )
			
 
				+        else:
			
 
				+            res = cmd(keys=[table], args=[priority_min, priority_max, increase_score])
			
 
				+
			
 
				+        return res
			
 
				+
			
 
				+    def zrangebyscore_set_score(
			
 
				+        self, table, priority_min, priority_max, score, count=None
			
 
				+    ):
			
 
				+        """
			
 
				+        @summary: 返回指定分数区间的数据 闭区间， 同时修改分数
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param priority_min: 最小分数
			
 
				+        @param priority_max: 最大分数
			
 
				+        @param score: 分数值
			
 
				+        @param count: 获取的数量，为空则表示分数区间内的全部数据
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        # 使用lua脚本， 保证操作的原子性
			
 
				+        lua = """
			
 
				+            -- local key = KEYS[1]
			
 
				+            local min_score = ARGV[1]
			
 
				+            local max_score = ARGV[2]
			
 
				+            local set_score = ARGV[3]
			
 
				+            local count = ARGV[4]
			
 
				+
			
 
				+            -- 取值
			
 
				+            local datas = nil
			
 
				+            if count then
			
 
				+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores','limit', 0, count)
			
 
				+            else
			
 
				+                datas = redis.call('zrangebyscore', KEYS[1], min_score, max_score, 'withscores')
			
 
				+            end
			
 
				+
			
 
				+            local real_datas = {} -- 数据
			
 
				+            --修改优先级
			
 
				+            for i=1, #datas, 2 do
			
 
				+               local data = datas[i]
			
 
				+               local score = datas[i+1]
			
 
				+
			
 
				+               table.insert(real_datas, data) -- 添加数据
			
 
				+
			
 
				+               redis.call('zincrby', KEYS[1], set_score - score, datas[i])
			
 
				+            end
			
 
				+
			
 
				+            return real_datas
			
 
				+
			
 
				+        """
			
 
				+        cmd = self._redis.register_script(lua)
			
 
				+        if count:
			
 
				+            res = cmd(keys=[table], args=[priority_min, priority_max, score, count])
			
 
				+        else:
			
 
				+            res = cmd(keys=[table], args=[priority_min, priority_max, score])
			
 
				+
			
 
				+        return res
			
 
				+
			
 
				+    def zincrby(self, table, amount, value):
			
 
				+        return self._redis.zincrby(table, amount, value)
			
 
				+
			
 
				+    def zget_count(self, table, priority_min=None, priority_max=None):
			
 
				+        """
			
 
				+        @summary: 获取表数据的数量
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param priority_min:优先级范围 最小值（包含）
			
 
				+        @param priority_max:优先级范围 最大值（包含）
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        if priority_min != None and priority_max != None:
			
 
				+            return self._redis.zcount(table, priority_min, priority_max)
			
 
				+        else:
			
 
				+            return self._redis.zcard(table)
			
 
				+
			
 
				+    def zrem(self, table, values):
			
 
				+        """
			
 
				+        @summary: 移除集合中的指定元素
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param values: 一个或者列表
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        if isinstance(values, list):
			
 
				+            self._redis.zrem(table, *values)
			
 
				+        else:
			
 
				+            self._redis.zrem(table, values)
			
 
				+
			
 
				+    def zexists(self, table, values):
			
 
				+        """
			
 
				+        利用zscore判断某元素是否存在
			
 
				+        @param values:
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        is_exists = []
			
 
				+
			
 
				+        if isinstance(values, list):
			
 
				+            pipe = self._redis.pipeline()
			
 
				+            pipe.multi()
			
 
				+            for value in values:
			
 
				+                pipe.zscore(table, value)
			
 
				+            is_exists_temp = pipe.execute()
			
 
				+            for is_exist in is_exists_temp:
			
 
				+                if is_exist != None:
			
 
				+                    is_exists.append(1)
			
 
				+                else:
			
 
				+                    is_exists.append(0)
			
 
				+
			
 
				+        else:
			
 
				+            is_exists = self._redis.zscore(table, values)
			
 
				+            is_exists = 1 if is_exists != None else 0
			
 
				+
			
 
				+        return is_exists
			
 
				+
			
 
				+    def lpush(self, table, values):
			
 
				+
			
 
				+        if isinstance(values, list):
			
 
				+            pipe = self._redis.pipeline()
			
 
				+
			
 
				+            if not self._is_redis_cluster:
			
 
				+                pipe.multi()
			
 
				+            for value in values:
			
 
				+                pipe.rpush(table, value)
			
 
				+            pipe.execute()
			
 
				+
			
 
				+        else:
			
 
				+            return self._redis.rpush(table, values)
			
 
				+
			
 
				+    def lpop(self, table, count=1):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param count:
			
 
				+        ---------
			
 
				+        @result: count>1时返回列表
			
 
				+        """
			
 
				+
			
 
				+        datas = None
			
 
				+
			
 
				+        count = count if count <= self.lget_count(table) else self.lget_count(table)
			
 
				+
			
 
				+        if count:
			
 
				+            if count > 1:
			
 
				+                pipe = self._redis.pipeline()
			
 
				+
			
 
				+                if not self._is_redis_cluster:
			
 
				+                    pipe.multi()
			
 
				+                while count:
			
 
				+                    pipe.lpop(table)
			
 
				+                    count -= 1
			
 
				+                datas = pipe.execute()
			
 
				+
			
 
				+            else:
			
 
				+                datas = self._redis.lpop(table)
			
 
				+
			
 
				+        return datas
			
 
				+
			
 
				+    def rpoplpush(self, from_table, to_table=None):
			
 
				+        """
			
 
				+        将列表 from_table 中的最后一个元素(尾元素)弹出，并返回给客户端。
			
 
				+        将 from_table 弹出的元素插入到列表 to_table ，作为 to_table 列表的的头元素。
			
 
				+        如果 from_table 和 to_table 相同，则列表中的表尾元素被移动到表头，并返回该元素，可以把这种特殊情况视作列表的旋转(rotation)操作
			
 
				+        @param from_table:
			
 
				+        @param to_table:
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        if not to_table:
			
 
				+            to_table = from_table
			
 
				+
			
 
				+        return self._redis.rpoplpush(from_table, to_table)
			
 
				+
			
 
				+    def lget_count(self, table):
			
 
				+        return self._redis.llen(table)
			
 
				+
			
 
				+    def lrem(self, table, value, num=0):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        删除value
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param value:
			
 
				+        @param num:
			
 
				+        ---------
			
 
				+        @result: 删除的条数
			
 
				+        """
			
 
				+        return self._redis.lrem(table, num, value)
			
 
				+
			
 
				+    def lrange(self, table, start=0, end=-1):
			
 
				+        return self._redis.lrange(table, start, end)
			
 
				+
			
 
				+    def hset(self, table, key, value):
			
 
				+        """
			
 
				+        @summary:
			
 
				+        如果 key 不存在，一个新的哈希表被创建并进行 HSET 操作。
			
 
				+        如果域 field 已经存在于哈希表中，旧值将被覆盖
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param key:
			
 
				+        @param value:
			
 
				+        ---------
			
 
				+        @result: 1 新插入； 0 覆盖
			
 
				+        """
			
 
				+        return self._redis.hset(table, key, value)
			
 
				+
			
 
				+    def hset_batch(self, table, datas):
			
 
				+        """
			
 
				+        批量插入
			
 
				+        Args:
			
 
				+            datas:
			
 
				+                [[key, value]]
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        pipe = self._redis.pipeline()
			
 
				+
			
 
				+        if not self._is_redis_cluster:
			
 
				+            pipe.multi()
			
 
				+        for key, value in datas:
			
 
				+            pipe.hset(table, key, value)
			
 
				+        return pipe.execute()
			
 
				+
			
 
				+    def hincrby(self, table, key, increment):
			
 
				+        return self._redis.hincrby(table, key, increment)
			
 
				+
			
 
				+    def hget(self, table, key, is_pop=False):
			
 
				+        if not is_pop:
			
 
				+            return self._redis.hget(table, key)
			
 
				+        else:
			
 
				+            lua = """
			
 
				+                -- local key = KEYS[1]
			
 
				+                local field = ARGV[1]
			
 
				+
			
 
				+                -- 取值
			
 
				+                local datas = redis.call('hget', KEYS[1], field)
			
 
				+                -- 删除值
			
 
				+                redis.call('hdel', KEYS[1], field)
			
 
				+
			
 
				+                return datas
			
 
				+
			
 
				+                    """
			
 
				+            cmd = self._redis.register_script(lua)
			
 
				+            res = cmd(keys=[table], args=[key])
			
 
				+
			
 
				+            return res
			
 
				+
			
 
				+    def hgetall(self, table):
			
 
				+        return self._redis.hgetall(table)
			
 
				+
			
 
				+    def hexists(self, table, key):
			
 
				+        return self._redis.hexists(table, key)
			
 
				+
			
 
				+    def hdel(self, table, *keys):
			
 
				+        """
			
 
				+        @summary: 删除对应的key 可传多个
			
 
				+        ---------
			
 
				+        @param table:
			
 
				+        @param *keys:
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        self._redis.hdel(table, *keys)
			
 
				+
			
 
				+    def hget_count(self, table):
			
 
				+        return self._redis.hlen(table)
			
 
				+
			
 
				+    def setbit(self, table, offsets, values):
			
 
				+        """
			
 
				+        设置字符串数组某一位的值， 返回之前的值
			
 
				+        @param table:
			
 
				+        @param offsets: 支持列表或单个值
			
 
				+        @param values: 支持列表或单个值
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        if isinstance(offsets, list):
			
 
				+            if not isinstance(values, list):
			
 
				+                values = [values] * len(offsets)
			
 
				+            else:
			
 
				+                assert len(offsets) == len(values), "offsets值要与values值一一对应"
			
 
				+
			
 
				+            pipe = self._redis.pipeline()
			
 
				+            pipe.multi()
			
 
				+
			
 
				+            for offset, value in zip(offsets, values):
			
 
				+                pipe.setbit(table, offset, value)
			
 
				+
			
 
				+            return pipe.execute()
			
 
				+
			
 
				+        else:
			
 
				+            return self._redis.setbit(table, offsets, values)
			
 
				+
			
 
				+    def getbit(self, table, offsets):
			
 
				+        """
			
 
				+        取字符串数组某一位的值
			
 
				+        @param table:
			
 
				+        @param offsets: 支持列表
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        if isinstance(offsets, list):
			
 
				+            pipe = self._redis.pipeline()
			
 
				+            pipe.multi()
			
 
				+            for offset in offsets:
			
 
				+                pipe.getbit(table, offset)
			
 
				+
			
 
				+            return pipe.execute()
			
 
				+
			
 
				+        else:
			
 
				+            return self._redis.getbit(table, offsets)
			
 
				+
			
 
				+    def bitcount(self, table):
			
 
				+        return self._redis.bitcount(table)
			
 
				+
			
 
				+    def strset(self, table, value, **kwargs):
			
 
				+        return self._redis.set(table, value, **kwargs)
			
 
				+
			
 
				+    def str_incrby(self, table, value):
			
 
				+        return self._redis.incrby(table, value)
			
 
				+
			
 
				+    def strget(self, table):
			
 
				+        return self._redis.get(table)
			
 
				+
			
 
				+    def strlen(self, table):
			
 
				+        return self._redis.strlen(table)
			
 
				+
			
 
				+    def getkeys(self, regex):
			
 
				+        return self._redis.keys(regex)
			
 
				+
			
 
				+    def exists_key(self, key):
			
 
				+        return self._redis.exists(key)
			
 
				+
			
 
				+    def set_expire(self, key, seconds):
			
 
				+        """
			
 
				+        @summary: 设置过期时间
			
 
				+        ---------
			
 
				+        @param key:
			
 
				+        @param seconds: 秒
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        self._redis.expire(key, seconds)
			
 
				+
			
 
				+    def get_expire(self, key):
			
 
				+        """
			
 
				+        @summary: 查询过期时间
			
 
				+        ---------
			
 
				+        @param key:
			
 
				+        @param seconds: 秒
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+        return self._redis.ttl(key)
			
 
				+
			
 
				+    def clear(self, table):
			
 
				+        try:
			
 
				+            self._redis.delete(table)
			
 
				+        except Exception as e:
			
 
				+            log.error(e)
			
 
				+
			
 
				+    def get_redis_obj(self):
			
 
				+        return self._redis
			
 
				+
			
 
				+    def _reconnect(self):
			
 
				+        # 检测连接状态, 当数据库重启或设置 timeout 导致断开连接时自动重连
			
 
				+        retry_count = 0
			
 
				+        while True:
			
 
				+            try:
			
 
				+                retry_count += 1
			
 
				+                log.error(f"redis 连接断开, 重新连接 {retry_count}")
			
 
				+                if self.get_connect():
			
 
				+                    log.info(f"redis 连接成功")
			
 
				+                    return True
			
 
				+            except (ConnectionError, TimeoutError) as e:
			
 
				+                log.error(f"连接失败 e: {e}")
			
 
				+
			
 
				+            time.sleep(2)
			
 
				+
			
 
				+    def __getattr__(self, name):
			
 
				+        return getattr(self._redis, name)
			
--- a/FworkSpider/feapder/dedup/__init__.py
+++ b/FworkSpider/feapder/dedup/__init__.py
@@ -0,0 +1,178 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-12-13 21:08
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import copy
			
 
				+from typing import Any, List, Union, Optional, Tuple, Callable
			
 
				+
			
 
				+from feapder.utils.tools import get_md5
			
 
				+from .bloomfilter import BloomFilter, ScalableBloomFilter
			
 
				+from .expirefilter import ExpireFilter
			
 
				+
			
 
				+
			
 
				+class Dedup:
			
 
				+    BloomFilter = 1
			
 
				+    MemoryFilter = 2
			
 
				+    ExpireFilter = 3
			
 
				+
			
 
				+    def __init__(self, filter_type: int = BloomFilter, to_md5: bool = True, **kwargs):
			
 
				+        """
			
 
				+        去重过滤器 集成BloomFilter、MemoryFilter、ExpireFilter
			
 
				+        Args:
			
 
				+            filter_type: 过滤器类型 BloomFilter
			
 
				+            name: 过滤器名称 该名称会默认以dedup作为前缀 dedup:expire_set:[name]/dedup:bloomfilter:[name]。 默认ExpireFilter name=过期时间; BloomFilter name=dedup:bloomfilter:bloomfilter
			
 
				+            absolute_name: 过滤器绝对名称 不会加dedup前缀，当此值不为空时name参数无效
			
 
				+            expire_time: ExpireFilter的过期时间 单位为秒，其他两种过滤器不用指定
			
 
				+            error_rate: BloomFilter/MemoryFilter的误判率 默认为0.00001
			
 
				+            to_md5: 去重前是否将数据转为MD5，默认是
			
 
				+            redis_url: redis://[[username]:[password]]@localhost:6379/0
			
 
				+                       BloomFilter 与 ExpireFilter 使用
			
 
				+                       默认会读取setting中的redis配置，若无setting，则需要专递redis_url
			
 
				+            initial_capacity: 单个布隆过滤器去重容量 默认100000000，当布隆过滤器容量满时会扩展下一个布隆过滤器
			
 
				+            error_rate：布隆过滤器的误判率 默认0.00001
			
 
				+            **kwargs:
			
 
				+        """
			
 
				+
			
 
				+        if filter_type == Dedup.ExpireFilter:
			
 
				+            try:
			
 
				+                expire_time = kwargs["expire_time"]
			
 
				+            except:
			
 
				+                raise ValueError("需传参数 expire_time")
			
 
				+
			
 
				+            name = kwargs.get("absolute_name") or "dedup:expire_set:%s" % kwargs.get(
			
 
				+                "name", expire_time
			
 
				+            )
			
 
				+            expire_time_record_key = "dedup:expire_set:expire_time"
			
 
				+
			
 
				+            self.dedup = ExpireFilter(
			
 
				+                name=name,
			
 
				+                expire_time=expire_time,
			
 
				+                expire_time_record_key=expire_time_record_key,
			
 
				+                redis_url=kwargs.get("redis_url"),
			
 
				+            )
			
 
				+
			
 
				+        else:
			
 
				+            initial_capacity = kwargs.get("initial_capacity", 100000000)
			
 
				+            error_rate = kwargs.get("error_rate", 0.00001)
			
 
				+            name = kwargs.get("absolute_name") or "dedup:bloomfilter:" + kwargs.get(
			
 
				+                "name", "bloomfilter"
			
 
				+            )
			
 
				+            if filter_type == Dedup.BloomFilter:
			
 
				+                self.dedup = ScalableBloomFilter(
			
 
				+                    name=name,
			
 
				+                    initial_capacity=initial_capacity,
			
 
				+                    error_rate=error_rate,
			
 
				+                    bitarray_type=ScalableBloomFilter.BASE_REDIS,
			
 
				+                    redis_url=kwargs.get("redis_url"),
			
 
				+                )
			
 
				+            elif filter_type == Dedup.MemoryFilter:
			
 
				+                self.dedup = ScalableBloomFilter(
			
 
				+                    name=name,
			
 
				+                    initial_capacity=initial_capacity,
			
 
				+                    error_rate=error_rate,
			
 
				+                    bitarray_type=ScalableBloomFilter.BASE_MEMORY,
			
 
				+                )
			
 
				+            else:
			
 
				+                raise ValueError(
			
 
				+                    "filter_type 类型错误，仅支持 Dedup.BloomFilter、Dedup.MemoryFilter、Dedup.ExpireFilter"
			
 
				+                )
			
 
				+
			
 
				+        self._to_md5 = to_md5
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return str(self.dedup)
			
 
				+
			
 
				+    def _deal_datas(self, datas):
			
 
				+        if self._to_md5:
			
 
				+            if isinstance(datas, list):
			
 
				+                keys = [get_md5(data) for data in datas]
			
 
				+            else:
			
 
				+                keys = get_md5(datas)
			
 
				+        else:
			
 
				+            keys = copy.deepcopy(datas)
			
 
				+
			
 
				+        return keys
			
 
				+
			
 
				+    def add(
			
 
				+        self, datas: Union[List[Any], Any], skip_check: bool = False
			
 
				+    ) -> Union[List[Any], Any]:
			
 
				+        """
			
 
				+        添加数据
			
 
				+        @param datas: list / 单个值
			
 
				+        @param skip_check: 是否直接添加，不检查是否存在 适用于bloomfilter，加快add速度
			
 
				+        @return: list / 单个值 (如果数据已存在 返回 0 否则返回 1, 可以理解为是否添加成功)
			
 
				+        """
			
 
				+
			
 
				+        keys = self._deal_datas(datas)
			
 
				+        is_added = self.dedup.add(keys, skip_check)
			
 
				+
			
 
				+        return is_added
			
 
				+
			
 
				+    def get(self, datas: Union[List[Any], Any]) -> Union[List[Any], Any]:
			
 
				+        """
			
 
				+        检查数据是否存在
			
 
				+        @param datas: list / 单个值
			
 
				+        @return: list / 单个值 （存在返回1 不存在返回0)
			
 
				+        """
			
 
				+        keys = self._deal_datas(datas)
			
 
				+        is_exists = self.dedup.get(keys)
			
 
				+
			
 
				+        return is_exists
			
 
				+
			
 
				+    def filter_exist_data(
			
 
				+        self,
			
 
				+        datas: List[Any],
			
 
				+        *,
			
 
				+        datas_fingerprints: Optional[List] = None,
			
 
				+        callback: Callable[[Any], None] = None
			
 
				+    ) -> Union[Tuple[List[Any], List[Any]], List[Any]]:
			
 
				+        """
			
 
				+        过滤掉已存在的数据
			
 
				+        *** 直接修改原来的数据 使用完此方法后 datas, datas_fingerprints 里面的值为去重后的数据
			
 
				+        @param datas_fingerprints: 数据的唯一指纹 列表
			
 
				+        @param datas: 数据 列表
			
 
				+        @param callback: 数据已存在时的回调 callback(data)
			
 
				+        @return: None
			
 
				+        """
			
 
				+
			
 
				+        is_exists = self.get(datas_fingerprints or datas)
			
 
				+
			
 
				+        dedup_datas = []
			
 
				+
			
 
				+        if datas_fingerprints:
			
 
				+            dedup_datas_fingerprints = []
			
 
				+            while is_exists:
			
 
				+                data = datas.pop(0)
			
 
				+                is_exist = is_exists.pop(0)
			
 
				+                data_fingerprint = datas_fingerprints.pop(0)
			
 
				+
			
 
				+                if not is_exist:
			
 
				+                    dedup_datas.append(data)
			
 
				+                    dedup_datas_fingerprints.append(data_fingerprint)
			
 
				+                else:
			
 
				+                    if callback:
			
 
				+                        callback(data)
			
 
				+
			
 
				+            datas_fingerprints.extend(dedup_datas_fingerprints)
			
 
				+            datas.extend(dedup_datas)
			
 
				+            return datas, datas_fingerprints
			
 
				+
			
 
				+        else:
			
 
				+            while is_exists:
			
 
				+                data = datas.pop(0)
			
 
				+                is_exist = is_exists.pop(0)
			
 
				+
			
 
				+                if not is_exist:
			
 
				+                    dedup_datas.append(data)
			
 
				+                else:
			
 
				+                    if callback:
			
 
				+                        callback(data)
			
 
				+
			
 
				+            datas.extend(dedup_datas)
			
 
				+            return datas
			
--- a/FworkSpider/feapder/dedup/bitarray.py
+++ b/FworkSpider/feapder/dedup/bitarray.py
@@ -0,0 +1,143 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018/12/14 1:05 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+from __future__ import absolute_import
			
 
				+
			
 
				+
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+
			
 
				+
			
 
				+class BitArray:
			
 
				+    def setall(self, value):
			
 
				+        pass
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        raise ImportError("this method mush be implement")
			
 
				+
			
 
				+    def set(self, offsets, values):
			
 
				+        """
			
 
				+        设置字符串数字某一位的值， 返回之前的值
			
 
				+        @param offsets: 支持列表或单个值
			
 
				+        @param values: 支持列表或单个值
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        raise ImportError("this method mush be implement")
			
 
				+
			
 
				+    def get(self, offsets):
			
 
				+        """
			
 
				+        取字符串数字某一位的值
			
 
				+        @param offsets: 支持列表或单个值
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        raise ImportError("this method mush be implement")
			
 
				+
			
 
				+    def count(self, value=True):
			
 
				+        raise ImportError("this method mush be implement")
			
 
				+
			
 
				+
			
 
				+class MemoryBitArray(BitArray):
			
 
				+    def __init__(self, num_bits):
			
 
				+        try:
			
 
				+            import bitarray
			
 
				+        except Exception as e:
			
 
				+            raise Exception(
			
 
				+                "需要安装feapder完整版\ncommand: pip install feapder[all]\n若安装出错，参考：https://boris.org.cn/feapder/#/question/%E5%AE%89%E8%A3%85%E9%97%AE%E9%A2%98"
			
 
				+            )
			
 
				+
			
 
				+        self.num_bits = num_bits
			
 
				+        self.bitarray = bitarray.bitarray(num_bits, endian="little")
			
 
				+
			
 
				+        self.setall(0)
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "MemoryBitArray: {}".format(self.num_bits)
			
 
				+
			
 
				+    def setall(self, value):
			
 
				+        self.bitarray.setall(value)
			
 
				+
			
 
				+    def set(self, offsets, values):
			
 
				+        """
			
 
				+        设置字符串数字某一位的值， 返回之前的值
			
 
				+        @param offsets: 支持列表或单个值
			
 
				+        @param values: 支持列表或单个值
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+
			
 
				+        old_values = []
			
 
				+
			
 
				+        if isinstance(offsets, list):
			
 
				+            if not isinstance(values, list):
			
 
				+                values = [values] * len(offsets)
			
 
				+            else:
			
 
				+                assert len(offsets) == len(values), "offsets值要与values值一一对应"
			
 
				+
			
 
				+            for offset, value in zip(offsets, values):
			
 
				+                old_values.append(int(self.bitarray[offset]))
			
 
				+                self.bitarray[offset] = value
			
 
				+
			
 
				+        else:
			
 
				+            old_values = int(self.bitarray[offsets])
			
 
				+            self.bitarray[offsets] = values
			
 
				+
			
 
				+        return old_values
			
 
				+
			
 
				+    def get(self, offsets):
			
 
				+        """
			
 
				+        取字符串数字某一位的值
			
 
				+        @param offsets: 支持列表或单个值
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        if isinstance(offsets, list):
			
 
				+            return [self.bitarray[offset] for offset in offsets]
			
 
				+        else:
			
 
				+            return self.bitarray[offsets]
			
 
				+
			
 
				+    def count(self, value=True):
			
 
				+        return self.bitarray.count(value)
			
 
				+
			
 
				+
			
 
				+class RedisBitArray(BitArray):
			
 
				+    """
			
 
				+    仿bitarray 基于redis
			
 
				+    """
			
 
				+
			
 
				+    redis_db = None
			
 
				+
			
 
				+    def __init__(self, name, redis_url=None):
			
 
				+        self.name = name
			
 
				+        self.count_cached_name = name + "_count_cached"
			
 
				+
			
 
				+        if not self.__class__.redis_db:
			
 
				+            self.__class__.redis_db = RedisDB(url=redis_url)
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "RedisBitArray: {}".format(self.name)
			
 
				+
			
 
				+    def set(self, offsets, values):
			
 
				+        """
			
 
				+        设置字符串数字某一位的值， 返回之前的值
			
 
				+        @param offsets: 支持列表或单个值
			
 
				+        @param values: 支持列表或单个值
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        return self.redis_db.setbit(self.name, offsets, values)
			
 
				+
			
 
				+    def get(self, offsets):
			
 
				+        return self.redis_db.getbit(self.name, offsets)
			
 
				+
			
 
				+    def count(self, value=True):
			
 
				+        # 先查redis的缓存，若没有 在统计数量
			
 
				+        count = self.redis_db.strget(self.count_cached_name)
			
 
				+        if count:
			
 
				+            return int(count)
			
 
				+        else:
			
 
				+            count = self.redis_db.bitcount(self.name)
			
 
				+            self.redis_db.strset(self.count_cached_name, count, ex=1800)  # 半小时过期
			
 
				+            return count
			
--- a/FworkSpider/feapder/dedup/bloomfilter.py
+++ b/FworkSpider/feapder/dedup/bloomfilter.py
@@ -0,0 +1,385 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018/12/13 4:11 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import hashlib
			
 
				+import math
			
 
				+import threading
			
 
				+import time
			
 
				+from struct import unpack, pack
			
 
				+
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.utils.redis_lock import RedisLock
			
 
				+from . import bitarray
			
 
				+
			
 
				+
			
 
				+def make_hashfuncs(num_slices, num_bits):
			
 
				+    if num_bits >= (1 << 31):
			
 
				+        fmt_code, chunk_size = "Q", 8
			
 
				+    elif num_bits >= (1 << 15):
			
 
				+        fmt_code, chunk_size = "I", 4
			
 
				+    else:
			
 
				+        fmt_code, chunk_size = "H", 2
			
 
				+    total_hash_bits = 8 * num_slices * chunk_size
			
 
				+    if total_hash_bits > 384:
			
 
				+        hashfn = hashlib.sha512
			
 
				+    elif total_hash_bits > 256:
			
 
				+        hashfn = hashlib.sha384
			
 
				+    elif total_hash_bits > 160:
			
 
				+        hashfn = hashlib.sha256
			
 
				+    elif total_hash_bits > 128:
			
 
				+        hashfn = hashlib.sha1
			
 
				+    else:
			
 
				+        hashfn = hashlib.md5
			
 
				+    fmt = fmt_code * (hashfn().digest_size // chunk_size)
			
 
				+    num_salts, extra = divmod(num_slices, len(fmt))
			
 
				+    if extra:
			
 
				+        num_salts += 1
			
 
				+    salts = tuple(hashfn(hashfn(pack("I", i)).digest()) for i in range(num_salts))
			
 
				+
			
 
				+    def _make_hashfuncs(key):
			
 
				+        if isinstance(key, str):
			
 
				+            key = key.encode("utf-8")
			
 
				+        else:
			
 
				+            key = str(key).encode("utf-8")
			
 
				+
			
 
				+        i = 0
			
 
				+        for salt in salts:
			
 
				+            h = salt.copy()
			
 
				+            h.update(key)
			
 
				+            for uint in unpack(fmt, h.digest()):
			
 
				+                yield uint % num_bits
			
 
				+                i += 1
			
 
				+                if i >= num_slices:
			
 
				+                    return
			
 
				+
			
 
				+    return _make_hashfuncs
			
 
				+
			
 
				+
			
 
				+class BloomFilter(object):
			
 
				+    BASE_MEMORY = 1
			
 
				+    BASE_REDIS = 2
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        capacity: int,
			
 
				+        error_rate: float = 0.00001,
			
 
				+        bitarray_type=BASE_REDIS,
			
 
				+        name=None,
			
 
				+        redis_url=None,
			
 
				+    ):
			
 
				+        if not (0 < error_rate < 1):
			
 
				+            raise ValueError("Error_Rate must be between 0 and 1.")
			
 
				+        if not capacity > 0:
			
 
				+            raise ValueError("Capacity must be > 0")
			
 
				+
			
 
				+        # given M = num_bits, k = num_slices, P = error_rate, n = capacity
			
 
				+        # k = log2(1/P)
			
 
				+        # solving for m = bits_per_slice
			
 
				+        # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
			
 
				+        # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
			
 
				+        # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
			
 
				+        num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
			
 
				+        bits_per_slice = int(
			
 
				+            math.ceil(
			
 
				+                (capacity * abs(math.log(error_rate)))
			
 
				+                / (num_slices * (math.log(2) ** 2))
			
 
				+            )
			
 
				+        )
			
 
				+        self._setup(error_rate, num_slices, bits_per_slice, capacity)
			
 
				+
			
 
				+        if bitarray_type == BloomFilter.BASE_MEMORY:
			
 
				+            self.bitarray = bitarray.MemoryBitArray(self.num_bits)
			
 
				+            self.bitarray.setall(False)
			
 
				+        elif bitarray_type == BloomFilter.BASE_REDIS:
			
 
				+            assert name, "name can't be None "
			
 
				+            self.bitarray = bitarray.RedisBitArray(name, redis_url)
			
 
				+        else:
			
 
				+            raise ValueError("not support this bitarray type")
			
 
				+
			
 
				+    def _setup(self, error_rate, num_slices, bits_per_slice, capacity):
			
 
				+        self.error_rate = error_rate
			
 
				+        self.num_slices = num_slices
			
 
				+        self.bits_per_slice = bits_per_slice
			
 
				+        self.capacity = capacity
			
 
				+        self.num_bits = num_slices * bits_per_slice
			
 
				+        self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
			
 
				+
			
 
				+        self._is_at_capacity = False
			
 
				+        self._check_capacity_time = 0
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<BloomFilter: {}>".format(self.bitarray)
			
 
				+
			
 
				+    def get(self, keys, to_list=False):
			
 
				+        is_list = isinstance(keys, list)
			
 
				+        keys = keys if is_list else [keys]
			
 
				+        is_exists = []
			
 
				+
			
 
				+        offsets = []
			
 
				+        for key in keys:
			
 
				+            hashes = self.make_hashes(key)
			
 
				+            offset = 0
			
 
				+            for k in hashes:
			
 
				+                offsets.append(offset + k)
			
 
				+                offset += self.bits_per_slice
			
 
				+
			
 
				+        old_values = self.bitarray.get(offsets)
			
 
				+        for i in range(0, len(old_values), self.num_slices):
			
 
				+            is_exists.append(int(all(old_values[i : i + self.num_slices])))
			
 
				+
			
 
				+        if to_list:
			
 
				+            return is_exists
			
 
				+        else:
			
 
				+            return is_exists if is_list else is_exists[0]
			
 
				+
			
 
				+    @property
			
 
				+    def is_at_capacity(self):
			
 
				+        """
			
 
				+        是否容量已满, 1的个数满位数组的一半的时，则看做已满
			
 
				+        比较耗时 半小时检查一次
			
 
				+        @return:
			
 
				+        """
			
 
				+        # if self._is_at_capacity:
			
 
				+        #     return self._is_at_capacity
			
 
				+        #
			
 
				+        # if not self._check_capacity_time or time.time() - self._check_capacity_time > 1800:
			
 
				+        #     bit_count = self.bitarray.count()
			
 
				+        #     if bit_count and bit_count / self.num_bits > 0.5:
			
 
				+        #         self._is_at_capacity = True
			
 
				+        #
			
 
				+        #     self._check_capacity_time = time.time()
			
 
				+        #
			
 
				+        # return self._is_at_capacity
			
 
				+
			
 
				+        if self._is_at_capacity:
			
 
				+            return self._is_at_capacity
			
 
				+
			
 
				+        bit_count = self.bitarray.count()
			
 
				+        if bit_count and bit_count / self.num_bits > 0.5:
			
 
				+            self._is_at_capacity = True
			
 
				+
			
 
				+        return self._is_at_capacity
			
 
				+
			
 
				+    def add(self, keys):
			
 
				+        """
			
 
				+        Adds a key to this bloom filter. If the key already exists in this
			
 
				+        filter it will return False. Otherwise True. keys support list
			
 
				+        @param keys: list or one key
			
 
				+        @return:
			
 
				+        """
			
 
				+        if self.is_at_capacity:
			
 
				+            raise IndexError("BloomFilter is at capacity")
			
 
				+
			
 
				+        is_list = isinstance(keys, list)
			
 
				+
			
 
				+        keys = keys if is_list else [keys]
			
 
				+        is_added = []
			
 
				+
			
 
				+        offsets = []
			
 
				+        for key in keys:
			
 
				+            hashes = self.make_hashes(key)
			
 
				+            offset = 0
			
 
				+            for k in hashes:
			
 
				+                offsets.append(offset + k)
			
 
				+                offset += self.bits_per_slice
			
 
				+
			
 
				+        old_values = self.bitarray.set(offsets, 1)
			
 
				+        for i in range(0, len(old_values), self.num_slices):
			
 
				+            is_added.append(1 ^ int(all(old_values[i : i + self.num_slices])))
			
 
				+
			
 
				+        return is_added if is_list else is_added[0]
			
 
				+
			
 
				+
			
 
				+class ScalableBloomFilter(object):
			
 
				+    """
			
 
				+    自动扩展空间的bloomfilter, 当一个filter满一半的时候，创建下一个
			
 
				+    """
			
 
				+
			
 
				+    BASE_MEMORY = BloomFilter.BASE_MEMORY
			
 
				+    BASE_REDIS = BloomFilter.BASE_REDIS
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        initial_capacity: int = 100000000,
			
 
				+        error_rate: float = 0.00001,
			
 
				+        bitarray_type=BASE_REDIS,
			
 
				+        name=None,
			
 
				+        redis_url=None,
			
 
				+    ):
			
 
				+
			
 
				+        if not error_rate or error_rate < 0:
			
 
				+            raise ValueError("Error_Rate must be a decimal less than 0.")
			
 
				+
			
 
				+        self._setup(
			
 
				+            initial_capacity, error_rate, name, bitarray_type, redis_url=redis_url
			
 
				+        )
			
 
				+
			
 
				+    def _setup(self, initial_capacity, error_rate, name, bitarray_type, redis_url):
			
 
				+        self.initial_capacity = initial_capacity
			
 
				+        self.error_rate = error_rate
			
 
				+        self.name = name
			
 
				+        self.bitarray_type = bitarray_type
			
 
				+        self.redis_url = redis_url
			
 
				+
			
 
				+        self.filters = []
			
 
				+
			
 
				+        self.filters.append(self.create_filter())
			
 
				+        self._thread_lock = threading.RLock()
			
 
				+        self._check_capacity_time = 0
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<ScalableBloomFilter: {}>".format(self.filters[-1].bitarray)
			
 
				+
			
 
				+    def create_filter(self):
			
 
				+        filter = BloomFilter(
			
 
				+            capacity=self.initial_capacity,
			
 
				+            error_rate=self.error_rate,
			
 
				+            bitarray_type=self.bitarray_type,
			
 
				+            name=self.name + str(len(self.filters)) if self.name else self.name,
			
 
				+            redis_url=self.redis_url,
			
 
				+        )
			
 
				+
			
 
				+        return filter
			
 
				+
			
 
				+    def check_filter_capacity(self):
			
 
				+        """
			
 
				+        检测filter状态，如果已满，加载新的filter
			
 
				+        @return:
			
 
				+        """
			
 
				+        if (
			
 
				+            not self._check_capacity_time
			
 
				+            or time.time() - self._check_capacity_time > 1800
			
 
				+        ):
			
 
				+            if self.bitarray_type == ScalableBloomFilter.BASE_MEMORY:
			
 
				+                with self._thread_lock:
			
 
				+                    while True:
			
 
				+                        if self.filters[-1].is_at_capacity:
			
 
				+                            self.filters.append(self.create_filter())
			
 
				+                        else:
			
 
				+                            break
			
 
				+
			
 
				+                    self._check_capacity_time = time.time()
			
 
				+            else:
			
 
				+                # 全局锁 同一时间只有一个进程在真正的创建新的filter，等这个进程创建完，其他进程只是把刚创建的filter append进来
			
 
				+                key = (
			
 
				+                    f"ScalableBloomFilter:{self.name}"
			
 
				+                    if self.name
			
 
				+                    else "ScalableBloomFilter"
			
 
				+                )
			
 
				+                with RedisLock(key=key) as lock:
			
 
				+                    if lock.locked:
			
 
				+                        while True:
			
 
				+                            if self.filters[-1].is_at_capacity:
			
 
				+                                self.filters.append(self.create_filter())
			
 
				+                            else:
			
 
				+                                break
			
 
				+
			
 
				+                        self._check_capacity_time = time.time()
			
 
				+
			
 
				+    def add(self, keys, skip_check=False):
			
 
				+        """
			
 
				+        Adds a key to this bloom filter. If the key already exists in this
			
 
				+        filter it will return False. Otherwise True. keys support list
			
 
				+        @param keys: list or one key
			
 
				+        @param skip_check: add directly，not check if is exist in bloomfilters
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        self.check_filter_capacity()
			
 
				+
			
 
				+        current_filter = self.filters[-1]
			
 
				+
			
 
				+        if skip_check:
			
 
				+            return current_filter.add(keys)
			
 
				+
			
 
				+        else:
			
 
				+            is_list = isinstance(keys, list)
			
 
				+
			
 
				+            keys = keys if is_list else [keys]
			
 
				+            not_exist_keys = list(set(keys))
			
 
				+
			
 
				+            # 检查之前的bloomfilter是否存在
			
 
				+            # 记录下每级filter存在的key，不存在的key继续向下检查
			
 
				+            for filter in reversed(self.filters):
			
 
				+                current_filter_is_exists = filter.get(
			
 
				+                    not_exist_keys, to_list=True
			
 
				+                )  # 当前的filter是否存在
			
 
				+
			
 
				+                not_exist_keys_temp = []
			
 
				+
			
 
				+                for key, is_exist in zip(not_exist_keys, current_filter_is_exists):
			
 
				+                    if not is_exist:  # 当前filter不存在的key 需要继续向下检查
			
 
				+                        not_exist_keys_temp.append(key)
			
 
				+
			
 
				+                not_exist_keys = not_exist_keys_temp
			
 
				+
			
 
				+                if not not_exist_keys:
			
 
				+                    break
			
 
				+
			
 
				+            # 仍有不存在的关键词，记录该关键词
			
 
				+            if not_exist_keys:
			
 
				+                current_filter.add(not_exist_keys)
			
 
				+
			
 
				+            # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在，其他看作已存在
			
 
				+            for i, key in enumerate(keys):
			
 
				+                for j, not_exist_key in enumerate(not_exist_keys):
			
 
				+                    if key == not_exist_key:
			
 
				+                        keys[i] = 1
			
 
				+                        not_exist_keys.pop(j)
			
 
				+                        break
			
 
				+                else:
			
 
				+                    keys[i] = 0
			
 
				+
			
 
				+            is_added = keys
			
 
				+            return is_added if is_list else is_added[0]
			
 
				+
			
 
				+    def get(self, keys):
			
 
				+        self.check_filter_capacity()
			
 
				+
			
 
				+        is_list = isinstance(keys, list)
			
 
				+
			
 
				+        keys = keys if is_list else [keys]  # 最终会修改为 [0, 1, ...] 0表示不存在 1 已存在
			
 
				+        not_exist_keys = list(set(keys))
			
 
				+
			
 
				+        # 检查之前的bloomfilter是否存在
			
 
				+        # 记录下每级filter存在的key，不存在的key继续向下检查
			
 
				+        for filter in reversed(self.filters):
			
 
				+            current_filter_is_exists = filter.get(
			
 
				+                not_exist_keys, to_list=True
			
 
				+            )  # 当前的filter是否存在
			
 
				+
			
 
				+            not_exist_keys_temp = []
			
 
				+
			
 
				+            for checked_key, is_exist in zip(not_exist_keys, current_filter_is_exists):
			
 
				+                if not is_exist:  # 当前filter不存在的key 需要继续向下检查
			
 
				+                    not_exist_keys_temp.append(checked_key)
			
 
				+
			
 
				+            not_exist_keys = not_exist_keys_temp
			
 
				+
			
 
				+            if not not_exist_keys:
			
 
				+                break
			
 
				+
			
 
				+        # 比较key是否已存在, 内部重复的key 若不存在啊则只留其一算为不存在，其他看作已存在
			
 
				+        for i, key in enumerate(keys):
			
 
				+            for j, not_exist_key in enumerate(not_exist_keys):
			
 
				+                if key == not_exist_key:
			
 
				+                    keys[i] = 0
			
 
				+                    not_exist_keys.pop(j)
			
 
				+                    break
			
 
				+            else:
			
 
				+                keys[i] = 1
			
 
				+
			
 
				+        is_exists = keys
			
 
				+        return is_exists if is_list else is_exists[0]
			
 
				+
			
 
				+    @property
			
 
				+    def capacity(self):
			
 
				+        """Returns the total capacity for all filters in this SBF"""
			
 
				+        return sum(f.capacity for f in self.filters)
			
--- a/FworkSpider/feapder/dedup/expirefilter.py
+++ b/FworkSpider/feapder/dedup/expirefilter.py
@@ -0,0 +1,70 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018/12/13 9:44 PM
			
 
				+---------
			
 
				+@summary: 带有有效期的去重集合
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import time
			
 
				+
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+
			
 
				+
			
 
				+class ExpireFilter:
			
 
				+    redis_db = None
			
 
				+
			
 
				+    def __init__(
			
 
				+        self, name: str, expire_time: int, expire_time_record_key=None, redis_url=None
			
 
				+    ):
			
 
				+        if not name:
			
 
				+            raise ValueError("name cant't be None")
			
 
				+        if not expire_time:
			
 
				+            raise ValueError("please set expire time, units is seconds")
			
 
				+
			
 
				+        if not self.__class__.redis_db:
			
 
				+            self.__class__.redis_db = RedisDB(url=redis_url)
			
 
				+
			
 
				+        self.name = name
			
 
				+        self.expire_time = expire_time
			
 
				+        self.expire_time_record_key = expire_time_record_key
			
 
				+        self.del_expire_key_time = None
			
 
				+
			
 
				+        self.record_expire_time()
			
 
				+
			
 
				+        self.del_expire_key()
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<ExpireSet: {}>".format(self.name)
			
 
				+
			
 
				+    @property
			
 
				+    def current_timestamp(self):
			
 
				+        return int(time.time())
			
 
				+
			
 
				+    def add(self, keys, *args, **kwargs):
			
 
				+        """
			
 
				+        @param keys: 检查关键词在zset中是否存在，支持列表批量
			
 
				+        @return: list / 单个值
			
 
				+        """
			
 
				+        if self.current_timestamp - self.del_expire_key_time > self.expire_time:
			
 
				+            self.del_expire_key()
			
 
				+
			
 
				+        is_added = self.redis_db.zadd(self.name, keys, self.current_timestamp)
			
 
				+        return is_added
			
 
				+
			
 
				+    def get(self, keys):
			
 
				+        return self.redis_db.zexists(self.name, keys)
			
 
				+
			
 
				+    def del_expire_key(self):
			
 
				+        self.redis_db.zremrangebyscore(
			
 
				+            self.name, "-inf", self.current_timestamp - self.expire_time
			
 
				+        )
			
 
				+        self.del_expire_key_time = self.current_timestamp
			
 
				+
			
 
				+    def record_expire_time(self):
			
 
				+        if self.expire_time_record_key:
			
 
				+            self.redis_db.hset(
			
 
				+                self.expire_time_record_key, key=self.name, value=self.expire_time
			
 
				+            )
			
--- a/FworkSpider/feapder/network/__init__.py
+++ b/FworkSpider/feapder/network/__init__.py
--- a/FworkSpider/feapder/network/cookie_pool.py
+++ b/FworkSpider/feapder/network/cookie_pool.py
@@ -0,0 +1,821 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018/12/27 11:32 AM
			
 
				+---------
			
 
				+@summary: cookie池
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import abc
			
 
				+import datetime
			
 
				+import random
			
 
				+import time
			
 
				+import warnings
			
 
				+from collections import Iterable
			
 
				+from enum import Enum, unique
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder import setting
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.utils import metrics
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.redis_lock import RedisLock
			
 
				+from feapder.utils.tools import send_msg
			
 
				+from feapder.utils.webdriver import WebDriver
			
 
				+
			
 
				+
			
 
				+class CookiePoolInterface(metaclass=abc.ABCMeta):
			
 
				+    """
			
 
				+    cookie pool interface
			
 
				+    """
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def create_cookie(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_cookie(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def del_cookie(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def run(self):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+class PageCookiePool(CookiePoolInterface):
			
 
				+    """
			
 
				+    由页面产生的cookie 不需要用户登陆
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        redis_key,
			
 
				+        page_url=None,
			
 
				+        min_cookies=10000,
			
 
				+        must_contained_keys=(),
			
 
				+        keep_alive=False,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param redis_key: 项目名
			
 
				+        @param page_url: 生产cookie的url
			
 
				+        @param min_cookies: 最小cookie数
			
 
				+        @param must_contained_keys: cookie 必须包含的key
			
 
				+        @param keep_alive: 当cookie数量足够是是否保持随时待命，生产cookie的状态。False为否，满足则退出
			
 
				+        ---
			
 
				+        @param kwargs: WebDriver的一些参数
			
 
				+            load_images: 是否加载图片
			
 
				+            user_agent_pool: user-agent池 为None时不使用
			
 
				+            proxies_pool: ；代理池 为None时不使用
			
 
				+            headless: 是否启用无头模式
			
 
				+            driver_type: web driver 类型
			
 
				+            timeout: 请求超时时间 默认16s
			
 
				+            window_size: 屏幕分辨率 (width, height)
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        self._redisdb = RedisDB()
			
 
				+
			
 
				+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
			
 
				+        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
			
 
				+            redis_key
			
 
				+        )  # 存储上一次统计cookie 数量的时间，格式为 时间戳:数量
			
 
				+        self._page_url = page_url
			
 
				+        self._min_cookies = min_cookies
			
 
				+        self._must_contained_keys = must_contained_keys
			
 
				+        self._keep_alive = keep_alive
			
 
				+
			
 
				+        self._kwargs = kwargs
			
 
				+        self._kwargs.setdefault("load_images", False)
			
 
				+        self._kwargs.setdefault("headless", True)
			
 
				+
			
 
				+    def create_cookie(self):
			
 
				+        """
			
 
				+        可能会重写
			
 
				+        @return:
			
 
				+        """
			
 
				+        with WebDriver(**self._kwargs) as driver:
			
 
				+            driver.get(self._page_url)
			
 
				+
			
 
				+            cookies = driver.get_cookies()
			
 
				+
			
 
				+            cookies_json = {}
			
 
				+            for cookie in cookies:
			
 
				+                cookies_json[cookie["name"]] = cookie["value"]
			
 
				+
			
 
				+            for key in self._must_contained_keys:
			
 
				+                if key not in cookies_json:
			
 
				+                    break
			
 
				+            else:
			
 
				+                return cookies_json
			
 
				+
			
 
				+            log.error("获取cookie失败 cookies = {}".format(cookies_json))
			
 
				+            return None
			
 
				+
			
 
				+    def add_cookies(self, cookies):
			
 
				+        log.info("添加cookie {}".format(cookies))
			
 
				+        self._redisdb.lpush(self._tab_cookie_pool, cookies)
			
 
				+
			
 
				+    def run(self):
			
 
				+        while True:
			
 
				+            try:
			
 
				+                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
			
 
				+                need_cookie_count = self._min_cookies - now_cookie_count
			
 
				+
			
 
				+                if need_cookie_count > 0:
			
 
				+                    log.info(
			
 
				+                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
			
 
				+                            now_cookie_count, self._min_cookies
			
 
				+                        )
			
 
				+                    )
			
 
				+                    try:
			
 
				+                        cookies = self.create_cookie()
			
 
				+                        if cookies:
			
 
				+                            self.add_cookies(cookies)
			
 
				+                    except Exception as e:
			
 
				+                        log.exception(e)
			
 
				+                else:
			
 
				+                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
			
 
				+
			
 
				+                    # 判断cookie池近一分钟数量是否有变化，无变化则认为爬虫不再用了，退出
			
 
				+                    last_count_info = self._redisdb.strget(
			
 
				+                        self._tab_cookie_pool_last_count
			
 
				+                    )
			
 
				+                    if not last_count_info:
			
 
				+                        self._redisdb.strset(
			
 
				+                            self._tab_cookie_pool_last_count,
			
 
				+                            "{}:{}".format(time.time(), now_cookie_count),
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        last_time, last_count = last_count_info.split(":")
			
 
				+                        last_time = float(last_time)
			
 
				+                        last_count = int(last_count)
			
 
				+
			
 
				+                        if time.time() - last_time > 60:
			
 
				+                            if now_cookie_count == last_count:
			
 
				+                                log.info("近一分钟，cookie池数量无变化，判定爬虫未使用，退出生产")
			
 
				+                                break
			
 
				+                            else:
			
 
				+                                self._redisdb.strset(
			
 
				+                                    self._tab_cookie_pool_last_count,
			
 
				+                                    "{}:{}".format(time.time(), now_cookie_count),
			
 
				+                                )
			
 
				+
			
 
				+                    if self._keep_alive:
			
 
				+                        log.info("sleep 10")
			
 
				+                        tools.delay_time(10)
			
 
				+                    else:
			
 
				+                        break
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                tools.delay_time(1)
			
 
				+
			
 
				+    def get_cookie(self, wait_when_null=True):
			
 
				+        while True:
			
 
				+            try:
			
 
				+                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
			
 
				+                if not cookie_info and wait_when_null:
			
 
				+                    log.info("暂无cookie 生产中...")
			
 
				+                    self._keep_alive = False
			
 
				+                    self._min_cookies = 1
			
 
				+                    with RedisLock(
			
 
				+                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
			
 
				+                    ) as _lock:
			
 
				+                        if _lock.locked:
			
 
				+                            self.run()
			
 
				+                    continue
			
 
				+                return eval(cookie_info) if cookie_info else {}
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                tools.delay_time(1)
			
 
				+
			
 
				+    def del_cookie(self, cookies):
			
 
				+        self._redisdb.lrem(self._tab_cookie_pool, cookies)
			
 
				+
			
 
				+
			
 
				+class User:
			
 
				+    def __init__(self, username, cookie):
			
 
				+        self.username = username
			
 
				+        self.cookie = cookie
			
 
				+
			
 
				+
			
 
				+class LoginCookiePool(CookiePoolInterface):
			
 
				+    """
			
 
				+    需要登陆的cookie池, 用户账号密码等信息用mysql保存
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        redis_key,
			
 
				+        *,
			
 
				+        table_userbase,
			
 
				+        login_state_key="login_state",
			
 
				+        lock_state_key="lock_state",
			
 
				+        username_key="username",
			
 
				+        password_key="password",
			
 
				+        login_retry_times=10,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param redis_key: 项目名
			
 
				+        @param table_userbase: 用户表名
			
 
				+        @param login_state_key: 登录状态列名
			
 
				+        @param lock_state_key: 封锁状态列名
			
 
				+        @param username_key: 登陆名列名
			
 
				+        @param password_key: 密码列名
			
 
				+        @param login_retry_times: 登陆失败重试次数
			
 
				+        """
			
 
				+
			
 
				+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
			
 
				+        self._login_retry_times = login_retry_times
			
 
				+        self._table_userbase = table_userbase
			
 
				+        self._login_state_key = login_state_key
			
 
				+        self._lock_state_key = lock_state_key
			
 
				+        self._username_key = username_key
			
 
				+        self._password_key = password_key
			
 
				+
			
 
				+        self._redisdb = RedisDB()
			
 
				+        self._mysqldb = ()
			
 
				+
			
 
				+        self.create_userbase()
			
 
				+
			
 
				+    def create_userbase(self):
			
 
				+        sql = f"""
			
 
				+            CREATE TABLE IF NOT EXISTS `{self._table_userbase}` (
			
 
				+              `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
			
 
				+              `{self._username_key}` varchar(50) DEFAULT NULL COMMENT '用户名',
			
 
				+              `{self._password_key}` varchar(255) DEFAULT NULL COMMENT '密码',
			
 
				+              `{self._login_state_key}` int(11) DEFAULT '0' COMMENT '登录状态（0未登录 1已登录）',
			
 
				+              `{self._lock_state_key}` int(11) DEFAULT '0' COMMENT '账号是否被封（0 未封 1 被封）',
			
 
				+              PRIMARY KEY (`id`),
			
 
				+              UNIQUE KEY `username` (`username`) USING BTREE
			
 
				+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
			
 
				+        """
			
 
				+        self._mysqldb.execute(sql)
			
 
				+
			
 
				+    def create_cookie(self, username, password):
			
 
				+        """
			
 
				+        创建cookie
			
 
				+        @param username: 用户名
			
 
				+        @param password: 密码
			
 
				+        @return: return cookie / None
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_user_info(self):
			
 
				+        """
			
 
				+        返回用户信息
			
 
				+        @return: yield username, password
			
 
				+        """
			
 
				+
			
 
				+        sql = "select {username_key}, {password_key} from {table_userbase} where {lock_state_key} != 1 and {login_state_key} != 1".format(
			
 
				+            username_key=self._username_key,
			
 
				+            password_key=self._password_key,
			
 
				+            table_userbase=self._table_userbase,
			
 
				+            lock_state_key=self._lock_state_key,
			
 
				+            login_state_key=self._login_state_key,
			
 
				+        )
			
 
				+
			
 
				+        return self._mysqldb.find(sql)
			
 
				+
			
 
				+    def handle_login_failed_user(self, username, password):
			
 
				+        """
			
 
				+        处理登录失败的user
			
 
				+        @param username:
			
 
				+        @param password:
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def handel_exception(self, e):
			
 
				+        """
			
 
				+        处理异常
			
 
				+        @param e:
			
 
				+        @return:
			
 
				+        """
			
 
				+        log.exception(e)
			
 
				+
			
 
				+    def save_cookie(self, username, cookie):
			
 
				+        user_cookie = {"username": username, "cookie": cookie}
			
 
				+
			
 
				+        self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
			
 
				+
			
 
				+        sql = "update {table_userbase} set {login_state_key} = 1 where {username_key} = '{username}'".format(
			
 
				+            table_userbase=self._table_userbase,
			
 
				+            login_state_key=self._login_state_key,
			
 
				+            username_key=self._username_key,
			
 
				+            username=username,
			
 
				+        )
			
 
				+
			
 
				+        self._mysqldb.update(sql)
			
 
				+
			
 
				+    def get_cookie(self, wait_when_null=True) -> User:
			
 
				+        while True:
			
 
				+            try:
			
 
				+                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
			
 
				+                if not user_cookie and wait_when_null:
			
 
				+                    log.info("暂无cookie 生产中...")
			
 
				+                    self.login()
			
 
				+                    continue
			
 
				+
			
 
				+                if user_cookie:
			
 
				+                    user_cookie = eval(user_cookie)
			
 
				+                    return User(**user_cookie)
			
 
				+
			
 
				+                return None
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                tools.delay_time(1)
			
 
				+
			
 
				+    def del_cookie(self, user: User):
			
 
				+        """
			
 
				+        删除失效的cookie
			
 
				+        @param user:
			
 
				+        @return:
			
 
				+        """
			
 
				+        user_info = {"username": user.username, "cookie": user.cookie}
			
 
				+        self._redisdb.lrem(self._tab_cookie_pool, user_info)
			
 
				+
			
 
				+        sql = "update {table_userbase} set {login_state_key} = 0 where {username_key} = '{username}'".format(
			
 
				+            table_userbase=self._table_userbase,
			
 
				+            login_state_key=self._login_state_key,
			
 
				+            username_key=self._username_key,
			
 
				+            username=user.username,
			
 
				+        )
			
 
				+
			
 
				+        self._mysqldb.update(sql)
			
 
				+
			
 
				+    def user_is_locked(self, user: User):
			
 
				+        sql = "update {table_userbase} set {lock_state_key} = 1 where {username_key} = '{username}'".format(
			
 
				+            table_userbase=self._table_userbase,
			
 
				+            lock_state_key=self._lock_state_key,
			
 
				+            username_key=self._username_key,
			
 
				+            username=user.username,
			
 
				+        )
			
 
				+
			
 
				+        self._mysqldb.update(sql)
			
 
				+
			
 
				+    def run(self):
			
 
				+        with RedisLock(
			
 
				+            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
			
 
				+        ) as _lock:
			
 
				+            if _lock.locked:
			
 
				+                user_infos = self.get_user_info()
			
 
				+                if not isinstance(user_infos, Iterable):
			
 
				+                    raise ValueError("get_user_info 返回值必须可迭代")
			
 
				+
			
 
				+                if not user_infos:
			
 
				+                    log.info("无可用用户")
			
 
				+
			
 
				+                for username, password in user_infos:
			
 
				+                    for i in range(self._login_retry_times):
			
 
				+                        try:
			
 
				+                            cookie = self.create_cookie(username, password)
			
 
				+                            if cookie:
			
 
				+                                self.save_cookie(username, cookie)
			
 
				+                            else:
			
 
				+                                self.handle_login_failed_user(username, password)
			
 
				+
			
 
				+                            break
			
 
				+                        except Exception as e:
			
 
				+                            self.handel_exception(e)
			
 
				+
			
 
				+                    else:
			
 
				+                        self.handle_login_failed_user(username, password)
			
 
				+
			
 
				+    login = run
			
 
				+
			
 
				+
			
 
				+@unique
			
 
				+class LimitTimesUserStatus(Enum):
			
 
				+    # 使用状态
			
 
				+    USED = "used"
			
 
				+    SUCCESS = "success"
			
 
				+    OVERDUE = "overdue"  # cookie 过期
			
 
				+    SLEEP = "sleep"
			
 
				+    EXCEPTION = "exception"
			
 
				+    # 登陆状态
			
 
				+    LOGIN_SUCCESS = "login_success"
			
 
				+    LOGIN_FALIED = "login_failed"
			
 
				+
			
 
				+
			
 
				+class LimitTimesUser:
			
 
				+    """
			
 
				+    有次数限制的账户
			
 
				+    基于本地做的缓存，不支持多进程调用
			
 
				+    """
			
 
				+
			
 
				+    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
			
 
				+    SITE_NAME = ""  # 网站名
			
 
				+
			
 
				+    redisdb = None
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        username,
			
 
				+        password,
			
 
				+        max_search_times,
			
 
				+        proxies=None,
			
 
				+        search_interval=0,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param username:
			
 
				+        @param password:
			
 
				+        @param max_search_times:
			
 
				+        @param proxies:
			
 
				+        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如（5，10）即5到10秒；或直接传整数
			
 
				+        """
			
 
				+        self.__dict__.update(kwargs)
			
 
				+        self.username = username
			
 
				+        self.password = password
			
 
				+        self.max_search_times = max_search_times
			
 
				+        self.proxies = proxies
			
 
				+        self.search_interval = search_interval
			
 
				+        self.delay_use = 0  # 延时使用，用于等待解封的用户
			
 
				+
			
 
				+        if isinstance(search_interval, (tuple, list)):
			
 
				+            if len(search_interval) != 2:
			
 
				+                raise ValueError("search_interval 需传递两个值的元组或列表。如（5，10）即5到10秒")
			
 
				+
			
 
				+            self.used_for_time_length = (
			
 
				+                search_interval[1] * 5
			
 
				+            )  # 抢占式爬虫独享cookie时间，这段时间内其他爬虫不可抢占
			
 
				+        else:
			
 
				+            self.used_for_time_length = (
			
 
				+                search_interval * 5
			
 
				+            )  # 抢占式爬虫独享cookie时间，这段时间内其他爬虫不可抢占
			
 
				+
			
 
				+        self.account_info = {
			
 
				+            "login_time": 0,
			
 
				+            "cookies": {},
			
 
				+            "search_times": 0,
			
 
				+            "last_search_time": 0,
			
 
				+            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
			
 
				+            "init_search_times_time": 0,  # 初始化搜索次数的时间
			
 
				+        }
			
 
				+
			
 
				+        if not self.__class__.redisdb:
			
 
				+            self.__class__.redisdb = RedisDB()
			
 
				+
			
 
				+        self.sync_account_info_from_redis()
			
 
				+
			
 
				+        self.__init_metrics()
			
 
				+
			
 
				+    def __init_metrics(self):
			
 
				+        """
			
 
				+        初始化打点系统
			
 
				+        @return:
			
 
				+        """
			
 
				+        metrics.init(**setting.METRICS_OTHER_ARGS)
			
 
				+
			
 
				+    def record_user_status(self, status: LimitTimesUserStatus):
			
 
				+        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
			
 
				+
			
 
				+    def __eq__(self, other):
			
 
				+        return self.username == other.username
			
 
				+
			
 
				+    def sync_account_info_from_redis(self):
			
 
				+        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
			
 
				+        if account_info:
			
 
				+            account_info = eval(account_info)
			
 
				+            self.account_info.update(account_info)
			
 
				+
			
 
				+    @property
			
 
				+    def cookies(self):
			
 
				+        cookies = self.account_info.get("cookies")
			
 
				+        return cookies
			
 
				+
			
 
				+    def set_cookies(self, cookies):
			
 
				+        self.account_info["cookies"] = cookies
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    def set_login_time(self, login_time=None):
			
 
				+        self.account_info["login_time"] = login_time or time.time()
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    def get_login_time(self):
			
 
				+        return self.account_info.get("login_time")
			
 
				+
			
 
				+    def is_time_to_login(self):
			
 
				+        return time.time() - self.get_login_time() > 40 * 60
			
 
				+
			
 
				+    def get_last_search_time(self):
			
 
				+        return self.account_info.get("last_search_time", 0)
			
 
				+
			
 
				+    def is_time_to_search(self):
			
 
				+        if self.delay_use:
			
 
				+            is_time = time.time() - self.get_last_search_time() > self.delay_use
			
 
				+            if is_time:
			
 
				+                self.delay_use = 0
			
 
				+
			
 
				+        else:
			
 
				+            is_time = time.time() - self.get_last_search_time() > (
			
 
				+                random.randint(*self.search_interval)
			
 
				+                if isinstance(self.search_interval, (tuple, list))
			
 
				+                else self.search_interval
			
 
				+            )
			
 
				+
			
 
				+        return is_time
			
 
				+
			
 
				+    @property
			
 
				+    def used_for_spider_name(self):
			
 
				+        return self.account_info.get("used_for_spider_name")
			
 
				+
			
 
				+    @used_for_spider_name.setter
			
 
				+    def used_for_spider_name(self, spider_name):
			
 
				+        self.account_info["used_for_spider_name"] = spider_name
			
 
				+
			
 
				+    def update_status(self):
			
 
				+        """
			
 
				+        更新search的一些状态
			
 
				+        @return:
			
 
				+        """
			
 
				+        self.account_info["search_times"] += 1
			
 
				+        self.account_info["last_search_time"] = time.time()
			
 
				+
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    @property
			
 
				+    def search_times(self):
			
 
				+        init_search_times_time = self.account_info.get("init_search_times_time")
			
 
				+        current_time = time.time()
			
 
				+        if (
			
 
				+            current_time - init_search_times_time >= 86400
			
 
				+        ):  # 如果距离上次初始化搜索次数时间大于1天，则搜索次数清清零
			
 
				+            self.account_info["search_times"] = 0
			
 
				+            self.account_info["init_search_times_time"] = current_time
			
 
				+
			
 
				+            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
			
 
				+
			
 
				+        return self.account_info["search_times"]
			
 
				+
			
 
				+    def is_overwork(self):
			
 
				+        if self.search_times > self.max_search_times:
			
 
				+            log.warning("账号 {} 请求次数超限制".format(self.username))
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def is_at_work_time(self):
			
 
				+        if datetime.datetime.now().hour in list(range(7, 23)):
			
 
				+            return True
			
 
				+
			
 
				+        log.warning("账号 {} 不再工作时间内".format(self.username))
			
 
				+        return False
			
 
				+
			
 
				+    def del_cookie(self):
			
 
				+        self.account_info["cookies"] = {}
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    def create_cookie(self):
			
 
				+        """
			
 
				+        生产cookie 有异常需要抛出
			
 
				+        @return: cookie_dict
			
 
				+        """
			
 
				+
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def login(self):
			
 
				+        """
			
 
				+        @return: 1 成功 0 失败
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+            # 预检查
			
 
				+            if not self.is_time_to_login():
			
 
				+                log.info("此账号尚未到登陆时间: {}".format(self.username))
			
 
				+                time.sleep(5)
			
 
				+                return 0
			
 
				+
			
 
				+            cookies = self.create_cookie()
			
 
				+            if not cookies:
			
 
				+                raise Exception("登陆失败 未获取到合法cookie")
			
 
				+
			
 
				+            if not isinstance(cookies, dict):
			
 
				+                raise Exception("cookie 必须为字典格式")
			
 
				+
			
 
				+            # 保存cookie
			
 
				+            self.set_login_time()
			
 
				+            self.set_cookies(cookies)
			
 
				+            log.info("登录成功 {}".format(self.username))
			
 
				+            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
			
 
				+            return 1
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+            send_msg(
			
 
				+                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
			
 
				+                level="error",
			
 
				+                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
			
 
				+            )
			
 
				+
			
 
				+        log.info("登录失败 {}".format(self.username))
			
 
				+        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
			
 
				+        return 0
			
 
				+
			
 
				+
			
 
				+class LimitTimesUserPool:
			
 
				+    """
			
 
				+    限制查询次数的用户的User pool
			
 
				+    基于本地做的缓存，不支持多进程调用
			
 
				+    """
			
 
				+
			
 
				+    LOAD_USER_INTERVAL = 60
			
 
				+
			
 
				+    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
			
 
				+        """
			
 
				+        @param accounts_dic: 账户信息字典
			
 
				+            {
			
 
				+                "15011300228": {
			
 
				+                    "password": "300228",
			
 
				+                    "proxies": {},
			
 
				+                    "max_search_times": 500,
			
 
				+                    "search_interval": 1, # 使用时间间隔
			
 
				+                    # 其他携带信息
			
 
				+                }
			
 
				+            }
			
 
				+        @param limit_user_class: 用户重写的 limit_user_class
			
 
				+        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
			
 
				+        """
			
 
				+        self.accounts_dict = accounts_dict
			
 
				+        self.limit_user_class = limit_user_class
			
 
				+
			
 
				+        self.limit_times_users = []
			
 
				+        self.current_user_index = -1
			
 
				+
			
 
				+        self.support_more_client = support_more_client
			
 
				+
			
 
				+        self.last_load_user_time = 0
			
 
				+
			
 
				+    def __load_users(self, username=None):
			
 
				+        # 装载user
			
 
				+        log.info("更新可用用户")
			
 
				+
			
 
				+        for _username, detail in self.accounts_dict.items():
			
 
				+            if username and username != _username:
			
 
				+                continue
			
 
				+
			
 
				+            limit_times_users = self.limit_user_class(username=_username, **detail)
			
 
				+            if limit_times_users in self.limit_times_users:
			
 
				+                continue
			
 
				+
			
 
				+            if limit_times_users.is_overwork():
			
 
				+                continue
			
 
				+            else:
			
 
				+                if (
			
 
				+                    limit_times_users.cookies or limit_times_users.login()
			
 
				+                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
			
 
				+                    self.limit_times_users.append(limit_times_users)
			
 
				+
			
 
				+        self.last_load_user_time = time.time()
			
 
				+
			
 
				+    def get_user(
			
 
				+        self,
			
 
				+        username=None,
			
 
				+        used_for_spider_name=None,
			
 
				+        wait_when_null=True,
			
 
				+        not_limit_frequence=False,
			
 
				+    ) -> LimitTimesUser:
			
 
				+        """
			
 
				+        @params username: 获取指定的用户
			
 
				+        @params used_for_spider_name: 独享式使用，独享爬虫的名字。其他爬虫不可抢占
			
 
				+        @params wait_when_null: 无用户时是否等待
			
 
				+        @params not_limit_frequence: 不限制使用频率
			
 
				+        @return: LimitTimesUser
			
 
				+        """
			
 
				+        if not self.support_more_client:
			
 
				+            warnings.warn(
			
 
				+                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存，不支持多进程或多线程",
			
 
				+                category=Warning,
			
 
				+            )
			
 
				+            self._is_show_warning = True
			
 
				+
			
 
				+        while True:
			
 
				+            if (
			
 
				+                not self.limit_times_users
			
 
				+                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
			
 
				+            ):
			
 
				+                self.__load_users(username)
			
 
				+                if not self.limit_times_users:
			
 
				+                    log.warning("无可用的用户")
			
 
				+                    if wait_when_null:
			
 
				+                        time.sleep(1)
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        return None
			
 
				+
			
 
				+            self.current_user_index += 1
			
 
				+            self.current_user_index = self.current_user_index % len(
			
 
				+                self.limit_times_users
			
 
				+            )
			
 
				+
			
 
				+            limit_times_user = self.limit_times_users[self.current_user_index]
			
 
				+            if self.support_more_client:  # 需要先同步下最新数据
			
 
				+                limit_times_user.sync_account_info_from_redis()
			
 
				+
			
 
				+            if username and limit_times_user.username != username:
			
 
				+                log.info(
			
 
				+                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
			
 
				+                )
			
 
				+                time.sleep(1)
			
 
				+                continue
			
 
				+
			
 
				+            # 独占式使用，若为其他爬虫，检查等待使用时间是否超过独占时间，若超过则可以使用
			
 
				+            if (
			
 
				+                limit_times_user.used_for_spider_name
			
 
				+                and limit_times_user.used_for_spider_name != used_for_spider_name
			
 
				+            ):
			
 
				+                wait_time = time.time() - limit_times_user.get_last_search_time()
			
 
				+                if wait_time < limit_times_user.used_for_time_length:
			
 
				+                    log.info(
			
 
				+                        "用户{} 被 {} 爬虫独占，需等待 {} 秒后才可使用".format(
			
 
				+                            limit_times_user.username,
			
 
				+                            limit_times_user.used_for_spider_name,
			
 
				+                            limit_times_user.used_for_time_length - wait_time,
			
 
				+                        )
			
 
				+                    )
			
 
				+                    time.sleep(1)
			
 
				+                    continue
			
 
				+
			
 
				+            if (
			
 
				+                not limit_times_user.is_overwork()
			
 
				+                and limit_times_user.is_at_work_time()
			
 
				+            ):
			
 
				+                if not limit_times_user.cookies:
			
 
				+                    self.limit_times_users.remove(limit_times_user)
			
 
				+                    continue
			
 
				+
			
 
				+                if not_limit_frequence or limit_times_user.is_time_to_search():
			
 
				+                    limit_times_user.used_for_spider_name = used_for_spider_name
			
 
				+
			
 
				+                    limit_times_user.update_status()
			
 
				+                    log.info("使用用户 {}".format(limit_times_user.username))
			
 
				+                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
			
 
				+                    return limit_times_user
			
 
				+                else:
			
 
				+                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
			
 
				+                    time.sleep(1)
			
 
				+                    continue
			
 
				+            else:
			
 
				+                self.limit_times_users.remove(limit_times_user)
			
 
				+                self.current_user_index -= 1
			
 
				+
			
 
				+                if not limit_times_user.is_at_work_time():
			
 
				+                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
			
 
				+                    if wait_when_null:
			
 
				+                        time.sleep(30)
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        return None
			
 
				+
			
 
				+    def del_user(self, username):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.del_cookie()
			
 
				+                self.limit_times_users.remove(limit_times_user)
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
			
 
				+                self.__load_users(username)
			
 
				+                break
			
 
				+
			
 
				+    def update_cookies(self, username, cookies):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.set_cookies(cookies)
			
 
				+                break
			
 
				+
			
 
				+    def delay_use(self, username, delay_seconds):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.delay_use = delay_seconds
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
			
 
				+                break
			
 
				+
			
 
				+    def record_success_user(self, username):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
			
 
				+
			
 
				+    def record_exception_user(self, username):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)
			
--- a/FworkSpider/feapder/network/item.py
+++ b/FworkSpider/feapder/network/item.py
@@ -0,0 +1,145 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-07-26 22:28:10
			
 
				+---------
			
 
				+@summary: 定义实体
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+
			
 
				+
			
 
				+class ItemMetaclass(type):
			
 
				+    def __new__(cls, name, bases, attrs):
			
 
				+        attrs.setdefault("__name__", None)
			
 
				+        attrs.setdefault("__table_name__", None)
			
 
				+        attrs.setdefault("__name_underline__", None)
			
 
				+        attrs.setdefault("__update_key__", None)
			
 
				+        attrs.setdefault("__unique_key__", None)
			
 
				+
			
 
				+        return type.__new__(cls, name, bases, attrs)
			
 
				+
			
 
				+
			
 
				+class Item(metaclass=ItemMetaclass):
			
 
				+    __unique_key__ = []
			
 
				+
			
 
				+    def __init__(self, **kwargs):
			
 
				+        self.__dict__ = kwargs
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<{}: {}>".format(self.item_name, tools.dumps_json(self.to_dict))
			
 
				+
			
 
				+    def __getitem__(self, key):
			
 
				+        return self.__dict__[key]
			
 
				+
			
 
				+    def __setitem__(self, key, value):
			
 
				+        self.__dict__[key] = value
			
 
				+
			
 
				+    def pre_to_db(self):
			
 
				+        """
			
 
				+        入库前的处理
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @property
			
 
				+    def to_dict(self):
			
 
				+        propertys = {}
			
 
				+        for key, value in self.__dict__.items():
			
 
				+            if key not in (
			
 
				+                "__name__",
			
 
				+                "__table_name__",
			
 
				+                "__name_underline__",
			
 
				+                "__update_key__",
			
 
				+                "__unique_key__",
			
 
				+            ):
			
 
				+                if key.startswith(f"_{self.__class__.__name__}"):
			
 
				+                    key = key.replace(f"_{self.__class__.__name__}", "")
			
 
				+                propertys[key] = value
			
 
				+
			
 
				+        return propertys
			
 
				+
			
 
				+    def to_sql(self, auto_update=False, update_columns=()):
			
 
				+        return tools.make_insert_sql(
			
 
				+            self.table_name, self.to_dict, auto_update, update_columns
			
 
				+        )
			
 
				+
			
 
				+    @property
			
 
				+    def item_name(self):
			
 
				+        return self.__name__ or self.__class__.__name__
			
 
				+
			
 
				+    @item_name.setter
			
 
				+    def item_name(self, name):
			
 
				+        self.__name__ = name
			
 
				+        self.__table_name__ = self.name_underline.replace("_item", "")
			
 
				+
			
 
				+    @property
			
 
				+    def table_name(self):
			
 
				+        if not self.__table_name__:
			
 
				+            self.__table_name__ = self.name_underline.replace("_item", "")
			
 
				+        return self.__table_name__
			
 
				+
			
 
				+    @table_name.setter
			
 
				+    def table_name(self, name):
			
 
				+        self.__table_name__ = name
			
 
				+        self.__name__ = tools.key2hump(name) + "Item"
			
 
				+
			
 
				+    @property
			
 
				+    def name_underline(self):
			
 
				+        if not self.__name_underline__:
			
 
				+            self.__name_underline__ = tools.key2underline(self.item_name)
			
 
				+
			
 
				+        return self.__name_underline__
			
 
				+
			
 
				+    @name_underline.setter
			
 
				+    def name_underline(self, name):
			
 
				+        self.__name_underline__ = name
			
 
				+
			
 
				+    @property
			
 
				+    def unique_key(self):
			
 
				+        return self.__unique_key__ or self.__class__.__unique_key__
			
 
				+
			
 
				+    @unique_key.setter
			
 
				+    def unique_key(self, keys):
			
 
				+        if isinstance(keys, (tuple, list)):
			
 
				+            self.__unique_key__ = keys
			
 
				+        else:
			
 
				+            self.__unique_key__ = (keys,)
			
 
				+
			
 
				+    @property
			
 
				+    def fingerprint(self):
			
 
				+        args = []
			
 
				+        for key, value in self.to_dict.items():
			
 
				+            if value:
			
 
				+                if (self.unique_key and key in self.unique_key) or not self.unique_key:
			
 
				+                    args.append(str(value))
			
 
				+
			
 
				+        if args:
			
 
				+            args = sorted(args)
			
 
				+            return tools.get_md5(*args)
			
 
				+        else:
			
 
				+            return None
			
 
				+
			
 
				+    def to_UpdateItem(self):
			
 
				+        update_item = UpdateItem(**self.__dict__)
			
 
				+        update_item.item_name = self.item_name
			
 
				+        return update_item
			
 
				+
			
 
				+
			
 
				+class UpdateItem(Item):
			
 
				+    __update_key__ = []
			
 
				+
			
 
				+    def __init__(self, **kwargs):
			
 
				+        super(UpdateItem, self).__init__(**kwargs)
			
 
				+
			
 
				+    @property
			
 
				+    def update_key(self):
			
 
				+        return self.__update_key__ or self.__class__.__update_key__
			
 
				+
			
 
				+    @update_key.setter
			
 
				+    def update_key(self, keys):
			
 
				+        if isinstance(keys, (tuple, list)):
			
 
				+            self.__update_key__ = keys
			
 
				+        else:
			
 
				+            self.__update_key__ = (keys,)
			
--- a/FworkSpider/feapder/network/proxy_file/1c718b9e5cc682d4ca7154958d0919c0.txt
+++ b/FworkSpider/feapder/network/proxy_file/1c718b9e5cc682d4ca7154958d0919c0.txt
@@ -0,0 +1,20 @@
 
				+117.88.5.96:8860


			
 
				+111.179.93.27:8861


			
 
				+111.179.93.27:8860


			
 
				+113.226.100.155:8861


			
 
				+113.226.100.155:8860


			
 
				+114.99.103.81:8861


			
 
				+171.13.51.41:8861


			
 
				+114.99.103.81:8860


			
 
				+171.13.51.41:8860


			
 
				+125.41.17.67:8861


			
 
				+125.41.17.67:8860


			
 
				+113.123.0.127:8861


			
 
				+117.88.5.96:8861


			
 
				+182.101.196.230:8861


			
 
				+113.123.0.127:8860


			
 
				+182.101.196.230:8860


			
 
				+182.34.102.234:8861


			
 
				+182.34.102.234:8860


			
 
				+117.88.4.100:8861


			
 
				+117.88.4.100:8860
			
--- a/FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt
+++ b/FworkSpider/feapder/network/proxy_file/a62f3217a0981b7b2117d9d0af64c2db.txt
@@ -0,0 +1,20 @@
 
				+175.162.217.157:8860&&1643361380
			
 
				+222.86.85.51:8861&&1643361867
			
 
				+222.86.85.51:8860&&1643361867
			
 
				+182.101.215.123:8861&&1643361013
			
 
				+182.34.32.132:8860&&1643361124
			
 
				+182.101.215.123:8860&&1643361013
			
 
				+182.34.32.132:8861&&1643361124
			
 
				+113.123.0.11:8861&&1643361579
			
 
				+113.123.0.11:8860&&1643361579
			
 
				+117.66.140.217:8860&&1643361016
			
 
				+117.66.140.217:8861&&1643361016
			
 
				+123.10.66.129:8860&&1643361437
			
 
				+123.10.66.129:8861&&1643361437
			
 
				+123.169.34.75:8860&&1643360309
			
 
				+123.169.34.75:8861&&1643360309
			
 
				+175.162.217.157:8861&&1643361379
			
 
				+111.179.73.220:8860&&1643360596
			
 
				+111.179.73.220:8861&&1643360596
			
 
				+36.62.71.201:8861&&1643360585
			
 
				+36.62.71.201:8860&&1643360585
			
--- a/FworkSpider/feapder/network/proxy_pool.py
+++ b/FworkSpider/feapder/network/proxy_pool.py
@@ -0,0 +1,763 @@
 
				+# coding:utf8
			
 
				+"""
			
 
				+代理池
			
 
				+"""
			
 
				+import datetime
			
 
				+import json
			
 
				+import os
			
 
				+import random
			
 
				+import socket
			
 
				+import time
			
 
				+from urllib import parse
			
 
				+
			
 
				+import redis
			
 
				+import requests
			
 
				+
			
 
				+from feapder import setting
			
 
				+from feapder.utils import tools
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+def decrypt(input_str: str) -> str:
			
 
				+    """
			
 
				+    改写：新增
			
 
				+    定义base64解密函数
			
 
				+
			
 
				+    :param input_str:
			
 
				+    :return:
			
 
				+    """
			
 
				+    key = "ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/"
			
 
				+    ascii_list = ['{:0>6}'.format(str(bin(key.index(i))).replace('0b', '')) for i in input_str if i != '=']
			
 
				+    output_str = ''
			
 
				+    # 对前面不是“=”的字节取索引，然后转换为2进制
			
 
				+    # 补齐“=”的个数
			
 
				+    equal_num = input_str.count('=')
			
 
				+    while ascii_list:
			
 
				+        temp_list = ascii_list[:4]
			
 
				+        # 转换成2进制字符串
			
 
				+        temp_str = ''.join(temp_list)
			
 
				+        # 对没有8位2进制的字符串补够8位2进制
			
 
				+        if len(temp_str) % 8 != 0:
			
 
				+            temp_str = temp_str[0:-1 * equal_num * 2]
			
 
				+        # 4个6字节的二进制  转换  为三个8字节的二进制
			
 
				+        temp_str_list = [temp_str[x:x + 8] for x in [0, 8, 16]]
			
 
				+        # 二进制转为10进制
			
 
				+        temp_str_list = [int(x, 2) for x in temp_str_list if x]
			
 
				+        # 连接成字符串
			
 
				+        output_str += ''.join([chr(x) for x in temp_str_list])
			
 
				+        ascii_list = ascii_list[4:]
			
 
				+    return output_str
			
 
				+
			
 
				+
			
 
				+# 建立本地缓存代理文件夹
			
 
				+proxy_path = os.path.join(os.path.dirname(__file__), "proxy_file")
			
 
				+if not os.path.exists(proxy_path):
			
 
				+    os.mkdir(proxy_path)
			
 
				+
			
 
				+
			
 
				+# def get_proxies_by_host(host, port):
			
 
				+#     proxy_id = "{}:{}".format(host, port)
			
 
				+#     return get_proxies_by_id(proxy_id)
			
 
				+
			
 
				+
			
 
				+# def get_proxies_by_id(proxy_id):
			
 
				+#     proxies = {
			
 
				+#         "http": "http://{}".format(proxy_id),
			
 
				+#         "https": "https://{}".format(proxy_id),
			
 
				+#     }
			
 
				+#     return proxies
			
 
				+
			
 
				+
			
 
				+def get_proxy_from_url(**kwargs):
			
 
				+    """
			
 
				+    获取指定url的代理
			
 
				+    :param kwargs:
			
 
				+    :return:
			
 
				+    """
			
 
				+    proxy_source_url = kwargs.get("proxy_source_url", [])
			
 
				+    # proxy_source_url = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"
			
 
				+
			
 
				+    if not isinstance(proxy_source_url, list):
			
 
				+        proxy_source_url = [proxy_source_url]
			
 
				+        proxy_source_url = [x for x in proxy_source_url if x]
			
 
				+    if not proxy_source_url:
			
 
				+        raise ValueError("no specify proxy_source_url: {}".format(proxy_source_url))
			
 
				+    kwargs = kwargs.copy()
			
 
				+    kwargs.pop("proxy_source_url")
			
 
				+    proxies_list = []
			
 
				+    for url in proxy_source_url:
			
 
				+        if url.startswith("http"):
			
 
				+            proxies_list.extend(get_proxy_from_http(url, **kwargs))
			
 
				+        elif url.startswith("redis"):
			
 
				+            proxies_list.extend(get_proxy_from_redis(url, **kwargs))
			
 
				+
			
 
				+    if proxies_list:
			
 
				+        # 顺序打乱
			
 
				+        random.shuffle(proxies_list)
			
 
				+    return proxies_list
			
 
				+
			
 
				+
			
 
				+def get_proxy_from_http(proxy_source_url, **kwargs):
			
 
				+    """
			
 
				+    从指定 http 地址获取代理
			
 
				+    :param proxy_source_url:
			
 
				+    :param kwargs:
			
 
				+    :return:
			
 
				+    """
			
 
				+    filename = tools.get_md5(proxy_source_url) + ".txt"
			
 
				+    abs_filename = os.path.join(proxy_path, filename)
			
 
				+    update_interval = kwargs.get("local_proxy_file_cache_timeout", 30)
			
 
				+    update_flag = 0
			
 
				+    if not update_interval:
			
 
				+        # 强制更新
			
 
				+        update_flag = 1
			
 
				+    elif not os.path.exists(abs_filename):
			
 
				+        # 文件不存在则更新
			
 
				+        update_flag = 1
			
 
				+    elif time.time() - os.stat(abs_filename).st_mtime > update_interval:
			
 
				+        # 超过更新间隔
			
 
				+        update_flag = 1
			
 
				+    if update_flag:
			
 
				+        pool = []
			
 
				+        response = requests.get(proxy_source_url, timeout=20)
			
 
				+        # 改写：获取scocks代理的response处理
			
 
				+        for proxy in response.json():
			
 
				+            host = decrypt(proxy['host'])
			
 
				+            port = proxy['port']
			
 
				+            endTime = proxy['EndTime']
			
 
				+            pool.append(f"{host}:{port}&&{endTime}")
			
 
				+
			
 
				+        with open(os.path.join(proxy_path, filename), "w") as f:
			
 
				+            f.write('\n'.join(pool))
			
 
				+    return get_proxy_from_file(filename)
			
 
				+
			
 
				+
			
 
				+def get_proxy_from_file(filename, **kwargs):
			
 
				+    """
			
 
				+    从指定本地文件获取代理
			
 
				+        文件格式
			
 
				+        ip:port:https
			
 
				+        ip:port:http
			
 
				+        ip:port
			
 
				+    :param filename:
			
 
				+    :param kwargs:
			
 
				+    :return:
			
 
				+    """
			
 
				+    proxies_list = []
			
 
				+    with open(os.path.join(proxy_path, filename), "r") as f:
			
 
				+        lines = f.readlines()
			
 
				+
			
 
				+    for line in lines:
			
 
				+        line = line.strip()
			
 
				+        if not line:
			
 
				+            continue
			
 
				+        # 解析
			
 
				+        auth = ""
			
 
				+        if "@" in line:
			
 
				+            auth, line = line.split("@")
			
 
				+        # 改写，解析代理有效期结束时间
			
 
				+        line, end = line.split("&&")
			
 
				+
			
 
				+        items = line.split(":")
			
 
				+        if len(items) < 2:
			
 
				+            continue
			
 
				+
			
 
				+        ip, port, *protocol = items
			
 
				+        if not all([port, ip]):
			
 
				+            continue
			
 
				+        if auth:
			
 
				+            ip = "{}@{}".format(auth, ip)
			
 
				+        if not protocol:
			
 
				+            # 改写：判断代理是否在有效期内，并将代理格式重http格式改成socks格式
			
 
				+            if time.time() < int(end):
			
 
				+                proxies = {
			
 
				+                    "https": "socks5://%s:%s" % (ip, port),
			
 
				+                    "http": "socks5://%s:%s" % (ip, port),
			
 
				+                    # "end":end
			
 
				+                }
			
 
				+            else:
			
 
				+                continue
			
 
				+        else:
			
 
				+            proxies = {protocol[0]: "%s://%s:%s" % (protocol[0], ip, port)}
			
 
				+        proxies_list.append(proxies)
			
 
				+
			
 
				+    return proxies_list
			
 
				+
			
 
				+
			
 
				+def get_proxy_from_redis(proxy_source_url, **kwargs):
			
 
				+    """
			
 
				+    从指定 redis 地址获取代理
			
 
				+    @param proxy_source_url: redis://:passwd@host:ip/db
			
 
				+        redis 存储结构 zset
			
 
				+        ip:port ts
			
 
				+    @param kwargs:
			
 
				+        {"redis_proxies_key": "xxx"}
			
 
				+    @return: [{'http':'http://xxx.xxx.xxx:xxx', 'https':'https://xxx.xxx.xxx.xxx:xxx'}]
			
 
				+    """
			
 
				+
			
 
				+    redis_conn = redis.StrictRedis.from_url(proxy_source_url)
			
 
				+    key = kwargs.get("redis_proxies_key")
			
 
				+    assert key, "从redis中获取代理 需要指定 redis_proxies_key"
			
 
				+    proxies = redis_conn.zrange(key, 0, -1)
			
 
				+    proxies_list = []
			
 
				+    for proxy in proxies:
			
 
				+        proxy = proxy.decode()
			
 
				+        proxies_list.append(
			
 
				+            {"https": "https://%s" % proxy, "http": "http://%s" % proxy}
			
 
				+        )
			
 
				+    return proxies_list
			
 
				+
			
 
				+
			
 
				+def check_proxy(
			
 
				+        ip="",
			
 
				+        port="",
			
 
				+        proxies=None,
			
 
				+        type=0,
			
 
				+        timeout=5,
			
 
				+        logger=None,
			
 
				+        show_error_log=True,
			
 
				+        **kwargs,
			
 
				+):
			
 
				+    """
			
 
				+    代理有效性检查
			
 
				+    :param ip:
			
 
				+    :param port:
			
 
				+    :param type: 0:socket  1:requests
			
 
				+    :param timeout:
			
 
				+    :param logger:
			
 
				+    :return:
			
 
				+    """
			
 
				+    if not logger:
			
 
				+        logger = log
			
 
				+    ok = 0
			
 
				+    if type == 0 and ip and port:
			
 
				+        # socket检测成功 不代表代理一定可用 Connection closed by foreign host. 这种情况就不行
			
 
				+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
			
 
				+            sk.settimeout(timeout)
			
 
				+            try:
			
 
				+                # 必须检测 否则代理永远不刷新
			
 
				+                sk.connect((ip, int(port)))
			
 
				+                ok = 1
			
 
				+            except Exception as e:
			
 
				+                if show_error_log:
			
 
				+                    logger.debug("check proxy failed: {} {}:{}".format(e, ip, port))
			
 
				+            sk.close()
			
 
				+    else:
			
 
				+        if not proxies:
			
 
				+            proxies = {
			
 
				+                "http": "socks5://{}:{}".format(ip, port),
			
 
				+                "https": "socks5//{}:{}".format(ip, port),
			
 
				+            }
			
 
				+        try:
			
 
				+            # 改写：代理检测的url
			
 
				+            r = requests.get(
			
 
				+                "https://myip.ipip.net", proxies=proxies, timeout=timeout, stream=True
			
 
				+            )
			
 
				+            ok = 1
			
 
				+            r.close()
			
 
				+        except Exception as e:
			
 
				+            if show_error_log:
			
 
				+                logger.debug(
			
 
				+                    "check proxy failed: {} {}:{} {}".format(e, ip, port, proxies)
			
 
				+                )
			
 
				+    return ok
			
 
				+
			
 
				+
			
 
				+class ProxyItem(object):
			
 
				+    """单个代理对象"""
			
 
				+
			
 
				+    # 代理标记
			
 
				+    proxy_tag_list = (-1, 0, 1)
			
 
				+
			
 
				+    def __init__(
			
 
				+            self,
			
 
				+            proxies=None,
			
 
				+            valid_timeout=20,
			
 
				+            check_interval=180,
			
 
				+            max_proxy_use_num=10000,
			
 
				+            delay=30,
			
 
				+            use_interval=None,
			
 
				+            **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        :param proxies:
			
 
				+        :param valid_timeout:  代理检测超时时间 默认-1    20181008  默认不再监测有效性
			
 
				+        :param check_interval:
			
 
				+        :param max_proxy_use_num:
			
 
				+        :param delay:
			
 
				+        :param use_interval: 使用间隔 单位秒 默认不限制
			
 
				+        :param logger: 日志处理器 默认 log.get_logger()
			
 
				+        :param kwargs:
			
 
				+        """
			
 
				+        # {"http": ..., "https": ...}
			
 
				+        self.proxies = proxies
			
 
				+        # 检测超时时间 秒
			
 
				+        self.valid_timeout = valid_timeout
			
 
				+        # 检测间隔 秒
			
 
				+        self.check_interval = check_interval
			
 
				+
			
 
				+        # 标记  0:正常 -1:丢弃  1: 待会再用 ...
			
 
				+        self.flag = 0
			
 
				+        # 上次状态变化时间
			
 
				+        self.flag_ts = 0
			
 
				+        # 上次更新时间 有效时间
			
 
				+        self.update_ts = 0
			
 
				+        # 最大被使用次数
			
 
				+        self.max_proxy_use_num = max_proxy_use_num
			
 
				+        # 被使用次数记录
			
 
				+        self.use_num = 0
			
 
				+        # 延迟使用时间
			
 
				+        self.delay = delay
			
 
				+        # 使用间隔 单位秒
			
 
				+        self.use_interval = use_interval
			
 
				+        # 使用时间
			
 
				+        self.use_ts = 0
			
 
				+
			
 
				+        self.proxy_args = self.parse_proxies(self.proxies)
			
 
				+        self.proxy_ip = self.proxy_args["ip"]
			
 
				+        self.proxy_port = self.proxy_args["port"]
			
 
				+        self.proxy_ip_port = "{}:{}".format(self.proxy_ip, self.proxy_port)
			
 
				+        if self.proxy_args["user"]:
			
 
				+            self.proxy_id = "{user}:{password}@{ip}:{port}".format(**self.proxy_args)
			
 
				+        else:
			
 
				+            self.proxy_id = self.proxy_ip_port
			
 
				+
			
 
				+        # 日志处理器
			
 
				+        self.logger = log
			
 
				+
			
 
				+    def get_proxies(self):
			
 
				+        self.use_num += 1
			
 
				+        return self.proxies
			
 
				+
			
 
				+    def is_delay(self):
			
 
				+        return self.flag == 1
			
 
				+
			
 
				+    def is_valid(self, force=0, type=0):
			
 
				+        """
			
 
				+        检测代理是否有效
			
 
				+            1 有效
			
 
				+            2 延时使用
			
 
				+            0 无效 直接在代理池删除
			
 
				+        :param force:
			
 
				+        :param type:
			
 
				+        :return:
			
 
				+        """
			
 
				+        if self.use_num > self.max_proxy_use_num > 0:
			
 
				+            self.logger.debug("代理达到最大使用次数: {} {}".format(self.use_num, self.proxies))
			
 
				+            return 0
			
 
				+        if self.flag == -1:
			
 
				+            self.logger.debug("代理被标记 -1 丢弃 %s" % self.proxies)
			
 
				+            return 0
			
 
				+        if self.delay > 0 and self.flag == 1:
			
 
				+            if time.time() - self.flag_ts < self.delay:
			
 
				+                self.logger.debug("代理被标记 1 延迟 %s" % self.proxies)
			
 
				+                return 2
			
 
				+            else:
			
 
				+                self.flag = 0
			
 
				+                self.logger.debug("延迟代理释放: {}".format(self.proxies))
			
 
				+        if self.use_interval:
			
 
				+            if time.time() - self.use_ts < self.use_interval:
			
 
				+                return 2
			
 
				+        if not force:
			
 
				+            if time.time() - self.update_ts < self.check_interval:
			
 
				+                return 1
			
 
				+        if self.valid_timeout > 0:
			
 
				+            ok = check_proxy(
			
 
				+                proxies=self.proxies,
			
 
				+                type=type,
			
 
				+                timeout=self.valid_timeout,
			
 
				+                logger=self.logger,
			
 
				+            )
			
 
				+        else:
			
 
				+            ok = 1
			
 
				+        self.update_ts = time.time()
			
 
				+        return ok
			
 
				+
			
 
				+    @classmethod
			
 
				+    def parse_proxies(self, proxies):
			
 
				+        """
			
 
				+        分解代理组成部分
			
 
				+        :param proxies:
			
 
				+        :return:
			
 
				+        """
			
 
				+        if not proxies:
			
 
				+            return {}
			
 
				+        if isinstance(proxies, (str, bytes)):
			
 
				+            proxies = json.loads(proxies)
			
 
				+        protocol = list(proxies.keys())
			
 
				+        if not protocol:
			
 
				+            return {}
			
 
				+        _url = proxies.get(protocol[0])
			
 
				+        # 改写：注释http代理url的拼接，以正常生成代理池
			
 
				+        # if not _url.startswith("http"):
			
 
				+        #     _url = "http://" + _url
			
 
				+        _url_parse = parse.urlparse(_url)
			
 
				+        netloc = _url_parse.netloc
			
 
				+        if "@" in netloc:
			
 
				+            netloc_auth, netloc_host = netloc.split("@")
			
 
				+        else:
			
 
				+            netloc_auth, netloc_host = "", netloc
			
 
				+        ip, *port = netloc_host.split(":")
			
 
				+        port = port[0] if port else "80"
			
 
				+        user, *password = netloc_auth.split(":")
			
 
				+        password = password[0] if password else ""
			
 
				+        return {
			
 
				+            "protocol": protocol,
			
 
				+            "ip": ip,
			
 
				+            "port": port,
			
 
				+            "user": user,
			
 
				+            "password": password,
			
 
				+            "ip_port": "{}:{}".format(ip, port),
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+class ProxyPoolBase(object):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        pass
			
 
				+
			
 
				+    def get(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+class ProxyPool(ProxyPoolBase):
			
 
				+    """代理池"""
			
 
				+
			
 
				+    def __init__(self, **kwargs):
			
 
				+        """
			
 
				+        :param size: 代理池大小  -1 为不限制
			
 
				+        :param proxy_source_url: 代理文件地址 支持列表
			
 
				+        :param proxy_instance:  提供代理的实例
			
 
				+        :param reset_interval:  代理池重置间隔 最小间隔
			
 
				+        :param reset_interval_max:  代理池重置间隔 最大间隔 默认2分钟
			
 
				+        :param check_valid: 是否在获取代理时进行检测有效性
			
 
				+        :param local_proxy_file_cache_timeout: 本地缓存的代理文件超时时间
			
 
				+        :param logger: 日志处理器 默认 log.get_logger()
			
 
				+        :param kwargs: 其他的参数
			
 
				+        """
			
 
				+        kwargs.setdefault("size", -1)
			
 
				+        kwargs.setdefault("proxy_source_url", setting.PROXY_EXTRACT_API)
			
 
				+
			
 
				+        super(ProxyPool, self).__init__(**kwargs)
			
 
				+        # 队列最大长度
			
 
				+        self.max_queue_size = kwargs.get("size", -1)
			
 
				+        # 实际代理数量
			
 
				+        self.real_max_proxy_count = 1000
			
 
				+        # 代理可用最大次数
			
 
				+        # 代理获取地址 http://localhost/proxy.txt
			
 
				+        self.proxy_source_url = kwargs.get("proxy_source_url", [])
			
 
				+        if not isinstance(self.proxy_source_url, list):
			
 
				+            self.proxy_source_url = [self.proxy_source_url]
			
 
				+            self.proxy_source_url = [x for x in self.proxy_source_url if x]
			
 
				+            self.proxy_source_url = list(set(self.proxy_source_url))
			
 
				+            kwargs.update({"proxy_source_url": self.proxy_source_url})
			
 
				+        # 处理日志
			
 
				+        self.logger = kwargs.get("logger") or log
			
 
				+        kwargs["logger"] = self.logger
			
 
				+        if not self.proxy_source_url:
			
 
				+            self.logger.warn("need set proxy_source_url or proxy_instance")
			
 
				+
			
 
				+        # 代理池重置间隔
			
 
				+        self.reset_interval = kwargs.get("reset_interval", 5)
			
 
				+        # 强制重置一下代理 添加新的代理进来 防止一直使用旧的被封的代理
			
 
				+        self.reset_interval_max = kwargs.get("reset_interval_max", 180)
			
 
				+        # 是否监测代理有效性
			
 
				+        self.check_valid = kwargs.get("check_valid", True)
			
 
				+
			
 
				+        # 代理队列
			
 
				+        self.proxy_queue = None
			
 
				+        # {代理id: ProxyItem, ...}
			
 
				+        self.proxy_dict = {}
			
 
				+        # 失效代理队列
			
 
				+        self.invalid_proxy_dict = {}
			
 
				+
			
 
				+        self.kwargs = kwargs
			
 
				+
			
 
				+        # 重置代理池锁
			
 
				+        self.reset_lock = None
			
 
				+        # 重置时间
			
 
				+        self.last_reset_time = 0
			
 
				+        # 重置的太快了  计数
			
 
				+        self.reset_fast_count = 0
			
 
				+        # 计数 获取代理重试3次仍然失败 次数
			
 
				+        self.no_valid_proxy_times = 0
			
 
				+
			
 
				+        # 上次获取代理时间
			
 
				+        self.last_get_ts = time.time()
			
 
				+
			
 
				+        # 记录ProxyItem的update_ts 防止由于重置太快导致重复检测有效性
			
 
				+        self.proxy_item_update_ts_dict = {}
			
 
				+
			
 
				+        # 警告
			
 
				+        self.warn_flag = False
			
 
				+
			
 
				+    def warn(self):
			
 
				+        if not self.warn_flag:
			
 
				+            for url in self.proxy_source_url:
			
 
				+                if "zhima" in url:
			
 
				+                    continue
			
 
				+            self.warn_flag = True
			
 
				+        return
			
 
				+
			
 
				+    @property
			
 
				+    def queue_size(self):
			
 
				+        """
			
 
				+        当前代理池中代理数量
			
 
				+        :return:
			
 
				+        """
			
 
				+        return self.proxy_queue.qsize() if self.proxy_queue is not None else 0
			
 
				+
			
 
				+    def clear(self):
			
 
				+        """
			
 
				+        清空自己
			
 
				+        :return:
			
 
				+        """
			
 
				+        self.proxy_queue = None
			
 
				+        # {代理ip: ProxyItem, ...}
			
 
				+        self.proxy_dict = {}
			
 
				+        # 清理失效代理集合
			
 
				+        _limit = datetime.datetime.now() - datetime.timedelta(minutes=10)
			
 
				+        self.invalid_proxy_dict = {
			
 
				+            k: v for k, v in self.invalid_proxy_dict.items() if v > _limit
			
 
				+        }
			
 
				+        # 清理超时的update_ts记录
			
 
				+        _limit = time.time() - 600
			
 
				+        self.proxy_item_update_ts_dict = {
			
 
				+            k: v for k, v in self.proxy_item_update_ts_dict.items() if v > _limit
			
 
				+        }
			
 
				+        return
			
 
				+
			
 
				+    def get(self, retry: int = 0) -> dict:
			
 
				+        """
			
 
				+        从代理池中获取代理
			
 
				+        :param retry:
			
 
				+        :return:
			
 
				+        """
			
 
				+        retry += 1
			
 
				+        if retry > 3:
			
 
				+            self.no_valid_proxy_times += 1
			
 
				+            return None
			
 
				+        # if time.time() - self.last_get_ts > 3 * 60:
			
 
				+        #     # 3分钟没有获取过 重置一下
			
 
				+        #     try:
			
 
				+        #         self.reset_proxy_pool()
			
 
				+        #     except Exception as e:
			
 
				+        #         self.logger.exception(e)
			
 
				+        # 记录获取时间
			
 
				+        self.last_get_ts = time.time()
			
 
				+        #
			
 
				+        self.warn()
			
 
				+        proxy_item = self.get_random_proxy()
			
 
				+        if proxy_item:
			
 
				+            # 不检测
			
 
				+            if not self.check_valid:  #
			
 
				+                # 塞回去
			
 
				+                proxies = proxy_item.get_proxies()
			
 
				+                self.put_proxy_item(proxy_item)
			
 
				+                return proxies
			
 
				+            else:
			
 
				+                is_valid = proxy_item.is_valid()
			
 
				+                if is_valid:
			
 
				+                    # 记录update_ts
			
 
				+                    self.proxy_item_update_ts_dict[
			
 
				+                        proxy_item.proxy_id
			
 
				+                    ] = proxy_item.update_ts
			
 
				+                    # 塞回去
			
 
				+                    proxies = proxy_item.get_proxies()
			
 
				+                    self.put_proxy_item(proxy_item)
			
 
				+                    if is_valid == 1:
			
 
				+                        if proxy_item.use_interval:
			
 
				+                            proxy_item.use_ts = time.time()
			
 
				+                        return proxies
			
 
				+                else:
			
 
				+                    # 处理失效代理
			
 
				+                    self.proxy_dict.pop(proxy_item.proxy_id, "")
			
 
				+                    self.invalid_proxy_dict[
			
 
				+                        proxy_item.proxy_id
			
 
				+                    ] = datetime.datetime.now()
			
 
				+        else:
			
 
				+            try:
			
 
				+                time.sleep(3)
			
 
				+                self.reset_proxy_pool()
			
 
				+            except Exception as e:
			
 
				+                self.logger.exception(e)
			
 
				+        if self.no_valid_proxy_times >= 5:
			
 
				+            # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多（时间越长越多） 可能出现一直获取不到代理的情况
			
 
				+            # 导致爬虫烂尾
			
 
				+            try:
			
 
				+                time.sleep(3)
			
 
				+                self.reset_proxy_pool()
			
 
				+            except Exception as e:
			
 
				+                self.logger.exception(e)
			
 
				+        return self.get(retry)
			
 
				+
			
 
				+    get_proxy = get
			
 
				+
			
 
				+    def get_random_proxy(self) -> ProxyItem:
			
 
				+        """
			
 
				+        随机获取代理
			
 
				+        :return:
			
 
				+        """
			
 
				+        if self.proxy_queue is not None:
			
 
				+            if random.random() < 0.5:
			
 
				+                # 一半概率检查 这是个高频操作 优化一下
			
 
				+                if time.time() - self.last_reset_time > self.reset_interval_max:
			
 
				+                    time.sleep(3)
			
 
				+                    self.reset_proxy_pool(force=True)
			
 
				+                else:
			
 
				+                    min_q_size = (
			
 
				+                        min(self.max_queue_size / 2, self.real_max_proxy_count / 2)
			
 
				+                        if self.max_queue_size > 0
			
 
				+                        else self.real_max_proxy_count / 2
			
 
				+                    )
			
 
				+                    if self.proxy_queue.qsize() < min_q_size:
			
 
				+                        time.sleep(3)
			
 
				+                        self.reset_proxy_pool()
			
 
				+            try:
			
 
				+                return self.proxy_queue.get_nowait()
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+        return None
			
 
				+
			
 
				+    def append_proxies(self, proxies_list: list) -> int:
			
 
				+        """
			
 
				+        添加代理到代理池
			
 
				+        :param proxies_list:
			
 
				+        :return:
			
 
				+        """
			
 
				+        count = 0
			
 
				+        if not isinstance(proxies_list, list):
			
 
				+            proxies_list = [proxies_list]
			
 
				+        for proxies in proxies_list:
			
 
				+            if proxies:
			
 
				+                proxy_item = ProxyItem(proxies=proxies, **self.kwargs)
			
 
				+                # 增加失效判断 2018/12/18
			
 
				+                if proxy_item.proxy_id in self.invalid_proxy_dict:
			
 
				+                    continue
			
 
				+                if proxy_item.proxy_id not in self.proxy_dict:
			
 
				+                    # 补充update_ts
			
 
				+                    if not proxy_item.update_ts:
			
 
				+                        proxy_item.update_ts = self.proxy_item_update_ts_dict.get(
			
 
				+                            proxy_item.proxy_id, 0
			
 
				+                        )
			
 
				+                    self.put_proxy_item(proxy_item)
			
 
				+                    self.proxy_dict[proxy_item.proxy_id] = proxy_item
			
 
				+                    count += 1
			
 
				+        return count
			
 
				+
			
 
				+    def put_proxy_item(self, proxy_item: ProxyItem):
			
 
				+        """
			
 
				+        添加 ProxyItem 到代理池
			
 
				+        :param proxy_item:
			
 
				+        :return:
			
 
				+        """
			
 
				+        return self.proxy_queue.put_nowait(proxy_item)
			
 
				+
			
 
				+    def reset_proxy_pool(self, force: bool = False):
			
 
				+        """
			
 
				+        重置代理池
			
 
				+        :param force: 是否强制重置代理池
			
 
				+        :return:
			
 
				+        """
			
 
				+        if not self.reset_lock:
			
 
				+            # 必须用时调用 否则 可能存在 gevent patch前 threading就已经被导入 导致的Rlock patch失效
			
 
				+            import threading
			
 
				+
			
 
				+            self.reset_lock = threading.RLock()
			
 
				+        with self.reset_lock:
			
 
				+            if (
			
 
				+                    force
			
 
				+                    or self.proxy_queue is None
			
 
				+                    or (
			
 
				+                    self.max_queue_size > 0
			
 
				+                    and self.proxy_queue.qsize() < self.max_queue_size / 2
			
 
				+            )
			
 
				+                    or (
			
 
				+                    self.max_queue_size < 0
			
 
				+                    and self.proxy_queue.qsize() < self.real_max_proxy_count / 2
			
 
				+            )
			
 
				+                    or self.no_valid_proxy_times >= 5
			
 
				+            ):
			
 
				+                if time.time() - self.last_reset_time < self.reset_interval:
			
 
				+                    self.reset_fast_count += 1
			
 
				+                    if self.reset_fast_count % 10 == 0:
			
 
				+                        self.logger.debug(
			
 
				+                            "代理池重置的太快了:) {}".format(self.reset_fast_count)
			
 
				+                        )
			
 
				+                        time.sleep(1)
			
 
				+                else:
			
 
				+                    self.clear()
			
 
				+                    if self.proxy_queue is None:
			
 
				+                        import queue
			
 
				+
			
 
				+                        self.proxy_queue = queue.Queue()
			
 
				+                    # TODO 这里获取到的可能重复
			
 
				+                    proxies_list = get_proxy_from_url(**self.kwargs)
			
 
				+                    self.real_max_proxy_count = len(proxies_list)
			
 
				+                    if 0 < self.max_queue_size < self.real_max_proxy_count:
			
 
				+                        proxies_list = random.sample(proxies_list, self.max_queue_size)
			
 
				+                    _valid_count = self.append_proxies(proxies_list)
			
 
				+                    self.last_reset_time = time.time()
			
 
				+                    self.no_valid_proxy_times = 0
			
 
				+                    self.logger.debug(
			
 
				+                        "重置代理池成功: 获取{}, 成功添加{}, 失效{},  当前代理数{},".format(
			
 
				+                            len(proxies_list),
			
 
				+                            _valid_count,
			
 
				+                            len(self.invalid_proxy_dict),
			
 
				+                            len(self.proxy_dict),
			
 
				+                        )
			
 
				+                    )
			
 
				+        return
			
 
				+
			
 
				+    def tag_proxy(self, proxies_list: list, flag: int, *, delay=30) -> bool:
			
 
				+        """
			
 
				+        对代理进行标记
			
 
				+        :param proxies_list:
			
 
				+        :param flag:
			
 
				+                    -1  废弃
			
 
				+                    1 延迟使用
			
 
				+        :param delay: 延迟时间
			
 
				+        :return:
			
 
				+        """
			
 
				+        if int(flag) not in ProxyItem.proxy_tag_list or not proxies_list:
			
 
				+            return False
			
 
				+        if not isinstance(proxies_list, list):
			
 
				+            proxies_list = [proxies_list]
			
 
				+        for proxies in proxies_list:
			
 
				+            if not proxies:
			
 
				+                continue
			
 
				+            proxy_id = ProxyItem(proxies).proxy_id
			
 
				+            if proxy_id not in self.proxy_dict:
			
 
				+                continue
			
 
				+            self.proxy_dict[proxy_id].flag = flag
			
 
				+            self.proxy_dict[proxy_id].flag_ts = time.time()
			
 
				+            self.proxy_dict[proxy_id].delay = delay
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def get_proxy_item(self, proxy_id="", proxies=None):
			
 
				+        """
			
 
				+        获取代理对象
			
 
				+        :param proxy_id:
			
 
				+        :param proxies:
			
 
				+        :return:
			
 
				+        """
			
 
				+        if proxy_id:
			
 
				+            return self.proxy_dict.get(proxy_id)
			
 
				+        if proxies:
			
 
				+            proxy_id = ProxyItem(proxies).proxy_id
			
 
				+            return self.proxy_dict.get(proxy_id)
			
 
				+        return
			
 
				+
			
 
				+    def copy(self):
			
 
				+        return ProxyPool(**self.kwargs)
			
 
				+
			
 
				+    def all(self) -> list:
			
 
				+        """
			
 
				+        获取当前代理池中的全部代理
			
 
				+        :return:
			
 
				+        """
			
 
				+        return get_proxy_from_url(**self.kwargs)
			
 
				+# 
			
 
				+# 
			
 
				+# if __name__ == '__main__':
			
 
				+#     ProxyPool().get()
			
--- a/FworkSpider/feapder/network/request.py
+++ b/FworkSpider/feapder/network/request.py
@@ -0,0 +1,506 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-07-25 11:49:08
			
 
				+---------
			
 
				+@summary: 请求结构体
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import requests
			
 
				+from requests.adapters import HTTPAdapter
			
 
				+from requests.cookies import RequestsCookieJar
			
 
				+from requests.packages.urllib3.exceptions import InsecureRequestWarning
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.network import user_agent
			
 
				+from feapder.network.proxy_pool import ProxyPool
			
 
				+from feapder.network.response import Response
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.webdriver import WebDriverPool
			
 
				+
			
 
				+# 屏蔽warning信息
			
 
				+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
			
 
				+
			
 
				+
			
 
				+class Request(object):
			
 
				+    session = None
			
 
				+    webdriver_pool: WebDriverPool = None
			
 
				+    user_agent_pool = user_agent
			
 
				+    proxies_pool: ProxyPool = None
			
 
				+
			
 
				+    cache_db = None  # redis / pika
			
 
				+    cached_redis_key = None  # 缓存response的文件文件夹 response_cached:cached_redis_key:md5
			
 
				+    cached_expire_time = 1200  # 缓存过期时间
			
 
				+
			
 
				+    local_filepath = None
			
 
				+    oss_handler = None
			
 
				+
			
 
				+    __REQUEST_ATTRS__ = {
			
 
				+        # 'method', 'url', 必须传递 不加入**kwargs中
			
 
				+        "params",
			
 
				+        "data",
			
 
				+        "headers",
			
 
				+        "cookies",
			
 
				+        "files",
			
 
				+        "auth",
			
 
				+        "timeout",
			
 
				+        "allow_redirects",
			
 
				+        "proxies",
			
 
				+        "hooks",
			
 
				+        "stream",
			
 
				+        "verify",
			
 
				+        "cert",
			
 
				+        "json",
			
 
				+    }
			
 
				+
			
 
				+    DEFAULT_KEY_VALUE = dict(
			
 
				+        url="",
			
 
				+        retry_times=0,
			
 
				+        priority=300,
			
 
				+        parser_name=None,
			
 
				+        callback=None,
			
 
				+        filter_repeat=True,
			
 
				+        auto_request=True,
			
 
				+        request_sync=False,
			
 
				+        use_session=None,
			
 
				+        random_user_agent=True,
			
 
				+        download_midware=None,
			
 
				+        is_abandoned=False,
			
 
				+        render=False,
			
 
				+        render_time=0,
			
 
				+    )
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        url="",
			
 
				+        retry_times=0,
			
 
				+        priority=300,
			
 
				+        parser_name=None,
			
 
				+        callback=None,
			
 
				+        filter_repeat=True,
			
 
				+        auto_request=True,
			
 
				+        request_sync=False,
			
 
				+        use_session=None,
			
 
				+        random_user_agent=True,
			
 
				+        download_midware=None,
			
 
				+        is_abandoned=False,
			
 
				+        render=False,
			
 
				+        render_time=0,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @summary: Request参数
			
 
				+        ---------
			
 
				+        框架参数
			
 
				+        @param url: 待抓取url
			
 
				+        @param retry_times: 当前重试次数
			
 
				+        @param priority: 优先级 越小越优先 默认300
			
 
				+        @param parser_name: 回调函数所在的类名 默认为当前类
			
 
				+        @param callback: 回调函数 可以是函数 也可是函数名（如想跨类回调时，parser_name指定那个类名，callback指定那个类想回调的方法名即可）
			
 
				+        @param filter_repeat: 是否需要去重 (True/False) 当setting中的REQUEST_FILTER_ENABLE设置为True时该参数生效 默认True
			
 
				+        @param auto_request: 是否需要自动请求下载网页 默认是。设置为False时返回的response为空，需要自己去请求网页
			
 
				+        @param request_sync: 是否同步请求下载网页，默认异步。如果该请求url过期时间快，可设置为True，相当于yield的reqeust会立即响应，而不是去排队
			
 
				+        @param use_session: 是否使用session方式
			
 
				+        @param random_user_agent: 是否随机User-Agent (True/False) 当setting中的RANDOM_HEADERS设置为True时该参数生效 默认True
			
 
				+        @param download_midware: 下载中间件。默认为parser中的download_midware
			
 
				+        @param is_abandoned: 当发生异常时是否放弃重试 True/False. 默认False
			
 
				+        @param render: 是否用浏览器渲染
			
 
				+        @param render_time: 渲染时长，即打开网页等待指定时间后再获取源码
			
 
				+        --
			
 
				+        以下参数与requests参数使用方式一致
			
 
				+        @param method: 请求方式，如POST或GET，默认根据data值是否为空来判断
			
 
				+        @param params: 请求参数
			
 
				+        @param data: 请求body
			
 
				+        @param json: 请求json字符串，同 json.dumps(data)
			
 
				+        @param headers:
			
 
				+        @param cookies: 字典 或 CookieJar 对象
			
 
				+        @param files:
			
 
				+        @param auth:
			
 
				+        @param timeout: (浮点或元组)等待服务器数据的超时限制，是一个浮点数，或是一个(connect timeout, read timeout) 元组
			
 
				+        @param allow_redirects : Boolean. True 表示允许跟踪 POST/PUT/DELETE 方法的重定向
			
 
				+        @param proxies: 代理 {"http":"http://xxx", "https":"https://xxx"}
			
 
				+        @param verify: 为 True 时将会验证 SSL 证书
			
 
				+        @param stream: 如果为 False，将会立即下载响应内容
			
 
				+        @param cert:
			
 
				+        --
			
 
				+        @param **kwargs: 其他值: 如 Request(item=item) 则item可直接用 request.item 取出
			
 
				+        ---------
			
 
				+        @result:
			
 
				+        """
			
 
				+
			
 
				+        self.url = url
			
 
				+        self.retry_times = retry_times
			
 
				+        self.priority = priority
			
 
				+        self.parser_name = parser_name
			
 
				+        self.callback = callback
			
 
				+        self.filter_repeat = filter_repeat
			
 
				+        self.auto_request = auto_request
			
 
				+        self.request_sync = request_sync
			
 
				+        self.use_session = use_session
			
 
				+        self.random_user_agent = random_user_agent
			
 
				+        self.download_midware = download_midware
			
 
				+        self.is_abandoned = is_abandoned
			
 
				+        self.render = render
			
 
				+        self.render_time = render_time or setting.WEBDRIVER.get("render_time", 0)
			
 
				+
			
 
				+        self.requests_kwargs = {}
			
 
				+        for key, value in kwargs.items():
			
 
				+            if key in self.__class__.__REQUEST_ATTRS__:  # 取requests参数
			
 
				+                self.requests_kwargs[key] = value
			
 
				+
			
 
				+            self.__dict__[key] = value
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        try:
			
 
				+            return "<Request {}>".format(self.url)
			
 
				+        except:
			
 
				+            return "<Request {}>".format(str(self.to_dict)[:40])
			
 
				+
			
 
				+    def __setattr__(self, key, value):
			
 
				+        """
			
 
				+        针对 request.xxx = xxx 的形式，更新reqeust及内部参数值
			
 
				+        @param key:
			
 
				+        @param value:
			
 
				+        @return:
			
 
				+        """
			
 
				+        self.__dict__[key] = value
			
 
				+
			
 
				+        if key in self.__class__.__REQUEST_ATTRS__:
			
 
				+            self.requests_kwargs[key] = value
			
 
				+
			
 
				+    def __lt__(self, other):
			
 
				+        return self.priority < other.priority
			
 
				+
			
 
				+    @property
			
 
				+    def _session(self):
			
 
				+        use_session = (
			
 
				+            setting.USE_SESSION if self.use_session is None else self.use_session
			
 
				+        )  # self.use_session 优先级高
			
 
				+        if use_session and not self.__class__.session:
			
 
				+            self.__class__.session = requests.Session()
			
 
				+            # pool_connections – 缓存的 urllib3 连接池个数  pool_maxsize – 连接池中保存的最大连接数
			
 
				+            http_adapter = HTTPAdapter(pool_connections=1000, pool_maxsize=1000)
			
 
				+            # 任何使用该session会话的 HTTP 请求，只要其 URL 是以给定的前缀开头，该传输适配器就会被使用到。
			
 
				+            self.__class__.session.mount("http", http_adapter)
			
 
				+
			
 
				+        return self.__class__.session
			
 
				+
			
 
				+    @property
			
 
				+    def _webdriver_pool(self):
			
 
				+        if not self.__class__.webdriver_pool:
			
 
				+            self.__class__.webdriver_pool = WebDriverPool(**setting.WEBDRIVER)
			
 
				+
			
 
				+        return self.__class__.webdriver_pool
			
 
				+
			
 
				+    @property
			
 
				+    def _proxies_pool(self):
			
 
				+        if not self.__class__.proxies_pool:
			
 
				+            self.__class__.proxies_pool = ProxyPool()
			
 
				+
			
 
				+        return self.__class__.proxies_pool
			
 
				+
			
 
				+    @property
			
 
				+    def to_dict(self):
			
 
				+        request_dict = {}
			
 
				+
			
 
				+        self.callback = (
			
 
				+            getattr(self.callback, "__name__")
			
 
				+            if callable(self.callback)
			
 
				+            else self.callback
			
 
				+        )
			
 
				+        self.download_midware = (
			
 
				+            getattr(self.download_midware, "__name__")
			
 
				+            if callable(self.download_midware)
			
 
				+            else self.download_midware
			
 
				+        )
			
 
				+
			
 
				+        for key, value in self.__dict__.items():
			
 
				+            if (
			
 
				+                key in self.__class__.DEFAULT_KEY_VALUE
			
 
				+                and self.__class__.DEFAULT_KEY_VALUE.get(key) == value
			
 
				+                or key == "requests_kwargs"
			
 
				+            ):
			
 
				+                continue
			
 
				+
			
 
				+            if key in self.__class__.__REQUEST_ATTRS__:
			
 
				+                if not isinstance(
			
 
				+                    value, (bytes, bool, float, int, str, tuple, list, dict)
			
 
				+                ):
			
 
				+                    value = tools.dumps_obj(value)
			
 
				+            else:
			
 
				+                if not isinstance(value, (bytes, bool, float, int, str)):
			
 
				+                    value = tools.dumps_obj(value)
			
 
				+
			
 
				+            request_dict[key] = value
			
 
				+
			
 
				+        return request_dict
			
 
				+
			
 
				+    @property
			
 
				+    def callback_name(self):
			
 
				+        return (
			
 
				+            getattr(self.callback, "__name__")
			
 
				+            if callable(self.callback)
			
 
				+            else self.callback
			
 
				+        )
			
 
				+
			
 
				+    def get_response(self, save_cached=False):
			
 
				+        """
			
 
				+        获取带有selector功能的response
			
 
				+        @param save_cached: 保存缓存 方便调试时不用每次都重新下载
			
 
				+        @return:
			
 
				+        """
			
 
				+        # 设置超时默认时间
			
 
				+        self.requests_kwargs.setdefault(
			
 
				+            "timeout", setting.REQUEST_TIMEOUT
			
 
				+        )  # connect=22 read=22
			
 
				+
			
 
				+        # 设置stream
			
 
				+        # 默认情况下，当你进行网络请求后，响应体会立即被下载。你可以通过 stream 参数覆盖这个行为，推迟下载响应体直到访问 Response.content 属性。此时仅有响应头被下载下来了。缺点： stream 设为 True，Requests 无法将连接释放回连接池，除非你 消耗了所有的数据，或者调用了 Response.close。 这样会带来连接效率低下的问题。
			
 
				+        self.requests_kwargs.setdefault("stream", True)
			
 
				+
			
 
				+        # 关闭证书验证
			
 
				+        self.requests_kwargs.setdefault("verify", False)
			
 
				+
			
 
				+        # 设置请求方法
			
 
				+        method = self.__dict__.get("method")
			
 
				+        if not method:
			
 
				+            if "data" in self.requests_kwargs:
			
 
				+                method = "POST"
			
 
				+            else:
			
 
				+                method = "GET"
			
 
				+
			
 
				+        # 随机user—agent
			
 
				+        headers = self.requests_kwargs.get("headers", {})
			
 
				+        if "user-agent" not in headers and "User-Agent" not in headers:
			
 
				+            if self.render:  # 如果是渲染默认，优先使用WEBDRIVER中配置的ua
			
 
				+                ua = setting.WEBDRIVER.get(
			
 
				+                    "user_agent"
			
 
				+                ) or self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
			
 
				+            else:
			
 
				+                ua = self.__class__.user_agent_pool.get(setting.USER_AGENT_TYPE)
			
 
				+
			
 
				+            if self.random_user_agent and setting.RANDOM_HEADERS:
			
 
				+                headers.update({"User-Agent": ua})
			
 
				+                self.requests_kwargs.update(headers=headers)
			
 
				+        else:
			
 
				+            self.requests_kwargs.setdefault(
			
 
				+                "headers", {"User-Agent": setting.DEFAULT_USERAGENT}
			
 
				+            )
			
 
				+
			
 
				+        # 代理
			
 
				+        proxies = self.requests_kwargs.get("proxies", -1)
			
 
				+        if proxies == -1 and setting.PROXY_ENABLE and setting.PROXY_EXTRACT_API:
			
 
				+            while True:
			
 
				+                proxies = self._proxies_pool.get()
			
 
				+                if proxies:
			
 
				+                    self.requests_kwargs.update(proxies=proxies)
			
 
				+                    break
			
 
				+                else:
			
 
				+                    log.debug("暂无可用代理 ...")
			
 
				+
			
 
				+        log.debug(
			
 
				+            """
			
 
				+                -------------- %srequest for ----------------
			
 
				+                url  = %s
			
 
				+                method = %s
			
 
				+                body = %s
			
 
				+                """
			
 
				+            % (
			
 
				+                ""
			
 
				+                if not self.parser_name
			
 
				+                else "%s.%s "
			
 
				+                % (
			
 
				+                    self.parser_name,
			
 
				+                    (
			
 
				+                        self.callback
			
 
				+                        and callable(self.callback)
			
 
				+                        and getattr(self.callback, "__name__")
			
 
				+                        or self.callback
			
 
				+                    )
			
 
				+                    or "parse",
			
 
				+                ),
			
 
				+                self.url,
			
 
				+                method,
			
 
				+                self.requests_kwargs,
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # def hooks(response, *args, **kwargs):
			
 
				+        #     print(response.url)
			
 
				+        #
			
 
				+        # self.requests_kwargs.update(hooks={'response': hooks})
			
 
				+
			
 
				+        use_session = (
			
 
				+            setting.USE_SESSION if self.use_session is None else self.use_session
			
 
				+        )  # self.use_session 优先级高
			
 
				+
			
 
				+        if self.render:
			
 
				+            # 使用request的user_agent、cookies、proxy
			
 
				+            user_agent = headers.get("User-Agent") or headers.get("user-agent")
			
 
				+            cookies = self.requests_kwargs.get("cookies")
			
 
				+            print(cookies)
			
 
				+            if cookies and isinstance(cookies, RequestsCookieJar):
			
 
				+                cookies = cookies.get_dict()
			
 
				+
			
 
				+            if not cookies:
			
 
				+                cookie_str = headers.get("Cookie") or headers.get("cookie")
			
 
				+                if cookie_str:
			
 
				+                    cookies = tools.get_cookies_from_str(cookie_str)
			
 
				+
			
 
				+            proxy = None
			
 
				+            if proxies and proxies != -1:
			
 
				+                proxy = proxies.get("http", "").strip("http://") or proxies.get(
			
 
				+                    "https", ""
			
 
				+                ).strip("https://")
			
 
				+
			
 
				+            browser = self._webdriver_pool.get(user_agent=user_agent, proxy=proxy)
			
 
				+
			
 
				+            try:
			
 
				+                browser.get(self.url)
			
 
				+                if cookies:
			
 
				+                    browser.cookies = cookies
			
 
				+                if self.render_time:
			
 
				+                    tools.delay_time(self.render_time)
			
 
				+
			
 
				+                html = browser.page_source
			
 
				+                response = Response.from_dict(
			
 
				+                    {
			
 
				+                        "url": browser.current_url,
			
 
				+                        "cookies": browser.cookies,
			
 
				+                        "_content": html.encode(),
			
 
				+                        "status_code": 200,
			
 
				+                        "elapsed": 666,
			
 
				+                        "headers": {
			
 
				+                            "User-Agent": browser.execute_script(
			
 
				+                                "return navigator.userAgent"
			
 
				+                            ),
			
 
				+                            "Cookie": tools.cookies2str(browser.cookies),
			
 
				+                        },
			
 
				+                    }
			
 
				+                )
			
 
				+
			
 
				+                response.browser = browser
			
 
				+            except Exception as e:
			
 
				+                self._webdriver_pool.remove(browser)
			
 
				+                raise e
			
 
				+
			
 
				+        elif use_session:
			
 
				+            response = self._session.request(method, self.url, **self.requests_kwargs)
			
 
				+            response = Response(response)
			
 
				+        else:
			
 
				+            response = requests.request(method, self.url, **self.requests_kwargs)
			
 
				+            response = Response(response)
			
 
				+
			
 
				+        if save_cached:
			
 
				+            self.save_cached(response, expire_time=self.__class__.cached_expire_time)
			
 
				+
			
 
				+        return response
			
 
				+
			
 
				+    def proxies(self):
			
 
				+        """
			
 
				+
			
 
				+        Returns: {"https": "https://ip:port", "http": "http://ip:port"}
			
 
				+
			
 
				+        """
			
 
				+        return self.requests_kwargs.get("proxies")
			
 
				+
			
 
				+    def proxy(self):
			
 
				+        """
			
 
				+
			
 
				+        Returns: ip:port
			
 
				+
			
 
				+        """
			
 
				+        proxies = self.proxies()
			
 
				+        if proxies:
			
 
				+            return proxies.get("http", "").strip("http://") or proxies.get(
			
 
				+                "https", ""
			
 
				+            ).strip("https://")
			
 
				+
			
 
				+    def user_agent(self):
			
 
				+        headers = self.requests_kwargs.get("headers")
			
 
				+        if headers:
			
 
				+            return headers.get("user_agent") or headers.get("User-Agent")
			
 
				+
			
 
				+    @property
			
 
				+    def fingerprint(self):
			
 
				+        """
			
 
				+        request唯一表识
			
 
				+        @return:
			
 
				+        """
			
 
				+        url = self.__dict__.get("url", "")
			
 
				+        # url 归一化
			
 
				+        url = tools.canonicalize_url(url)
			
 
				+        args = [url]
			
 
				+
			
 
				+        for arg in ["params", "data", "files", "auth", "cert", "json"]:
			
 
				+            if self.requests_kwargs.get(arg):
			
 
				+                args.append(self.requests_kwargs.get(arg))
			
 
				+
			
 
				+        return tools.get_md5(*args)
			
 
				+
			
 
				+    @property
			
 
				+    def _cache_db(self):
			
 
				+        if not self.__class__.cache_db:
			
 
				+            self.__class__.cache_db = RedisDB()  # .from_url(setting.pika_spider_1_uri)
			
 
				+
			
 
				+        return self.__class__.cache_db
			
 
				+
			
 
				+    @property
			
 
				+    def _cached_redis_key(self):
			
 
				+        if self.__class__.cached_redis_key:
			
 
				+            return (
			
 
				+                f"response_cached:{self.__class__.cached_redis_key}:{self.fingerprint}"
			
 
				+            )
			
 
				+        else:
			
 
				+            return f"response_cached:test:{self.fingerprint}"
			
 
				+
			
 
				+    def save_cached(self, response, expire_time=1200):
			
 
				+        """
			
 
				+        使用redis保存response 用于调试 不用每回都下载
			
 
				+        @param response:
			
 
				+        @param expire_time: 过期时间
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        self._cache_db.strset(self._cached_redis_key, response.to_dict, ex=expire_time)
			
 
				+
			
 
				+    def get_response_from_cached(self, save_cached=True):
			
 
				+        """
			
 
				+        从缓存中获取response
			
 
				+        注意：
			
 
				+            属性值为空：
			
 
				+                -raw ： urllib3.response.HTTPResponse
			
 
				+                -connection：requests.adapters.HTTPAdapter
			
 
				+                -history
			
 
				+
			
 
				+            属性含义改变：
			
 
				+                - request 由requests 改为Request
			
 
				+        @param: save_cached 当无缓存 直接下载 下载完是否保存缓存
			
 
				+        @return:
			
 
				+        """
			
 
				+        response_dict = self._cache_db.strget(self._cached_redis_key)
			
 
				+        if not response_dict:
			
 
				+            log.info("无response缓存  重新下载")
			
 
				+            response_obj = self.get_response(save_cached=save_cached)
			
 
				+        else:
			
 
				+            response_dict = eval(response_dict)
			
 
				+            response_obj = Response.from_dict(response_dict)
			
 
				+        return response_obj
			
 
				+
			
 
				+    def del_response_cached(self):
			
 
				+        self._cache_db.clear(self._cached_redis_key)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, request_dict):
			
 
				+        for key, value in request_dict.items():
			
 
				+            if isinstance(value, bytes):  # 反序列化 如item
			
 
				+                request_dict[key] = tools.loads_obj(value)
			
 
				+
			
 
				+        return cls(**request_dict)
			
 
				+
			
 
				+    def copy(self):
			
 
				+        return self.__class__.from_dict(self.to_dict)
			
--- a/FworkSpider/feapder/network/response.py
+++ b/FworkSpider/feapder/network/response.py
@@ -0,0 +1,356 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-07-26 11:40:28
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import datetime
			
 
				+import os
			
 
				+import re
			
 
				+import time
			
 
				+from urllib.parse import urlparse, urlunparse, urljoin
			
 
				+
			
 
				+from bs4 import UnicodeDammit, BeautifulSoup
			
 
				+from requests.cookies import RequestsCookieJar
			
 
				+from requests.models import Response as res
			
 
				+from w3lib.encoding import http_content_type_encoding, html_body_declared_encoding
			
 
				+
			
 
				+from feapder.network.selector import Selector
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+FAIL_ENCODING = "ISO-8859-1"
			
 
				+
			
 
				+# html 源码中的特殊字符，需要删掉，否则会影响etree的构建
			
 
				+SPECIAL_CHARACTERS = [
			
 
				+    # 移除控制字符 全部字符列表 https://zh.wikipedia.org/wiki/%E6%8E%A7%E5%88%B6%E5%AD%97%E7%AC%A6
			
 
				+    "[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]"
			
 
				+]
			
 
				+
			
 
				+SPECIAL_CHARACTER_PATTERNS = [
			
 
				+    re.compile(special_character) for special_character in SPECIAL_CHARACTERS
			
 
				+]
			
 
				+
			
 
				+
			
 
				+class Response(res):
			
 
				+    def __init__(self, response):
			
 
				+        super(Response, self).__init__()
			
 
				+        self.__dict__.update(response.__dict__)
			
 
				+
			
 
				+        self._cached_selector = None
			
 
				+        self._cached_text = None
			
 
				+        self._cached_json = None
			
 
				+
			
 
				+        self._encoding = None
			
 
				+
			
 
				+        self.encoding_errors = "strict"  # strict / replace / ignore
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(cls, response_dict):
			
 
				+        """
			
 
				+        利用字典获取Response对象
			
 
				+        @param response_dict: 原生的response.__dict__
			
 
				+        @return:
			
 
				+        """
			
 
				+        cookie_jar = RequestsCookieJar()
			
 
				+        cookie_jar.update(other=response_dict["cookies"])
			
 
				+        response_dict["cookies"] = cookie_jar
			
 
				+
			
 
				+        response_dict["elapsed"] = datetime.timedelta(
			
 
				+            0, 0, response_dict["elapsed"]
			
 
				+        )  # 耗时
			
 
				+        response_dict["connection"] = None
			
 
				+        response_dict["_content_consumed"] = True
			
 
				+
			
 
				+        response = res()
			
 
				+        response.__dict__.update(response_dict)
			
 
				+        return cls(response)
			
 
				+
			
 
				+    @property
			
 
				+    def to_dict(self):
			
 
				+        response_dict = {
			
 
				+            "_content": self.content,
			
 
				+            "cookies": self.cookies.get_dict(),
			
 
				+            "encoding": self.encoding,
			
 
				+            "headers": self.headers,
			
 
				+            "status_code": self.status_code,
			
 
				+            "elapsed": self.elapsed.microseconds,  # 耗时
			
 
				+            "url": self.url,
			
 
				+        }
			
 
				+
			
 
				+        return response_dict
			
 
				+
			
 
				+    def __clear_cache(self):
			
 
				+        self.__dict__["_cached_selector"] = None
			
 
				+        self.__dict__["_cached_text"] = None
			
 
				+        self.__dict__["_cached_json"] = None
			
 
				+
			
 
				+    @property
			
 
				+    def encoding(self):
			
 
				+        """
			
 
				+        编码优先级：自定义编码 > header中编码 > 页面编码 > 根据content猜测的编码
			
 
				+        """
			
 
				+        self._encoding = (
			
 
				+            self._encoding
			
 
				+            or self._headers_encoding()
			
 
				+            or self._body_declared_encoding()
			
 
				+            or self.apparent_encoding
			
 
				+        )
			
 
				+        return self._encoding
			
 
				+
			
 
				+    @encoding.setter
			
 
				+    def encoding(self, val):
			
 
				+        self.__clear_cache()
			
 
				+        self._encoding = val
			
 
				+
			
 
				+    code = encoding
			
 
				+
			
 
				+    def _headers_encoding(self):
			
 
				+        """
			
 
				+        从headers获取头部charset编码
			
 
				+        """
			
 
				+        content_type = self.headers.get("Content-Type") or self.headers.get(
			
 
				+            "content-type"
			
 
				+        )
			
 
				+        if content_type:
			
 
				+            return (
			
 
				+                http_content_type_encoding(content_type) or "utf-8"
			
 
				+                if "application/json" in content_type
			
 
				+                else None
			
 
				+            )
			
 
				+
			
 
				+    def _body_declared_encoding(self):
			
 
				+        """
			
 
				+        从html xml等获取<meta charset="编码">
			
 
				+        """
			
 
				+
			
 
				+        return html_body_declared_encoding(self.content)
			
 
				+
			
 
				+    def _get_unicode_html(self, html):
			
 
				+        if not html or not isinstance(html, bytes):
			
 
				+            return html
			
 
				+
			
 
				+        converted = UnicodeDammit(html, is_html=True)
			
 
				+        if not converted.unicode_markup:
			
 
				+            raise Exception(
			
 
				+                "Failed to detect encoding of article HTML, tried: %s"
			
 
				+                % ", ".join(converted.tried_encodings)
			
 
				+            )
			
 
				+
			
 
				+        html = converted.unicode_markup
			
 
				+        return html
			
 
				+
			
 
				+    def _make_absolute(self, link):
			
 
				+        """Makes a given link absolute."""
			
 
				+        try:
			
 
				+
			
 
				+            link = link.strip()
			
 
				+
			
 
				+            # Parse the link with stdlib.
			
 
				+            parsed = urlparse(link)._asdict()
			
 
				+
			
 
				+            # If link is relative, then join it with base_url.
			
 
				+            if not parsed["netloc"]:
			
 
				+                return urljoin(self.url, link)
			
 
				+
			
 
				+            # Link is absolute; if it lacks a scheme, add one from base_url.
			
 
				+            if not parsed["scheme"]:
			
 
				+                parsed["scheme"] = urlparse(self.url).scheme
			
 
				+
			
 
				+                # Reconstruct the URL to incorporate the new scheme.
			
 
				+                parsed = (v for v in parsed.values())
			
 
				+                return urlunparse(parsed)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                "Invalid URL <{}> can't make absolute_link. exception: {}".format(
			
 
				+                    link, e
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        # Link is absolute and complete with scheme; nothing to be done here.
			
 
				+        return link
			
 
				+
			
 
				+    def _absolute_links(self, text):
			
 
				+        regexs = [
			
 
				+            r'(<(?i)a.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # a
			
 
				+            r'(<(?i)img.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # img
			
 
				+            r'(<(?i)link.*?href\s*?=\s*?["\'])(.+?)(["\'])',  # css
			
 
				+            r'(<(?i)script.*?src\s*?=\s*?["\'])(.+?)(["\'])',  # js
			
 
				+        ]
			
 
				+
			
 
				+        for regex in regexs:
			
 
				+
			
 
				+            def replace_href(text):
			
 
				+                # html = text.group(0)
			
 
				+                link = text.group(2)
			
 
				+                absolute_link = self._make_absolute(link)
			
 
				+
			
 
				+                # return re.sub(regex, r'\1{}\3'.format(absolute_link), html) # 使用正则替换，个别字符不支持。如该网址源代码http://permit.mep.gov.cn/permitExt/syssb/xxgk/xxgk!showImage.action?dataid=0b092f8115ff45c5a50947cdea537726
			
 
				+                return text.group(1) + absolute_link + text.group(3)
			
 
				+
			
 
				+            text = re.sub(regex, replace_href, text, flags=re.S)
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    def _del_special_character(self, text):
			
 
				+        """
			
 
				+        删除特殊字符
			
 
				+        """
			
 
				+        for special_character_pattern in SPECIAL_CHARACTER_PATTERNS:
			
 
				+            text = special_character_pattern.sub("", text)
			
 
				+
			
 
				+        return text
			
 
				+
			
 
				+    @property
			
 
				+    def __text(self):
			
 
				+        """Content of the response, in unicode.
			
 
				+
			
 
				+        If Response.encoding is None, encoding will be guessed using
			
 
				+        ``chardet``.
			
 
				+
			
 
				+        The encoding of the response content is determined based solely on HTTP
			
 
				+        headers, following RFC 2616 to the letter. If you can take advantage of
			
 
				+        non-HTTP knowledge to make a better guess at the encoding, you should
			
 
				+        set ``r.encoding`` appropriately before accessing this property.
			
 
				+        """
			
 
				+
			
 
				+        if not self.content:
			
 
				+            return ""
			
 
				+
			
 
				+        # Decode unicode from given encoding.
			
 
				+        try:
			
 
				+            content = str(self.content, self.encoding, errors=self.encoding_errors)
			
 
				+        except (LookupError, TypeError):
			
 
				+            # A LookupError is raised if the encoding was not found which could
			
 
				+            # indicate a misspelling or similar mistake.
			
 
				+            #
			
 
				+            # A TypeError can be raised if encoding is None
			
 
				+            #
			
 
				+            # So we try blindly encoding.
			
 
				+            content = str(self.content, errors=self.encoding_errors)
			
 
				+
			
 
				+        return content
			
 
				+
			
 
				+    @property
			
 
				+    def text(self):
			
 
				+        if self._cached_text is None:
			
 
				+            if self.encoding and self.encoding.upper() != FAIL_ENCODING:
			
 
				+                try:
			
 
				+                    self._cached_text = self.__text
			
 
				+                except UnicodeDecodeError:
			
 
				+                    self._cached_text = self._get_unicode_html(self.content)
			
 
				+            else:
			
 
				+                self._cached_text = self._get_unicode_html(self.content)
			
 
				+
			
 
				+            if self._cached_text:
			
 
				+                self._cached_text = self._absolute_links(self._cached_text)
			
 
				+                self._cached_text = self._del_special_character(self._cached_text)
			
 
				+
			
 
				+        return self._cached_text
			
 
				+
			
 
				+    @text.setter
			
 
				+    def text(self, html):
			
 
				+        self._cached_text = html
			
 
				+        self._cached_text = self._absolute_links(self._cached_text)
			
 
				+        self._cached_text = self._del_special_character(self._cached_text)
			
 
				+        self._cached_selector = Selector(self.text)
			
 
				+
			
 
				+    @property
			
 
				+    def json(self, **kwargs):
			
 
				+        if self._cached_json is None:
			
 
				+            self.encoding = self.encoding or "utf-8"
			
 
				+            self._cached_json = super(Response, self).json(**kwargs)
			
 
				+
			
 
				+        return self._cached_json
			
 
				+
			
 
				+    @property
			
 
				+    def content(self):
			
 
				+        content = super(Response, self).content
			
 
				+        return content
			
 
				+
			
 
				+    @property
			
 
				+    def is_html(self):
			
 
				+        content_type = self.headers.get("Content-Type", "")
			
 
				+        if "text/html" in content_type:
			
 
				+            return True
			
 
				+        else:
			
 
				+            return False
			
 
				+
			
 
				+    @property
			
 
				+    def selector(self):
			
 
				+        if self._cached_selector is None:
			
 
				+            self._cached_selector = Selector(self.text)
			
 
				+        return self._cached_selector
			
 
				+
			
 
				+    def bs4(self, features="html.parser"):
			
 
				+        soup = BeautifulSoup(self.text, features)
			
 
				+        return soup
			
 
				+
			
 
				+    def extract(self):
			
 
				+        return self.selector.get()
			
 
				+
			
 
				+    def xpath(self, query, **kwargs):
			
 
				+        return self.selector.xpath(query, **kwargs)
			
 
				+
			
 
				+    def css(self, query):
			
 
				+        return self.selector.css(query)
			
 
				+
			
 
				+    def re(self, regex, replace_entities=False):
			
 
				+        """
			
 
				+        @summary: 正则匹配
			
 
				+        注意：网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ； 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
			
 
				+        为了使用方便，正则单双引号自动处理为不敏感
			
 
				+        ---------
			
 
				+        @param regex: 正则或者re.compile
			
 
				+        @param replace_entities: 为True时 去掉&nbsp;等字符， 转义&quot;为 " 等， 会使网页结构发生变化。如在网页源码中提取json， 建议设置成False
			
 
				+        ---------
			
 
				+        @result: 列表
			
 
				+        """
			
 
				+
			
 
				+        # 将单双引号设置为不敏感
			
 
				+        if isinstance(regex, str):
			
 
				+            regex = re.sub("['\"]", "['\"]", regex)
			
 
				+
			
 
				+        return self.selector.re(regex, replace_entities)
			
 
				+
			
 
				+    def re_first(self, regex, default=None, replace_entities=False):
			
 
				+        """
			
 
				+        @summary: 正则匹配
			
 
				+        注意：网页源码<a class='page-numbers'...  会被处理成<a class="page-numbers" ； 写正则时要写<a class="(.*?)"。 但不会改非html的文本引号格式
			
 
				+        为了使用方便，正则单双引号自动处理为不敏感
			
 
				+        ---------
			
 
				+        @param regex: 正则或者re.compile
			
 
				+        @param default: 未匹配到， 默认值
			
 
				+        @param replace_entities: 为True时 去掉&nbsp;等字符， 转义&quot;为 " 等， 会使网页结构发生变化。如在网页源码中提取json， 建议设置成False
			
 
				+        ---------
			
 
				+        @result: 第一个值或默认值
			
 
				+        """
			
 
				+
			
 
				+        # 将单双引号设置为不敏感
			
 
				+        if isinstance(regex, str):
			
 
				+            regex = re.sub("['\"]", "['\"]", regex)
			
 
				+
			
 
				+        return self.selector.re_first(regex, default, replace_entities)
			
 
				+
			
 
				+    def close_browser(self, request):
			
 
				+        if hasattr(self, "browser"):
			
 
				+            request._webdriver_pool.remove(self.browser)
			
 
				+            del self.browser
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        self.close()
			
 
				+
			
 
				+    def open(self, delete_temp_file=False):
			
 
				+        with open("temp.html", "w", encoding=self.encoding, errors="replace") as html:
			
 
				+            self.encoding_errors = "replace"
			
 
				+            html.write(self.text)
			
 
				+
			
 
				+        os.system("open temp.html")
			
 
				+
			
 
				+        if delete_temp_file:
			
 
				+            time.sleep(1)
			
 
				+            os.remove("temp.html")
			
--- a/FworkSpider/feapder/network/selector.py
+++ b/FworkSpider/feapder/network/selector.py
@@ -0,0 +1,155 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-10-08 15:33:37
			
 
				+---------
			
 
				+@summary: 重新定义 selector
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+import re
			
 
				+
			
 
				+import six
			
 
				+from lxml import etree
			
 
				+from parsel import Selector as ParselSelector
			
 
				+from parsel import SelectorList as ParselSelectorList
			
 
				+from w3lib.html import replace_entities as w3lib_replace_entities
			
 
				+
			
 
				+
			
 
				+def extract_regex(regex, text, replace_entities=True, flags=0):
			
 
				+    """Extract a list of unicode strings from the given text/encoding using the following policies:
			
 
				+    * if the regex contains a named group called "extract" that will be returned
			
 
				+    * if the regex contains multiple numbered groups, all those will be returned (flattened)
			
 
				+    * if the regex doesn't contain any group the entire regex matching is returned
			
 
				+    """
			
 
				+    if isinstance(regex, six.string_types):
			
 
				+        regex = re.compile(regex, flags=flags)
			
 
				+
			
 
				+    if "extract" in regex.groupindex:
			
 
				+        # named group
			
 
				+        try:
			
 
				+            extracted = regex.search(text).group("extract")
			
 
				+        except AttributeError:
			
 
				+            strings = []
			
 
				+        else:
			
 
				+            strings = [extracted] if extracted is not None else []
			
 
				+    else:
			
 
				+        # full regex or numbered groups
			
 
				+        strings = regex.findall(text)
			
 
				+
			
 
				+    # strings = flatten(strings) # 这东西会把多维列表铺平
			
 
				+    if not replace_entities:
			
 
				+        return strings
			
 
				+
			
 
				+    values = []
			
 
				+    for value in strings:
			
 
				+        if isinstance(value, (list, tuple)):  # w3lib_replace_entities 不能接收list tuple
			
 
				+            values.append(
			
 
				+                [w3lib_replace_entities(v, keep=["lt", "amp"]) for v in value]
			
 
				+            )
			
 
				+        else:
			
 
				+            values.append(w3lib_replace_entities(value, keep=["lt", "amp"]))
			
 
				+
			
 
				+    return values
			
 
				+
			
 
				+
			
 
				+def create_root_node(text, parser_cls, base_url=None):
			
 
				+    """Create root node for text using given parser class.
			
 
				+    """
			
 
				+    body = text.strip().replace("\x00", "").encode("utf8") or b"<html/>"
			
 
				+    parser = parser_cls(recover=True, encoding="utf8", huge_tree=True)
			
 
				+    root = etree.fromstring(body, parser=parser, base_url=base_url)
			
 
				+    if root is None:
			
 
				+        root = etree.fromstring(b"<html/>", parser=parser, base_url=base_url)
			
 
				+    return root
			
 
				+
			
 
				+
			
 
				+class SelectorList(ParselSelectorList):
			
 
				+    """
			
 
				+    The :class:`SelectorList` class is a subclass of the builtin ``list``
			
 
				+    class, which provides a few additional methods.
			
 
				+    """
			
 
				+
			
 
				+    def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
			
 
				+        """
			
 
				+        Call the ``.re()`` method for the first element in this list and
			
 
				+        return the result in an unicode string. If the list is empty or the
			
 
				+        regex doesn't match anything, return the default value (``None`` if
			
 
				+        the argument is not provided).
			
 
				+
			
 
				+        By default, character entity references are replaced by their
			
 
				+        corresponding character (except for ``&amp;`` and ``&lt;``.
			
 
				+        Passing ``replace_entities`` as ``False`` switches off these
			
 
				+        replacements.
			
 
				+        """
			
 
				+
			
 
				+        datas = self.re(regex, replace_entities=replace_entities, flags=flags)
			
 
				+        return datas[0] if datas else default
			
 
				+
			
 
				+    def re(self, regex, replace_entities=True, flags=re.S):
			
 
				+        """
			
 
				+        Call the ``.re()`` method for each element in this list and return
			
 
				+        their results flattened, as a list of unicode strings.
			
 
				+
			
 
				+        By default, character entity references are replaced by their
			
 
				+        corresponding character (except for ``&amp;`` and ``&lt;``.
			
 
				+        Passing ``replace_entities`` as ``False`` switches off these
			
 
				+        replacements.
			
 
				+        """
			
 
				+        datas = [
			
 
				+            x.re(regex, replace_entities=replace_entities, flags=flags) for x in self
			
 
				+        ]
			
 
				+        return datas[0] if len(datas) == 1 else datas
			
 
				+
			
 
				+
			
 
				+class Selector(ParselSelector):
			
 
				+    selectorlist_cls = SelectorList
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        data = repr(self.get())
			
 
				+        return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
			
 
				+
			
 
				+    __repr__ = __str__
			
 
				+
			
 
				+    def __init__(self, text=None, *args, **kwargs):
			
 
				+        # 先将&nbsp; 转为空格，否则selector 会转为 \xa0
			
 
				+        if text:
			
 
				+            text = re.sub("&nbsp;", "\x20", text)
			
 
				+        super(Selector, self).__init__(text, *args, **kwargs)
			
 
				+
			
 
				+    def re_first(self, regex, default=None, replace_entities=True, flags=re.S):
			
 
				+        """
			
 
				+        Apply the given regex and return the first unicode string which
			
 
				+        matches. If there is no match, return the default value (``None`` if
			
 
				+        the argument is not provided).
			
 
				+
			
 
				+        By default, character entity references are replaced by their
			
 
				+        corresponding character (except for ``&amp;`` and ``&lt;``.
			
 
				+        Passing ``replace_entities`` as ``False`` switches off these
			
 
				+        replacements.
			
 
				+        """
			
 
				+
			
 
				+        datas = self.re(regex, replace_entities=replace_entities, flags=flags)
			
 
				+
			
 
				+        return datas[0] if datas else default
			
 
				+
			
 
				+    def re(self, regex, replace_entities=True, flags=re.S):
			
 
				+        """
			
 
				+        Apply the given regex and return a list of unicode strings with the
			
 
				+        matches.
			
 
				+
			
 
				+        ``regex`` can be either a compiled regular expression or a string which
			
 
				+        will be compiled to a regular expression using ``re.compile(regex)``.
			
 
				+
			
 
				+        By default, character entity references are replaced by their
			
 
				+        corresponding character (except for ``&amp;`` and ``&lt;``.
			
 
				+        Passing ``replace_entities`` as ``False`` switches off these
			
 
				+        replacements.
			
 
				+        """
			
 
				+
			
 
				+        return extract_regex(
			
 
				+            regex, self.get(), replace_entities=replace_entities, flags=flags
			
 
				+        )
			
 
				+
			
 
				+    def _get_root(self, text, base_url=None):
			
 
				+        return create_root_node(text, self._parser, base_url=base_url)
			
--- a/FworkSpider/feapder/network/user_agent.py
+++ b/FworkSpider/feapder/network/user_agent.py
@@ -0,0 +1,389 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2016-12-28 17:55
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import random
			
 
				+
			
 
				+USER_AGENTS = {
			
 
				+    "chrome": [
			
 
				+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
			
 
				+    ],
			
 
				+    "opera": [
			
 
				+        "Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
			
 
				+        "Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
			
 
				+        "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
			
 
				+        "Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
			
 
				+        "Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
			
 
				+        "Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
			
 
				+        "Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
			
 
				+        "Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
			
 
				+        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
			
 
				+        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
			
 
				+        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
			
 
				+        "Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
			
 
				+        "Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
			
 
				+        "Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
			
 
				+        "Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
			
 
				+        "Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
			
 
				+        "Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
			
 
				+        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
			
 
				+        "Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
			
 
				+        "Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
			
 
				+        "Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
			
 
				+        "Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
			
 
				+        "Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
			
 
				+        "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
			
 
				+    ],
			
 
				+    "firefox": [
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
			
 
				+        "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
			
 
				+        "Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
			
 
				+        "Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101  Firefox/28.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
			
 
				+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
			
 
				+        "Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
			
 
				+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
			
 
				+        "Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0)  Gecko/20100101 Firefox/18.0",
			
 
				+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
			
 
				+    ],
			
 
				+    "internetexplorer": [
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
			
 
				+        "Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0;  rv:11.0) like Gecko",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
			
 
				+        "Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
			
 
				+        "Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
			
 
				+        "Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
			
 
				+        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; InfoPath.3; MS-RTC LM 8; .NET4.0C; .NET4.0E)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 4.0; Tablet PC 2.0; InfoPath.3; .NET4.0C; .NET4.0E)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; .NET4.0C; Tablet PC 2.0)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; FunWebProducts)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/13.0.782.215)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; chromeframe/11.0.696.57)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.1; SV1; .NET CLR 2.8.52393; WOW64; en-US)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; chromeframe/11.0.696.57)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/4.0; GTB7.4; InfoPath.3; SV1; .NET CLR 3.1.76908; WOW64; en-US)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET CLR 3.3.69573; WOW64; en-US)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.8.36217; WOW64; en-US)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; .NET CLR 2.7.58687; SLCC2; Media Center PC 5.0; Zune 3.4; Tablet PC 3.6; InfoPath.3)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; Media Center PC 4.0; SLCC1; .NET CLR 3.0.04320)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.1; SLCC1; .NET CLR 1.1.4322)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 5.0; Trident/4.0; InfoPath.1; SV1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 3.0.04506.30)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.0; Trident/4.0; FBSMTWB; .NET CLR 2.0.34861; .NET CLR 3.0.3746.3218; .NET CLR 3.5.33652; msn OptimizedIE8;ENUS)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; Media Center PC 6.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET4.0C)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; InfoPath.2)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Zune 3.0)",
			
 
				+    ],
			
 
				+    "safari": [
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; de-at) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; da-dk) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; tr-TR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ko-KR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; cs-CZ) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_5_8; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; zh-cn) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; sv-se) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ko-kr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; ja-jp) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; it-it) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-fr) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; es-es) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-us) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-gb) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; de-de) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; sv-SE) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; hu-HU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; de-DE) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ja-JP) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; it-IT) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_7; en-us) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; fr-ch) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; de-de) AppleWebKit/534.15+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; ar) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Android 2.2; Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-HK) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; tr-TR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; nb-NO) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.0; fr-FR) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 5.1; ru-RU) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; zh-cn) AppleWebKit/533.18.1 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
			
 
				+    ],
			
 
				+    "mobile": [
			
 
				+        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
			
 
				+        "Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/14.2 Safari/536.2+",
			
 
				+        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
			
 
				+        "Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/14.2 Mobile Safari/537.10+",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/14.2 Mobile Safari/534.30",
			
 
				+        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/14.2 Mobile/14E304 Safari/602.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/14.2 Mobile/15A372 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Mobile/15E148 Safari/604.1",
			
 
				+        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
			
 
				+        "Mozilla/5.0 (Mobile; LYF/F300B/LYF-F300B-001-01-15-130718-i;Android; rv:89.0 Gecko/48.0 Firefox/90.0 KAIOS/2.5",
			
 
				+        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
			
 
				+        "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
			
 
				+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
			
 
				+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
			
 
				+        "Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36 Edge/14.14263",
			
 
				+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)",
			
 
				+        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
			
 
				+        "Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 11; Pixel 4a (5G)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Linux; Android 7.0; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Mobile Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
			
 
				+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.2 Safari/605.1.15",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4576.0 Safari/537.36 Edg/93.0.4576.0",
			
 
				+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0 Gecko/20100101 Firefox/90.0",
			
 
				+    ],
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def get(ua_type: str = None):
			
 
				+    if not ua_type:
			
 
				+        ua_type = random.choice(list(USER_AGENTS.keys()))
			
 
				+    elif ua_type not in USER_AGENTS:
			
 
				+        raise ValueError(
			
 
				+            "ua_type error, expect one of {}".format(list(USER_AGENTS.keys()))
			
 
				+        )
			
 
				+
			
 
				+    return random.choice(USER_AGENTS[ua_type])
			
--- a/FworkSpider/feapder/pipelines/__init__.py
+++ b/FworkSpider/feapder/pipelines/__init__.py
@@ -0,0 +1,56 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/3/17 10:57 下午
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import abc
			
 
				+from typing import Dict, List, Tuple
			
 
				+
			
 
				+
			
 
				+class BasePipeline(metaclass=abc.ABCMeta):
			
 
				+    """
			
 
				+    pipeline 是单线程的，批量保存数据的操作，不建议在这里写网络请求代码，如下载图片等
			
 
				+    """
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def save_items(self, table, items: List[Dict]) -> bool:
			
 
				+        """
			
 
				+        保存数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+
			
 
				+        Returns: 是否保存成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
			
 
				+        """
			
 
				+        更新数据, 与UpdateItem配合使用，若爬虫中没使用UpdateItem，则可不实现此接口
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+            update_keys: 更新的字段, 如 ("title", "publish_time")
			
 
				+
			
 
				+        Returns: 是否更新成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def close(self):
			
 
				+        """
			
 
				+        关闭，爬虫结束时调用
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        pass
			
--- a/FworkSpider/feapder/pipelines/console_pipeline.py
+++ b/FworkSpider/feapder/pipelines/console_pipeline.py
@@ -0,0 +1,47 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/3/18 12:39 上午
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+from feapder.pipelines import BasePipeline
			
 
				+from typing import Dict, List, Tuple
			
 
				+
			
 
				+
			
 
				+class ConsolePipeline(BasePipeline):
			
 
				+    """
			
 
				+    pipeline 是单线程的，批量保存数据的操作，不建议在这里写网络请求代码，如下载图片等
			
 
				+    """
			
 
				+
			
 
				+    def save_items(self, table, items: List[Dict]) -> bool:
			
 
				+        """
			
 
				+        保存数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+
			
 
				+        Returns: 是否保存成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
			
 
				+        """
			
 
				+        更新数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+            update_keys: 更新的字段, 如 ("title", "publish_time")
			
 
				+
			
 
				+        Returns: 是否更新成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        return True
			
--- a/FworkSpider/feapder/pipelines/mongo_pipeline.py
+++ b/FworkSpider/feapder/pipelines/mongo_pipeline.py
@@ -0,0 +1,84 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-04-18 14:12:21
			
 
				+---------
			
 
				+@summary: 导出数据
			
 
				+---------
			
 
				+@author: Mkdir700
			
 
				+@email:  mkdir700@gmail.com
			
 
				+"""
			
 
				+from typing import Dict, List, Tuple
			
 
				+
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+from feapder.pipelines import BasePipeline
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class MongoPipeline(BasePipeline):
			
 
				+    def __init__(self):
			
 
				+        self._to_db = None
			
 
				+
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def save_items(self, table, items: List[Dict]) -> bool:
			
 
				+        """
			
 
				+        保存数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+
			
 
				+        Returns: 是否保存成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+        try:
			
 
				+            add_count = self.to_db.add_batch(coll_name=table, datas=items)
			
 
				+            datas_size = len(items)
			
 
				+            log.info(
			
 
				+                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
			
 
				+                % (datas_size, table, add_count, datas_size - add_count)
			
 
				+            )
			
 
				+            return True
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+            return False
			
 
				+
			
 
				+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
			
 
				+        """
			
 
				+        更新数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+            update_keys: 更新的字段, 如 ("title", "publish_time")
			
 
				+
			
 
				+        Returns: 是否更新成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+        try:
			
 
				+            add_count = self.to_db.add_batch(
			
 
				+                coll_name=table,
			
 
				+                datas=items,
			
 
				+                update_columns=update_keys or list(items[0].keys()),
			
 
				+            )
			
 
				+            datas_size = len(items)
			
 
				+            update_count = datas_size - add_count
			
 
				+            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
			
 
				+                datas_size,
			
 
				+                table,
			
 
				+                add_count,
			
 
				+                update_count,
			
 
				+            )
			
 
				+            if update_keys:
			
 
				+                msg += " 更新字段为 {}".format(update_keys)
			
 
				+            log.info(msg)
			
 
				+
			
 
				+            return True
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+            return False
			
--- a/FworkSpider/feapder/pipelines/mysql_pipeline.py
+++ b/FworkSpider/feapder/pipelines/mysql_pipeline.py
@@ -0,0 +1,74 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-07-29 22:48:30
			
 
				+---------
			
 
				+@summary: 导出数据
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+from typing import Dict, List, Tuple
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from feapder.pipelines import BasePipeline
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class MysqlPipeline(BasePipeline):
			
 
				+    def __init__(self):
			
 
				+        self._to_db = None
			
 
				+
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MysqlDB()
			
 
				+
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def save_items(self, table, items: List[Dict]) -> bool:
			
 
				+        """
			
 
				+        保存数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+
			
 
				+        Returns: 是否保存成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        sql, datas = tools.make_batch_sql(table, items)
			
 
				+        add_count = self.to_db.add_batch(sql, datas)
			
 
				+        datas_size = len(datas)
			
 
				+        if add_count:
			
 
				+            log.info(
			
 
				+                "共导出 %s 条数据 到 %s, 重复 %s 条" % (datas_size, table, datas_size - add_count)
			
 
				+            )
			
 
				+
			
 
				+        return add_count != None
			
 
				+
			
 
				+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
			
 
				+        """
			
 
				+        更新数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+            update_keys: 更新的字段, 如 ("title", "publish_time")
			
 
				+
			
 
				+        Returns: 是否更新成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        sql, datas = tools.make_batch_sql(
			
 
				+            table, items, update_columns=update_keys or list(items[0].keys())
			
 
				+        )
			
 
				+        update_count = self.to_db.add_batch(sql, datas)
			
 
				+        if update_count:
			
 
				+            msg = "共更新 %s 条数据 到 %s" % (update_count // 2, table)
			
 
				+            if update_keys:
			
 
				+                msg += " 更新字段为 {}".format(update_keys)
			
 
				+            log.info(msg)
			
 
				+
			
 
				+        return update_count != None
			
--- a/FworkSpider/feapder/requirements.txt
+++ b/FworkSpider/feapder/requirements.txt
@@ -0,0 +1,17 @@
 
				+better-exceptions>=0.2.2
			
 
				+DBUtils>=2.0
			
 
				+parsel>=1.5.2
			
 
				+PyExecJS>=1.5.1
			
 
				+pymongo>=3.10.1
			
 
				+PyMySQL>=0.9.3
			
 
				+redis>=2.10.6
			
 
				+requests>=2.22.0
			
 
				+selenium>=3.141.0
			
 
				+bs4>=0.0.1
			
 
				+ipython>=7.14.0
			
 
				+bitarray>=1.5.3
			
 
				+redis-py-cluster>=2.1.0
			
 
				+cryptography>=3.3.2
			
 
				+urllib3>=1.25.8
			
 
				+loguru>=0.5.3
			
 
				+influxdb>=5.3.1
			
--- a/FworkSpider/feapder/setting.py
+++ b/FworkSpider/feapder/setting.py
@@ -0,0 +1,172 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""爬虫配置文件"""
			
 
				+import os
			
 
				+
			
 
				+# redis 表名
			
 
				+# 任务表模版
			
 
				+TAB_REQUSETS = "{redis_key}:z_requsets"
			
 
				+# 任务失败模板
			
 
				+TAB_FAILED_REQUSETS = "{redis_key}:z_failed_requsets"
			
 
				+# 数据保存失败模板
			
 
				+TAB_FAILED_ITEMS = "{redis_key}:s_failed_items"
			
 
				+# 爬虫状态表模版
			
 
				+TAB_SPIDER_STATUS = "{redis_key}:z_spider_status"
			
 
				+# 爬虫时间记录表
			
 
				+TAB_SPIDER_TIME = "{redis_key}:h_spider_time"
			
 
				+
			
 
				+# MYSQL
			
 
				+MYSQL_IP = os.getenv("MYSQL_IP")
			
 
				+MYSQL_PORT = int(os.getenv("MYSQL_PORT", 3306))
			
 
				+MYSQL_DB = os.getenv("MYSQL_DB")
			
 
				+MYSQL_USER_NAME = os.getenv("MYSQL_USER_NAME")
			
 
				+MYSQL_USER_PASS = os.getenv("MYSQL_USER_PASS")
			
 
				+
			
 
				+# MONGODB
			
 
				+MONGO_IP = os.getenv("MONGO_IP", "localhost")
			
 
				+MONGO_PORT = int(os.getenv("MONGO_PORT", 27017))
			
 
				+MONGO_DB = os.getenv("MONGO_DB")
			
 
				+MONGO_USER_NAME = os.getenv("MONGO_USER_NAME")
			
 
				+MONGO_USER_PASS = os.getenv("MONGO_USER_PASS")
			
 
				+
			
 
				+# REDIS
			
 
				+# ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
			
 
				+REDISDB_IP_PORTS = os.getenv("REDISDB_IP_PORTS")
			
 
				+REDISDB_USER_PASS = os.getenv("REDISDB_USER_PASS")
			
 
				+REDISDB_DB = int(os.getenv("REDISDB_DB", 0))
			
 
				+# 适用于redis哨兵模式
			
 
				+REDISDB_SERVICE_NAME = os.getenv("REDISDB_SERVICE_NAME")
			
 
				+
			
 
				+# 数据入库的pipeline，可自定义，默认MysqlPipeline
			
 
				+ITEM_PIPELINES = [
			
 
				+    "feapder.pipelines.mysql_pipeline.MysqlPipeline",
			
 
				+    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
			
 
				+]
			
 
				+EXPORT_DATA_MAX_FAILED_TIMES = 10  # 导出数据时最大的失败次数，包括保存和更新，超过这个次数报警
			
 
				+EXPORT_DATA_MAX_RETRY_TIMES = 10  # 导出数据时最大的重试次数，包括保存和更新，超过这个次数则放弃重试
			
 
				+
			
 
				+# 爬虫相关
			
 
				+# COLLECTOR
			
 
				+COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
			
 
				+COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
			
 
				+
			
 
				+# SPIDER
			
 
				+SPIDER_THREAD_COUNT = 1  # 爬虫并发数
			
 
				+SPIDER_SLEEP_TIME = (
			
 
				+    0
			
 
				+)  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数，包含2和5
			
 
				+SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
			
 
				+SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
			
 
				+SPIDER_AUTO_START_REQUESTS = (
			
 
				+    True
			
 
				+)  # 是否主动执行添加 设置为False 需要手动调用start_monitor_task，适用于多进程情况下
			
 
				+KEEP_ALIVE = False  # 爬虫是否常驻
			
 
				+
			
 
				+# 浏览器渲染
			
 
				+WEBDRIVER = dict(
			
 
				+    pool_size=1,  # 浏览器的数量
			
 
				+    load_images=True,  # 是否加载图片
			
 
				+    user_agent=None,  # 字符串 或 无参函数，返回值为user_agent
			
 
				+    proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数，返回值为代理地址
			
 
				+    headless=False,  # 是否为无头浏览器
			
 
				+    driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
			
 
				+    timeout=30,  # 请求超时时间
			
 
				+    window_size=(1024, 800),  # 窗口大小
			
 
				+    executable_path=None,  # 浏览器路径，默认为默认路径
			
 
				+    render_time=0,  # 渲染时长，即打开网页等待指定时间后再获取源码
			
 
				+    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
			
 
				+)
			
 
				+
			
 
				+# 爬虫启动时，重新抓取失败的requests
			
 
				+RETRY_FAILED_REQUESTS = False
			
 
				+# 保存失败的request
			
 
				+SAVE_FAILED_REQUEST = True
			
 
				+# request防丢机制。（指定的REQUEST_LOST_TIMEOUT时间内request还没做完，会重新下发 重做）
			
 
				+REQUEST_LOST_TIMEOUT = 600  # 10分钟
			
 
				+# request网络请求超时时间
			
 
				+REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间，浮点数，或(connect timeout, read timeout)元组
			
 
				+
			
 
				+# 下载缓存 利用redis缓存，但由于内存大小限制，所以建议仅供开发调试代码时使用，防止每次debug都需要网络请求
			
 
				+RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据，建议设置为True
			
 
				+RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
			
 
				+RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
			
 
				+
			
 
				+# redis 存放item与request的根目录
			
 
				+REDIS_KEY = ""
			
 
				+# 爬虫启动时删除的key，类型: 元组/bool/string。 支持正则; 常用于清空任务队列，否则重启时会断点续爬
			
 
				+DELETE_KEYS = []
			
 
				+
			
 
				+# 设置代理
			
 
				+PROXY_EXTRACT_API = None  # 代理提取API ，返回的代理分割符为\r\n
			
 
				+PROXY_ENABLE = True
			
 
				+
			
 
				+# 随机headers
			
 
				+RANDOM_HEADERS = True
			
 
				+# UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari'，'mobile' 若不指定则随机类型
			
 
				+USER_AGENT_TYPE = "chrome"
			
 
				+# 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
			
 
				+DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
			
 
				+# requests 使用session
			
 
				+USE_SESSION = False
			
 
				+
			
 
				+# 去重
			
 
				+ITEM_FILTER_ENABLE = False  # item 去重
			
 
				+ITEM_FILTER_SETTING = dict(
			
 
				+    filter_type=1  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
			
 
				+)
			
 
				+REQUEST_FILTER_ENABLE = False  # request 去重
			
 
				+REQUEST_FILTER_SETTING = dict(
			
 
				+    filter_type=3,  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
			
 
				+    expire_time=2592000,  # 过期时间1个月
			
 
				+)
			
 
				+
			
 
				+# 报警 支持钉钉、企业微信、邮件
			
 
				+# 钉钉报警
			
 
				+DINGDING_WARNING_URL = ""  # 钉钉机器人api
			
 
				+DINGDING_WARNING_PHONE = ""  # 报警人 支持列表，可指定多个
			
 
				+DINGDING_WARNING_ALL = False  # 是否提示所有人， 默认为False
			
 
				+# 邮件报警
			
 
				+EMAIL_SENDER = ""  # 发件人
			
 
				+EMAIL_PASSWORD = ""  # 授权码
			
 
				+EMAIL_RECEIVER = ""  # 收件人 支持列表，可指定多个
			
 
				+EMAIL_SMTPSERVER = "smtp.163.com"  # 邮件服务器 默认为163邮箱
			
 
				+# 企业微信报警
			
 
				+WECHAT_WARNING_URL = ""  # 企业微信机器人api
			
 
				+WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表，可指定多人
			
 
				+WECHAT_WARNING_ALL = False  # 是否提示所有人， 默认为False
			
 
				+# 时间间隔
			
 
				+WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔，防止刷屏; 0表示不去重
			
 
				+WARNING_LEVEL = "DEBUG"  # 报警级别， DEBUG / ERROR
			
 
				+WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
			
 
				+
			
 
				+LOG_NAME = os.path.basename(os.getcwd())
			
 
				+LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
			
 
				+LOG_LEVEL = "DEBUG"
			
 
				+LOG_COLOR = True  # 是否带有颜色
			
 
				+LOG_IS_WRITE_TO_CONSOLE = True  # 是否打印到控制台
			
 
				+LOG_IS_WRITE_TO_FILE = False  # 是否写文件
			
 
				+LOG_MODE = "w"  # 写文件的模式
			
 
				+LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
			
 
				+LOG_BACKUP_COUNT = 20  # 日志文件保留数量
			
 
				+LOG_ENCODING = "utf8"  # 日志文件编码
			
 
				+OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
			
 
				+
			
 
				+# 打点监控 influxdb 配置
			
 
				+INFLUXDB_HOST = os.getenv("INFLUXDB_HOST", "localhost")
			
 
				+INFLUXDB_PORT = int(os.getenv("INFLUXDB_PORT", 8086))
			
 
				+INFLUXDB_UDP_PORT = int(os.getenv("INFLUXDB_UDP_PORT", 8089))
			
 
				+INFLUXDB_USER = os.getenv("INFLUXDB_USER")
			
 
				+INFLUXDB_PASSWORD = os.getenv("INFLUXDB_PASSWORD")
			
 
				+INFLUXDB_DATABASE = os.getenv("INFLUXDB_DB")
			
 
				+# 监控数据存储的表名，爬虫管理系统上会以task_id命名
			
 
				+INFLUXDB_MEASUREMENT = "task_" + os.getenv("TASK_ID") if os.getenv("TASK_ID") else None
			
 
				+# 打点监控其他参数，若这里也配置了influxdb的参数, 则会覆盖外面的配置
			
 
				+METRICS_OTHER_ARGS = dict(retention_policy_duration="180d", emit_interval=60)
			
 
				+
			
 
				+############# 导入用户自定义的setting #############
			
 
				+try:
			
 
				+    from setting import *
			
 
				+
			
 
				+    # 兼容老版本的配置
			
 
				+    KEEP_ALIVE = not AUTO_STOP_WHEN_SPIDER_DONE
			
 
				+except:
			
 
				+    pass
			
--- a/FworkSpider/feapder/templates/air_spider_template.tmpl
+++ b/FworkSpider/feapder/templates/air_spider_template.tmpl
@@ -0,0 +1,22 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+
			
 
				+import feapder
			
 
				+
			
 
				+
			
 
				+class ${spider_name}(feapder.AirSpider):
			
 
				+    def start_requests(self):
			
 
				+        yield feapder.Request("https://www.baidu.com")
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        print(response)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    ${spider_name}().start()
			
--- a/FworkSpider/feapder/templates/batch_spider_template.tmpl
+++ b/FworkSpider/feapder/templates/batch_spider_template.tmpl
@@ -0,0 +1,45 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+
			
 
				+import feapder
			
 
				+
			
 
				+
			
 
				+class ${spider_name}(feapder.BatchSpider):
			
 
				+    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
			
 
				+    __custom_setting__ = dict(
			
 
				+        REDISDB_IP_PORTS="localhost:6379",
			
 
				+        REDISDB_USER_PASS="",
			
 
				+        REDISDB_DB=0,
			
 
				+        MYSQL_IP="localhost",
			
 
				+        MYSQL_PORT=3306,
			
 
				+        MYSQL_DB="feapder",
			
 
				+        MYSQL_USER_NAME="feapder",
			
 
				+        MYSQL_USER_PASS="feapder123",
			
 
				+    )
			
 
				+
			
 
				+    def start_requests(self, task):
			
 
				+        yield feapder.Request("https://www.baidu.com")
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        print(response)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    spider = ${spider_name}(
			
 
				+        redis_key="xxx:xxxx",  # redis中存放任务等信息的根key
			
 
				+        task_table="",  # mysql中的任务表
			
 
				+        task_keys=["id", "xxx"],  # 需要获取任务表里的字段名，可添加多个
			
 
				+        task_state="state",  # mysql中任务状态字段
			
 
				+        batch_record_table="xxx_batch_record",  # mysql中的批次记录表
			
 
				+        batch_name="xxx(周全)",  # 批次名字
			
 
				+        batch_interval=7,  # 批次周期 天为单位 若为小时 可写 1 / 24
			
 
				+    )
			
 
				+
			
 
				+    # spider.start_monitor_task() # 下发及监控任务
			
 
				+    spider.start() # 采集
			
--- a/FworkSpider/feapder/templates/detail_template.tmpl
+++ b/FworkSpider/feapder/templates/detail_template.tmpl
@@ -0,0 +1,105 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary:  ${spider_name}
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+import sys
			
 
				+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
			
 
				+import time
			
 
				+from urllib.parse import urljoin
			
 
				+
			
 
				+import feapder
			
 
				+from feapder.utils.tools import wechat_warning
			
 
				+import execjs
			
 
				+from items.spider_item import DataBakItem, MgpListItem
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+
			
 
				+
			
 
				+
			
 
				+class ${spider_name}(feapder.Spider):
			
 
				+    _to_db = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    send_list = []
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        while True:
			
 
				+            data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},sort={"failed":-1},limit=50)
			
 
				+            for item in data_lsit:
			
 
				+                request_params = item.get("request_params")
			
 
				+
			
 
				+                '''可自定义'''
			
 
				+
			
 
				+                yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),
			
 
				+                                      deal_detail=item.get("deal_detail"),**request_params,
			
 
				+                                      callback=eval(item.get("parse")),base_info=item,proxies=item.get("proxies"))
			
 
				+                self.to_db.delete(self.db_name,item)
			
 
				+            break
			
 
				+
			
 
				+    def detail_get(self,request,response):
			
 
				+        '''需自定义解析规则'''
			
 
				+        items = request.item
			
 
				+        list_item = DataBakItem()
			
 
				+        for key in items:
			
 
				+            list_item.__setitem__(key,items[key])
			
 
				+        html = ''
			
 
				+        # for xpath in request.deal_detail:
			
 
				+        #    html = response.xpath(xpath).extract_first()
			
 
				+        #    if html is not None:
			
 
				+        #        break
			
 
				+
			
 
				+        list_item.contenthtml = html
			
 
				+        # if request.files:
			
 
				+        #     files_info = request.files
			
 
				+        #     files =  response.xpath(files_info.get("xpath")).extract()
			
 
				+        #     for file_url in files:
			
 
				+        #         if files_info.get("host"):
			
 
				+        #             file_url = urljoin(files_info.get("host"), file_url)
			
 
				+        #         if file_url.split(".")[-1] in files.get("other_files"):
			
 
				+        #             continue
			
 
				+        yield list_item
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def failed_request(self, request, response):
			
 
				+        '''请求、解析次数超过上限后，将原信息重新保存至mongo，并修改failed字段'''
			
 
				+        mgp = MgpListItem()
			
 
				+        items = request.base_info
			
 
				+        for key in items:
			
 
				+            mgp.__setitem__(key,items[key])
			
 
				+        mgp.failed +=1
			
 
				+        print(f'......{mgp.failed}')
			
 
				+        if mgp.pri > 5:
			
 
				+            if mgp.failed in(10,30,50,100,200)or mgp.failed>200:
			
 
				+                if self.send_list.count(mgp.item.get("site")) == mgp.pri - 5:
			
 
				+                    '''
			
 
				+                    根据爬虫优先级报警'''
			
 
				+                    info= f'''`
			
 
				+        您的爬虫出现超<font color="#FF0000">{mgp.failed}</font>次请求、解析失败的任务。
			
 
				+        > **爬虫名称:** {mgp.item.get("site")}
			
 
				+        > **栏目名称:** {mgp.item.get("channel")}
			
 
				+        > **爬虫代码:** {mgp.item.get("spidercode")}
			
 
				+        > **爬虫等级:** {mgp.pri}
			
 
				+        > **所属管理人员:** {mgp.author}
			
 
				+        请登录剑鱼爬虫管理平台查看详情。
			
 
				+        `'''
			
 
				+                    wechat_warning(info)
			
 
				+                    self.send_list.append(mgp.item.get("site"))
			
 
				+        yield mgp
			
 
				+
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    Details(redis_key="fwork:details1").start()
			
--- a/FworkSpider/feapder/templates/item_template.tmpl
+++ b/FworkSpider/feapder/templates/item_template.tmpl
@@ -0,0 +1,22 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+
			
 
				+from feapder import Item
			
 
				+
			
 
				+
			
 
				+class ${item_name}Item(Item):
			
 
				+    """
			
 
				+    This class was generated by feapder.
			
 
				+    command: feapder create -i ${table_name}.
			
 
				+    """
			
 
				+
			
 
				+    __table_name__ = "${table_name}"
			
 
				+
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        ${propertys}
			
--- a/FworkSpider/feapder/templates/project_template/CHECK_DATA.md
+++ b/FworkSpider/feapder/templates/project_template/CHECK_DATA.md
@@ -0,0 +1,49 @@
 
				+# 数据审核 
			
 
				+## 表说明：
			
 
				+
			
 
				+> 表名 含义（更新策略）
			
 
				+
			
 
				+## 一、准确性
			
 
				+
			
 
				+**字段设计是否满足需求？ 表之间的关联字段是否满足要求？ （需要人工检查）**
			
 
				+
			
 
				+> 注意：是否设计了自增 id，id 的类型是否设置为 bigint？
			
 
				+> 注意：unique index 是否需要设计？
			
 
				+> 注意：各张表之间是否需要设计关联字段；
			
 
				+
			
 
				+* [ ] 是
			
 
				+* [ ] 否
			
 
				+
			
 
				+**各字段采集内容及存储格式是否满足要求？是否与网页一致？是否有信息缺失？**
			
 
				+
			
 
				+> 备注：可尝试对每个字段进行升降序排列，然后抽样检查；
			
 
				+     
			
 
				+**是否考虑了网站同一类数据可能出现的数据格式不一致情况？**
			
 
				+
			
 
				+> 建议：代码对各个字段不做兼容性处理、数据不一致则抛出异常并记录 
			
 
				+
			
 
				+* [ ] 是
			
 
				+* [ ] 否
			
 
				+
			
 
				+## 二、全量性
			
 
				+
			
 
				+**如果是增量采集，是否最早信息和最晚信息都采集了，同时条目总数是否正确；**
			
 
				+**如果是批次采集，是否每个批次都有？**
			
 
				+
			
 
				+>备注：需要去网页端评估单个批次的总量；
			
 
				+>参考sql语句：SELECT count(1), batch_date from [table_name] GROUP BY batch_date;
			
 
				+
			
 
				+**如果与另外一张表有关联关系，是否信息关联完整？**
			
 
				+
			
 
				+## 三、稳定性
			
 
				+
			
 
				+* [ ] 是否能够长期稳定采集？ 
			
 
				+* [ ] 是否加IP代理？
			
 
				+* [ ] 是否支持断点续跑?
			
 
				+* [ ] 是否能确保按时启动，定期采集?
			
 
				+* [ ] 是否已开启报警？ 
			
 
				+
			
 
				+## 四、采集频次、类型、存储方式
			
 
				+
			
 
				+* [ ] 采集频次是否满足要求？
			
 
				+* [ ] 采集类型是否满足要求：增量采集 or 批次采集? 
			
--- a/FworkSpider/feapder/templates/project_template/README.md
+++ b/FworkSpider/feapder/templates/project_template/README.md
@@ -0,0 +1,8 @@
 
				+# xxx爬虫文档
			
 
				+## 调研
			
 
				+
			
 
				+## 数据库设计
			
 
				+
			
 
				+## 爬虫逻辑
			
 
				+
			
 
				+## 项目架构
			
--- a/FworkSpider/feapder/templates/project_template/items/__init__.py
+++ b/FworkSpider/feapder/templates/project_template/items/__init__.py
--- a/FworkSpider/feapder/templates/project_template/main.py
+++ b/FworkSpider/feapder/templates/project_template/main.py
@@ -0,0 +1,79 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary: 爬虫入口
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+
			
 
				+from feapder import ArgumentParser
			
 
				+
			
 
				+from spiders import *
			
 
				+
			
 
				+def crawl_xxx():
			
 
				+    """
			
 
				+    AirSpider爬虫
			
 
				+    """
			
 
				+    spider = xxx.XXXSpider()
			
 
				+    spider.start()
			
 
				+
			
 
				+def crawl_xxx():
			
 
				+    """
			
 
				+    Spider爬虫
			
 
				+    """
			
 
				+    spider = xxx.XXXSpider(redis_key="xxx:xxx")
			
 
				+    spider.start()
			
 
				+
			
 
				+
			
 
				+def crawl_xxx(args):
			
 
				+    """
			
 
				+    BatchSpider爬虫
			
 
				+    """
			
 
				+    spider = xxx_spider.XXXSpider(
			
 
				+        task_table="",  # mysql中的任务表
			
 
				+        batch_record_table="",  # mysql中的批次记录表
			
 
				+        batch_name="xxx(周全)",  # 批次名字
			
 
				+        batch_interval=7,  # 批次时间 天为单位 若为小时 可写 1 / 24
			
 
				+        task_keys=["id", "xxx"],  # 需要获取任务表里的字段名，可添加多个
			
 
				+        redis_key="xxx:xxxx",  # redis中存放request等信息的根key
			
 
				+        task_state="state",  # mysql中任务状态字段
			
 
				+    )
			
 
				+
			
 
				+    if args == 1:
			
 
				+        spider.start_monitor_task()
			
 
				+    elif args == 2:
			
 
				+        spider.start()
			
 
				+    elif args == 3:
			
 
				+        spider.init_task()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = ArgumentParser(description="xxx爬虫")
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--crawl_xxx",
			
 
				+        type=int,
			
 
				+        nargs=1,
			
 
				+        help="xxx爬虫",
			
 
				+        choices=[1, 2, 3],
			
 
				+        function=crawl_xxx,
			
 
				+    )
			
 
				+
			
 
				+    parser.start()
			
 
				+
			
 
				+    # main.py作为爬虫启动的统一入口，提供命令行的方式启动多个爬虫，若只有一个爬虫，可不编写main.py
			
 
				+    # 将上面的xxx修改为自己实际的爬虫名
			
 
				+    # 查看运行命令 python main.py --help
			
 
				+    # AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx
			
 
				+    # BatchSpider运行方式
			
 
				+    # 1. 下发任务：python main.py --crawl_xxx 1
			
 
				+    # 2. 采集：python main.py --crawl_xxx 2
			
 
				+    # 3. 重置任务：python main.py --crawl_xxx 3
			
 
				+
			
--- a/FworkSpider/feapder/templates/project_template/setting.py
+++ b/FworkSpider/feapder/templates/project_template/setting.py
@@ -0,0 +1,137 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""爬虫配置文件"""
			
 
				+# import os
			
 
				+# import sys
			
 
				+#
			
 
				+# # MYSQL
			
 
				+# MYSQL_IP = "localhost"
			
 
				+# MYSQL_PORT = 3306
			
 
				+# MYSQL_DB = ""
			
 
				+# MYSQL_USER_NAME = ""
			
 
				+# MYSQL_USER_PASS = ""
			
 
				+#
			
 
				+# # MONGODB
			
 
				+# MONGO_IP = "localhost"
			
 
				+# MONGO_PORT = 27017
			
 
				+# MONGO_DB = ""
			
 
				+# MONGO_USER_NAME = ""
			
 
				+# MONGO_USER_PASS = ""
			
 
				+#
			
 
				+# # REDIS
			
 
				+# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
			
 
				+# REDISDB_IP_PORTS = "localhost:6379"
			
 
				+# REDISDB_USER_PASS = ""
			
 
				+# REDISDB_DB = 0
			
 
				+# # 适用于redis哨兵模式
			
 
				+# REDISDB_SERVICE_NAME = ""
			
 
				+#
			
 
				+# # 数据入库的pipeline，可自定义，默认MysqlPipeline
			
 
				+# ITEM_PIPELINES = [
			
 
				+#     "feapder.pipelines.mysql_pipeline.MysqlPipeline",
			
 
				+#     # "feapder.pipelines.mongo_pipeline.MongoPipeline",
			
 
				+# ]
			
 
				+# EXPORT_DATA_MAX_FAILED_TIMES = 10 # 导出数据时最大的失败次数，包括保存和更新，超过这个次数报警
			
 
				+# EXPORT_DATA_MAX_RETRY_TIMES = 10 # 导出数据时最大的重试次数，包括保存和更新，超过这个次数则放弃重试
			
 
				+#
			
 
				+# # 爬虫相关
			
 
				+# # COLLECTOR
			
 
				+# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
			
 
				+# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
			
 
				+#
			
 
				+# # SPIDER
			
 
				+# SPIDER_THREAD_COUNT = 1  # 爬虫并发数
			
 
				+# SPIDER_SLEEP_TIME = 0  # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数，包含2和5
			
 
				+# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
			
 
				+# SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
			
 
				+# KEEP_ALIVE = False  # 爬虫是否常驻
			
 
				+#
			
 
				+# # 浏览器渲染
			
 
				+# WEBDRIVER = dict(
			
 
				+#     pool_size=1,  # 浏览器的数量
			
 
				+#     load_images=True,  # 是否加载图片
			
 
				+#     user_agent=None,  # 字符串 或 无参函数，返回值为user_agent
			
 
				+#     proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数，返回值为代理地址
			
 
				+#     headless=False,  # 是否为无头浏览器
			
 
				+#     driver_type="CHROME",  # CHROME、PHANTOMJS、FIREFOX
			
 
				+#     timeout=30,  # 请求超时时间
			
 
				+#     window_size=(1024, 800),  # 窗口大小
			
 
				+#     executable_path=None,  # 浏览器路径，默认为默认路径
			
 
				+#     render_time=0,  # 渲染时长，即打开网页等待指定时间后再获取源码
			
 
				+#     custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
			
 
				+# )
			
 
				+#
			
 
				+# # 爬虫启动时，重新抓取失败的requests
			
 
				+# RETRY_FAILED_REQUESTS = False
			
 
				+# # 保存失败的request
			
 
				+# SAVE_FAILED_REQUEST = True
			
 
				+# # request防丢机制。（指定的REQUEST_LOST_TIMEOUT时间内request还没做完，会重新下发 重做）
			
 
				+# REQUEST_LOST_TIMEOUT = 600  # 10分钟
			
 
				+# # request网络请求超时时间
			
 
				+# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间，浮点数，或(connect timeout, read timeout)元组
			
 
				+#
			
 
				+# # 下载缓存 利用redis缓存，但由于内存大小限制，所以建议仅供开发调试代码时使用，防止每次debug都需要网络请求
			
 
				+# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据，建议设置为True
			
 
				+# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
			
 
				+# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
			
 
				+#
			
 
				+# # 设置代理
			
 
				+# PROXY_EXTRACT_API = None  # 代理提取API ，返回的代理分割符为\r\n
			
 
				+# PROXY_ENABLE = True
			
 
				+#
			
 
				+# # 随机headers
			
 
				+# RANDOM_HEADERS = True
			
 
				+# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari'，'mobile' 若不指定则随机类型
			
 
				+# USER_AGENT_TYPE = "chrome"
			
 
				+# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
			
 
				+# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
			
 
				+# # requests 使用session
			
 
				+# USE_SESSION = False
			
 
				+#
			
 
				+# # 去重
			
 
				+# ITEM_FILTER_ENABLE = False  # item 去重
			
 
				+# REQUEST_FILTER_ENABLE = False  # request 去重
			
 
				+# ITEM_FILTER_SETTING = dict(
			
 
				+#     filter_type=1  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
			
 
				+# )
			
 
				+# REQUEST_FILTER_ENABLE = False  # request 去重
			
 
				+# REQUEST_FILTER_SETTING = dict(
			
 
				+#     filter_type=3,  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
			
 
				+#     expire_time=2592000,  # 过期时间1个月
			
 
				+# )
			
 
				+#
			
 
				+# # 报警 支持钉钉、企业微信、邮件
			
 
				+# # 钉钉报警
			
 
				+# DINGDING_WARNING_URL = ""  # 钉钉机器人api
			
 
				+# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表，可指定多个
			
 
				+# DINGDING_WARNING_ALL = False # 是否提示所有人， 默认为False
			
 
				+# # 邮件报警
			
 
				+# EMAIL_SENDER = ""  # 发件人
			
 
				+# EMAIL_PASSWORD = ""  # 授权码
			
 
				+# EMAIL_RECEIVER = ""  # 收件人 支持列表，可指定多个
			
 
				+# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
			
 
				+# # 企业微信报警
			
 
				+# WECHAT_WARNING_URL = ""  # 企业微信机器人api
			
 
				+# WECHAT_WARNING_PHONE = ""  # 报警人 将会在群内@此人, 支持列表，可指定多人
			
 
				+# WECHAT_WARNING_ALL = False  # 是否提示所有人， 默认为False
			
 
				+# # 时间间隔
			
 
				+# WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔，防止刷屏; 0表示不去重
			
 
				+# WARNING_LEVEL = "DEBUG"  # 报警级别， DEBUG / ERROR
			
 
				+# WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
			
 
				+#
			
 
				+# LOG_NAME = os.path.basename(os.getcwd())
			
 
				+# LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
			
 
				+# LOG_LEVEL = "DEBUG"
			
 
				+# LOG_COLOR = True  # 是否带有颜色
			
 
				+# LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
			
 
				+# LOG_IS_WRITE_TO_FILE = False  # 是否写文件
			
 
				+# LOG_MODE = "w"  # 写文件的模式
			
 
				+# LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
			
 
				+# LOG_BACKUP_COUNT = 20  # 日志文件保留数量
			
 
				+# LOG_ENCODING = "utf8"  # 日志文件编码
			
 
				+# OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
			
 
				+#
			
 
				+# # 切换工作路径为当前项目路径
			
 
				+# project_path = os.path.abspath(os.path.dirname(__file__))
			
 
				+# os.chdir(project_path)  # 切换工作路经
			
 
				+# sys.path.insert(0, project_path)
			
 
				+# print('当前工作路径为 ' + os.getcwd())
			
--- a/FworkSpider/feapder/templates/project_template/spiders/__init__.py
+++ b/FworkSpider/feapder/templates/project_template/spiders/__init__.py
--- a/FworkSpider/feapder/templates/spider_list_template.tmpl
+++ b/FworkSpider/feapder/templates/spider_list_template.tmpl
@@ -0,0 +1,88 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary: ${spider_name}
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+import sys
			
 
				+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
			
 
				+import feapder
			
 
				+from items.spider_item import DataBakItem,MgpListItem,ListItem
			
 
				+from feapder.dedup import Dedup
			
 
				+from collections import namedtuple
			
 
				+
			
 
				+
			
 
				+class ${spider_name}(feapder.Spider):
			
 
				+
			
 
				+    def start_callback(self):
			
 
				+         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
			
 
				+
			
 
				+         self.menus = [
			
 
				+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "自定义参数", 1),
			
 
				+             Menu('${spider_name}抓取栏目', '${spider_name}爬虫code', "Notice", 1),
			
 
				+         ]
			
 
				+    def start_requests(self):
			
 
				+         for menu in self.menus:
			
 
				+             for page in range(1,menu.crawl_page+1):
			
 
				+                 start_url = f''
			
 
				+                 yield feapder.Request(url=start_url, item=menu._asdict(),proxies=False)
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        menu = request.item
			
 
				+        dedup = Dedup(Dedup.BloomFilter)
			
 
				+        href_list = []
			
 
				+        info_list = []
			
 
				+        for info in info_list:
			
 
				+            href = ''
			
 
				+            title = ''
			
 
				+            create_time = ''
			
 
				+
			
 
				+            data_item = DataBakItem()  # 存储数据的管道
			
 
				+            data_item.href = href  # 标书链接
			
 
				+            data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
			
 
				+            data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
			
 
				+            data_item.title = title  # 标题
			
 
				+            data_item.publishtime = create_time  # 标书发布时间
			
 
				+            data_item.site = "*******记得编辑平台名称"
			
 
				+            data_item.area = "全国"  # 城市默认:全国
			
 
				+            data_item.city = ""  # 城市 默认为空
			
 
				+            ss = dedup.filter_exist_data([href])
			
 
				+            if ss == []:
			
 
				+                continue
			
 
				+            list_item =  MgpListItem()
			
 
				+            list_item.parse = "self.detail_get"
			
 
				+            list_item.parser_name = "details"
			
 
				+            list_item.item = data_item.to_dict
			
 
				+            list_item.deal_detail = ['//div[@class="****"]',"*****"]
			
 
				+            list_item.proxies = False
			
 
				+            list_item.parse_url = href
			
 
				+            list_item.pri = 1
			
 
				+            list.files={
			
 
				+                "list_xpath":'//div[@class="notice-foot"]/a',
			
 
				+                "url_xpath":'./@href',
			
 
				+                "name_xpath":'./text()',
			
 
				+                "files_type":('zip','doxc','ftp'),
			
 
				+                "file_type":'zip',
			
 
				+                "url_key":'attachmentDownload',
			
 
				+                # "host":'http',
			
 
				+                "kwargs":{"headers": {
			
 
				+                    "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
			
 
				+                }}
			
 
				+            href_list.append(href)
			
 
				+            yield list_item
			
 
				+        list = ListItem()
			
 
				+        list.site = self.site
			
 
				+        list.channel = menu.get("channel")
			
 
				+        list.spidercode = menu.get("code")
			
 
				+        list.url = request.url
			
 
				+        list.count = len(info_list)
			
 
				+        list.rel_count = len(href_list)
			
 
				+        dedup.add(href_list)
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    ${spider_name}(redis_key="{USER}:${spider_name}").start()
			
--- a/FworkSpider/feapder/templates/spider_template.tmpl
+++ b/FworkSpider/feapder/templates/spider_template.tmpl
@@ -0,0 +1,67 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on {DATE}
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: {USER}
			
 
				+"""
			
 
				+
			
 
				+import feapder
			
 
				+from items.spider_item import DataBakItem
			
 
				+from untils.proxy_pool import ProxyPool
			
 
				+from feapder.dedup import Dedup
			
 
				+from collections import namedtuple
			
 
				+
			
 
				+
			
 
				+class ${spider_name}(feapder.Spider):
			
 
				+    # 自定义数据库，若项目中有setting.py文件，此自定义可删除
			
 
				+    def start_callback(self):
			
 
				+         self.count = 0
			
 
				+         self.prox_pool = ProxyPool()
			
 
				+         Menu = namedtuple('Menu', ['channel', 'code', 'types', 'crawl_page'])
			
 
				+
			
 
				+         self.menus = [
			
 
				+             Menu('${spider_name}', '${spider_name}', "Notice", 1),
			
 
				+             Menu('${spider_name}', '${spider_name}', "Notice", 1),
			
 
				+         ]
			
 
				+    def start_requests(self):
			
 
				+         for menu in self.menus:
			
 
				+            start_url = f''
			
 
				+            yield feapder.Request(url=start_url, item=menu._asdict())
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        menu = request.item
			
 
				+        self.count += 1   # 一个计数器
			
 
				+        for info in info_list:
			
 
				+            list_item = DataBakItem()  # 存储数据的管道
			
 
				+            list_item.href = href  # 标书链接
			
 
				+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
			
 
				+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
			
 
				+            list_item.title = title  # 标题
			
 
				+            list_item.publishtime = create_time  # 标书发布时间
			
 
				+
			
 
				+            list_item.site = "#######记得编辑平台名称"
			
 
				+            list_item.area = "全国"  # 城市默认:全国
			
 
				+            list_item.city = ""  # 城市 默认为空
			
 
				+            dedup = Dedup(Dedup.BloomFilter)
			
 
				+            ss = dedup.filter_exist_data([href])
			
 
				+            if ss == []:
			
 
				+                continue
			
 
				+            yield feapder.Request(href, callback=self.detail, item=list_item)
			
 
				+    def detail(self,request,response):
			
 
				+        list_item = request.item
			
 
				+        html = response.xpath("//div[@class='']").extract_first()  # 标书详细内容
			
 
				+        list_item.contenthtml = html
			
 
				+        yield list_item
			
 
				+
			
 
				+    def end_callback(self):
			
 
				+        print("爬虫结束")
			
 
				+        # wechat_warning(f"爬虫名称 爬虫结束\n共抓取{self.count}次详情页数据")
			
 
				+
			
 
				+    def download_midware(self, request):
			
 
				+        request.proxies = self.prox_pool.get()
			
 
				+        return request
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    ${spider_name}(redis_key="{USER}:${spider_name}").start()
			
--- a/FworkSpider/feapder/utils/__init__.py
+++ b/FworkSpider/feapder/utils/__init__.py
@@ -0,0 +1,9 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+'''
			
 
				+Created on 2019/11/5 4:41 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+'''
			
--- a/FworkSpider/feapder/utils/aliyun.py
+++ b/FworkSpider/feapder/utils/aliyun.py
@@ -0,0 +1,168 @@
 
				+import hashlib
			
 
				+import os
			
 
				+import traceback
			
 
				+import oss2
			
 
				+import requests
			
 
				+from feapder import setting
			
 
				+import time
			
 
				+
			
 
				+class UploadOSS:
			
 
				+    """阿里云 oss"""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        oss_conf = setting.oss_
			
 
				+        self.file_path: str = ""
			
 
				+        self.file_stream: bytes = b''
			
 
				+        self.__acc_key_id = oss_conf['key_id']
			
 
				+        self.__acc_key_secret = oss_conf['key_secret']
			
 
				+        self.__endpoint = oss_conf['endpoint']
			
 
				+        self.__bucket_name = oss_conf['bucket_name']
			
 
				+
			
 
				+    @property
			
 
				+    def fid(self):
			
 
				+        """
			
 
				+        文本摘要值
			
 
				+
			
 
				+        @return: 十六进制摘要值
			
 
				+        """
			
 
				+        sha1 = hashlib.sha1()
			
 
				+        sha1.update(str(self.file_stream).encode("utf-8"))
			
 
				+        return sha1.hexdigest()
			
 
				+
			
 
				+    @property
			
 
				+    def file_size(self):
			
 
				+        """
			
 
				+        文件的大小,将字节(bytes)转化(kb/M/G单位)
			
 
				+
			
 
				+        @return: 文件大小
			
 
				+        """
			
 
				+        try:
			
 
				+            size = os.path.getsize(self.file_path)
			
 
				+        except Exception:
			
 
				+            traceback.print_exc()
			
 
				+        else:
			
 
				+            try:
			
 
				+                _kb = float(size) / 1024
			
 
				+            except:
			
 
				+                return "Error"
			
 
				+            else:
			
 
				+                if _kb >= 1024:
			
 
				+                    _M = _kb / 1024
			
 
				+                    if _M >= 1024:
			
 
				+                        _G = _M / 1024
			
 
				+                        return "{:.1f} G".format(_G)
			
 
				+                    else:
			
 
				+                        return "{:.1f} M".format(_M)
			
 
				+                else:
			
 
				+                    return "{:.1f} kb".format(_kb)
			
 
				+
			
 
				+    def get_state(self, attachment,count=0, **kwargs):
			
 
				+        """
			
 
				+        下载附件并上传阿里oss
			
 
				+
			
 
				+        @param attachment: 附件
			
 
				+        @return: 附件处理结果
			
 
				+        """
			
 
				+        request_params = {
			
 
				+            'headers': setting.headers,
			
 
				+            'timeout': 20,
			
 
				+            'stream': True,
			
 
				+            **kwargs
			
 
				+        }
			
 
				+        with requests.get(attachment["org_url"], **request_params) as req:
			
 
				+            if req.status_code == 200:
			
 
				+                self.file_stream = req.content
			
 
				+                # img_dir = "file"
			
 
				+                img_dir = f"file/{attachment['channel']}"
			
 
				+                # 文件夹不存在则创建文件夹
			
 
				+                if not os.path.exists(img_dir):
			
 
				+                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
			
 
				+                # 打开目录,放入下载的附件
			
 
				+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
			
 
				+                filname = filname.hexdigest() #加密1次
			
 
				+                types = attachment["ftype"]
			
 
				+                self.file_path = "{}/{}".format(img_dir, filname+'.'+types)
			
 
				+                with open(self.file_path, 'wb') as f:
			
 
				+                    f.write(self.file_stream)
			
 
				+                # 上传附件
			
 
				+                self.put_oss_from_local()
			
 
				+                file_state = self.file_state(attachment)
			
 
				+                # 删除附件
			
 
				+                os.remove(self.file_path)
			
 
				+                # 返回附件上传处理信息
			
 
				+                return file_state
			
 
				+            else:
			
 
				+                if count<3:
			
 
				+                    self.post_state(attachment,count=count+1, **kwargs)
			
 
				+                else:
			
 
				+                    # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
			
 
				+                    attachment["url"] = 'oss'
			
 
				+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
			
 
				+                    attachment["size"] = '0kb'
			
 
				+                    attachment["false"] = True
			
 
				+                    return attachment
			
 
				+    def post_state(self, attachment,count=0, **kwargs):
			
 
				+        """
			
 
				+        下载附件并上传阿里oss
			
 
				+
			
 
				+        @param attachment: 附件
			
 
				+        @return: 附件处理结果
			
 
				+        """
			
 
				+        request_params = {
			
 
				+            'headers': setting.headers,
			
 
				+            'timeout': 20,
			
 
				+            'stream': True,
			
 
				+            **kwargs
			
 
				+        }
			
 
				+        with requests.post(attachment["org_url"], **request_params) as req:
			
 
				+            if req.status_code == 200:
			
 
				+                self.file_stream = req.content
			
 
				+                img_dir = f"file/{attachment['channel']}"
			
 
				+                # 文件夹不存在则创建文件夹
			
 
				+                if not os.path.exists(img_dir):
			
 
				+                    os.makedirs(img_dir, mode=0o777, exist_ok=True)
			
 
				+                # 打开目录,放入下载的附件
			
 
				+                filname = hashlib.md5(attachment["filename"].encode("utf-8"))
			
 
				+                filname = filname.hexdigest()  # 加密1次
			
 
				+                types = attachment["ftype"]
			
 
				+                self.file_path = "{}/{}".format(img_dir, filname + '.' + types)
			
 
				+
			
 
				+                with open(self.file_path, 'wb') as f:
			
 
				+                    f.write(self.file_stream)
			
 
				+                # 上传附件
			
 
				+                self.put_oss_from_local()
			
 
				+                file_state = self.file_state(attachment)
			
 
				+                # 删除附件
			
 
				+                # os.remove(self.file_path)
			
 
				+                # 返回附件上传处理信息
			
 
				+                return file_state
			
 
				+            else:
			
 
				+                if count<3:
			
 
				+                    self.post_state(attachment,count=count+1, **kwargs)
			
 
				+                else:
			
 
				+                    attachment["url"] = 'oss'
			
 
				+                    attachment["fid"] = self.fid + "." + attachment["ftype"]
			
 
				+                    attachment["size"] = '0kb'
			
 
				+                    attachment["false"] = True
			
 
				+                    return attachment
			
 
				+
			
 
				+    def put_oss_from_local(self):
			
 
				+        """上传一个本地文件到阿里OSS的普通文件"""
			
 
				+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
			
 
				+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
			
 
				+        bucket.put_object_from_file(self.fid, self.file_path)
			
 
				+
			
 
				+    def file_state(self, attachment):
			
 
				+        """
			
 
				+        文件信息
			
 
				+
			
 
				+        @param attachment: 附件
			
 
				+        @return: 附件上传处理信息
			
 
				+        """
			
 
				+        # attachment["ftype"] = str(attachment["filename"]).split(".")[1]
			
 
				+        attachment["url"] = 'oss'
			
 
				+        attachment["fid"] = self.fid + "." + attachment["ftype"]
			
 
				+        attachment["size"] = self.file_size
			
 
				+        return attachment
			
 
				+
			
 
				+
			
--- a/FworkSpider/feapder/utils/custom_argparse.py
+++ b/FworkSpider/feapder/utils/custom_argparse.py
@@ -0,0 +1,63 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-10-15 14:32:12
			
 
				+---------
			
 
				+@summary: 封装ArgumentParser， 使其支持function， 调用start自动执行
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+
			
 
				+
			
 
				+class ArgumentParser(argparse.ArgumentParser):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        self.functions = {}
			
 
				+
			
 
				+        super(ArgumentParser, self).__init__(*args, **kwargs)
			
 
				+
			
 
				+    def add_argument(self, *args, **kwargs):
			
 
				+        function = kwargs.pop("function") if "function" in kwargs else None
			
 
				+        key = self._get_optional_kwargs(*args, **kwargs).get("dest")
			
 
				+        self.functions[key] = function
			
 
				+
			
 
				+        return super(ArgumentParser, self).add_argument(*args, **kwargs)
			
 
				+
			
 
				+    def start(self, args=None, namespace=None):
			
 
				+        args = self.parse_args(args=args, namespace=namespace)
			
 
				+        for key, value in vars(args).items():  # vars() 函数返回对象object的属性和属性值的字典对象
			
 
				+            if value not in (None, False):
			
 
				+                if callable(self.functions[key]):
			
 
				+                    if value != True:
			
 
				+                        if isinstance(value, list) and len(value) == 1:
			
 
				+                            value = value[0]
			
 
				+                        self.functions[key](value)
			
 
				+                    else:
			
 
				+                        self.functions[key]()
			
 
				+
			
 
				+    def run(self, args, values=None):
			
 
				+        if args in self.functions:
			
 
				+            if values:
			
 
				+                self.functions[args](values)
			
 
				+            else:
			
 
				+                self.functions[args]()
			
 
				+
			
 
				+        else:
			
 
				+            raise Exception(f"无此方法: {args}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    def test():
			
 
				+        print("test not args func")
			
 
				+
			
 
				+    def test2(args):
			
 
				+        print("test args func", args)
			
 
				+
			
 
				+    parser = ArgumentParser(description="测试")
			
 
				+
			
 
				+    parser.add_argument("--test2", type=int, nargs=1, help="(1|2）", function=test2)
			
 
				+    parser.add_argument("--test", action="store_true", help="", function=test)
			
 
				+
			
 
				+    parser.start()
			
--- a/FworkSpider/feapder/utils/email_sender.py
+++ b/FworkSpider/feapder/utils/email_sender.py
@@ -0,0 +1,93 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2020/2/19 12:57 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import smtplib
			
 
				+from email.header import Header
			
 
				+from email.mime.multipart import MIMEMultipart
			
 
				+from email.mime.text import MIMEText
			
 
				+from email.utils import formataddr
			
 
				+
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class EmailSender(object):
			
 
				+    SENDER = "feapder报警系统"
			
 
				+
			
 
				+    def __init__(self, username, password, smtpserver="smtp.163.com"):
			
 
				+        self.username = username
			
 
				+        self.password = password
			
 
				+        self.smtpserver = smtpserver
			
 
				+        self.smtp_client = smtplib.SMTP_SSL(smtpserver)
			
 
				+        self.sender = EmailSender.SENDER
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        self.login()
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        self.quit()
			
 
				+
			
 
				+    def quit(self):
			
 
				+        self.smtp_client.quit()
			
 
				+
			
 
				+    def login(self):
			
 
				+        self.smtp_client.connect(self.smtpserver)
			
 
				+        self.smtp_client.login(self.username, self.password)
			
 
				+
			
 
				+    def send(
			
 
				+        self,
			
 
				+        receivers: list,
			
 
				+        title: str,
			
 
				+        content: str,
			
 
				+        content_type: str = "plain",
			
 
				+        filepath: str = None,
			
 
				+    ):
			
 
				+        """
			
 
				+
			
 
				+        Args:
			
 
				+            receivers:
			
 
				+            title:
			
 
				+            content:
			
 
				+            content_type: html / plain
			
 
				+            filepath:
			
 
				+
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        # 创建一个带附件的实例
			
 
				+        message = MIMEMultipart()
			
 
				+        message["From"] = formataddr(
			
 
				+            (self.sender, self.username)
			
 
				+        )  # 括号里的对应发件人邮箱昵称、发件人邮箱账号
			
 
				+        message["To"] = formataddr((receivers[0], receivers[0]))  # ",".join(receivers)
			
 
				+
			
 
				+        message["Subject"] = Header(title, "utf-8")
			
 
				+
			
 
				+        content = MIMEText(content, content_type, "utf-8")
			
 
				+        message.attach(content)
			
 
				+
			
 
				+        # 构造附件
			
 
				+        if filepath:
			
 
				+            attach = MIMEText(open(filepath, "rb").read(), "base64", "utf-8")
			
 
				+            attach.add_header(
			
 
				+                "content-disposition",
			
 
				+                "attachment",
			
 
				+                filename=("utf-8", "", os.path.basename(filepath)),
			
 
				+            )
			
 
				+            message.attach(attach)
			
 
				+
			
 
				+        msg = message.as_string()
			
 
				+        # 此处直接发送多个邮箱有问题，改成一个个发送
			
 
				+        for receiver in receivers:
			
 
				+            log.debug("发送邮件到 {}".format(receiver))
			
 
				+            self.smtp_client.sendmail(self.username, receiver, msg)
			
 
				+        log.debug("邮件发送成功！！！")
			
 
				+        return True
			
--- a/FworkSpider/feapder/utils/js/stealth.min.js
+++ b/FworkSpider/feapder/utils/js/stealth.min.js
--- a/FworkSpider/feapder/utils/log.py
+++ b/FworkSpider/feapder/utils/log.py
@@ -0,0 +1,265 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-12-08 16:50
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import logging
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+from logging.handlers import BaseRotatingHandler
			
 
				+
			
 
				+import loguru
			
 
				+import pymongo
			
 
				+from better_exceptions import format_exception
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+
			
 
				+LOG_FORMAT = "%(threadName)s|%(asctime)s|%(filename)s|%(funcName)s|line:%(lineno)d|%(levelname)s| %(message)s"
			
 
				+PRINT_EXCEPTION_DETAILS = True
			
 
				+
			
 
				+
			
 
				+class InterceptHandler(logging.Handler):
			
 
				+    def emit(self, record):
			
 
				+        # Retrieve context where the logging call occurred, this happens to be in the 6th frame upward
			
 
				+        logger_opt = loguru.logger.opt(depth=6, exception=record.exc_info)
			
 
				+        logger_opt.log(record.levelname, record.getMessage())
			
 
				+
			
 
				+
			
 
				+# 重写 RotatingFileHandler 自定义log的文件名
			
 
				+# 原来 xxx.log xxx.log.1 xxx.log.2 xxx.log.3 文件由近及远
			
 
				+# 现在 xxx.log xxx1.log xxx2.log  如果backup_count 是2位数时  则 01  02  03 三位数 001 002 .. 文件由近及远
			
 
				+class RotatingFileHandler(BaseRotatingHandler):
			
 
				+    def __init__(
			
 
				+        self, filename, mode="a", max_bytes=0, backup_count=0, encoding=None, delay=0
			
 
				+    ):
			
 
				+        BaseRotatingHandler.__init__(self, filename, mode, encoding, delay)
			
 
				+        self.max_bytes = max_bytes
			
 
				+        self.backup_count = backup_count
			
 
				+        self.placeholder = str(len(str(backup_count)))
			
 
				+        self._to_db = None
			
 
				+        self.filename = filename
			
 
				+
			
 
				+
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = pymongo.MongoClient(setting.MONGO_IP, setting.MONGO_PORT)
			
 
				+
			
 
				+        return self._to_db.pyspider
			
 
				+
			
 
				+
			
 
				+    def shouldRollover(self, record):
			
 
				+        parmars = {
			
 
				+            "spider_name":record.name,
			
 
				+            "msg":record.msg,
			
 
				+            "Message":str(record.getMessage)
			
 
				+        }
			
 
				+        if record.levelname == "ERROR":
			
 
				+            crawl_type = 'list'
			
 
				+            if 'detail' in record.name:
			
 
				+                crawl_type = 'detail'
			
 
				+            url = ''
			
 
				+            item={
			
 
				+                "recordname":record.name,
			
 
				+                "spidercode":"spidercode",
			
 
				+                "author":self.filename,
			
 
				+                "account":"",
			
 
				+                "crawl_time":time.time(),
			
 
				+                "crawl_type": crawl_type,
			
 
				+                "status_code":"status_code",
			
 
				+                "url":url,
			
 
				+                "reason":record.msg,
			
 
				+                'parmars': parmars,
			
 
				+            }
			
 
				+
			
 
				+            # print('<<<<<<<<<<<<<<<<<<<<<<<插入error_info')
			
 
				+            # print(item)
			
 
				+            # print(self.to_db.error_info)
			
 
				+            # self.to_db.error_info.insert_one(item)
			
 
				+
			
 
				+
			
 
				+
			
 
				+def get_logger(
			
 
				+    name=None,
			
 
				+    path=None,
			
 
				+    log_level=None,
			
 
				+    is_write_to_console=None,
			
 
				+    is_write_to_file=None,
			
 
				+    color=None,
			
 
				+    mode=None,
			
 
				+    max_bytes=None,
			
 
				+    backup_count=None,
			
 
				+    encoding=None,
			
 
				+):
			
 
				+    """
			
 
				+    @summary: 获取log
			
 
				+    ---------
			
 
				+    @param name: log名
			
 
				+    @param path: log文件存储路径 如 D://xxx.log
			
 
				+    @param log_level: log等级 CRITICAL/ERROR/WARNING/INFO/DEBUG
			
 
				+    @param is_write_to_console: 是否输出到控制台
			
 
				+    @param is_write_to_file: 是否写入到文件 默认否
			
 
				+    @param color：是否有颜色
			
 
				+    @param mode：写文件模式
			
 
				+    @param max_bytes： 每个日志文件的最大字节数
			
 
				+    @param backup_count：日志文件保留数量
			
 
				+    @param encoding：日志文件编码
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+    # 加载setting里最新的值
			
 
				+    name = name or setting.LOG_NAME
			
 
				+    path = path or setting.LOG_PATH
			
 
				+    log_level = log_level or setting.LOG_LEVEL
			
 
				+    is_write_to_console = (
			
 
				+        is_write_to_console
			
 
				+        if is_write_to_console is not None
			
 
				+        else setting.LOG_IS_WRITE_TO_CONSOLE
			
 
				+    )
			
 
				+    is_write_to_file = (
			
 
				+        is_write_to_file
			
 
				+        if is_write_to_file is not None
			
 
				+        else setting.LOG_IS_WRITE_TO_FILE
			
 
				+    )
			
 
				+    color = color if color is not None else setting.LOG_COLOR
			
 
				+    mode = mode or setting.LOG_MODE
			
 
				+    max_bytes = max_bytes or setting.LOG_MAX_BYTES
			
 
				+    backup_count = backup_count or setting.LOG_BACKUP_COUNT
			
 
				+    encoding = encoding or setting.LOG_ENCODING
			
 
				+
			
 
				+    # logger 配置
			
 
				+    name = name.split(os.sep)[-1].split(".")[0]  # 取文件名
			
 
				+
			
 
				+    logger = logging.getLogger(name)
			
 
				+    logger.setLevel(log_level)
			
 
				+
			
 
				+    formatter = logging.Formatter(LOG_FORMAT)
			
 
				+    if PRINT_EXCEPTION_DETAILS:
			
 
				+        formatter.formatException = lambda exc_info: format_exception(*exc_info)
			
 
				+
			
 
				+    # 定义一个RotatingFileHandler，最多备份5个日志文件，每个日志文件最大10M
			
 
				+    if is_write_to_file:
			
 
				+        # if path and not os.path.exists(os.path.dirname(path)):
			
 
				+        #     os.makedirs(os.path.dirname(path))
			
 
				+
			
 
				+        rf_handler = RotatingFileHandler(
			
 
				+            path,
			
 
				+            mode=mode,
			
 
				+            max_bytes=max_bytes,
			
 
				+            backup_count=backup_count,
			
 
				+            encoding=encoding,
			
 
				+        )
			
 
				+        rf_handler.setFormatter(formatter)
			
 
				+        logger.addHandler(rf_handler)
			
 
				+    if color and is_write_to_console:
			
 
				+        loguru_handler = InterceptHandler()
			
 
				+        loguru_handler.setFormatter(formatter)
			
 
				+        # logging.basicConfig(handlers=[loguru_handler], level=0)
			
 
				+        logger.addHandler(loguru_handler)
			
 
				+    elif is_write_to_console:
			
 
				+        stream_handler = logging.StreamHandler()
			
 
				+        stream_handler.stream = sys.stdout
			
 
				+        stream_handler.setFormatter(formatter)
			
 
				+        logger.addHandler(stream_handler)
			
 
				+
			
 
				+    _handler_list = []
			
 
				+    _handler_name_list = []
			
 
				+    # 检查是否存在重复handler
			
 
				+    for _handler in logger.handlers:
			
 
				+        if str(_handler) not in _handler_name_list:
			
 
				+            _handler_name_list.append(str(_handler))
			
 
				+            _handler_list.append(_handler)
			
 
				+    logger.handlers = _handler_list
			
 
				+    return logger
			
 
				+
			
 
				+
			
 
				+# logging.disable(logging.DEBUG) # 关闭所有log
			
 
				+
			
 
				+# 不让打印log的配置
			
 
				+STOP_LOGS = [
			
 
				+    # ES
			
 
				+    "urllib3.response",
			
 
				+    "urllib3.connection",
			
 
				+    "elasticsearch.trace",
			
 
				+    "requests.packages.urllib3.util",
			
 
				+    "requests.packages.urllib3.util.retry",
			
 
				+    "urllib3.util",
			
 
				+    "requests.packages.urllib3.response",
			
 
				+    "requests.packages.urllib3.contrib.pyopenssl",
			
 
				+    "requests.packages",
			
 
				+    "urllib3.util.retry",
			
 
				+    "requests.packages.urllib3.contrib",
			
 
				+    "requests.packages.urllib3.connectionpool",
			
 
				+    "requests.packages.urllib3.poolmanager",
			
 
				+    "urllib3.connectionpool",
			
 
				+    "requests.packages.urllib3.connection",
			
 
				+    "elasticsearch",
			
 
				+    "log_request_fail",
			
 
				+    # requests
			
 
				+    "requests",
			
 
				+    "selenium.webdriver.remote.remote_connection",
			
 
				+    "selenium.webdriver.remote",
			
 
				+    "selenium.webdriver",
			
 
				+    "selenium",
			
 
				+    # markdown
			
 
				+    "MARKDOWN",
			
 
				+    "build_extension",
			
 
				+    # newspaper
			
 
				+    "calculate_area",
			
 
				+    "largest_image_url",
			
 
				+    "newspaper.images",
			
 
				+    "newspaper",
			
 
				+    "Importing",
			
 
				+    "PIL",
			
 
				+]
			
 
				+
			
 
				+# 关闭日志打印
			
 
				+for STOP_LOG in STOP_LOGS:
			
 
				+    log_level = eval("logging." + setting.OTHERS_LOG_LEVAL)
			
 
				+    logging.getLogger(STOP_LOG).setLevel(log_level)
			
 
				+
			
 
				+# print(logging.Logger.manager.loggerDict) # 取使用debug模块的name
			
 
				+
			
 
				+# 日志级别大小关系为：CRITICAL > ERROR > WARNING > INFO > DEBUG
			
 
				+
			
 
				+
			
 
				+class Log:
			
 
				+    log = None
			
 
				+
			
 
				+    def __getattr__(self, name):
			
 
				+        # 调用log时再初始化，为了加载最新的setting
			
 
				+        if self.__class__.log is None:
			
 
				+            self.__class__.log = get_logger()
			
 
				+        return getattr(self.__class__.log, name)
			
 
				+
			
 
				+    @property
			
 
				+    def debug(self):
			
 
				+        return self.__class__.log.debug
			
 
				+
			
 
				+    @property
			
 
				+    def info(self):
			
 
				+        return self.__class__.log.info
			
 
				+
			
 
				+    @property
			
 
				+    def warning(self):
			
 
				+        return self.__class__.log.warning
			
 
				+
			
 
				+    @property
			
 
				+    def exception(self):
			
 
				+        return self.__class__.log.exception
			
 
				+
			
 
				+    @property
			
 
				+    def error(self):
			
 
				+        return self.__class__.log.error
			
 
				+
			
 
				+    @property
			
 
				+    def critical(self):
			
 
				+        return self.__class__.log.critical
			
 
				+
			
 
				+
			
 
				+log = Log()
			
--- a/FworkSpider/feapder/utils/metrics.py
+++ b/FworkSpider/feapder/utils/metrics.py
@@ -0,0 +1,539 @@
 
				+import concurrent.futures
			
 
				+import json
			
 
				+import os
			
 
				+import queue
			
 
				+import random
			
 
				+import socket
			
 
				+import threading
			
 
				+import time
			
 
				+from collections import Counter
			
 
				+from typing import Any
			
 
				+
			
 
				+from influxdb import InfluxDBClient
			
 
				+
			
 
				+from feapder import setting
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.tools import aio_wrap, ensure_float, ensure_int
			
 
				+
			
 
				+_inited_pid = None
			
 
				+# this thread should stop running in the forked process
			
 
				+_executor = concurrent.futures.ThreadPoolExecutor(
			
 
				+    max_workers=1, thread_name_prefix="metrics"
			
 
				+)
			
 
				+
			
 
				+
			
 
				+class MetricsEmitter:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        influxdb,
			
 
				+        *,
			
 
				+        batch_size=10,
			
 
				+        max_timer_seq=0,
			
 
				+        emit_interval=10,
			
 
				+        retention_policy=None,
			
 
				+        ratio=1.0,
			
 
				+        debug=False,
			
 
				+        add_hostname=False,
			
 
				+        max_points=10240,
			
 
				+        default_tags=None,
			
 
				+        time_precision="s",
			
 
				+    ):
			
 
				+        """
			
 
				+        Args:
			
 
				+            influxdb: influxdb instance
			
 
				+            batch_size: 打点的批次大小
			
 
				+            max_timer_seq: 每个时间间隔内最多收集多少个 timer 类型点, 0 表示不限制
			
 
				+            emit_interval: 最多等待多长时间必须打点
			
 
				+            retention_policy: 对应的 retention policy
			
 
				+            ratio: store 和 timer 类型采样率，比如 0.1 表示只有 10% 的点会留下
			
 
				+            debug: 是否打印调试日志
			
 
				+            add_hostname: 是否添加 hostname 作为 tag
			
 
				+            max_points: 本地 buffer 最多累计多少个点
			
 
				+            time_precision: 打点精度 默认 s
			
 
				+        """
			
 
				+        self.pending_points = queue.Queue()
			
 
				+        self.batch_size = batch_size
			
 
				+        self.influxdb: InfluxDBClient = influxdb
			
 
				+        self.tagkv = {}
			
 
				+        self.max_timer_seq = max_timer_seq
			
 
				+        self.lock = threading.Lock()
			
 
				+        self.hostname = socket.gethostname()
			
 
				+        self.last_emit_ts = time.time()  # 上次提交时间
			
 
				+        self.emit_interval = emit_interval  # 提交间隔
			
 
				+        self.max_points = max_points
			
 
				+        self.retention_policy = retention_policy  # 支持自定义保留策略
			
 
				+        self.debug = debug
			
 
				+        self.add_hostname = add_hostname
			
 
				+        self.ratio = ratio
			
 
				+        self.default_tags = default_tags or {}
			
 
				+        self.time_precision = time_precision
			
 
				+
			
 
				+    def define_tagkv(self, tagk, tagvs):
			
 
				+        self.tagkv[tagk] = set(tagvs)
			
 
				+
			
 
				+    def _point_tagset(self, p):
			
 
				+        return f"{p['measurement']}-{sorted(p['tags'].items())}-{p['time']}"
			
 
				+
			
 
				+    def _accumulate_points(self, points):
			
 
				+        """
			
 
				+        对于处于同一个 key 的点做聚合
			
 
				+
			
 
				+          - 对于 counter 类型，同一个 key 的值(_count)可以累加
			
 
				+          - 对于 store 类型，不做任何操作，influxdb 会自行覆盖
			
 
				+          - 对于 timer 类型，通过添加一个 _seq 值来区分每个不同的点
			
 
				+        """
			
 
				+        counters = {}  # 临时保留 counter 类型的值
			
 
				+        timer_seqs = Counter()  # 记录不同 key 的 timer 序列号
			
 
				+        new_points = []
			
 
				+
			
 
				+        for point in points:
			
 
				+            point_type = point["tags"].get("_type", None)
			
 
				+            tagset = self._point_tagset(point)
			
 
				+
			
 
				+            # counter 类型全部聚合，不做丢弃
			
 
				+            if point_type == "counter":
			
 
				+                if tagset not in counters:
			
 
				+                    counters[tagset] = point
			
 
				+                else:
			
 
				+                    counters[tagset]["fields"]["_count"] += point["fields"]["_count"]
			
 
				+            elif point_type == "timer":
			
 
				+                if self.max_timer_seq and timer_seqs[tagset] > self.max_timer_seq:
			
 
				+                    continue
			
 
				+                # 掷一把骰子，如果足够幸运才打点
			
 
				+                if self.ratio < 1.0 and random.random() > self.ratio:
			
 
				+                    continue
			
 
				+                # 增加 _seq tag，以便区分不同的点
			
 
				+                point["tags"]["_seq"] = timer_seqs[tagset]
			
 
				+                timer_seqs[tagset] += 1
			
 
				+                new_points.append(point)
			
 
				+            else:
			
 
				+                if self.ratio < 1.0 and random.random() > self.ratio:
			
 
				+                    continue
			
 
				+                new_points.append(point)
			
 
				+
			
 
				+        # 把累加得到的 counter 值添加进来
			
 
				+        new_points.extend(counters.values())
			
 
				+        return new_points
			
 
				+
			
 
				+    def _get_ready_emit(self, force=False):
			
 
				+        """
			
 
				+        把当前 pending 的值做聚合并返回
			
 
				+        """
			
 
				+        if self.debug:
			
 
				+            log.info("got %s raw points", self.pending_points.qsize())
			
 
				+
			
 
				+        # 从 pending 中读取点, 设定一个最大值，避免一直打点，一直获取
			
 
				+        points = []
			
 
				+        while len(points) < self.max_points or force:
			
 
				+            try:
			
 
				+                points.append(self.pending_points.get_nowait())
			
 
				+            except queue.Empty:
			
 
				+                break
			
 
				+
			
 
				+        # 聚合点
			
 
				+        points = self._accumulate_points(points)
			
 
				+
			
 
				+        if self.debug:
			
 
				+            log.info("got %s point", len(points))
			
 
				+            log.info(json.dumps(points, indent=4))
			
 
				+
			
 
				+        return points
			
 
				+
			
 
				+    def emit(self, point=None, force=False):
			
 
				+        """
			
 
				+        1. 添加新点到 pending
			
 
				+        2. 如果符合条件，尝试聚合并打点
			
 
				+        3. 更新打点时间
			
 
				+
			
 
				+        :param point:
			
 
				+        :param force: 强制提交所有点 默认False
			
 
				+        :return:
			
 
				+        """
			
 
				+        if point:
			
 
				+            self.pending_points.put(point)
			
 
				+
			
 
				+        # 判断是否需要提交点 1、数量 2、间隔 3、强力打点
			
 
				+        if not (
			
 
				+            force
			
 
				+            or self.pending_points.qsize() >= self.max_points  # noqa: W503
			
 
				+            or time.time() - self.last_emit_ts > self.emit_interval  # noqa: W503
			
 
				+        ):
			
 
				+            return
			
 
				+
			
 
				+        # 需要打点，读取可以打点的值, 确保只有一个线程在做点的压缩
			
 
				+        with self.lock:
			
 
				+            points = self._get_ready_emit(force=force)
			
 
				+
			
 
				+            if not points:
			
 
				+                return
			
 
				+            try:
			
 
				+                self.influxdb.write_points(
			
 
				+                    points,
			
 
				+                    batch_size=self.batch_size,
			
 
				+                    time_precision=self.time_precision,
			
 
				+                    retention_policy=self.retention_policy,
			
 
				+                )
			
 
				+            except Exception:
			
 
				+                log.exception("error writing points")
			
 
				+
			
 
				+            self.last_emit_ts = time.time()
			
 
				+
			
 
				+    def flush(self):
			
 
				+        if self.debug:
			
 
				+            log.info("start draining points %s", self.pending_points.qsize())
			
 
				+        self.emit(force=True)
			
 
				+
			
 
				+    def close(self):
			
 
				+        self.flush()
			
 
				+        try:
			
 
				+            self.influxdb.close()
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+
			
 
				+    def make_point(self, measurement, tags: dict, fields: dict, timestamp=None):
			
 
				+        """
			
 
				+        默认的时间戳是"秒"级别的
			
 
				+        """
			
 
				+        assert measurement, "measurement can't be null"
			
 
				+        tags = tags.copy() if tags else {}
			
 
				+        tags.update(self.default_tags)
			
 
				+        fields = fields.copy() if fields else {}
			
 
				+        if timestamp is None:
			
 
				+            timestamp = int(time.time())
			
 
				+        # 支持自定义hostname
			
 
				+        if self.add_hostname and "hostname" not in tags:
			
 
				+            tags["hostname"] = self.hostname
			
 
				+        point = dict(measurement=measurement, tags=tags, fields=fields, time=timestamp)
			
 
				+        if self.tagkv:
			
 
				+            for tagk, tagv in tags.items():
			
 
				+                if tagv not in self.tagkv[tagk]:
			
 
				+                    raise ValueError("tag value = %s not in %s", tagv, self.tagkv[tagk])
			
 
				+        return point
			
 
				+
			
 
				+    def get_counter_point(
			
 
				+        self,
			
 
				+        measurement: str,
			
 
				+        key: str = None,
			
 
				+        count: int = 1,
			
 
				+        tags: dict = None,
			
 
				+        timestamp: int = None,
			
 
				+    ):
			
 
				+        """
			
 
				+        counter 不能被覆盖
			
 
				+        """
			
 
				+        tags = tags.copy() if tags else {}
			
 
				+        if key is not None:
			
 
				+            tags["_key"] = key
			
 
				+        tags["_type"] = "counter"
			
 
				+        count = ensure_int(count)
			
 
				+        fields = dict(_count=count)
			
 
				+        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
			
 
				+        return point
			
 
				+
			
 
				+    def get_store_point(
			
 
				+        self,
			
 
				+        measurement: str,
			
 
				+        key: str = None,
			
 
				+        value: Any = 0,
			
 
				+        tags: dict = None,
			
 
				+        timestamp=None,
			
 
				+    ):
			
 
				+        tags = tags.copy() if tags else {}
			
 
				+        if key is not None:
			
 
				+            tags["_key"] = key
			
 
				+        tags["_type"] = "store"
			
 
				+        fields = dict(_value=value)
			
 
				+        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
			
 
				+        return point
			
 
				+
			
 
				+    def get_timer_point(
			
 
				+        self,
			
 
				+        measurement: str,
			
 
				+        key: str = None,
			
 
				+        duration: float = 0,
			
 
				+        tags: dict = None,
			
 
				+        timestamp=None,
			
 
				+    ):
			
 
				+        tags = tags.copy() if tags else {}
			
 
				+        if key is not None:
			
 
				+            tags["_key"] = key
			
 
				+        tags["_type"] = "timer"
			
 
				+        fields = dict(_duration=ensure_float(duration))
			
 
				+        point = self.make_point(measurement, tags, fields, timestamp=timestamp)
			
 
				+        return point
			
 
				+
			
 
				+    def emit_any(self, *args, **kwargs):
			
 
				+        point = self.make_point(*args, **kwargs)
			
 
				+        self.emit(point)
			
 
				+
			
 
				+    def emit_counter(self, *args, **kwargs):
			
 
				+        point = self.get_counter_point(*args, **kwargs)
			
 
				+        self.emit(point)
			
 
				+
			
 
				+    def emit_store(self, *args, **kwargs):
			
 
				+        point = self.get_store_point(*args, **kwargs)
			
 
				+        self.emit(point)
			
 
				+
			
 
				+    def emit_timer(self, *args, **kwargs):
			
 
				+        point = self.get_timer_point(*args, **kwargs)
			
 
				+        self.emit(point)
			
 
				+
			
 
				+
			
 
				+_emitter: MetricsEmitter = None
			
 
				+_measurement: str = None
			
 
				+
			
 
				+
			
 
				+def init(
			
 
				+    *,
			
 
				+    influxdb_host=None,
			
 
				+    influxdb_port=None,
			
 
				+    influxdb_udp_port=None,
			
 
				+    influxdb_database=None,
			
 
				+    influxdb_user=None,
			
 
				+    influxdb_password=None,
			
 
				+    influxdb_measurement=None,
			
 
				+    retention_policy=None,
			
 
				+    retention_policy_duration="180d",
			
 
				+    emit_interval=60,
			
 
				+    batch_size=10,
			
 
				+    debug=False,
			
 
				+    use_udp=False,
			
 
				+    timeout=10,
			
 
				+    time_precision="s",
			
 
				+    **kwargs,
			
 
				+):
			
 
				+    """
			
 
				+    打点监控初始化
			
 
				+    Args:
			
 
				+        influxdb_host:
			
 
				+        influxdb_port:
			
 
				+        influxdb_udp_port:
			
 
				+        influxdb_database:
			
 
				+        influxdb_user:
			
 
				+        influxdb_password:
			
 
				+        influxdb_measurement: 存储的表，也可以在打点的时候指定
			
 
				+        retention_policy: 保留策略
			
 
				+        retention_policy_duration: 保留策略过期时间
			
 
				+        emit_interval: 打点最大间隔
			
 
				+        batch_size: 打点的批次大小
			
 
				+        debug: 是否开启调试
			
 
				+        use_udp: 是否使用udp协议打点
			
 
				+        timeout: 与influxdb建立连接时的超时时间
			
 
				+        time_precision: 打点精度 默认秒
			
 
				+        **kwargs: 可传递MetricsEmitter类的参数
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    global _inited_pid, _emitter, _measurement
			
 
				+    if _inited_pid == os.getpid():
			
 
				+        return
			
 
				+
			
 
				+    influxdb_host = influxdb_host or setting.INFLUXDB_HOST
			
 
				+    influxdb_port = influxdb_port or setting.INFLUXDB_PORT
			
 
				+    influxdb_udp_port = influxdb_udp_port or setting.INFLUXDB_UDP_PORT
			
 
				+    influxdb_database = influxdb_database or setting.INFLUXDB_DATABASE
			
 
				+    influxdb_user = influxdb_user or setting.INFLUXDB_USER
			
 
				+    influxdb_password = influxdb_password or setting.INFLUXDB_PASSWORD
			
 
				+    _measurement = influxdb_measurement or setting.INFLUXDB_MEASUREMENT
			
 
				+    retention_policy = (
			
 
				+        retention_policy or f"{influxdb_database}_{retention_policy_duration}"
			
 
				+    )
			
 
				+
			
 
				+    if not all(
			
 
				+        [
			
 
				+            influxdb_host,
			
 
				+            influxdb_port,
			
 
				+            influxdb_udp_port,
			
 
				+            influxdb_database,
			
 
				+            influxdb_user,
			
 
				+            influxdb_password,
			
 
				+        ]
			
 
				+    ):
			
 
				+        return
			
 
				+
			
 
				+    influxdb_client = InfluxDBClient(
			
 
				+        host=influxdb_host,
			
 
				+        port=influxdb_port,
			
 
				+        udp_port=influxdb_udp_port,
			
 
				+        database=influxdb_database,
			
 
				+        use_udp=use_udp,
			
 
				+        timeout=timeout,
			
 
				+        username=influxdb_user,
			
 
				+        password=influxdb_password,
			
 
				+    )
			
 
				+    # 创建数据库
			
 
				+    if influxdb_database:
			
 
				+        try:
			
 
				+            influxdb_client.create_database(influxdb_database)
			
 
				+            influxdb_client.create_retention_policy(
			
 
				+                retention_policy,
			
 
				+                retention_policy_duration,
			
 
				+                replication="1",
			
 
				+                default=True,
			
 
				+            )
			
 
				+        except Exception as e:
			
 
				+            log.error("metrics init falied: {}".format(e))
			
 
				+            return
			
 
				+
			
 
				+    _emitter = MetricsEmitter(
			
 
				+        influxdb_client,
			
 
				+        debug=debug,
			
 
				+        batch_size=batch_size,
			
 
				+        time_precision=time_precision,
			
 
				+        retention_policy=retention_policy,
			
 
				+        emit_interval=emit_interval,
			
 
				+        **kwargs,
			
 
				+    )
			
 
				+    _inited_pid = os.getpid()
			
 
				+    log.info("metrics init successfully")
			
 
				+
			
 
				+
			
 
				+def emit_any(
			
 
				+    tags: dict,
			
 
				+    fields: dict,
			
 
				+    *,
			
 
				+    classify: str = "",
			
 
				+    measurement: str = None,
			
 
				+    timestamp=None,
			
 
				+):
			
 
				+    """
			
 
				+    原生的打点，不进行额外的处理
			
 
				+    Args:
			
 
				+        tags: influxdb的tag的字段和值
			
 
				+        fields: influxdb的field的字段和值
			
 
				+        classify: 点的类别
			
 
				+        measurement: 存储的表
			
 
				+        timestamp: 点的时间搓，默认为当前时间
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not _emitter:
			
 
				+        return
			
 
				+
			
 
				+    tags = tags or {}
			
 
				+    tags["_classify"] = classify
			
 
				+    measurement = measurement or _measurement
			
 
				+    _emitter.emit_any(measurement, tags, fields, timestamp)
			
 
				+
			
 
				+
			
 
				+def emit_counter(
			
 
				+    key: str = None,
			
 
				+    count: int = 1,
			
 
				+    *,
			
 
				+    classify: str = "",
			
 
				+    tags: dict = None,
			
 
				+    measurement: str = None,
			
 
				+    timestamp: int = None,
			
 
				+):
			
 
				+    """
			
 
				+    聚合打点，即会将一段时间内的点求和，然后打一个点数和
			
 
				+    Args:
			
 
				+        key: 与点绑定的key值
			
 
				+        count: 点数
			
 
				+        classify: 点的类别
			
 
				+        tags: influxdb的tag的字段和值
			
 
				+        measurement: 存储的表
			
 
				+        timestamp: 点的时间搓，默认为当前时间
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not _emitter:
			
 
				+        return
			
 
				+
			
 
				+    tags = tags or {}
			
 
				+    tags["_classify"] = classify
			
 
				+    measurement = measurement or _measurement
			
 
				+    _emitter.emit_counter(measurement, key, count, tags, timestamp)
			
 
				+
			
 
				+
			
 
				+def emit_timer(
			
 
				+    key: str = None,
			
 
				+    duration: float = 0,
			
 
				+    *,
			
 
				+    classify: str = "",
			
 
				+    tags: dict = None,
			
 
				+    measurement: str = None,
			
 
				+    timestamp=None,
			
 
				+):
			
 
				+    """
			
 
				+    时间打点，用于监控程序的运行时长等，每个duration一个点，不会被覆盖
			
 
				+    Args:
			
 
				+        key: 与点绑定的key值
			
 
				+        duration: 时长
			
 
				+        classify: 点的类别
			
 
				+        tags: influxdb的tag的字段和值
			
 
				+        measurement: 存储的表
			
 
				+        timestamp: 点的时间搓，默认为当前时间
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not _emitter:
			
 
				+        return
			
 
				+
			
 
				+    tags = tags or {}
			
 
				+    tags["_classify"] = classify
			
 
				+    measurement = measurement or _measurement
			
 
				+    _emitter.emit_timer(measurement, key, duration, tags, timestamp)
			
 
				+
			
 
				+
			
 
				+def emit_store(
			
 
				+    key: str = None,
			
 
				+    value: Any = 0,
			
 
				+    *,
			
 
				+    classify: str = "",
			
 
				+    tags: dict = None,
			
 
				+    measurement: str,
			
 
				+    timestamp=None,
			
 
				+):
			
 
				+    """
			
 
				+    直接打点，不进行额外的处理
			
 
				+    Args:
			
 
				+        key: 与点绑定的key值
			
 
				+        value: 点的值
			
 
				+        classify: 点的类别
			
 
				+        tags: influxdb的tag的字段和值
			
 
				+        measurement: 存储的表
			
 
				+        timestamp: 点的时间搓，默认为当前时间
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not _emitter:
			
 
				+        return
			
 
				+
			
 
				+    tags = tags or {}
			
 
				+    tags["_classify"] = classify
			
 
				+    measurement = measurement or _measurement
			
 
				+    _emitter.emit_store(measurement, key, value, tags, timestamp)
			
 
				+
			
 
				+
			
 
				+def flush():
			
 
				+    """
			
 
				+    强刷点到influxdb
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not _emitter:
			
 
				+        return
			
 
				+    _emitter.flush()
			
 
				+
			
 
				+
			
 
				+def close():
			
 
				+    """
			
 
				+    关闭
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not _emitter:
			
 
				+        return
			
 
				+    _emitter.close()
			
 
				+
			
 
				+
			
 
				+# 协程打点
			
 
				+aemit_counter = aio_wrap(executor=_executor)(emit_counter)
			
 
				+aemit_store = aio_wrap(executor=_executor)(emit_store)
			
 
				+aemit_timer = aio_wrap(executor=_executor)(emit_timer)
			
--- a/FworkSpider/feapder/utils/perfect_dict.py
+++ b/FworkSpider/feapder/utils/perfect_dict.py
@@ -0,0 +1,94 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/4/8 11:32 上午
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+def ensure_value(value):
			
 
				+    if isinstance(value, (list, tuple)):
			
 
				+        _value = []
			
 
				+        for v in value:
			
 
				+            _value.append(ensure_value(v))
			
 
				+
			
 
				+        if isinstance(value, tuple):
			
 
				+            value = tuple(_value)
			
 
				+        else:
			
 
				+            value = _value
			
 
				+
			
 
				+    if isinstance(value, dict):
			
 
				+        return PerfectDict(value)
			
 
				+    else:
			
 
				+        return value
			
 
				+
			
 
				+
			
 
				+class PerfectDict(dict):
			
 
				+    """
			
 
				+    >>> data = PerfectDict({"id":1, "url":"xxx"})
			
 
				+    >>> data
			
 
				+    {'id': 1, 'url': 'xxx'}
			
 
				+    >>> data = PerfectDict(id=1, url="xxx")
			
 
				+    >>> data
			
 
				+    {'id': 1, 'url': 'xxx'}
			
 
				+    >>> data.id
			
 
				+    1
			
 
				+    >>> data.get("id")
			
 
				+    1
			
 
				+    >>> data["id"]
			
 
				+    1
			
 
				+    >>> id, url = data
			
 
				+    >>> id
			
 
				+    1
			
 
				+    >>> url
			
 
				+    'xxx'
			
 
				+    >>> data[0]
			
 
				+    1
			
 
				+    >>> data[1]
			
 
				+    'xxx'
			
 
				+    >>> data = PerfectDict({"a": 1, "b": {"b1": 2}, "c": [{"c1": [{"d": 1}]}]})
			
 
				+    >>> data.b.b1
			
 
				+    2
			
 
				+    >>> data[1].b1
			
 
				+    2
			
 
				+    >>> data.get("b").b1
			
 
				+    2
			
 
				+    >>> data.c[0].c1
			
 
				+    [{'d': 1}]
			
 
				+    >>> data.c[0].c1[0]
			
 
				+    {'d': 1}
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, _dict: dict = None, _values: list = None, **kwargs):
			
 
				+        self.__dict__ = _dict or kwargs or {}
			
 
				+        self.__dict__.pop("__values__", None)
			
 
				+        super().__init__(self.__dict__, **kwargs)
			
 
				+        self.__values__ = _values or list(self.__dict__.values())
			
 
				+
			
 
				+    def __getitem__(self, key):
			
 
				+        if isinstance(key, int):
			
 
				+            value = self.__values__[key]
			
 
				+        else:
			
 
				+            value = self.__dict__[key]
			
 
				+
			
 
				+        return ensure_value(value)
			
 
				+
			
 
				+    def __iter__(self, *args, **kwargs):
			
 
				+        for value in self.__values__:
			
 
				+            yield ensure_value(value)
			
 
				+
			
 
				+    def __getattribute__(self, item):
			
 
				+        value = object.__getattribute__(self, item)
			
 
				+        if item == "__dict__" or item == "__values__":
			
 
				+            return value
			
 
				+        return ensure_value(value)
			
 
				+
			
 
				+    def get(self, key, default=None):
			
 
				+        if key in self.__dict__:
			
 
				+            value = self.__dict__[key]
			
 
				+            return ensure_value(value)
			
 
				+
			
 
				+        return default
			
--- a/FworkSpider/feapder/utils/redis_lock.py
+++ b/FworkSpider/feapder/utils/redis_lock.py
@@ -0,0 +1,115 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2019/11/5 5:25 PM
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import threading
			
 
				+import time
			
 
				+
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.utils.log import log
			
 
				+
			
 
				+
			
 
				+class RedisLock:
			
 
				+    redis_cli = None
			
 
				+
			
 
				+    def __init__(self, key, redis_cli=None, wait_timeout=0, lock_timeout=86400):
			
 
				+        """
			
 
				+        redis超时锁
			
 
				+        :param key: 存储锁的key redis_lock:[key]
			
 
				+        :param redis_cli: redis客户端对象
			
 
				+        :param wait_timeout: 等待加锁超时时间，为0时则不等待加锁，加锁失败
			
 
				+        :param lock_timeout: 锁超时时间 为0时则不会超时，直到锁释放或意外退出，默认超时为1天
			
 
				+
			
 
				+        用法示例:
			
 
				+        with RedisLock(key="test") as _lock:
			
 
				+            if _lock.locked:
			
 
				+                # 用来判断是否加上了锁
			
 
				+                # do somethings
			
 
				+        """
			
 
				+        self.redis_conn = redis_cli
			
 
				+        self.lock_key = "redis_lock:{}".format(key)
			
 
				+        # 锁超时时间
			
 
				+        self.lock_timeout = lock_timeout
			
 
				+        # 等待加锁时间
			
 
				+        self.wait_timeout = wait_timeout
			
 
				+        self.locked = False
			
 
				+        self.stop_prolong_life = False
			
 
				+
			
 
				+    @property
			
 
				+    def redis_conn(self):
			
 
				+        if not self.__class__.redis_cli:
			
 
				+            self.__class__.redis_cli = RedisDB().get_redis_obj()
			
 
				+
			
 
				+        return self.__class__.redis_cli
			
 
				+
			
 
				+    @redis_conn.setter
			
 
				+    def redis_conn(self, cli):
			
 
				+        self.__class__.redis_cli = cli
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        if not self.locked:
			
 
				+            self.acquire()
			
 
				+            # 延长锁的时间
			
 
				+            thread = threading.Thread(target=self.prolong_life)
			
 
				+            thread.setDaemon(True)
			
 
				+            thread.start()
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        self.stop_prolong_life = True
			
 
				+        self.release()
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<RedisLock: {} >".format(self.lock_key)
			
 
				+
			
 
				+    def acquire(self):
			
 
				+        start = time.time()
			
 
				+        while True:
			
 
				+            # 尝试加锁
			
 
				+            if self.redis_conn.set(self.lock_key, time.time(), nx=True, ex=5):
			
 
				+                self.locked = True
			
 
				+                break
			
 
				+
			
 
				+            if self.wait_timeout > 0:
			
 
				+                if time.time() - start > self.wait_timeout:
			
 
				+                    log.info("加锁失败")
			
 
				+                    break
			
 
				+            else:
			
 
				+                break
			
 
				+            log.debug("等待加锁: {} wait:{}".format(self, time.time() - start))
			
 
				+            if self.wait_timeout > 10:
			
 
				+                time.sleep(5)
			
 
				+            else:
			
 
				+                time.sleep(1)
			
 
				+        return
			
 
				+
			
 
				+    def release(self):
			
 
				+        if self.locked:
			
 
				+            self.redis_conn.delete(self.lock_key)
			
 
				+            self.locked = False
			
 
				+        return
			
 
				+
			
 
				+    def prolong_life(self):
			
 
				+        """
			
 
				+        延长锁的过期时间
			
 
				+        :return:
			
 
				+        """
			
 
				+
			
 
				+        spend_time = 0
			
 
				+        while not self.stop_prolong_life:
			
 
				+            expire = self.redis_conn.ttl(self.lock_key)
			
 
				+            if expire < 0:  # key 不存在
			
 
				+                time.sleep(1)
			
 
				+                continue
			
 
				+            self.redis_conn.expire(self.lock_key, expire + 5)  # 延长5秒
			
 
				+            time.sleep(expire)  # 临过期5秒前，再次延长
			
 
				+            spend_time += expire
			
 
				+            if self.lock_timeout and spend_time > self.lock_timeout:
			
 
				+                log.info("锁超时，释放")
			
 
				+                self.redis_conn.delete(self.lock_key)
			
 
				+                break
			
--- a/FworkSpider/feapder/utils/tools.py
+++ b/FworkSpider/feapder/utils/tools.py
@@ -0,0 +1,2554 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018-09-06 14:21
			
 
				+---------
			
 
				+@summary: 工具
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+import asyncio
			
 
				+import calendar
			
 
				+import codecs
			
 
				+import configparser  # 读配置文件的
			
 
				+import datetime
			
 
				+import functools
			
 
				+import hashlib
			
 
				+import html
			
 
				+import json
			
 
				+import os
			
 
				+import pickle
			
 
				+import random
			
 
				+import re
			
 
				+import socket
			
 
				+import ssl
			
 
				+import string
			
 
				+import sys
			
 
				+import time
			
 
				+import traceback
			
 
				+import urllib
			
 
				+import urllib.parse
			
 
				+import uuid
			
 
				+import weakref
			
 
				+from functools import partial, wraps
			
 
				+from hashlib import md5
			
 
				+from pprint import pformat
			
 
				+from pprint import pprint
			
 
				+from urllib import request
			
 
				+from urllib.parse import urljoin
			
 
				+
			
 
				+import execjs  # pip install PyExecJS
			
 
				+import redis
			
 
				+import requests
			
 
				+import six
			
 
				+from requests.cookies import RequestsCookieJar
			
 
				+from w3lib.url import canonicalize_url as _canonicalize_url
			
 
				+
			
 
				+import feapder.setting as setting
			
 
				+from feapder.utils.email_sender import EmailSender
			
 
				+from feapder.utils.log import log
			
 
				+os.environ["EXECJS_RUNTIME"] = "Node"  # 设置使用node执行js
			
 
				+
			
 
				+# 全局取消ssl证书验证
			
 
				+ssl._create_default_https_context = ssl._create_unverified_context
			
 
				+
			
 
				+TIME_OUT = 30
			
 
				+TIMER_TIME = 5
			
 
				+
			
 
				+redisdb = None
			
 
				+
			
 
				+
			
 
				+def get_redisdb():
			
 
				+    global redisdb
			
 
				+    if not redisdb:
			
 
				+        ip, port = setting.REDISDB_IP_PORTS.split(":")
			
 
				+        redisdb = redis.Redis(
			
 
				+            host=ip,
			
 
				+            port=port,
			
 
				+            db=setting.REDISDB_DB,
			
 
				+            password=setting.REDISDB_USER_PASS,
			
 
				+            decode_responses=True,
			
 
				+        )  # redis默认端口是6379
			
 
				+    return redisdb
			
 
				+
			
 
				+
			
 
				+# 装饰器
			
 
				+class Singleton(object):
			
 
				+    def __init__(self, cls):
			
 
				+        self._cls = cls
			
 
				+        self._instance = {}
			
 
				+
			
 
				+    def __call__(self, *args, **kwargs):
			
 
				+        if self._cls not in self._instance:
			
 
				+            self._instance[self._cls] = self._cls(*args, **kwargs)
			
 
				+        return self._instance[self._cls]
			
 
				+
			
 
				+
			
 
				+def log_function_time(func):
			
 
				+    try:
			
 
				+
			
 
				+        @functools.wraps(func)  # 将函数的原来属性付给新函数
			
 
				+        def calculate_time(*args, **kw):
			
 
				+            began_time = time.time()
			
 
				+            callfunc = func(*args, **kw)
			
 
				+            end_time = time.time()
			
 
				+            log.debug(func.__name__ + " run time  = " + str(end_time - began_time))
			
 
				+            return callfunc
			
 
				+
			
 
				+        return calculate_time
			
 
				+    except:
			
 
				+        log.debug("求取时间无效 因为函数参数不符")
			
 
				+        return func
			
 
				+
			
 
				+
			
 
				+def run_safe_model(module_name):
			
 
				+    def inner_run_safe_model(func):
			
 
				+        try:
			
 
				+
			
 
				+            @functools.wraps(func)  # 将函数的原来属性付给新函数
			
 
				+            def run_func(*args, **kw):
			
 
				+                callfunc = None
			
 
				+                try:
			
 
				+                    callfunc = func(*args, **kw)
			
 
				+                except Exception as e:
			
 
				+                    log.error(module_name + ": " + func.__name__ + " - " + str(e))
			
 
				+                    traceback.print_exc()
			
 
				+                return callfunc
			
 
				+
			
 
				+            return run_func
			
 
				+        except Exception as e:
			
 
				+            log.error(module_name + ": " + func.__name__ + " - " + str(e))
			
 
				+            traceback.print_exc()
			
 
				+            return func
			
 
				+
			
 
				+    return inner_run_safe_model
			
 
				+
			
 
				+
			
 
				+def memoizemethod_noargs(method):
			
 
				+    """Decorator to cache the result of a method (without arguments) using a
			
 
				+    weak reference to its object
			
 
				+    """
			
 
				+    cache = weakref.WeakKeyDictionary()
			
 
				+
			
 
				+    @functools.wraps(method)
			
 
				+    def new_method(self, *args, **kwargs):
			
 
				+        if self not in cache:
			
 
				+            cache[self] = method(self, *args, **kwargs)
			
 
				+        return cache[self]
			
 
				+
			
 
				+    return new_method
			
 
				+
			
 
				+
			
 
				+########################【网页解析相关】###############################
			
 
				+
			
 
				+
			
 
				+# @log_function_time
			
 
				+def get_html_by_requests(
			
 
				+    url, headers=None, code="utf-8", data=None, proxies={}, with_response=False
			
 
				+):
			
 
				+    html = ""
			
 
				+    r = None
			
 
				+    try:
			
 
				+        if data:
			
 
				+            r = requests.post(
			
 
				+                url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies
			
 
				+            )
			
 
				+        else:
			
 
				+            r = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies)
			
 
				+
			
 
				+        if code:
			
 
				+            r.encoding = code
			
 
				+        html = r.text
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        log.error(e)
			
 
				+    finally:
			
 
				+        r and r.close()
			
 
				+
			
 
				+    if with_response:
			
 
				+        return html, r
			
 
				+    else:
			
 
				+        return html
			
 
				+
			
 
				+
			
 
				+def get_json_by_requests(
			
 
				+    url,
			
 
				+    params=None,
			
 
				+    headers=None,
			
 
				+    data=None,
			
 
				+    proxies={},
			
 
				+    with_response=False,
			
 
				+    cookies=None,
			
 
				+):
			
 
				+    json = {}
			
 
				+    response = None
			
 
				+    try:
			
 
				+        # response = requests.get(url, params = params)
			
 
				+        if data:
			
 
				+            response = requests.post(
			
 
				+                url,
			
 
				+                headers=headers,
			
 
				+                data=data,
			
 
				+                params=params,
			
 
				+                timeout=TIME_OUT,
			
 
				+                proxies=proxies,
			
 
				+                cookies=cookies,
			
 
				+            )
			
 
				+        else:
			
 
				+            response = requests.get(
			
 
				+                url,
			
 
				+                headers=headers,
			
 
				+                params=params,
			
 
				+                timeout=TIME_OUT,
			
 
				+                proxies=proxies,
			
 
				+                cookies=cookies,
			
 
				+            )
			
 
				+        response.encoding = "utf-8"
			
 
				+        json = response.json()
			
 
				+    except Exception as e:
			
 
				+        log.error(e)
			
 
				+    finally:
			
 
				+        response and response.close()
			
 
				+
			
 
				+    if with_response:
			
 
				+        return json, response
			
 
				+    else:
			
 
				+        return json
			
 
				+
			
 
				+
			
 
				+def get_cookies(response):
			
 
				+    cookies = requests.utils.dict_from_cookiejar(response.cookies)
			
 
				+    return cookies
			
 
				+
			
 
				+
			
 
				+def get_cookies_from_str(cookie_str):
			
 
				+    """
			
 
				+    >>> get_cookies_from_str("key=value; key2=value2; key3=; key4=; ")
			
 
				+    {'key': 'value', 'key2': 'value2', 'key3': '', 'key4': ''}
			
 
				+
			
 
				+    Args:
			
 
				+        cookie_str: key=value; key2=value2; key3=; key4=
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    cookies = {}
			
 
				+    for cookie in cookie_str.split(";"):
			
 
				+        cookie = cookie.strip()
			
 
				+        if not cookie:
			
 
				+            continue
			
 
				+        key, value = cookie.split("=", 1)
			
 
				+        key = key.strip()
			
 
				+        value = value.strip()
			
 
				+        cookies[key] = value
			
 
				+
			
 
				+    return cookies
			
 
				+
			
 
				+
			
 
				+def get_cookies_jar(cookies):
			
 
				+    """
			
 
				+    @summary: 适用于selenium生成的cookies转requests的cookies
			
 
				+    requests.get(xxx, cookies=jar)
			
 
				+    参考：https://www.cnblogs.com/small-bud/p/9064674.html
			
 
				+
			
 
				+    ---------
			
 
				+    @param cookies: [{},{}]
			
 
				+    ---------
			
 
				+    @result: cookie jar
			
 
				+    """
			
 
				+
			
 
				+    cookie_jar = RequestsCookieJar()
			
 
				+    for cookie in cookies:
			
 
				+        cookie_jar.set(cookie["name"], cookie["value"])
			
 
				+
			
 
				+    return cookie_jar
			
 
				+
			
 
				+
			
 
				+def get_cookies_from_selenium_cookie(cookies):
			
 
				+    """
			
 
				+    @summary: 适用于selenium生成的cookies转requests的cookies
			
 
				+    requests.get(xxx, cookies=jar)
			
 
				+    参考：https://www.cnblogs.com/small-bud/p/9064674.html
			
 
				+
			
 
				+    ---------
			
 
				+    @param cookies: [{},{}]
			
 
				+    ---------
			
 
				+    @result: cookie jar
			
 
				+    """
			
 
				+
			
 
				+    cookie_dict = {}
			
 
				+    for cookie in cookies:
			
 
				+        if cookie.get("name"):
			
 
				+            cookie_dict[cookie["name"]] = cookie["value"]
			
 
				+
			
 
				+    return cookie_dict
			
 
				+
			
 
				+
			
 
				+def cookiesjar2str(cookies):
			
 
				+    str_cookie = ""
			
 
				+    for k, v in requests.utils.dict_from_cookiejar(cookies).items():
			
 
				+        str_cookie += k
			
 
				+        str_cookie += "="
			
 
				+        str_cookie += v
			
 
				+        str_cookie += "; "
			
 
				+    return str_cookie
			
 
				+
			
 
				+
			
 
				+def cookies2str(cookies):
			
 
				+    str_cookie = ""
			
 
				+    for k, v in cookies.items():
			
 
				+        str_cookie += k
			
 
				+        str_cookie += "="
			
 
				+        str_cookie += v
			
 
				+        str_cookie += "; "
			
 
				+    return str_cookie
			
 
				+
			
 
				+
			
 
				+def get_urls(
			
 
				+    html,
			
 
				+    stop_urls=(
			
 
				+        "javascript",
			
 
				+        "+",
			
 
				+        ".css",
			
 
				+        ".js",
			
 
				+        ".rar",
			
 
				+        ".xls",
			
 
				+        ".exe",
			
 
				+        ".apk",
			
 
				+        ".doc",
			
 
				+        ".jpg",
			
 
				+        ".png",
			
 
				+        ".flv",
			
 
				+        ".mp4",
			
 
				+    ),
			
 
				+):
			
 
				+    # 不匹配javascript、 +、 # 这样的url
			
 
				+    regex = r'<a.*?href.*?=.*?["|\'](.*?)["|\']'
			
 
				+
			
 
				+    urls = get_info(html, regex)
			
 
				+    urls = sorted(set(urls), key=urls.index)
			
 
				+    if stop_urls:
			
 
				+        stop_urls = isinstance(stop_urls, str) and [stop_urls] or stop_urls
			
 
				+        use_urls = []
			
 
				+        for url in urls:
			
 
				+            for stop_url in stop_urls:
			
 
				+                if stop_url in url:
			
 
				+                    break
			
 
				+            else:
			
 
				+                use_urls.append(url)
			
 
				+
			
 
				+        urls = use_urls
			
 
				+    return urls
			
 
				+
			
 
				+
			
 
				+def get_full_url(root_url, sub_url):
			
 
				+    """
			
 
				+    @summary: 得到完整的ur
			
 
				+    ---------
			
 
				+    @param root_url: 根url （网页的url）
			
 
				+    @param sub_url:  子url （带有相对路径的 可以拼接成完整的）
			
 
				+    ---------
			
 
				+    @result: 返回完整的url
			
 
				+    """
			
 
				+
			
 
				+    return urljoin(root_url, sub_url)
			
 
				+
			
 
				+
			
 
				+def joint_url(url, params):
			
 
				+    # param_str = "?"
			
 
				+    # for key, value in params.items():
			
 
				+    #     value = isinstance(value, str) and value or str(value)
			
 
				+    #     param_str += key + "=" + value + "&"
			
 
				+    #
			
 
				+    # return url + param_str[:-1]
			
 
				+
			
 
				+    if not params:
			
 
				+        return url
			
 
				+
			
 
				+    params = urlencode(params)
			
 
				+    separator = "?" if "?" not in url else "&"
			
 
				+    return url + separator + params
			
 
				+
			
 
				+
			
 
				+def canonicalize_url(url):
			
 
				+    """
			
 
				+    url 归一化 会参数排序 及去掉锚点
			
 
				+    """
			
 
				+    return _canonicalize_url(url)
			
 
				+
			
 
				+
			
 
				+def get_url_md5(url):
			
 
				+    url = canonicalize_url(url)
			
 
				+    url = re.sub("^http://", "https://", url)
			
 
				+    return get_md5(url)
			
 
				+
			
 
				+
			
 
				+def fit_url(urls, identis):
			
 
				+    identis = isinstance(identis, str) and [identis] or identis
			
 
				+    fit_urls = []
			
 
				+    for link in urls:
			
 
				+        for identi in identis:
			
 
				+            if identi in link:
			
 
				+                fit_urls.append(link)
			
 
				+    return list(set(fit_urls))
			
 
				+
			
 
				+
			
 
				+def get_param(url, key):
			
 
				+    params = url.split("?")[-1].split("&")
			
 
				+    for param in params:
			
 
				+        key_value = param.split("=", 1)
			
 
				+        if key == key_value[0]:
			
 
				+            return key_value[1]
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def urlencode(params):
			
 
				+    """
			
 
				+    字典类型的参数转为字符串
			
 
				+    @param params:
			
 
				+    {
			
 
				+        'a': 1,
			
 
				+        'b': 2
			
 
				+    }
			
 
				+    @return: a=1&b=2
			
 
				+    """
			
 
				+    return urllib.parse.urlencode(params)
			
 
				+
			
 
				+
			
 
				+def urldecode(url):
			
 
				+    """
			
 
				+    将字符串类型的参数转为json
			
 
				+    @param url: xxx?a=1&b=2
			
 
				+    @return:
			
 
				+    {
			
 
				+        'a': 1,
			
 
				+        'b': 2
			
 
				+    }
			
 
				+    """
			
 
				+    params_json = {}
			
 
				+    params = url.split("?")[-1].split("&")
			
 
				+    for param in params:
			
 
				+        key, value = param.split("=")
			
 
				+        params_json[key] = unquote_url(value)
			
 
				+
			
 
				+    return params_json
			
 
				+
			
 
				+
			
 
				+def unquote_url(url, encoding="utf-8"):
			
 
				+    """
			
 
				+    @summary: 将url解码
			
 
				+    ---------
			
 
				+    @param url:
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    return urllib.parse.unquote(url, encoding=encoding)
			
 
				+
			
 
				+
			
 
				+def quote_url(url, encoding="utf-8"):
			
 
				+    """
			
 
				+    @summary: 将url编码 编码意思http://www.w3school.com.cn/tags/html_ref_urlencode.html
			
 
				+    ---------
			
 
				+    @param url:
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    return urllib.parse.quote(url, safe="%;/?:@&=+$,", encoding=encoding)
			
 
				+
			
 
				+
			
 
				+def quote_chinese_word(text, encoding="utf-8"):
			
 
				+    def quote_chinese_word_func(text):
			
 
				+        chinese_word = text.group(0)
			
 
				+        return urllib.parse.quote(chinese_word, encoding=encoding)
			
 
				+
			
 
				+    return re.sub("([\u4e00-\u9fa5]+)", quote_chinese_word_func, text, flags=re.S)
			
 
				+
			
 
				+
			
 
				+def unescape(str):
			
 
				+    """
			
 
				+    反转译
			
 
				+    """
			
 
				+    return html.unescape(str)
			
 
				+
			
 
				+
			
 
				+def excape(str):
			
 
				+    """
			
 
				+    转译
			
 
				+    """
			
 
				+    return html.escape(str)
			
 
				+
			
 
				+
			
 
				+_regexs = {}
			
 
				+
			
 
				+
			
 
				+# @log_function_time
			
 
				+def get_info(html, regexs, allow_repeat=True, fetch_one=False, split=None):
			
 
				+    regexs = isinstance(regexs, str) and [regexs] or regexs
			
 
				+
			
 
				+    infos = []
			
 
				+    for regex in regexs:
			
 
				+        if regex == "":
			
 
				+            continue
			
 
				+
			
 
				+        if regex not in _regexs.keys():
			
 
				+            _regexs[regex] = re.compile(regex, re.S)
			
 
				+
			
 
				+        if fetch_one:
			
 
				+            infos = _regexs[regex].search(html)
			
 
				+            if infos:
			
 
				+                infos = infos.groups()
			
 
				+            else:
			
 
				+                continue
			
 
				+        else:
			
 
				+            infos = _regexs[regex].findall(str(html))
			
 
				+
			
 
				+        if len(infos) > 0:
			
 
				+            # print(regex)
			
 
				+            break
			
 
				+
			
 
				+    if fetch_one:
			
 
				+        infos = infos if infos else ("",)
			
 
				+        return infos if len(infos) > 1 else infos[0]
			
 
				+    else:
			
 
				+        infos = allow_repeat and infos or sorted(set(infos), key=infos.index)
			
 
				+        infos = split.join(infos) if split else infos
			
 
				+        return infos
			
 
				+
			
 
				+
			
 
				+def table_json(table, save_one_blank=True):
			
 
				+    """
			
 
				+    将表格转为json 适应于 key：value 在一行类的表格
			
 
				+    @param table: 使用selector封装后的具有xpath的selector
			
 
				+    @param save_one_blank: 保留一个空白符
			
 
				+    @return:
			
 
				+    """
			
 
				+    data = {}
			
 
				+
			
 
				+    trs = table.xpath(".//tr")
			
 
				+    for tr in trs:
			
 
				+        tds = tr.xpath("./td|./th")
			
 
				+
			
 
				+        for i in range(0, len(tds), 2):
			
 
				+            if i + 1 > len(tds) - 1:
			
 
				+                break
			
 
				+
			
 
				+            key = tds[i].xpath("string(.)").extract_first(default="").strip()
			
 
				+            value = tds[i + 1].xpath("string(.)").extract_first(default="").strip()
			
 
				+            value = replace_str(value, "[\f\n\r\t\v]", "")
			
 
				+            value = replace_str(value, " +", " " if save_one_blank else "")
			
 
				+
			
 
				+            if key:
			
 
				+                data[key] = value
			
 
				+
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+def get_table_row_data(table):
			
 
				+    """
			
 
				+    获取表格里每一行数据
			
 
				+    @param table: 使用selector封装后的具有xpath的selector
			
 
				+    @return: [[],[]..]
			
 
				+    """
			
 
				+
			
 
				+    datas = []
			
 
				+    rows = table.xpath(".//tr")
			
 
				+    for row in rows:
			
 
				+        cols = row.xpath("./td|./th")
			
 
				+        row_datas = []
			
 
				+        for col in cols:
			
 
				+            data = col.xpath("string(.)").extract_first(default="").strip()
			
 
				+            row_datas.append(data)
			
 
				+        datas.append(row_datas)
			
 
				+
			
 
				+    return datas
			
 
				+
			
 
				+
			
 
				+def rows2json(rows, keys=None):
			
 
				+    """
			
 
				+    将行数据转为json
			
 
				+    @param rows: 每一行的数据
			
 
				+    @param keys: json的key，空时将rows的第一行作为key
			
 
				+    @return:
			
 
				+    """
			
 
				+    data_start_pos = 0 if keys else 1
			
 
				+    datas = []
			
 
				+    keys = keys or rows[0]
			
 
				+    for values in rows[data_start_pos:]:
			
 
				+        datas.append(dict(zip(keys, values)))
			
 
				+
			
 
				+    return datas
			
 
				+
			
 
				+
			
 
				+def get_form_data(form):
			
 
				+    """
			
 
				+    提取form中提交的数据
			
 
				+    :param form: 使用selector封装后的具有xpath的selector
			
 
				+    :return:
			
 
				+    """
			
 
				+    data = {}
			
 
				+    inputs = form.xpath(".//input")
			
 
				+    for input in inputs:
			
 
				+        name = input.xpath("./@name").extract_first()
			
 
				+        value = input.xpath("./@value").extract_first()
			
 
				+        if name:
			
 
				+            data[name] = value
			
 
				+
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+# mac上不好使
			
 
				+# def get_domain(url):
			
 
				+#     domain = ''
			
 
				+#     try:
			
 
				+#         domain = get_tld(url)
			
 
				+#     except Exception as e:
			
 
				+#         log.debug(e)
			
 
				+#     return domain
			
 
				+
			
 
				+
			
 
				+def get_domain(url):
			
 
				+    proto, rest = urllib.parse.splittype(url)
			
 
				+    domain, rest = urllib.parse.splithost(rest)
			
 
				+    return domain
			
 
				+
			
 
				+
			
 
				+def get_index_url(url):
			
 
				+    return "/".join(url.split("/")[:3])
			
 
				+
			
 
				+
			
 
				+def get_ip(domain):
			
 
				+    ip = socket.getaddrinfo(domain, "http")[0][4][0]
			
 
				+    return ip
			
 
				+
			
 
				+
			
 
				+def get_localhost_ip():
			
 
				+    """
			
 
				+    利用 UDP 协议来实现的，生成一个UDP包，把自己的 IP 放如到 UDP 协议头中，然后从UDP包中获取本机的IP。
			
 
				+    这个方法并不会真实的向外部发包，所以用抓包工具是看不到的
			
 
				+    :return:
			
 
				+    """
			
 
				+    s = None
			
 
				+    try:
			
 
				+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			
 
				+        s.connect(("8.8.8.8", 80))
			
 
				+        ip = s.getsockname()[0]
			
 
				+    finally:
			
 
				+        if s:
			
 
				+            s.close()
			
 
				+
			
 
				+    return ip
			
 
				+
			
 
				+
			
 
				+def ip_to_num(ip):
			
 
				+    import struct
			
 
				+
			
 
				+    ip_num = socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0])
			
 
				+    return ip_num
			
 
				+
			
 
				+
			
 
				+def is_valid_proxy(proxy, check_url=None):
			
 
				+    """
			
 
				+    检验代理是否有效
			
 
				+    @param proxy: xxx.xxx.xxx:xxx
			
 
				+    @param check_url: 利用目标网站检查，目标网站url。默认为None， 使用代理服务器的socket检查, 但不能排除Connection closed by foreign host
			
 
				+    @return: True / False
			
 
				+    """
			
 
				+    is_valid = False
			
 
				+
			
 
				+    if check_url:
			
 
				+        proxies = {"http": f"http://{proxy}", "https": f"https://{proxy}"}
			
 
				+        headers = {
			
 
				+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
			
 
				+        }
			
 
				+        response = None
			
 
				+        try:
			
 
				+            response = requests.get(
			
 
				+                check_url, headers=headers, proxies=proxies, stream=True, timeout=20
			
 
				+            )
			
 
				+            is_valid = True
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.error("check proxy failed: {} {}".format(e, proxy))
			
 
				+
			
 
				+        finally:
			
 
				+            if response:
			
 
				+                response.close()
			
 
				+
			
 
				+    else:
			
 
				+        ip, port = proxy.split(":")
			
 
				+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sk:
			
 
				+            sk.settimeout(7)
			
 
				+            try:
			
 
				+                sk.connect((ip, int(port)))  # 检查代理服务器是否开着
			
 
				+                is_valid = True
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.error("check proxy failed: {} {}:{}".format(e, ip, port))
			
 
				+
			
 
				+    return is_valid
			
 
				+
			
 
				+
			
 
				+def is_valid_url(url):
			
 
				+    """
			
 
				+    验证url是否合法
			
 
				+    :param url:
			
 
				+    :return:
			
 
				+    """
			
 
				+    if re.match(r"(^https?:/{2}\w.+$)|(ftp://)", url):
			
 
				+        return True
			
 
				+    else:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def get_text(soup, *args):
			
 
				+    try:
			
 
				+        return soup.get_text()
			
 
				+    except Exception as e:
			
 
				+        log.error(e)
			
 
				+        return ""
			
 
				+
			
 
				+
			
 
				+def del_html_tag(content, except_line_break=False, save_img=False, white_replaced=""):
			
 
				+    """
			
 
				+    删除html标签
			
 
				+    @param content: html内容
			
 
				+    @param except_line_break: 保留p标签
			
 
				+    @param save_img: 保留图片
			
 
				+    @param white_replaced: 空白符替换
			
 
				+    @return:
			
 
				+    """
			
 
				+    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
			
 
				+    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
			
 
				+    content = replace_str(content, "<!--(.|\n)*?-->")
			
 
				+    content = replace_str(
			
 
				+        content, "(?!&[a-z]+=)&[a-z]+;?"
			
 
				+    )  # 干掉&nbsp等无用的字符 但&xxx= 这种表示参数的除外
			
 
				+    if except_line_break:
			
 
				+        content = content.replace("</p>", "/p")
			
 
				+        content = replace_str(content, "<[^p].*?>")
			
 
				+        content = content.replace("/p", "</p>")
			
 
				+        content = replace_str(content, "[ \f\r\t\v]")
			
 
				+
			
 
				+    elif save_img:
			
 
				+        content = replace_str(content, "(?!<img.+?>)<.+?>")  # 替换掉除图片外的其他标签
			
 
				+        content = replace_str(content, "(?! +)\s+", "\n")  # 保留空格
			
 
				+        content = content.strip()
			
 
				+
			
 
				+    else:
			
 
				+        content = replace_str(content, "<(.|\n)*?>")
			
 
				+        content = replace_str(content, "\s", white_replaced)
			
 
				+        content = content.strip()
			
 
				+
			
 
				+    return content
			
 
				+
			
 
				+
			
 
				+def del_html_js_css(content):
			
 
				+    content = replace_str(content, "(?i)<script(.|\n)*?</script>")  # (?)忽略大小写
			
 
				+    content = replace_str(content, "(?i)<style(.|\n)*?</style>")
			
 
				+    content = replace_str(content, "<!--(.|\n)*?-->")
			
 
				+
			
 
				+    return content
			
 
				+
			
 
				+
			
 
				+def is_have_chinese(content):
			
 
				+    regex = "[\u4e00-\u9fa5]+"
			
 
				+    chinese_word = get_info(content, regex)
			
 
				+    return chinese_word and True or False
			
 
				+
			
 
				+
			
 
				+def is_have_english(content):
			
 
				+    regex = "[a-zA-Z]+"
			
 
				+    english_words = get_info(content, regex)
			
 
				+    return english_words and True or False
			
 
				+
			
 
				+
			
 
				+def get_chinese_word(content):
			
 
				+    regex = "[\u4e00-\u9fa5]+"
			
 
				+    chinese_word = get_info(content, regex)
			
 
				+    return chinese_word
			
 
				+
			
 
				+
			
 
				+def get_english_words(content):
			
 
				+    regex = "[a-zA-Z]+"
			
 
				+    english_words = get_info(content, regex)
			
 
				+    return english_words or ""
			
 
				+
			
 
				+
			
 
				+##################################################
			
 
				+def get_json(json_str):
			
 
				+    """
			
 
				+    @summary: 取json对象
			
 
				+    ---------
			
 
				+    @param json_str: json格式的字符串
			
 
				+    ---------
			
 
				+    @result: 返回json对象
			
 
				+    """
			
 
				+
			
 
				+    try:
			
 
				+        return json.loads(json_str) if json_str else {}
			
 
				+    except Exception as e1:
			
 
				+        try:
			
 
				+            json_str = json_str.strip()
			
 
				+            json_str = json_str.replace("'", '"')
			
 
				+            keys = get_info(json_str, "(\w+):")
			
 
				+            for key in keys:
			
 
				+                json_str = json_str.replace(key, '"%s"' % key)
			
 
				+
			
 
				+            return json.loads(json_str) if json_str else {}
			
 
				+
			
 
				+        except Exception as e2:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                e1: %s
			
 
				+                format json_str: %s
			
 
				+                e2: %s
			
 
				+                """
			
 
				+                % (e1, json_str, e2)
			
 
				+            )
			
 
				+
			
 
				+        return {}
			
 
				+
			
 
				+
			
 
				+def jsonp2json(jsonp):
			
 
				+    """
			
 
				+    将jsonp转为json
			
 
				+    @param jsonp: jQuery172013600082560040794_1553230569815({})
			
 
				+    @return:
			
 
				+    """
			
 
				+    try:
			
 
				+        return json.loads(re.match(".*?({.*}).*", jsonp, re.S).group(1))
			
 
				+    except:
			
 
				+        raise ValueError("Invalid Input")
			
 
				+
			
 
				+
			
 
				+def dumps_json(json_, indent=4, sort_keys=False):
			
 
				+    """
			
 
				+    @summary: 格式化json 用于打印
			
 
				+    ---------
			
 
				+    @param json_: json格式的字符串或json对象
			
 
				+    ---------
			
 
				+    @result: 格式化后的字符串
			
 
				+    """
			
 
				+    try:
			
 
				+        if isinstance(json_, str):
			
 
				+            json_ = get_json(json_)
			
 
				+
			
 
				+        json_ = json.dumps(
			
 
				+            json_, ensure_ascii=False, indent=indent, skipkeys=True, sort_keys=sort_keys
			
 
				+        )
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        log.error(e)
			
 
				+        json_ = pformat(json_)
			
 
				+
			
 
				+    return json_
			
 
				+
			
 
				+
			
 
				+def get_json_value(json_object, key):
			
 
				+    """
			
 
				+    @summary:
			
 
				+    ---------
			
 
				+    @param json_object: json对象或json格式的字符串
			
 
				+    @param key: 建值 如果在多个层级目录下 可写 key1.key2  如{'key1':{'key2':3}}
			
 
				+    ---------
			
 
				+    @result: 返回对应的值，如果没有，返回''
			
 
				+    """
			
 
				+    current_key = ""
			
 
				+    value = ""
			
 
				+    try:
			
 
				+        json_object = (
			
 
				+            isinstance(json_object, str) and get_json(json_object) or json_object
			
 
				+        )
			
 
				+
			
 
				+        current_key = key.split(".")[0]
			
 
				+        value = json_object[current_key]
			
 
				+
			
 
				+        key = key[key.find(".") + 1 :]
			
 
				+    except Exception as e:
			
 
				+        return value
			
 
				+
			
 
				+    if key == current_key:
			
 
				+        return value
			
 
				+    else:
			
 
				+        return get_json_value(value, key)
			
 
				+
			
 
				+
			
 
				+def get_all_keys(datas, depth=None, current_depth=0):
			
 
				+    """
			
 
				+    @summary: 获取json李所有的key
			
 
				+    ---------
			
 
				+    @param datas: dict / list
			
 
				+    @param depth: 字典key的层级 默认不限制层级 层级从1开始
			
 
				+    @param current_depth: 字典key的当前层级 不用传参
			
 
				+    ---------
			
 
				+    @result: 返回json所有的key
			
 
				+    """
			
 
				+
			
 
				+    keys = []
			
 
				+    if depth and current_depth >= depth:
			
 
				+        return keys
			
 
				+
			
 
				+    if isinstance(datas, list):
			
 
				+        for data in datas:
			
 
				+            keys.extend(get_all_keys(data, depth, current_depth=current_depth + 1))
			
 
				+    elif isinstance(datas, dict):
			
 
				+        for key, value in datas.items():
			
 
				+            keys.append(key)
			
 
				+            if isinstance(value, dict):
			
 
				+                keys.extend(get_all_keys(value, depth, current_depth=current_depth + 1))
			
 
				+
			
 
				+    return keys
			
 
				+
			
 
				+
			
 
				+def to_chinese(unicode_str):
			
 
				+    format_str = json.loads('{"chinese":"%s"}' % unicode_str)
			
 
				+    return format_str["chinese"]
			
 
				+
			
 
				+
			
 
				+##################################################
			
 
				+def replace_str(source_str, regex, replace_str=""):
			
 
				+    """
			
 
				+    @summary: 替换字符串
			
 
				+    ---------
			
 
				+    @param source_str: 原字符串
			
 
				+    @param regex: 正则
			
 
				+    @param replace_str: 用什么来替换 默认为''
			
 
				+    ---------
			
 
				+    @result: 返回替换后的字符串
			
 
				+    """
			
 
				+    str_info = re.compile(regex)
			
 
				+    return str_info.sub(replace_str, source_str)
			
 
				+
			
 
				+
			
 
				+def del_redundant_blank_character(text):
			
 
				+    """
			
 
				+    删除冗余的空白符， 只保留一个
			
 
				+    :param text:
			
 
				+    :return:
			
 
				+    """
			
 
				+    return re.sub("\s+", " ", text)
			
 
				+
			
 
				+
			
 
				+##################################################
			
 
				+def get_conf_value(config_file, section, key):
			
 
				+    cp = configparser.ConfigParser(allow_no_value=True)
			
 
				+    with codecs.open(config_file, "r", encoding="utf-8") as f:
			
 
				+        cp.read_file(f)
			
 
				+    return cp.get(section, key)
			
 
				+
			
 
				+
			
 
				+def mkdir(path):
			
 
				+    try:
			
 
				+        if not os.path.exists(path):
			
 
				+            os.makedirs(path)
			
 
				+    except OSError as exc:  # Python >2.5
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+def write_file(filename, content, mode="w", encoding="utf-8"):
			
 
				+    """
			
 
				+    @summary: 写文件
			
 
				+    ---------
			
 
				+    @param filename: 文件名（有路径）
			
 
				+    @param content: 内容
			
 
				+    @param mode: 模式 w/w+ (覆盖/追加)
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    directory = os.path.dirname(filename)
			
 
				+    mkdir(directory)
			
 
				+    with open(filename, mode, encoding=encoding) as file:
			
 
				+        file.writelines(content)
			
 
				+
			
 
				+
			
 
				+def read_file(filename, readlines=False, encoding="utf-8"):
			
 
				+    """
			
 
				+    @summary: 读文件
			
 
				+    ---------
			
 
				+    @param filename: 文件名（有路径）
			
 
				+    @param readlines: 按行读取 （默认False）
			
 
				+    ---------
			
 
				+    @result: 按行读取返回List，否则返回字符串
			
 
				+    """
			
 
				+
			
 
				+    content = None
			
 
				+    try:
			
 
				+        with open(filename, "r", encoding=encoding) as file:
			
 
				+            content = file.readlines() if readlines else file.read()
			
 
				+    except Exception as e:
			
 
				+        log.error(e)
			
 
				+
			
 
				+    return content
			
 
				+
			
 
				+
			
 
				+def get_oss_file_list(oss_handler, prefix, date_range_min, date_range_max=None):
			
 
				+    """
			
 
				+    获取文件列表
			
 
				+    @param prefix: 路径前缀 如 data/car_service_line/yiche/yiche_serial_zongshu_info
			
 
				+    @param date_range_min: 时间范围 最小值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
			
 
				+    @param date_range_max: 时间范围 最大值 日期分隔符为/ 如 2019/03/01 或 2019/03/01/00/00/00
			
 
				+    @return: 每个文件路径 如 html/e_commerce_service_line/alibaba/alibaba_shop_info/2019/03/22/15/53/15/8ca8b9e4-4c77-11e9-9dee-acde48001122.json.snappy
			
 
				+    """
			
 
				+
			
 
				+    # 计算时间范围
			
 
				+    date_range_max = date_range_max or date_range_min
			
 
				+    date_format = "/".join(
			
 
				+        ["%Y", "%m", "%d", "%H", "%M", "%S"][: date_range_min.count("/") + 1]
			
 
				+    )
			
 
				+    time_interval = [
			
 
				+        {"days": 365},
			
 
				+        {"days": 31},
			
 
				+        {"days": 1},
			
 
				+        {"hours": 1},
			
 
				+        {"minutes": 1},
			
 
				+        {"seconds": 1},
			
 
				+    ][date_range_min.count("/")]
			
 
				+    date_range = get_between_date(
			
 
				+        date_range_min, date_range_max, date_format=date_format, **time_interval
			
 
				+    )
			
 
				+
			
 
				+    for date in date_range:
			
 
				+        file_folder_path = os.path.join(prefix, date)
			
 
				+        objs = oss_handler.list(prefix=file_folder_path)
			
 
				+        for obj in objs:
			
 
				+            filename = obj.key
			
 
				+            yield filename
			
 
				+
			
 
				+
			
 
				+def is_html(url):
			
 
				+    if not url:
			
 
				+        return False
			
 
				+
			
 
				+    try:
			
 
				+        content_type = request.urlopen(url).info().get("Content-Type", "")
			
 
				+
			
 
				+        if "text/html" in content_type:
			
 
				+            return True
			
 
				+        else:
			
 
				+            return False
			
 
				+    except Exception as e:
			
 
				+        log.error(e)
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def is_exist(file_path):
			
 
				+    """
			
 
				+    @summary: 文件是否存在
			
 
				+    ---------
			
 
				+    @param file_path:
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    return os.path.exists(file_path)
			
 
				+
			
 
				+
			
 
				+def download_file(url, file_path, *, call_func=None, proxies=None, data=None):
			
 
				+    """
			
 
				+    下载文件，会自动创建文件存储目录
			
 
				+    Args:
			
 
				+        url: 地址
			
 
				+        file_path: 文件存储地址
			
 
				+        call_func: 下载成功的回调
			
 
				+        proxies: 代理
			
 
				+        data: 请求体
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    directory = os.path.dirname(file_path)
			
 
				+    mkdir(directory)
			
 
				+
			
 
				+    # 进度条
			
 
				+    def progress_callfunc(blocknum, blocksize, totalsize):
			
 
				+        """回调函数
			
 
				+        @blocknum : 已经下载的数据块
			
 
				+        @blocksize : 数据块的大小
			
 
				+        @totalsize: 远程文件的大小
			
 
				+        """
			
 
				+        percent = 100.0 * blocknum * blocksize / totalsize
			
 
				+        if percent > 100:
			
 
				+            percent = 100
			
 
				+        # print ('进度条 %.2f%%' % percent, end = '\r')
			
 
				+        sys.stdout.write("进度条 %.2f%%" % percent + "\r")
			
 
				+        sys.stdout.flush()
			
 
				+
			
 
				+    if url:
			
 
				+        try:
			
 
				+            if proxies:
			
 
				+                # create the object, assign it to a variable
			
 
				+                proxy = request.ProxyHandler(proxies)
			
 
				+                # construct a new opener using your proxy settings
			
 
				+                opener = request.build_opener(proxy)
			
 
				+                # install the openen on the module-level
			
 
				+                request.install_opener(opener)
			
 
				+
			
 
				+            request.urlretrieve(url, file_path, progress_callfunc, data)
			
 
				+
			
 
				+            if callable(call_func):
			
 
				+                call_func()
			
 
				+            return 1
			
 
				+        except Exception as e:
			
 
				+            log.error(e)
			
 
				+            return 0
			
 
				+    else:
			
 
				+        return 0
			
 
				+
			
 
				+
			
 
				+def get_file_list(path, ignore=[]):
			
 
				+    templist = path.split("*")
			
 
				+    path = templist[0]
			
 
				+    file_type = templist[1] if len(templist) >= 2 else ""
			
 
				+
			
 
				+    # 递归遍历文件
			
 
				+    def get_file_list_(path, file_type, ignore, all_file=[]):
			
 
				+        file_list = os.listdir(path)
			
 
				+
			
 
				+        for file_name in file_list:
			
 
				+            if file_name in ignore:
			
 
				+                continue
			
 
				+
			
 
				+            file_path = os.path.join(path, file_name)
			
 
				+            if os.path.isdir(file_path):
			
 
				+                get_file_list_(file_path, file_type, ignore, all_file)
			
 
				+            else:
			
 
				+                if not file_type or file_name.endswith(file_type):
			
 
				+                    all_file.append(file_path)
			
 
				+
			
 
				+        return all_file
			
 
				+
			
 
				+    return get_file_list_(path, file_type, ignore) if os.path.isdir(path) else [path]
			
 
				+
			
 
				+
			
 
				+def rename_file(old_name, new_name):
			
 
				+    os.rename(old_name, new_name)
			
 
				+
			
 
				+
			
 
				+def del_file(path, ignore=()):
			
 
				+    files = get_file_list(path, ignore)
			
 
				+    for file in files:
			
 
				+        try:
			
 
				+            os.remove(file)
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                """
			
 
				+                删除出错: %s
			
 
				+                Exception : %s
			
 
				+                """
			
 
				+                % (file, str(e))
			
 
				+            )
			
 
				+        finally:
			
 
				+            pass
			
 
				+
			
 
				+
			
 
				+def get_file_type(file_name):
			
 
				+    """
			
 
				+    @summary: 取文件后缀名
			
 
				+    ---------
			
 
				+    @param file_name:
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+    try:
			
 
				+        return os.path.splitext(file_name)[1]
			
 
				+    except Exception as e:
			
 
				+        log.exception(e)
			
 
				+
			
 
				+
			
 
				+def get_file_path(file_path):
			
 
				+    """
			
 
				+    @summary: 取文件路径
			
 
				+    ---------
			
 
				+    @param file_path: /root/a.py
			
 
				+    ---------
			
 
				+    @result: /root
			
 
				+    """
			
 
				+    try:
			
 
				+        return os.path.split(file_path)[0]
			
 
				+    except Exception as e:
			
 
				+        log.exception(e)
			
 
				+
			
 
				+
			
 
				+#############################################
			
 
				+
			
 
				+
			
 
				+def exec_js(js_code):
			
 
				+    """
			
 
				+    @summary: 执行js代码
			
 
				+    ---------
			
 
				+    @param js_code: js代码
			
 
				+    ---------
			
 
				+    @result: 返回执行结果
			
 
				+    """
			
 
				+
			
 
				+    return execjs.eval(js_code)
			
 
				+
			
 
				+
			
 
				+def compile_js(js_func):
			
 
				+    """
			
 
				+    @summary: 编译js函数
			
 
				+    ---------
			
 
				+    @param js_func:js函数
			
 
				+    ---------
			
 
				+    @result: 返回函数对象 调用 fun('js_funName', param1,param2)
			
 
				+    """
			
 
				+
			
 
				+    ctx = execjs.compile(js_func)
			
 
				+    return ctx.call
			
 
				+
			
 
				+
			
 
				+###############################################
			
 
				+
			
 
				+#############################################
			
 
				+
			
 
				+
			
 
				+def date_to_timestamp(date, time_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    """
			
 
				+    @summary:
			
 
				+    ---------
			
 
				+    @param date:将"2011-09-28 10:00:00"时间格式转化为时间戳
			
 
				+    @param format:时间格式
			
 
				+    ---------
			
 
				+    @result: 返回时间戳
			
 
				+    """
			
 
				+
			
 
				+    timestamp = time.mktime(time.strptime(date, time_format))
			
 
				+    return int(timestamp)
			
 
				+
			
 
				+
			
 
				+def timestamp_to_date(timestamp, time_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    """
			
 
				+    @summary:
			
 
				+    ---------
			
 
				+    @param timestamp: 将时间戳转化为日期
			
 
				+    @param format: 日期格式
			
 
				+    ---------
			
 
				+    @result: 返回日期
			
 
				+    """
			
 
				+    if timestamp is None:
			
 
				+        raise ValueError("timestamp is null")
			
 
				+
			
 
				+    date = time.localtime(timestamp)
			
 
				+    return time.strftime(time_format, date)
			
 
				+
			
 
				+
			
 
				+def get_current_timestamp():
			
 
				+    return int(time.time())
			
 
				+
			
 
				+
			
 
				+def get_current_date(date_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    return datetime.datetime.now().strftime(date_format)
			
 
				+    # return time.strftime(date_format, time.localtime(time.time()))
			
 
				+
			
 
				+
			
 
				+def get_date_number(year=None, month=None, day=None):
			
 
				+    """
			
 
				+    @summary: 获取指定日期对应的日期数
			
 
				+    默认当前周
			
 
				+    ---------
			
 
				+    @param year: 2010
			
 
				+    @param month: 6
			
 
				+    @param day: 16
			
 
				+    ---------
			
 
				+    @result: (年号，第几周，第几天) 如 (2010, 24, 3)
			
 
				+    """
			
 
				+    if year and month and day:
			
 
				+        return datetime.date(year, month, day).isocalendar()
			
 
				+    elif not any([year, month, day]):
			
 
				+        return datetime.datetime.now().isocalendar()
			
 
				+    else:
			
 
				+        assert year, "year 不能为空"
			
 
				+        assert month, "month 不能为空"
			
 
				+        assert day, "day 不能为空"
			
 
				+
			
 
				+
			
 
				+def get_between_date(
			
 
				+    begin_date, end_date=None, date_format="%Y-%m-%d", **time_interval
			
 
				+):
			
 
				+    """
			
 
				+    @summary: 获取一段时间间隔内的日期，默认为每一天
			
 
				+    ---------
			
 
				+    @param begin_date: 开始日期 str 如 2018-10-01
			
 
				+    @param end_date: 默认为今日
			
 
				+    @param date_format: 日期格式，应与begin_date的日期格式相对应
			
 
				+    @param time_interval: 时间间隔 默认一天 支持 days、seconds、microseconds、milliseconds、minutes、hours、weeks
			
 
				+    ---------
			
 
				+    @result: list 值为字符串
			
 
				+    """
			
 
				+
			
 
				+    date_list = []
			
 
				+
			
 
				+    begin_date = datetime.datetime.strptime(begin_date, date_format)
			
 
				+    end_date = (
			
 
				+        datetime.datetime.strptime(end_date, date_format)
			
 
				+        if end_date
			
 
				+        else datetime.datetime.strptime(
			
 
				+            time.strftime(date_format, time.localtime(time.time())), date_format
			
 
				+        )
			
 
				+    )
			
 
				+    time_interval = time_interval or dict(days=1)
			
 
				+
			
 
				+    while begin_date <= end_date:
			
 
				+        date_str = begin_date.strftime(date_format)
			
 
				+        date_list.append(date_str)
			
 
				+
			
 
				+        begin_date += datetime.timedelta(**time_interval)
			
 
				+
			
 
				+    if end_date.strftime(date_format) not in date_list:
			
 
				+        date_list.append(end_date.strftime(date_format))
			
 
				+
			
 
				+    return date_list
			
 
				+
			
 
				+
			
 
				+def get_between_months(begin_date, end_date=None):
			
 
				+    """
			
 
				+    @summary: 获取一段时间间隔内的月份
			
 
				+    需要满一整月
			
 
				+    ---------
			
 
				+    @param begin_date: 开始时间 如 2018-01-01
			
 
				+    @param end_date: 默认当前时间
			
 
				+    ---------
			
 
				+    @result: 列表 如 ['2018-01', '2018-02']
			
 
				+    """
			
 
				+
			
 
				+    def add_months(dt, months):
			
 
				+        month = dt.month - 1 + months
			
 
				+        year = dt.year + month // 12
			
 
				+        month = month % 12 + 1
			
 
				+        day = min(dt.day, calendar.monthrange(year, month)[1])
			
 
				+        return dt.replace(year=year, month=month, day=day)
			
 
				+
			
 
				+    date_list = []
			
 
				+    begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
			
 
				+    end_date = (
			
 
				+        datetime.datetime.strptime(end_date, "%Y-%m-%d")
			
 
				+        if end_date
			
 
				+        else datetime.datetime.strptime(
			
 
				+            time.strftime("%Y-%m-%d", time.localtime(time.time())), "%Y-%m-%d"
			
 
				+        )
			
 
				+    )
			
 
				+    while begin_date <= end_date:
			
 
				+        date_str = begin_date.strftime("%Y-%m")
			
 
				+        date_list.append(date_str)
			
 
				+        begin_date = add_months(begin_date, 1)
			
 
				+    return date_list
			
 
				+
			
 
				+
			
 
				+def get_today_of_day(day_offset=0):
			
 
				+    return str(datetime.date.today() + datetime.timedelta(days=day_offset))
			
 
				+
			
 
				+
			
 
				+def get_days_of_month(year, month):
			
 
				+    """
			
 
				+    返回天数
			
 
				+    """
			
 
				+
			
 
				+    return calendar.monthrange(year, month)[1]
			
 
				+
			
 
				+
			
 
				+def get_firstday_of_month(date):
			
 
				+    """''
			
 
				+    date format = "YYYY-MM-DD"
			
 
				+    """
			
 
				+
			
 
				+    year, month, day = date.split("-")
			
 
				+    year, month, day = int(year), int(month), int(day)
			
 
				+
			
 
				+    days = "01"
			
 
				+    if int(month) < 10:
			
 
				+        month = "0" + str(int(month))
			
 
				+    arr = (year, month, days)
			
 
				+    return "-".join("%s" % i for i in arr)
			
 
				+
			
 
				+
			
 
				+def get_lastday_of_month(date):
			
 
				+    """''
			
 
				+    get the last day of month
			
 
				+    date format = "YYYY-MM-DD"
			
 
				+    """
			
 
				+    year, month, day = date.split("-")
			
 
				+    year, month, day = int(year), int(month), int(day)
			
 
				+
			
 
				+    days = calendar.monthrange(year, month)[1]
			
 
				+    month = add_zero(month)
			
 
				+    arr = (year, month, days)
			
 
				+    return "-".join("%s" % i for i in arr)
			
 
				+
			
 
				+
			
 
				+def get_firstday_month(month_offset=0):
			
 
				+    """''
			
 
				+    get the first day of month from today
			
 
				+    month_offset is how many months
			
 
				+    """
			
 
				+    (y, m, d) = get_year_month_and_days(month_offset)
			
 
				+    d = "01"
			
 
				+    arr = (y, m, d)
			
 
				+    return "-".join("%s" % i for i in arr)
			
 
				+
			
 
				+
			
 
				+def get_lastday_month(month_offset=0):
			
 
				+    """''
			
 
				+    get the last day of month from today
			
 
				+    month_offset is how many months
			
 
				+    """
			
 
				+    return "-".join("%s" % i for i in get_year_month_and_days(month_offset))
			
 
				+
			
 
				+
			
 
				+def get_last_month(month_offset=0):
			
 
				+    """''
			
 
				+    get the last day of month from today
			
 
				+    month_offset is how many months
			
 
				+    """
			
 
				+    return "-".join("%s" % i for i in get_year_month_and_days(month_offset)[:2])
			
 
				+
			
 
				+
			
 
				+def get_year_month_and_days(month_offset=0):
			
 
				+    """
			
 
				+    @summary:
			
 
				+    ---------
			
 
				+    @param month_offset: 月份偏移量
			
 
				+    ---------
			
 
				+    @result: ('2019', '04', '30')
			
 
				+    """
			
 
				+
			
 
				+    today = datetime.datetime.now()
			
 
				+    year, month = today.year, today.month
			
 
				+
			
 
				+    this_year = int(year)
			
 
				+    this_month = int(month)
			
 
				+    total_month = this_month + month_offset
			
 
				+    if month_offset >= 0:
			
 
				+        if total_month <= 12:
			
 
				+            days = str(get_days_of_month(this_year, total_month))
			
 
				+            total_month = add_zero(total_month)
			
 
				+            return (year, total_month, days)
			
 
				+        else:
			
 
				+            i = total_month // 12
			
 
				+            j = total_month % 12
			
 
				+            if j == 0:
			
 
				+                i -= 1
			
 
				+                j = 12
			
 
				+            this_year += i
			
 
				+            days = str(get_days_of_month(this_year, j))
			
 
				+            j = add_zero(j)
			
 
				+            return (str(this_year), str(j), days)
			
 
				+    else:
			
 
				+        if (total_month > 0) and (total_month < 12):
			
 
				+            days = str(get_days_of_month(this_year, total_month))
			
 
				+            total_month = add_zero(total_month)
			
 
				+            return (year, total_month, days)
			
 
				+        else:
			
 
				+            i = total_month // 12
			
 
				+            j = total_month % 12
			
 
				+            if j == 0:
			
 
				+                i -= 1
			
 
				+                j = 12
			
 
				+            this_year += i
			
 
				+            days = str(get_days_of_month(this_year, j))
			
 
				+            j = add_zero(j)
			
 
				+            return (str(this_year), str(j), days)
			
 
				+
			
 
				+
			
 
				+def add_zero(n):
			
 
				+    return "%02d" % n
			
 
				+
			
 
				+
			
 
				+def get_month(month_offset=0):
			
 
				+    """''
			
 
				+    获取当前日期前后N月的日期
			
 
				+    if month_offset>0, 获取当前日期前N月的日期
			
 
				+    if month_offset<0, 获取当前日期后N月的日期
			
 
				+    date format = "YYYY-MM-DD"
			
 
				+    """
			
 
				+    today = datetime.datetime.now()
			
 
				+    day = add_zero(today.day)
			
 
				+
			
 
				+    (y, m, d) = get_year_month_and_days(month_offset)
			
 
				+    arr = (y, m, d)
			
 
				+    if int(day) < int(d):
			
 
				+        arr = (y, m, day)
			
 
				+    return "-".join("%s" % i for i in arr)
			
 
				+
			
 
				+
			
 
				+@run_safe_model("format_date")
			
 
				+def format_date(date, old_format="", new_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    """
			
 
				+    @summary: 格式化日期格式
			
 
				+    ---------
			
 
				+    @param date: 日期 eg：2017年4月17日 3时27分12秒
			
 
				+    @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒'
			
 
				+        %y 两位数的年份表示（00-99）
			
 
				+        %Y 四位数的年份表示（000-9999）
			
 
				+        %m 月份（01-12）
			
 
				+        %d 月内中的一天（0-31）
			
 
				+        %H 24小时制小时数（0-23）
			
 
				+        %I 12小时制小时数（01-12）
			
 
				+        %M 分钟数（00-59）
			
 
				+        %S 秒（00-59）
			
 
				+    @param new_format: 输出的日期格式
			
 
				+    ---------
			
 
				+    @result: 格式化后的日期，类型为字符串 如2017-4-17 03:27:12
			
 
				+    """
			
 
				+    if not date:
			
 
				+        return ""
			
 
				+
			
 
				+    if not old_format:
			
 
				+        regex = "(\d+)"
			
 
				+        numbers = get_info(date, regex, allow_repeat=True)
			
 
				+        formats = ["%Y", "%m", "%d", "%H", "%M", "%S"]
			
 
				+        old_format = date
			
 
				+        for i, number in enumerate(numbers[:6]):
			
 
				+            if i == 0 and len(number) == 2:  # 年份可能是两位 用小%y
			
 
				+                old_format = old_format.replace(
			
 
				+                    number, formats[i].lower(), 1
			
 
				+                )  # 替换一次 '2017年11月30日 11:49' 防止替换11月时，替换11小时
			
 
				+            else:
			
 
				+                old_format = old_format.replace(number, formats[i], 1)  # 替换一次
			
 
				+
			
 
				+    try:
			
 
				+        date_obj = datetime.datetime.strptime(date, old_format)
			
 
				+        if "T" in date and "Z" in date:
			
 
				+            date_obj += datetime.timedelta(hours=8)
			
 
				+            date_str = date_obj.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+        else:
			
 
				+            date_str = datetime.datetime.strftime(date_obj, new_format)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        log.error("日期格式化出错，old_format = %s 不符合 %s 格式" % (old_format, date))
			
 
				+        date_str = date
			
 
				+
			
 
				+    return date_str
			
 
				+
			
 
				+
			
 
				+def transform_lower_num(data_str: str):
			
 
				+    num_map = {
			
 
				+        "一": "1",
			
 
				+        "二": "2",
			
 
				+        "三": "3",
			
 
				+        "四": "4",
			
 
				+        "五": "5",
			
 
				+        "六": "6",
			
 
				+        "七": "7",
			
 
				+        "八": "8",
			
 
				+        "九": "9",
			
 
				+        "十": "0",
			
 
				+    }
			
 
				+    pattern = f'[{"|".join(num_map.keys())}|零]'
			
 
				+    res = re.search(pattern, data_str)
			
 
				+    if not res:
			
 
				+        #  如果字符串中没有包含中文数字 不做处理 直接返回
			
 
				+        return data_str
			
 
				+
			
 
				+    data_str = data_str.replace("0", "零")
			
 
				+    for n in num_map:
			
 
				+        data_str = data_str.replace(n, num_map[n])
			
 
				+
			
 
				+    re_data_str = re.findall("\d+", data_str)
			
 
				+    for i in re_data_str:
			
 
				+        if len(i) == 3:
			
 
				+            new_i = i.replace("0", "")
			
 
				+            data_str = data_str.replace(i, new_i, 1)
			
 
				+        elif len(i) == 4:
			
 
				+            new_i = i.replace("10", "")
			
 
				+            data_str = data_str.replace(i, new_i, 1)
			
 
				+        elif len(i) == 2 and int(i) < 10:
			
 
				+            new_i = int(i) + 10
			
 
				+            data_str = data_str.replace(i, str(new_i), 1)
			
 
				+        elif len(i) == 1 and int(i) == 0:
			
 
				+            new_i = int(i) + 10
			
 
				+            data_str = data_str.replace(i, str(new_i), 1)
			
 
				+
			
 
				+    return data_str.replace("零", "0")
			
 
				+
			
 
				+
			
 
				+@run_safe_model("format_time")
			
 
				+def format_time(release_time, date_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    """
			
 
				+    >>> format_time("2个月前")
			
 
				+    '2021-08-15 16:24:21'
			
 
				+    >>> format_time("2月前")
			
 
				+    '2021-08-15 16:24:36'
			
 
				+    """
			
 
				+    release_time = transform_lower_num(release_time)
			
 
				+    release_time = release_time.replace("日", "天").replace("/", "-")
			
 
				+
			
 
				+    if "年前" in release_time:
			
 
				+        years = re.compile("(\d+)\s*年前").findall(release_time)
			
 
				+        years_ago = datetime.datetime.now() - datetime.timedelta(
			
 
				+            days=int(years[0]) * 365
			
 
				+        )
			
 
				+        release_time = years_ago.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    elif "月前" in release_time:
			
 
				+        months = re.compile("(\d+)[\s个]*月前").findall(release_time)
			
 
				+        months_ago = datetime.datetime.now() - datetime.timedelta(
			
 
				+            days=int(months[0]) * 30
			
 
				+        )
			
 
				+        release_time = months_ago.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    elif "周前" in release_time:
			
 
				+        weeks = re.compile("(\d+)\s*周前").findall(release_time)
			
 
				+        weeks_ago = datetime.datetime.now() - datetime.timedelta(days=int(weeks[0]) * 7)
			
 
				+        release_time = weeks_ago.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    elif "天前" in release_time:
			
 
				+        ndays = re.compile("(\d+)\s*天前").findall(release_time)
			
 
				+        days_ago = datetime.datetime.now() - datetime.timedelta(days=int(ndays[0]))
			
 
				+        release_time = days_ago.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    elif "小时前" in release_time:
			
 
				+        nhours = re.compile("(\d+)\s*小时前").findall(release_time)
			
 
				+        hours_ago = datetime.datetime.now() - datetime.timedelta(hours=int(nhours[0]))
			
 
				+        release_time = hours_ago.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    elif "分钟前" in release_time:
			
 
				+        nminutes = re.compile("(\d+)\s*分钟前").findall(release_time)
			
 
				+        minutes_ago = datetime.datetime.now() - datetime.timedelta(
			
 
				+            minutes=int(nminutes[0])
			
 
				+        )
			
 
				+        release_time = minutes_ago.strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+    elif "前天" in release_time:
			
 
				+        today = datetime.date.today()
			
 
				+        yesterday = today - datetime.timedelta(days=2)
			
 
				+        release_time = release_time.replace("前天", str(yesterday))
			
 
				+
			
 
				+    elif "昨天" in release_time:
			
 
				+        today = datetime.date.today()
			
 
				+        yesterday = today - datetime.timedelta(days=1)
			
 
				+        release_time = release_time.replace("昨天", str(yesterday))
			
 
				+
			
 
				+    elif "今天" in release_time:
			
 
				+        release_time = release_time.replace("今天", get_current_date("%Y-%m-%d"))
			
 
				+
			
 
				+    elif "刚刚" in release_time:
			
 
				+        release_time = get_current_date()
			
 
				+
			
 
				+    elif re.search("^\d\d:\d\d", release_time):
			
 
				+        release_time = get_current_date("%Y-%m-%d") + " " + release_time
			
 
				+
			
 
				+    elif not re.compile("\d{4}").findall(release_time):
			
 
				+        month = re.compile("\d{1,2}").findall(release_time)
			
 
				+        if month and int(month[0]) <= int(get_current_date("%m")):
			
 
				+            release_time = get_current_date("%Y") + "-" + release_time
			
 
				+        else:
			
 
				+            release_time = str(int(get_current_date("%Y")) - 1) + "-" + release_time
			
 
				+
			
 
				+    # 把日和小时粘在一起的拆开
			
 
				+    template = re.compile("(\d{4}-\d{1,2}-\d{2})(\d{1,2})")
			
 
				+    release_time = re.sub(template, r"\1 \2", release_time)
			
 
				+    release_time = format_date(release_time, new_format=date_format)
			
 
				+
			
 
				+    return release_time
			
 
				+
			
 
				+
			
 
				+def to_date(date_str, date_format="%Y-%m-%d %H:%M:%S"):
			
 
				+    return datetime.datetime.strptime(date_str, date_format)
			
 
				+
			
 
				+
			
 
				+def get_before_date(
			
 
				+    current_date,
			
 
				+    days,
			
 
				+    current_date_format="%Y-%m-%d %H:%M:%S",
			
 
				+    return_date_format="%Y-%m-%d %H:%M:%S",
			
 
				+):
			
 
				+    """
			
 
				+    @summary: 获取之前时间
			
 
				+    ---------
			
 
				+    @param current_date: 当前时间 str类型
			
 
				+    @param days: 时间间隔 -1 表示前一天 1 表示后一天
			
 
				+    @param days: 返回的时间格式
			
 
				+    ---------
			
 
				+    @result: 字符串
			
 
				+    """
			
 
				+
			
 
				+    current_date = to_date(current_date, current_date_format)
			
 
				+    date_obj = current_date + datetime.timedelta(days=days)
			
 
				+    return datetime.datetime.strftime(date_obj, return_date_format)
			
 
				+
			
 
				+
			
 
				+def delay_time(sleep_time=60):
			
 
				+    """
			
 
				+    @summary: 睡眠  默认1分钟
			
 
				+    ---------
			
 
				+    @param sleep_time: 以秒为单位
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    time.sleep(sleep_time)
			
 
				+
			
 
				+
			
 
				+def format_seconds(seconds):
			
 
				+    """
			
 
				+    @summary: 将秒转为时分秒
			
 
				+    ---------
			
 
				+    @param seconds:
			
 
				+    ---------
			
 
				+    @result: 2天3小时2分49秒
			
 
				+    """
			
 
				+
			
 
				+    seconds = int(seconds + 0.5)  # 向上取整
			
 
				+
			
 
				+    m, s = divmod(seconds, 60)
			
 
				+    h, m = divmod(m, 60)
			
 
				+    d, h = divmod(h, 24)
			
 
				+
			
 
				+    times = ""
			
 
				+    if d:
			
 
				+        times += "{}天".format(d)
			
 
				+    if h:
			
 
				+        times += "{}小时".format(h)
			
 
				+    if m:
			
 
				+        times += "{}分".format(m)
			
 
				+    if s:
			
 
				+        times += "{}秒".format(s)
			
 
				+
			
 
				+    return times
			
 
				+
			
 
				+
			
 
				+################################################
			
 
				+def get_md5(*args):
			
 
				+    """
			
 
				+    @summary: 获取唯一的32位md5
			
 
				+    ---------
			
 
				+    @param *args: 参与联合去重的值
			
 
				+    ---------
			
 
				+    @result: 7c8684bcbdfcea6697650aa53d7b1405
			
 
				+    """
			
 
				+
			
 
				+    m = hashlib.md5()
			
 
				+    for arg in args:
			
 
				+        m.update(str(arg).encode())
			
 
				+
			
 
				+    return m.hexdigest()
			
 
				+
			
 
				+
			
 
				+def get_sha1(*args):
			
 
				+    """
			
 
				+    @summary: 获取唯一的40位值， 用于获取唯一的id
			
 
				+    ---------
			
 
				+    @param *args: 参与联合去重的值
			
 
				+    ---------
			
 
				+    @result: ba4868b3f277c8e387b55d9e3d0be7c045cdd89e
			
 
				+    """
			
 
				+
			
 
				+    sha1 = hashlib.sha1()
			
 
				+    for arg in args:
			
 
				+        sha1.update(str(arg).encode())
			
 
				+    return sha1.hexdigest()  # 40位
			
 
				+
			
 
				+
			
 
				+def get_base64(secret, message):
			
 
				+    """
			
 
				+    @summary: 数字证书签名算法是："HMAC-SHA256"
			
 
				+              参考：https://www.jokecamp.com/blog/examples-of-creating-base64-hashes-using-hmac-sha256-in-different-languages/
			
 
				+    ---------
			
 
				+    @param secret: 秘钥
			
 
				+    @param message: 消息
			
 
				+    ---------
			
 
				+    @result: 签名输出类型是："base64"
			
 
				+    """
			
 
				+
			
 
				+    import hashlib
			
 
				+    import hmac
			
 
				+    import base64
			
 
				+
			
 
				+    message = bytes(message, "utf-8")
			
 
				+    secret = bytes(secret, "utf-8")
			
 
				+
			
 
				+    signature = base64.b64encode(
			
 
				+        hmac.new(secret, message, digestmod=hashlib.sha256).digest()
			
 
				+    ).decode("utf8")
			
 
				+    return signature
			
 
				+
			
 
				+
			
 
				+def get_uuid(key1="", key2=""):
			
 
				+    """
			
 
				+    @summary: 计算uuid值
			
 
				+    可用于将两个字符串组成唯一的值。如可将域名和新闻标题组成uuid，形成联合索引
			
 
				+    ---------
			
 
				+    @param key1:str
			
 
				+    @param key2:str
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    uuid_object = ""
			
 
				+
			
 
				+    if not key1 and not key2:
			
 
				+        uuid_object = uuid.uuid1()
			
 
				+    else:
			
 
				+        hash = md5(bytes(key1, "utf-8") + bytes(key2, "utf-8")).digest()
			
 
				+        uuid_object = uuid.UUID(bytes=hash[:16], version=3)
			
 
				+
			
 
				+    return str(uuid_object)
			
 
				+
			
 
				+
			
 
				+def get_hash(text):
			
 
				+    return hash(text)
			
 
				+
			
 
				+
			
 
				+##################################################
			
 
				+
			
 
				+
			
 
				+def cut_string(text, length):
			
 
				+    """
			
 
				+    @summary: 将文本按指定长度拆分
			
 
				+    ---------
			
 
				+    @param text: 文本
			
 
				+    @param length: 拆分长度
			
 
				+    ---------
			
 
				+    @result: 返回按指定长度拆分后形成的list
			
 
				+    """
			
 
				+
			
 
				+    text_list = re.findall(".{%d}" % length, text, re.S)
			
 
				+    leave_text = text[len(text_list) * length :]
			
 
				+    if leave_text:
			
 
				+        text_list.append(leave_text)
			
 
				+
			
 
				+    return text_list
			
 
				+
			
 
				+
			
 
				+def get_random_string(length=1):
			
 
				+    random_string = "".join(random.sample(string.ascii_letters + string.digits, length))
			
 
				+    return random_string
			
 
				+
			
 
				+
			
 
				+def get_random_password(length=8, special_characters=""):
			
 
				+    """
			
 
				+    @summary: 创建随机密码 默认长度为8，包含大写字母、小写字母、数字
			
 
				+    ---------
			
 
				+    @param length: 密码长度 默认8
			
 
				+    @param special_characters: 特殊字符
			
 
				+    ---------
			
 
				+    @result: 指定长度的密码
			
 
				+    """
			
 
				+
			
 
				+    while True:
			
 
				+        random_password = "".join(
			
 
				+            random.sample(
			
 
				+                string.ascii_letters + string.digits + special_characters, length
			
 
				+            )
			
 
				+        )
			
 
				+        if (
			
 
				+            re.search("[0-9]", random_password)
			
 
				+            and re.search("[A-Z]", random_password)
			
 
				+            and re.search("[a-z]", random_password)
			
 
				+        ):
			
 
				+            if not special_characters:
			
 
				+                break
			
 
				+            elif set(random_password).intersection(special_characters):
			
 
				+                break
			
 
				+
			
 
				+    return random_password
			
 
				+
			
 
				+
			
 
				+def get_random_email(length=None, email_types: list = None, special_characters=""):
			
 
				+    """
			
 
				+    随机生成邮箱
			
 
				+    :param length: 邮箱长度
			
 
				+    :param email_types: 邮箱类型
			
 
				+    :param special_characters: 特殊字符
			
 
				+    :return:
			
 
				+    """
			
 
				+    if not length:
			
 
				+        length = random.randint(4, 12)
			
 
				+    if not email_types:
			
 
				+        email_types = [
			
 
				+            "qq.com",
			
 
				+            "163.com",
			
 
				+            "gmail.com",
			
 
				+            "yahoo.com",
			
 
				+            "hotmail.com",
			
 
				+            "yeah.net",
			
 
				+            "126.com",
			
 
				+            "139.com",
			
 
				+            "sohu.com",
			
 
				+        ]
			
 
				+
			
 
				+    email_body = get_random_password(length, special_characters)
			
 
				+    email_type = random.choice(email_types)
			
 
				+
			
 
				+    email = email_body + "@" + email_type
			
 
				+    return email
			
 
				+
			
 
				+
			
 
				+#################################
			
 
				+
			
 
				+
			
 
				+def dumps_obj(obj):
			
 
				+    return pickle.dumps(obj)
			
 
				+
			
 
				+
			
 
				+def loads_obj(obj_str):
			
 
				+    return pickle.loads(obj_str)
			
 
				+
			
 
				+
			
 
				+def get_method(obj, name):
			
 
				+    name = str(name)
			
 
				+    try:
			
 
				+        return getattr(obj, name)
			
 
				+    except AttributeError:
			
 
				+        log.error("Method %r not found in: %s" % (name, obj))
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def witch_workspace(project_path):
			
 
				+    """
			
 
				+    @summary:
			
 
				+    ---------
			
 
				+    @param project_path:
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    os.chdir(project_path)  # 切换工作路经
			
 
				+
			
 
				+
			
 
				+############### 数据库相关 #######################
			
 
				+def format_sql_value(value):
			
 
				+    if isinstance(value, str):
			
 
				+        value = value.strip()
			
 
				+
			
 
				+    elif isinstance(value, (list, dict)):
			
 
				+        value = dumps_json(value, indent=None)
			
 
				+
			
 
				+    elif isinstance(value, (datetime.date, datetime.time)):
			
 
				+        value = str(value)
			
 
				+
			
 
				+    elif isinstance(value, bool):
			
 
				+        value = int(value)
			
 
				+
			
 
				+    return value
			
 
				+
			
 
				+
			
 
				+def list2str(datas):
			
 
				+    """
			
 
				+    列表转字符串
			
 
				+    :param datas: [1, 2]
			
 
				+    :return: (1, 2)
			
 
				+    """
			
 
				+    data_str = str(tuple(datas))
			
 
				+    data_str = re.sub(",\)$", ")", data_str)
			
 
				+    return data_str
			
 
				+
			
 
				+
			
 
				+def make_insert_sql(
			
 
				+    table, data, auto_update=False, update_columns=(), insert_ignore=False
			
 
				+):
			
 
				+    """
			
 
				+    @summary: 适用于mysql， oracle数据库时间需要to_date 处理（TODO）
			
 
				+    ---------
			
 
				+    @param table:
			
 
				+    @param data: 表数据 json格式
			
 
				+    @param auto_update: 使用的是replace into， 为完全覆盖已存在的数据
			
 
				+    @param update_columns: 需要更新的列 默认全部，当指定值时，auto_update设置无效，当duplicate key冲突时更新指定的列
			
 
				+    @param insert_ignore: 数据存在忽略
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    keys = ["`{}`".format(key) for key in data.keys()]
			
 
				+    keys = list2str(keys).replace("'", "")
			
 
				+
			
 
				+    values = [format_sql_value(value) for value in data.values()]
			
 
				+    values = list2str(values)
			
 
				+
			
 
				+    if update_columns:
			
 
				+        if not isinstance(update_columns, (tuple, list)):
			
 
				+            update_columns = [update_columns]
			
 
				+        update_columns_ = ", ".join(
			
 
				+            ["{key}=values({key})".format(key=key) for key in update_columns]
			
 
				+        )
			
 
				+        sql = (
			
 
				+            "insert%s into `{table}` {keys} values {values} on duplicate key update %s"
			
 
				+            % (" ignore" if insert_ignore else "", update_columns_)
			
 
				+        )
			
 
				+
			
 
				+    elif auto_update:
			
 
				+        sql = "replace into `{table}` {keys} values {values}"
			
 
				+    else:
			
 
				+        sql = "insert%s into `{table}` {keys} values {values}" % (
			
 
				+            " ignore" if insert_ignore else ""
			
 
				+        )
			
 
				+
			
 
				+    sql = sql.format(table=table, keys=keys, values=values).replace("None", "null")
			
 
				+    return sql
			
 
				+
			
 
				+
			
 
				+def make_update_sql(table, data, condition):
			
 
				+    """
			
 
				+    @summary: 适用于mysql， oracle数据库时间需要to_date 处理（TODO）
			
 
				+    ---------
			
 
				+    @param table:
			
 
				+    @param data: 表数据 json格式
			
 
				+    @param condition: where 条件
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+    key_values = []
			
 
				+
			
 
				+    for key, value in data.items():
			
 
				+        value = format_sql_value(value)
			
 
				+        if isinstance(value, str):
			
 
				+            key_values.append("`{}`={}".format(key, repr(value)))
			
 
				+        elif value is None:
			
 
				+            key_values.append("`{}`={}".format(key, "null"))
			
 
				+        else:
			
 
				+            key_values.append("`{}`={}".format(key, value))
			
 
				+
			
 
				+    key_values = ", ".join(key_values)
			
 
				+
			
 
				+    sql = "update `{table}` set {key_values} where {condition}"
			
 
				+    sql = sql.format(table=table, key_values=key_values, condition=condition)
			
 
				+    return sql
			
 
				+
			
 
				+
			
 
				+def make_batch_sql(
			
 
				+    table, datas, auto_update=False, update_columns=(), update_columns_value=()
			
 
				+):
			
 
				+    """
			
 
				+    @summary: 生产批量的sql
			
 
				+    ---------
			
 
				+    @param table:
			
 
				+    @param datas: 表数据 [{...}]
			
 
				+    @param auto_update: 使用的是replace into， 为完全覆盖已存在的数据
			
 
				+    @param update_columns: 需要更新的列 默认全部，当指定值时，auto_update设置无效，当duplicate key冲突时更新指定的列
			
 
				+    @param update_columns_value: 需要更新的列的值 默认为datas里边对应的值, 注意 如果值为字符串类型 需要主动加单引号， 如 update_columns_value=("'test'",)
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+    if not datas:
			
 
				+        return
			
 
				+
			
 
				+    keys = list(datas[0].keys())
			
 
				+    values_placeholder = ["%s"] * len(keys)
			
 
				+
			
 
				+    values = []
			
 
				+    for data in datas:
			
 
				+        value = []
			
 
				+        for key in keys:
			
 
				+            current_data = data.get(key)
			
 
				+            current_data = format_sql_value(current_data)
			
 
				+
			
 
				+            value.append(current_data)
			
 
				+
			
 
				+        values.append(value)
			
 
				+
			
 
				+    keys = ["`{}`".format(key) for key in keys]
			
 
				+    keys = list2str(keys).replace("'", "")
			
 
				+
			
 
				+    values_placeholder = list2str(values_placeholder).replace("'", "")
			
 
				+
			
 
				+    if update_columns:
			
 
				+        if not isinstance(update_columns, (tuple, list)):
			
 
				+            update_columns = [update_columns]
			
 
				+        if update_columns_value:
			
 
				+            update_columns_ = ", ".join(
			
 
				+                [
			
 
				+                    "`{key}`={value}".format(key=key, value=value)
			
 
				+                    for key, value in zip(update_columns, update_columns_value)
			
 
				+                ]
			
 
				+            )
			
 
				+        else:
			
 
				+            update_columns_ = ", ".join(
			
 
				+                ["`{key}`=values(`{key}`)".format(key=key) for key in update_columns]
			
 
				+            )
			
 
				+        sql = "insert into `{table}` {keys} values {values_placeholder} on duplicate key update {update_columns}".format(
			
 
				+            table=table,
			
 
				+            keys=keys,
			
 
				+            values_placeholder=values_placeholder,
			
 
				+            update_columns=update_columns_,
			
 
				+        )
			
 
				+    elif auto_update:
			
 
				+        sql = "replace into `{table}` {keys} values {values_placeholder}".format(
			
 
				+            table=table, keys=keys, values_placeholder=values_placeholder
			
 
				+        )
			
 
				+    else:
			
 
				+        sql = "insert ignore into `{table}` {keys} values {values_placeholder}".format(
			
 
				+            table=table, keys=keys, values_placeholder=values_placeholder
			
 
				+        )
			
 
				+
			
 
				+    return sql, values
			
 
				+
			
 
				+
			
 
				+############### json相关 #######################
			
 
				+
			
 
				+
			
 
				+def key2underline(key: str, strict=True):
			
 
				+    """
			
 
				+    >>> key2underline("HelloWord")
			
 
				+    'hello_word'
			
 
				+    >>> key2underline("SHData", strict=True)
			
 
				+    's_h_data'
			
 
				+    >>> key2underline("SHData", strict=False)
			
 
				+    'sh_data'
			
 
				+    >>> key2underline("SHDataHi", strict=False)
			
 
				+    'sh_data_hi'
			
 
				+    >>> key2underline("SHDataHi", strict=True)
			
 
				+    's_h_data_hi'
			
 
				+    >>> key2underline("dataHi", strict=True)
			
 
				+    'data_hi'
			
 
				+    """
			
 
				+    regex = "[A-Z]*" if not strict else "[A-Z]"
			
 
				+    capitals = re.findall(regex, key)
			
 
				+
			
 
				+    if capitals:
			
 
				+        for capital in capitals:
			
 
				+            if not capital:
			
 
				+                continue
			
 
				+            if key.startswith(capital):
			
 
				+                if len(capital) > 1:
			
 
				+                    key = key.replace(
			
 
				+                        capital, capital[:-1].lower() + "_" + capital[-1].lower(), 1
			
 
				+                    )
			
 
				+                else:
			
 
				+                    key = key.replace(capital, capital.lower(), 1)
			
 
				+            else:
			
 
				+                if len(capital) > 1:
			
 
				+                    key = key.replace(capital, "_" + capital.lower() + "_", 1)
			
 
				+                else:
			
 
				+                    key = key.replace(capital, "_" + capital.lower(), 1)
			
 
				+
			
 
				+    return key.strip("_")
			
 
				+
			
 
				+
			
 
				+def key2hump(key):
			
 
				+    """
			
 
				+    下划线试变成首字母大写
			
 
				+    """
			
 
				+    return key.title().replace("_", "")
			
 
				+
			
 
				+
			
 
				+def format_json_key(json_data):
			
 
				+    json_data_correct = {}
			
 
				+    for key, value in json_data.items():
			
 
				+        key = key2underline(key)
			
 
				+        json_data_correct[key] = value
			
 
				+
			
 
				+    return json_data_correct
			
 
				+
			
 
				+
			
 
				+def quick_to_json(text):
			
 
				+    """
			
 
				+    @summary: 可快速将浏览器上的header转为json格式
			
 
				+    ---------
			
 
				+    @param text:
			
 
				+    ---------
			
 
				+    @result:
			
 
				+    """
			
 
				+
			
 
				+    contents = text.split("\n")
			
 
				+    json = {}
			
 
				+    for content in contents:
			
 
				+        if content == "\n":
			
 
				+            continue
			
 
				+
			
 
				+        content = content.strip()
			
 
				+        regex = ["(:?.*?):(.*)", "(.*?):? +(.*)", "([^:]*)"]
			
 
				+
			
 
				+        result = get_info(content, regex)
			
 
				+        result = result[0] if isinstance(result[0], tuple) else result
			
 
				+        try:
			
 
				+            json[result[0]] = eval(result[1].strip())
			
 
				+        except:
			
 
				+            json[result[0]] = result[1].strip()
			
 
				+
			
 
				+    return json
			
 
				+
			
 
				+
			
 
				+##############################
			
 
				+
			
 
				+
			
 
				+def print_pretty(object):
			
 
				+    pprint(object)
			
 
				+
			
 
				+
			
 
				+def print_params2json(url):
			
 
				+    params_json = {}
			
 
				+    params = url.split("?")[-1].split("&")
			
 
				+    for param in params:
			
 
				+        key_value = param.split("=", 1)
			
 
				+        params_json[key_value[0]] = key_value[1]
			
 
				+
			
 
				+    print(dumps_json(params_json))
			
 
				+
			
 
				+
			
 
				+def print_cookie2json(cookie_str_or_list):
			
 
				+    if isinstance(cookie_str_or_list, str):
			
 
				+        cookie_json = {}
			
 
				+        cookies = cookie_str_or_list.split("; ")
			
 
				+        for cookie in cookies:
			
 
				+            name, value = cookie.split("=")
			
 
				+            cookie_json[name] = value
			
 
				+    else:
			
 
				+        cookie_json = get_cookies_from_selenium_cookie(cookie_str_or_list)
			
 
				+
			
 
				+    print(dumps_json(cookie_json))
			
 
				+
			
 
				+
			
 
				+###############################
			
 
				+
			
 
				+
			
 
				+def flatten(x):
			
 
				+    """flatten(sequence) -> list
			
 
				+    Returns a single, flat list which contains all elements retrieved
			
 
				+    from the sequence and all recursively contained sub-sequences
			
 
				+    (iterables).
			
 
				+    Examples:
			
 
				+    >>> [1, 2, [3,4], (5,6)]
			
 
				+    [1, 2, [3, 4], (5, 6)]
			
 
				+    >>> flatten([[[1,2,3], (42,None)], [4,5], [6], 7, (8,9,10)])
			
 
				+    [1, 2, 3, 42, None, 4, 5, 6, 7, 8, 9, 10]
			
 
				+    >>> flatten(["foo", "bar"])
			
 
				+    ['foo', 'bar']
			
 
				+    >>> flatten(["foo", ["baz", 42], "bar"])
			
 
				+    ['foo', 'baz', 42, 'bar']
			
 
				+    """
			
 
				+    return list(iflatten(x))
			
 
				+
			
 
				+
			
 
				+def iflatten(x):
			
 
				+    """iflatten(sequence) -> iterator
			
 
				+    Similar to ``.flatten()``, but returns iterator instead"""
			
 
				+    for el in x:
			
 
				+        if _is_listlike(el):
			
 
				+            for el_ in flatten(el):
			
 
				+                yield el_
			
 
				+        else:
			
 
				+            yield el
			
 
				+
			
 
				+
			
 
				+def _is_listlike(x):
			
 
				+    """
			
 
				+    >>> _is_listlike("foo")
			
 
				+    False
			
 
				+    >>> _is_listlike(5)
			
 
				+    False
			
 
				+    >>> _is_listlike(b"foo")
			
 
				+    False
			
 
				+    >>> _is_listlike([b"foo"])
			
 
				+    True
			
 
				+    >>> _is_listlike((b"foo",))
			
 
				+    True
			
 
				+    >>> _is_listlike({})
			
 
				+    True
			
 
				+    >>> _is_listlike(set())
			
 
				+    True
			
 
				+    >>> _is_listlike((x for x in range(3)))
			
 
				+    True
			
 
				+    >>> _is_listlike(six.moves.xrange(5))
			
 
				+    True
			
 
				+    """
			
 
				+    return hasattr(x, "__iter__") and not isinstance(x, (six.text_type, bytes))
			
 
				+
			
 
				+
			
 
				+###################
			
 
				+
			
 
				+
			
 
				+def re_def_supper_class(obj, supper_class):
			
 
				+    """
			
 
				+    重新定义父类
			
 
				+    @param obj: 类 如 class A: 则obj为A 或者 A的实例 a.__class__
			
 
				+    @param supper_class: 父类
			
 
				+    @return:
			
 
				+    """
			
 
				+    obj.__bases__ = (supper_class,)
			
 
				+
			
 
				+
			
 
				+###################
			
 
				+freq_limit_record = {}
			
 
				+
			
 
				+
			
 
				+def reach_freq_limit(rate_limit, *key):
			
 
				+    """
			
 
				+    频率限制
			
 
				+    :param rate_limit: 限制时间 单位秒
			
 
				+    :param key: 频率限制的key
			
 
				+    :return: True / False
			
 
				+    """
			
 
				+    if rate_limit == 0:
			
 
				+        return False
			
 
				+
			
 
				+    msg_md5 = get_md5(*key)
			
 
				+    key = "rate_limit:{}".format(msg_md5)
			
 
				+    try:
			
 
				+        if get_redisdb().get(key):
			
 
				+            return True
			
 
				+
			
 
				+        get_redisdb().set(key, time.time(), ex=rate_limit)
			
 
				+    except redis.exceptions.ConnectionError as e:
			
 
				+        # 使用内存做频率限制
			
 
				+        global freq_limit_record
			
 
				+
			
 
				+        if key not in freq_limit_record:
			
 
				+            freq_limit_record[key] = time.time()
			
 
				+            return False
			
 
				+
			
 
				+        if time.time() - freq_limit_record.get(key) < rate_limit:
			
 
				+            return True
			
 
				+        else:
			
 
				+            freq_limit_record[key] = time.time()
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def dingding_warning(
			
 
				+    message, message_prefix=None, rate_limit=None, url=None, user_phone=None
			
 
				+):
			
 
				+    # 为了加载最新的配置
			
 
				+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
			
 
				+    url = url or setting.DINGDING_WARNING_URL
			
 
				+    user_phone = user_phone or setting.DINGDING_WARNING_PHONE
			
 
				+
			
 
				+    if not all([url, message]):
			
 
				+        return
			
 
				+
			
 
				+    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
			
 
				+        log.info("报警时间间隔过短，此次报警忽略。 内容 {}".format(message))
			
 
				+        return
			
 
				+
			
 
				+    if isinstance(user_phone, str):
			
 
				+        user_phone = [user_phone] if user_phone else []
			
 
				+
			
 
				+    data = {
			
 
				+        "msgtype": "text",
			
 
				+        "text": {"content": message},
			
 
				+        "at": {"atMobiles": user_phone, "isAtAll": setting.DINGDING_WARNING_ALL},
			
 
				+    }
			
 
				+
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				+
			
 
				+    try:
			
 
				+        response = requests.post(
			
 
				+            url, headers=headers, data=json.dumps(data).encode("utf8")
			
 
				+        )
			
 
				+        result = response.json()
			
 
				+        response.close()
			
 
				+        if result.get("errcode") == 0:
			
 
				+            return True
			
 
				+        else:
			
 
				+            raise Exception(result.get("errmsg"))
			
 
				+    except Exception as e:
			
 
				+        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def email_warning(
			
 
				+    message,
			
 
				+    title,
			
 
				+    message_prefix=None,
			
 
				+    email_sender=None,
			
 
				+    email_password=None,
			
 
				+    email_receiver=None,
			
 
				+    email_smtpserver=None,
			
 
				+    rate_limit=None,
			
 
				+):
			
 
				+    # 为了加载最新的配置
			
 
				+    email_sender = email_sender or setting.EMAIL_SENDER
			
 
				+    email_password = email_password or setting.EMAIL_PASSWORD
			
 
				+    email_receiver = email_receiver or setting.EMAIL_RECEIVER
			
 
				+    email_smtpserver = email_smtpserver or setting.EMAIL_SMTPSERVER
			
 
				+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
			
 
				+
			
 
				+    if not all([message, email_sender, email_password, email_receiver]):
			
 
				+        return
			
 
				+
			
 
				+    if reach_freq_limit(
			
 
				+        rate_limit, email_receiver, email_sender, message_prefix or message
			
 
				+    ):
			
 
				+        log.info("报警时间间隔过短，此次报警忽略。 内容 {}".format(message))
			
 
				+        return
			
 
				+
			
 
				+    if isinstance(email_receiver, str):
			
 
				+        email_receiver = [email_receiver]
			
 
				+
			
 
				+    with EmailSender(
			
 
				+        username=email_sender, password=email_password, smtpserver=email_smtpserver
			
 
				+    ) as email:
			
 
				+        return email.send(receivers=email_receiver, title=title, content=message)
			
 
				+
			
 
				+
			
 
				+def linkedsee_warning(message, rate_limit=3600, message_prefix=None, token=None):
			
 
				+    """
			
 
				+    灵犀电话报警
			
 
				+    Args:
			
 
				+        message:
			
 
				+        rate_limit:
			
 
				+        message_prefix:
			
 
				+        token:
			
 
				+
			
 
				+    Returns:
			
 
				+
			
 
				+    """
			
 
				+    if not token:
			
 
				+        log.info("未设置灵犀token，不支持报警")
			
 
				+        return
			
 
				+
			
 
				+    if reach_freq_limit(rate_limit, token, message_prefix or message):
			
 
				+        log.info("报警时间间隔过短，此次报警忽略。 内容 {}".format(message))
			
 
				+        return
			
 
				+
			
 
				+    headers = {"servicetoken": token, "Content-Type": "application/json"}
			
 
				+
			
 
				+    url = "http://www.linkedsee.com/alarm/zabbix"
			
 
				+
			
 
				+    data = {"content": message}
			
 
				+    response = requests.post(url, data=json.dumps(data), headers=headers)
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+def wechat_warning(
			
 
				+    message,
			
 
				+    message_prefix=None,
			
 
				+    rate_limit=None,
			
 
				+    url=None,
			
 
				+    user_phone=None,
			
 
				+    all_users: bool = None,
			
 
				+):
			
 
				+    """企业微信报警"""
			
 
				+
			
 
				+    # 为了加载最新的配置
			
 
				+    rate_limit = rate_limit if rate_limit is not None else setting.WARNING_INTERVAL
			
 
				+    url = url or setting.WECHAT_WARNING_URL
			
 
				+    user_phone = user_phone or setting.WECHAT_WARNING_PHONE
			
 
				+    all_users = all_users if all_users is not None else setting.WECHAT_WARNING_ALL
			
 
				+
			
 
				+    if isinstance(user_phone, str):
			
 
				+        user_phone = [user_phone] if user_phone else []
			
 
				+
			
 
				+    if all_users is True or not user_phone:
			
 
				+        user_phone = ["@all"]
			
 
				+
			
 
				+    if not all([url, message]):
			
 
				+        return
			
 
				+
			
 
				+    if reach_freq_limit(rate_limit, url, user_phone, message_prefix or message):
			
 
				+        log.info("报警时间间隔过短，此次报警忽略。 内容 {}".format(message))
			
 
				+        return
			
 
				+
			
 
				+    data = {
			
 
				+        "msgtype": "text",
			
 
				+        "text": {"content": message, "mentioned_mobile_list": user_phone},
			
 
				+    }
			
 
				+
			
 
				+    headers = {"Content-Type": "application/json"}
			
 
				+
			
 
				+    try:
			
 
				+        response = requests.post(
			
 
				+            url, headers=headers, data=json.dumps(data).encode("utf8")
			
 
				+        )
			
 
				+        result = response.json()
			
 
				+        response.close()
			
 
				+        if result.get("errcode") == 0:
			
 
				+            return True
			
 
				+        else:
			
 
				+            raise Exception(result.get("errmsg"))
			
 
				+    except Exception as e:
			
 
				+        log.error("报警发送失败。 报警内容 {}, error: {}".format(message, e))
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def send_msg(msg, level="debug", message_prefix=""):
			
 
				+    if setting.WARNING_LEVEL == "ERROR":
			
 
				+        if level != "error":
			
 
				+            return
			
 
				+
			
 
				+    if setting.DINGDING_WARNING_URL:
			
 
				+        keyword = "feapder报警系统\n"
			
 
				+        dingding_warning(keyword + msg, message_prefix=message_prefix)
			
 
				+
			
 
				+    if setting.EMAIL_RECEIVER:
			
 
				+        title = message_prefix or msg
			
 
				+        if len(title) > 50:
			
 
				+            title = title[:50] + "..."
			
 
				+        email_warning(msg, message_prefix=message_prefix, title=title)
			
 
				+
			
 
				+    if setting.WECHAT_WARNING_URL:
			
 
				+        keyword = "feapder报警系统\n"
			
 
				+        wechat_warning(keyword + msg, message_prefix=message_prefix)
			
 
				+
			
 
				+
			
 
				+###################
			
 
				+
			
 
				+
			
 
				+def make_item(cls, data: dict):
			
 
				+    """提供Item类与原数据，快速构建Item实例
			
 
				+    :param cls: Item类
			
 
				+    :param data: 字典格式的数据
			
 
				+    """
			
 
				+    item = cls()
			
 
				+    for key, val in data.items():
			
 
				+        setattr(item, key, val)
			
 
				+    return item
			
 
				+
			
 
				+
			
 
				+###################
			
 
				+
			
 
				+
			
 
				+def aio_wrap(loop=None, executor=None):
			
 
				+    """
			
 
				+    wrap a normal sync version of a function to an async version
			
 
				+    """
			
 
				+    outer_loop = loop
			
 
				+    outer_executor = executor
			
 
				+
			
 
				+    def wrap(fn):
			
 
				+        @wraps(fn)
			
 
				+        async def run(*args, loop=None, executor=None, **kwargs):
			
 
				+            if loop is None:
			
 
				+                if outer_loop is None:
			
 
				+                    loop = asyncio.get_event_loop()
			
 
				+                else:
			
 
				+                    loop = outer_loop
			
 
				+            if executor is None:
			
 
				+                executor = outer_executor
			
 
				+            pfunc = partial(fn, *args, **kwargs)
			
 
				+            return await loop.run_in_executor(executor, pfunc)
			
 
				+
			
 
				+        return run
			
 
				+
			
 
				+    return wrap
			
 
				+
			
 
				+
			
 
				+######### number ##########
			
 
				+
			
 
				+
			
 
				+def ensure_int(n):
			
 
				+    """
			
 
				+    >>> ensure_int(None)
			
 
				+    0
			
 
				+    >>> ensure_int(False)
			
 
				+    0
			
 
				+    >>> ensure_int(12)
			
 
				+    12
			
 
				+    >>> ensure_int("72")
			
 
				+    72
			
 
				+    >>> ensure_int('')
			
 
				+    0
			
 
				+    >>> ensure_int('1')
			
 
				+    1
			
 
				+    """
			
 
				+    if not n:
			
 
				+        return 0
			
 
				+    return int(n)
			
 
				+
			
 
				+
			
 
				+def ensure_float(n):
			
 
				+    """
			
 
				+    >>> ensure_float(None)
			
 
				+    0.0
			
 
				+    >>> ensure_float(False)
			
 
				+    0.0
			
 
				+    >>> ensure_float(12)
			
 
				+    12.0
			
 
				+    >>> ensure_float("72")
			
 
				+    72.0
			
 
				+    """
			
 
				+    if not n:
			
 
				+        return 0.0
			
 
				+    return float(n)
			
--- a/FworkSpider/feapder/utils/webdriver.py
+++ b/FworkSpider/feapder/utils/webdriver.py
@@ -0,0 +1,334 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021/3/18 4:59 下午
			
 
				+---------
			
 
				+@summary:
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email: boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import queue
			
 
				+import threading
			
 
				+import os
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
			
 
				+from selenium.webdriver.remote.webdriver import WebDriver as RemoteWebDriver
			
 
				+
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.tools import Singleton
			
 
				+
			
 
				+DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
			
 
				+
			
 
				+
			
 
				+class WebDriver(RemoteWebDriver):
			
 
				+    CHROME = "CHROME"
			
 
				+    PHANTOMJS = "PHANTOMJS"
			
 
				+    FIREFOX = "FIREFOX"
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        load_images=True,
			
 
				+        user_agent=None,
			
 
				+        proxy=None,
			
 
				+        headless=False,
			
 
				+        driver_type=CHROME,
			
 
				+        timeout=16,
			
 
				+        window_size=(1024, 800),
			
 
				+        executable_path=None,
			
 
				+        custom_argument=None,
			
 
				+        **kwargs
			
 
				+    ):
			
 
				+        """
			
 
				+        webdirver 封装，支持chrome、phantomjs 和 firefox
			
 
				+        Args:
			
 
				+            load_images: 是否加载图片
			
 
				+            user_agent: 字符串 或 无参函数，返回值为user_agent
			
 
				+            proxy: xxx.xxx.xxx.xxx:xxxx 或 无参函数，返回值为代理地址
			
 
				+            headless: 是否启用无头模式
			
 
				+            driver_type: CHROME 或 PHANTOMJS,FIREFOX
			
 
				+            timeout: 请求超时时间
			
 
				+            window_size: # 窗口大小
			
 
				+            executable_path: 浏览器路径，默认为默认路径
			
 
				+            **kwargs:
			
 
				+        """
			
 
				+        self._load_images = load_images
			
 
				+        self._user_agent = user_agent or DEFAULT_USERAGENT
			
 
				+        self._proxy = proxy
			
 
				+        self._headless = headless
			
 
				+        self._timeout = timeout
			
 
				+        self._window_size = window_size
			
 
				+        self._executable_path = executable_path
			
 
				+        self._custom_argument = custom_argument
			
 
				+
			
 
				+        self.proxies = {}
			
 
				+        self.user_agent = None
			
 
				+
			
 
				+        if driver_type == WebDriver.CHROME:
			
 
				+            self.driver = self.chrome_driver()
			
 
				+
			
 
				+        elif driver_type == WebDriver.PHANTOMJS:
			
 
				+            self.driver = self.phantomjs_driver()
			
 
				+
			
 
				+        elif driver_type == WebDriver.FIREFOX:
			
 
				+            self.driver = self.firefox_driver()
			
 
				+
			
 
				+        else:
			
 
				+            raise TypeError(
			
 
				+                "dirver_type must be one of CHROME or PHANTOMJS or FIREFOX, but received {}".format(
			
 
				+                    type(driver_type)
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+        # driver.get(url)一直不返回，但也不报错的问题，这时程序会卡住，设置超时选项能解决这个问题。
			
 
				+        self.driver.set_page_load_timeout(self._timeout)
			
 
				+        # 设置10秒脚本超时时间
			
 
				+        self.driver.set_script_timeout(self._timeout)
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        if exc_val:
			
 
				+            log.error(exc_val)
			
 
				+
			
 
				+        self.quit()
			
 
				+        return True
			
 
				+
			
 
				+    def get_driver(self):
			
 
				+        return self.driver
			
 
				+
			
 
				+    def firefox_driver(self):
			
 
				+        firefox_profile = webdriver.FirefoxProfile()
			
 
				+        firefox_options = webdriver.FirefoxOptions()
			
 
				+        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
			
 
				+        firefox_profile.set_preference("dom.webdriver.enabled",False)
			
 
				+        if self._proxy:
			
 
				+            proxy = self._proxy() if callable(self._proxy) else self._proxy
			
 
				+            proxy = proxy.replace("socks5://","")
			
 
				+            # 使用socks5 代理
			
 
				+            firefox_profile.set_preference('network.proxy.type', 1)  # 不使用代理：0, 使用代理：1
			
 
				+            firefox_profile.set_preference('network.proxy.socks', proxy.split(":")[0])
			
 
				+            firefox_profile.set_preference('network.proxy.socks_port', int(proxy.split(":")[-1]))
			
 
				+            # firefox_capabilities["marionette"] = True  # http代理的使用
			
 
				+            # firefox_capabilities["proxy"] = {
			
 
				+            #     "proxyType": "MANUAL",
			
 
				+            #     "httpProxy": proxy,
			
 
				+            #     "ftpProxy": proxy,
			
 
				+            #     "sslProxy": proxy,
			
 
				+            # }
			
 
				+
			
 
				+        if self._user_agent:
			
 
				+            firefox_profile.set_preference(
			
 
				+                "general.useragent.override",
			
 
				+                self._user_agent() if callable(self._user_agent) else self._user_agent,
			
 
				+            )
			
 
				+
			
 
				+        if not self._load_images:
			
 
				+            firefox_profile.set_preference("permissions.default.image", 2)
			
 
				+
			
 
				+        if self._headless:
			
 
				+            firefox_options.add_argument("--headless")
			
 
				+            firefox_options.add_argument("--disable-gpu")
			
 
				+
			
 
				+        # 添加自定义的配置参数
			
 
				+        if self._custom_argument:
			
 
				+            for arg in self._custom_argument:
			
 
				+                firefox_options.add_argument(arg)
			
 
				+
			
 
				+        if self._executable_path:
			
 
				+            driver = webdriver.Firefox(
			
 
				+                capabilities=firefox_capabilities,
			
 
				+                options=firefox_options,
			
 
				+                firefox_profile=firefox_profile,
			
 
				+                executable_path=self._executable_path,
			
 
				+            )
			
 
				+        else:
			
 
				+            driver = webdriver.Firefox(
			
 
				+                capabilities=firefox_capabilities,
			
 
				+                options=firefox_options,
			
 
				+                firefox_profile=firefox_profile,
			
 
				+            )
			
 
				+
			
 
				+        if self._window_size:
			
 
				+            driver.set_window_size(*self._window_size)
			
 
				+
			
 
				+        return driver
			
 
				+
			
 
				+    def chrome_driver(self):
			
 
				+        chrome_options = webdriver.ChromeOptions()
			
 
				+        # 此步骤很重要，设置为开发者模式，防止被各大网站识别出来使用了Selenium
			
 
				+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
			
 
				+        chrome_options.add_experimental_option("useAutomationExtension", False)
			
 
				+        # docker 里运行需要
			
 
				+        chrome_options.add_argument("--no-sandbox")
			
 
				+
			
 
				+        if self._proxy:
			
 
				+            chrome_options.add_argument(
			
 
				+                "--proxy-server={}".format(
			
 
				+                    self._proxy() if callable(self._proxy) else self._proxy
			
 
				+                )
			
 
				+            )
			
 
				+        if self._user_agent:
			
 
				+            chrome_options.add_argument(
			
 
				+                "user-agent={}".format(
			
 
				+                    self._user_agent()
			
 
				+                    if callable(self._user_agent)
			
 
				+                    else self._user_agent
			
 
				+                )
			
 
				+            )
			
 
				+        if not self._load_images:
			
 
				+            chrome_options.add_experimental_option(
			
 
				+                "prefs", {"profile.managed_default_content_settings.images": 2}
			
 
				+            )
			
 
				+
			
 
				+        if self._headless:
			
 
				+            chrome_options.add_argument("--headless")
			
 
				+            chrome_options.add_argument("--disable-gpu")
			
 
				+
			
 
				+        if self._window_size:
			
 
				+            chrome_options.add_argument(
			
 
				+                "--window-size={},{}".format(self._window_size[0], self._window_size[1])
			
 
				+            )
			
 
				+
			
 
				+        # 添加自定义的配置参数
			
 
				+        if self._custom_argument:
			
 
				+            for arg in self._custom_argument:
			
 
				+                chrome_options.add_argument(arg)
			
 
				+
			
 
				+        if self._executable_path:
			
 
				+            driver = webdriver.Chrome(
			
 
				+                chrome_options=chrome_options, executable_path=self._executable_path
			
 
				+            )
			
 
				+        else:
			
 
				+            driver = webdriver.Chrome(chrome_options=chrome_options)
			
 
				+
			
 
				+        # 隐藏浏览器特征
			
 
				+        with open(os.path.join(os.path.dirname(__file__), "./js/stealth.min.js")) as f:
			
 
				+            js = f.read()
			
 
				+        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
			
 
				+
			
 
				+        return driver
			
 
				+
			
 
				+    def phantomjs_driver(self):
			
 
				+        import warnings
			
 
				+
			
 
				+        warnings.filterwarnings("ignore")
			
 
				+
			
 
				+        service_args = []
			
 
				+        dcap = DesiredCapabilities.PHANTOMJS
			
 
				+
			
 
				+        if self._proxy:
			
 
				+            service_args.append(
			
 
				+                "--proxy=%s" % self._proxy() if callable(self._proxy) else self._proxy
			
 
				+            )
			
 
				+        if self._user_agent:
			
 
				+            dcap["phantomjs.page.settings.userAgent"] = (
			
 
				+                self._user_agent() if callable(self._user_agent) else self._user_agent
			
 
				+            )
			
 
				+        if not self._load_images:
			
 
				+            service_args.append("--load-images=no")
			
 
				+
			
 
				+        # 添加自定义的配置参数
			
 
				+        if self._custom_argument:
			
 
				+            for arg in self._custom_argument:
			
 
				+                service_args.append(arg)
			
 
				+
			
 
				+        if self._executable_path:
			
 
				+            driver = webdriver.PhantomJS(
			
 
				+                service_args=service_args,
			
 
				+                desired_capabilities=dcap,
			
 
				+                executable_path=self._executable_path,
			
 
				+            )
			
 
				+        else:
			
 
				+            driver = webdriver.PhantomJS(
			
 
				+                service_args=service_args, desired_capabilities=dcap
			
 
				+            )
			
 
				+
			
 
				+        if self._window_size:
			
 
				+            driver.set_window_size(self._window_size[0], self._window_size[1])
			
 
				+
			
 
				+        del warnings
			
 
				+
			
 
				+        return driver
			
 
				+
			
 
				+    @property
			
 
				+    def cookies(self):
			
 
				+        cookies_json = {}
			
 
				+        for cookie in self.driver.get_cookies():
			
 
				+            cookies_json[cookie["name"]] = cookie["value"]
			
 
				+
			
 
				+        return cookies_json
			
 
				+
			
 
				+    @cookies.setter
			
 
				+    def cookies(self, val: dict):
			
 
				+        """
			
 
				+        设置cookie
			
 
				+        Args:
			
 
				+            val: {"key":"value", "key2":"value2"}
			
 
				+
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        for key, value in val.items():
			
 
				+            self.driver.add_cookie({"name": key, "value": value})
			
 
				+
			
 
				+    def __getattr__(self, name):
			
 
				+        if self.driver:
			
 
				+            return getattr(self.driver, name)
			
 
				+        else:
			
 
				+            raise AttributeError
			
 
				+
			
 
				+    # def __del__(self):
			
 
				+    #     self.quit()
			
 
				+
			
 
				+
			
 
				+@Singleton
			
 
				+class WebDriverPool:
			
 
				+    def __init__(self, pool_size=5, **kwargs):
			
 
				+        self.queue = queue.Queue(maxsize=pool_size)
			
 
				+        self.kwargs = kwargs
			
 
				+        self.lock = threading.RLock()
			
 
				+        self.driver_count = 0
			
 
				+
			
 
				+    @property
			
 
				+    def is_full(self):
			
 
				+        return self.driver_count >= self.queue.maxsize
			
 
				+
			
 
				+    def get(self, user_agent: str = None, proxy: str = None) -> WebDriver:
			
 
				+        """
			
 
				+        获取webdriver
			
 
				+        当webdriver为新实例时会使用 user_agen, proxy, cookie参数来创建
			
 
				+        Args:
			
 
				+            user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36
			
 
				+            proxy: xxx.xxx.xxx.xxx
			
 
				+        Returns:
			
 
				+
			
 
				+        """
			
 
				+        if not self.is_full:
			
 
				+            with self.lock:
			
 
				+                if not self.is_full:
			
 
				+                    kwargs = self.kwargs.copy()
			
 
				+                    if user_agent:
			
 
				+                        kwargs["user_agent"] = user_agent
			
 
				+                    if proxy:
			
 
				+                        kwargs["proxy"] = proxy
			
 
				+                    driver = WebDriver(**kwargs)
			
 
				+                    self.queue.put(driver)
			
 
				+                    self.driver_count += 1
			
 
				+
			
 
				+        driver = self.queue.get()
			
 
				+        return driver
			
 
				+
			
 
				+    def put(self, driver):
			
 
				+        self.queue.put(driver)
			
 
				+
			
 
				+    def remove(self, driver):
			
 
				+        driver.quit()
			
 
				+        self.driver_count -= 1
			
 
				+
			
 
				+    def close(self):
			
 
				+        while not self.queue.empty():
			
 
				+            driver = self.queue.get()
			
 
				+            driver.quit()
			
 
				+            self.driver_count -= 1
			
--- a/FworkSpider/items/__init__.py
+++ b/FworkSpider/items/__init__.py
--- a/FworkSpider/items/spider_item.py
+++ b/FworkSpider/items/spider_item.py
@@ -0,0 +1,125 @@
 
				+from feapder import Item
			
 
				+from untils.tools import int2long,substitute,text_search
			
 
				+import time
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.tools import get_current_date
			
 
				+from crawlab import save_item
			
 
				+from datetime import datetime
			
 
				+import os
			
 
				+from feapder import setting
			
 
				+class DataBakItem(Item):
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.title = ""  # 文章标题
			
 
				+        self.publishtime = ""   # 文章发布时间（日期格式 xxxx-xx-xx）
			
 
				+        self.spidercode = ""   # 爬虫代码（编辑器爬虫平台定义）
			
 
				+        self.site = ""   # 采集的站点（编辑器爬虫平台定义）
			
 
				+        self.channel = ""   # 采集的版块（编辑器爬虫平台定义）
			
 
				+        self.area = "全国"   # 省
			
 
				+        self.city = ""   # 市
			
 
				+        self.competehref = None   # 竞品快照页地址
			
 
				+        self.href = ""   # 非竞品快照页地址
			
 
				+        self.publishdept = ""
			
 
				+        self.iscompete=True
			
 
				+        self.type = ""
			
 
				+        self.T = "bidding"
			
 
				+        self.l_np_publishtime = ""  # 发布时间的时间戳（秒级）, 需定义为long型
			
 
				+        self.comeintime = ""  # 入库时间戳（秒级）, 需定义为long型
			
 
				+        self.sendflag = "false"
			
 
				+        self._d = "comeintime"
			
 
				+        self.contenthtml = ""  # 快照页源码
			
 
				+        self.detail = ""  # 快照页源码清洗之后招投标文本
			
 
				+        self.projectinfo = None  # 快照页源码清洗之后招投标文本
			
 
				+    def stop(self):
			
 
				+        print(self.cc_err)
			
 
				+
			
 
				+    def pre_to_db(self):
			
 
				+        # 生成入库时间戳（秒级）, 定义为long型
			
 
				+        self.comeintime = int2long(time.time())
			
 
				+        # 根据文章发布时间 生成发布时间的时间戳（秒级）, 定义为long型
			
 
				+        '''如果无法解析到发布时间、可以考虑补一个发布时间
			
 
				+        '''
			
 
				+        if ":" in self.publishtime:
			
 
				+            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d %H:%M:%S"))))
			
 
				+        else:
			
 
				+            self.l_np_publishtime = int2long(int(time.mktime(time.strptime(self.publishtime, "%Y-%m-%d"))))
			
 
				+
			
 
				+        # 数据获取失败处理：输出错误日志
			
 
				+        if self.contenthtml is None and self.projectinfo is None:
			
 
				+            log.error(f"{self.href},此链接数据正文抓取失败")
			
 
				+            # self.sendflag = "true"
			
 
				+            self.stop()
			
 
				+        if not self.title or not self.publishtime or not self.href:
			
 
				+            # self.sendflag = "true"
			
 
				+            log.error(f"部分数据抓取失败，数据详情：\n 链接：{self.href}\n 发布时间：{self.publishtime}\n标题:{self.title}")
			
 
				+            self.stop()
			
 
				+        # html处理正文
			
 
				+        if self.contenthtml is not None and self.detail =='':
			
 
				+            self.detail = substitute(self.contenthtml)
			
 
				+            '''
			
 
				+            detail:去头、去尾
			
 
				+            '''
			
 
				+            if text_search(self.detail).total == 0:
			
 
				+                # 无正文内容时，该内容直接标记true, 不在被统计
			
 
				+                self.sendflag = "true"
			
 
				+        save_item({"site": self.site, "title": self.title,"error":False,
			
 
				+                   "spidercode":self.spidercode,"channel":self.channel,
			
 
				+                   })
			
 
				+
			
 
				+
			
 
				+class MgpListItem(Item):
			
 
				+    def __init__(self):
			
 
				+        # self.__table_name__='ggg_list'
			
 
				+
			
 
				+        self.parse = "" # 需要调用的方法名称
			
 
				+        self.item = "" # 传过来的参数
			
 
				+        self.parser_name = "" # 处理详情页的爬虫名
			
 
				+        self.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 当前日期时间
			
 
				+        self.deal_detail = [] # 定义解析详情页主页内容的解析，detail_get是一个xpath列表，detail_post 则是一段处理代码
			
 
				+        self.create_time = None # 定义解析详情页发布时间的xpath，列表页无发布时间时应用
			
 
				+        self.parse_url = "" # 定义解析详情页主页内容的xpath
			
 
				+        self.request_params = {} # 定义callback所需的参数，诸如render，headers，method，data，params等等，
			
 
				+                                # 必须与requests请求的参数名称对应，否则无法识别
			
 
				+        self.failed = 0 # 定义callback所需的参数，诸如render，headers，method等等
			
 
				+        self.author = "开发及维护人员" # 开发及维护人员
			
 
				+        self.ex_js = ''  # 定义需要执行的python代码时所需的参数、js_str、js文件路径 等
			
 
				+        self.ex_python = None # 定义需要执行的python代码，生成params/date，如header和cookie特殊，最好使用特殊定义法
			
 
				+        self.pri = 1 # 爬虫报警级 可分9级
			
 
				+        self.proxies = True # 爬虫报警级 可分9级
			
 
				+        self.files = False # 附件采集配置
			
 
				+        self.error = None
			
 
				+        # self.error_info =
			
 
				+    def pre_to_db(self):
			
 
				+        # 生成入库时间戳（秒级）, 定义为long型
			
 
				+        self.author = setting.author.get(os.path.basename(os.getcwd()))
			
 
				+        save_item({"site": self.item.get("site"),"error":True,"author":self.author,
			
 
				+                   "spidercode":self.item.get("spidercode"),"channel":self.item.get("channel"),"state_code":"code",
			
 
				+                   "href":self.item.get("href"),"error_info":self.error})
			
 
				+        '''
			
 
				+        "site": "站点名", "error_type": "错误类型（detail/list/content/）", "author": "负责人",
			
 
				+         "spidercode": "", "channel": "", error_count:"错误数量"
			
 
				+         '''
			
 
				+
			
 
				+class ListItem(Item):
			
 
				+    def __init__(self):
			
 
				+        self.spidercode = ""  # 爬虫代码（编辑器爬虫平台定义）
			
 
				+        self.site = ""  # 采集的站点（编辑器爬虫平台定义）
			
 
				+        self.channel = ""  # 采集的版块（编辑器爬虫平台定义）
			
 
				+        self.url = ''
			
 
				+        self.count=0
			
 
				+        self.rel_count = 0
			
 
				+
			
 
				+    def pre_to_db(self):
			
 
				+        self.author = setting.author.get(os.path.basename(os.getcwd()))
			
 
				+        if self.author is None:
			
 
				+            self.author = os.path.basename(os.getcwd())
			
 
				+        self.runtime = get_current_date(date_format="%Y-%m-%d")
			
 
				+
			
 
				+
			
 
				+
			
 
				+class ErrorInfoItem(Item):
			
 
				+    def __init__(self):
			
 
				+        self.parmars = ""  # 需要调用的方法名称
			
 
				+        self.item = "111"  # 传过来的参数
			
 
				+        self.parser_name = "111"  # 处理详情页的爬虫名
			
 
				+        self.date = time.time()
			
--- a/FworkSpider/login_pool/__init__.py
+++ b/FworkSpider/login_pool/__init__.py
--- a/FworkSpider/login_pool/zglbw.py
+++ b/FworkSpider/login_pool/zglbw.py
@@ -0,0 +1,95 @@
 
				+
			
 
				+import sys
			
 
				+sys.path.append('/app/spiders/sword_feapder/FworkSpider')
			
 
				+from untils.cookie_pool import LoginCookiePool
			
 
				+import requests
			
 
				+class ZglbwPool(LoginCookiePool):
			
 
				+
			
 
				+    def create_cookie(self, username, password):
			
 
				+        print(username,password)
			
 
				+        '''
			
 
				+        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
			
 
				+        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
			
 
				+        
			
 
				+        https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
			
 
				+        2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
			
 
				+        '''
			
 
				+        session = requests.Session()
			
 
				+        headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0"}
			
 
				+        url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=9d424669-5af6-4b3d-bed5-56cc06bd5ca6'
			
 
				+        data = {
			
 
				+            "clear": "",
			
 
				+            "BackURL": "null",
			
 
				+            "username": username,
			
 
				+            "password": password,
			
 
				+            "jcaptchaCode": "shmt"
			
 
				+        }
			
 
				+        session.get(url,headers=headers)
			
 
				+        session.post(url, data=data)
			
 
				+        # print(res.headers)
			
 
				+        ss = session.get(url='https://eproport.crecgec.com/getAuthentication')
			
 
				+        print(ss.text)
			
 
				+        cookies = requests.utils.dict_from_cookiejar(session.cookies)
			
 
				+        print(cookies)
			
 
				+        return cookies
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+# cookie_pool = ZglbwPool(username_key='username', password_key="password", table_userbase='zglbw',
			
 
				+#                               redis_key='zglbw')
			
 
				+# # cookie_pool.create_cookie('zuoshang123',"123qwe!A")
			
 
				+# # # res = requests.get('https://eproport.crecgec.com/getAuthentication',cookies=cookie)
			
 
				+# # # print(res.text)
			
 
				+# cookie_pool.del_cookie(cookie_pool.get_cookie())
			
 
				+
			
 
				+
			
 
				+# def create_cookie():
			
 
				+#     '''
			
 
				+#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
			
 
				+#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
			
 
				+#
			
 
				+#     https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&redirect_uri=https%3A%2F%2Fpassport.crecgec.com%
			
 
				+#     2FCAS%2Flogin%3Foauth_name%3DCasWrapperProvider20&response_type=code&param1=8d672ed8-ea12-450f-85f4-82310755b2e3
			
 
				+#     '''
			
 
				+#     session = requests.Session()
			
 
				+#     url = 'https://passport.crecgec.com/authorize?type=cas&client_id=1000000053&response_type=code'
			
 
				+#     data = {
			
 
				+#         "clear": "",
			
 
				+#         "BackURL": "null",
			
 
				+#         "username": "zuoshang123",
			
 
				+#         "password": "123qwe!A",
			
 
				+#         "jcaptchaCode": "shmt"
			
 
				+#     }
			
 
				+#     session.get(url)
			
 
				+#     res = session.post(url, data=data)
			
 
				+#
			
 
				+# create_cookie()
			
 
				+# # import requests
			
 
				+#
			
 
				+#
			
 
				+#
			
 
				+# # cookies = {
			
 
				+# #     "srv_id": "53069e9fd596ee2f1c7cf21d24bd170e",
			
 
				+# #     "uid": "e423da7f-1d30-4571-a011-429326f1cfd1",
			
 
				+# #     "Hm_lvt_89c053c39b2269b8a37c5881ca224223": "1642647201",
			
 
				+# #     "JSESSIONID": "752173C3FF0C519DB45BBF781CEC76CB",
			
 
				+# #     "Hm_lpvt_89c053c39b2269b8a37c5881ca224223": "1642661696"
			
 
				+# # }
			
 
				+# # url = "https://passport.crecgec.com/authorize"
			
 
				+# # params = {
			
 
				+# #     "type": "cas",
			
 
				+# #     "client_id": "10000000`53",
			
 
				+# #     "response_type": "code"
			
 
				+# # }
			
 
				+# # data = {
			
 
				+# #     "clear": "",
			
 
				+# #     "BackURL": "null",
			
 
				+# #     "username": "zuoshang123",
			
 
				+# #     "password": "123qwe!A",
			
 
				+# #     "jcaptchaCode": "shmt"
			
 
				+# # }
			
 
				+# # response = requests.post(url, headers=headers, cookies=cookies, params=params, data=data)
			
 
				+# #
			
 
				+# # print(response.text)
			
 
				+# # print(response)
			
--- a/FworkSpider/mongo_pipeline.py
+++ b/FworkSpider/mongo_pipeline.py
@@ -0,0 +1,96 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2021-04-18 14:12:21
			
 
				+---------
			
 
				+@summary: 导出数据
			
 
				+---------
			
 
				+@author: 马国鹏
			
 
				+@email:  305021384@qq.com
			
 
				+"""
			
 
				+from typing import Dict, List, Tuple
			
 
				+import time
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+from feapder.dedup import Dedup
			
 
				+from feapder.pipelines import BasePipeline
			
 
				+from feapder.utils.log import log
			
 
				+from untils.tools import *
			
 
				+from crawlab import save_item
			
 
				+
			
 
				+
			
 
				+
			
 
				+class MongoPipeline(BasePipeline):
			
 
				+    def __init__(self):
			
 
				+        self._to_db = None
			
 
				+
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+
			
 
				+        return self._to_db
			
 
				+
			
 
				+    def save_items(self, table, items: List[Dict]) -> bool:
			
 
				+        """
			
 
				+        保存数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+
			
 
				+        Returns: 是否保存成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+        """
			
 
				+        try:
			
 
				+            add_count = self.to_db.add_batch(coll_name=table, datas=items)
			
 
				+            for item in items:
			
 
				+                dedup = Dedup(Dedup.BloomFilter)
			
 
				+                dedup.add([item.get("href")])
			
 
				+                # save_item({'count':item.get("href")})
			
 
				+            datas_size = len(items)
			
 
				+            log.info(
			
 
				+                "共导出 %s 条数据到 %s,  新增 %s条, 重复 %s 条"
			
 
				+                % (datas_size, table, add_count, datas_size - add_count)
			
 
				+            )
			
 
				+            # wechat_warning(f"{site}  数据导报\n共插入 {datas_size} 条数据到 {table}")
			
 
				+            # for i in range(add_count):
			
 
				+            if table == "mgp_list":
			
 
				+                save_item({"site": "失败回填", "title": add_count})
			
 
				+
			
 
				+            return True
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+            return False
			
 
				+
			
 
				+    def update_items(self, table, items: List[Dict], update_keys=Tuple) -> bool:
			
 
				+        """
			
 
				+        更新数据
			
 
				+        Args:
			
 
				+            table: 表名
			
 
				+            items: 数据，[{},{},...]
			
 
				+            update_keys: 更新的字段, 如 ("title", "publish_time")
			
 
				+
			
 
				+        Returns: 是否更新成功 True / False
			
 
				+                 若False，不会将本批数据入到去重库，以便再次入库
			
 
				+
			
 
				+        """
			
 
				+        try:
			
 
				+            add_count = self.to_db.add_batch(
			
 
				+                coll_name=table,
			
 
				+                datas=items,
			
 
				+                update_columns=update_keys or list(items[0].keys()),
			
 
				+            )
			
 
				+            datas_size = len(items)
			
 
				+            update_count = datas_size - add_count
			
 
				+            msg = "共导出 %s 条数据到 %s,  新增 %s 条, 更新 %s 条" % (
			
 
				+                datas_size,
			
 
				+                table,
			
 
				+                add_count,
			
 
				+                update_count,
			
 
				+            )
			
 
				+            if update_keys:
			
 
				+                msg += " 更新字段为 {}".format(update_keys)
			
 
				+            log.info(msg)
			
 
				+
			
 
				+            return True
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+            return False
			
--- a/FworkSpider/setting.py
+++ b/FworkSpider/setting.py
@@ -0,0 +1,163 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""爬虫配置文件"""
			
 
				+import os
			
 
				+import time
			
 
				+import sys
			
 
				+# from scoket_proxy import Socks5Proxy
			
 
				+#
			
 
				+# # MYSQL
			
 
				+# MYSQL_IP = "localhost"
			
 
				+# MYSQL_PORT = 3306
			
 
				+# MYSQL_DB = ""
			
 
				+# MYSQL_USER_NAME = ""
			
 
				+# MYSQL_USER_PASS = ""
			
 
				+#
			
 
				+# MONGODB
			
 
				+# MONGO_IP = "192.168.20.51"  # 本地 docker 环境
			
 
				+MONGO_IP = "127.0.0.1"  # 本地环境
			
 
				+# MONGO_PORT = 27017
			
 
				+MONGO_PORT = 27001
			
 
				+
			
 
				+#
			
 
				+
			
 
				+# MONGO_IP = "192.168.3.71"  # 本地环境
			
 
				+# MONGO_PORT = 27027
			
 
				+
			
 
				+
			
 
				+MONGO_DB = "py_spider"
			
 
				+# MONGO_USER_NAME = ""
			
 
				+# MONGO_USER_PASS = ""
			
 
				+#
			
 
				+# # REDIS
			
 
				+# # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
			
 
				+# REDISDB_IP_PORTS = "192.168.20.51:6379"  # 本地 docker 环境
			
 
				+REDISDB_IP_PORTS = "127.0.0.1:6379"  # 本地环境
			
 
				+# REDISDB_IP_PORTS = "192.168.3.71:6379"  # 本地环境
			
 
				+# REDISDB_USER_PASS = ""
			
 
				+REDISDB_DB = 10
			
 
				+# # 适用于redis哨兵模式
			
 
				+REDISDB_SERVICE_NAME = "quchoong"
			
 
				+#
			
 
				+# # 数据入库的pipeline，可自定义，默认MysqlPipeline
			
 
				+ITEM_PIPELINES = [
			
 
				+    # "feapder.pipelines.mysql_pipeline.MysqlPipeline",
			
 
				+    # "feapder.pipelines.mongo_pipeline.MongoPipeline",
			
 
				+    "mongo_pipeline.MongoPipeline"
			
 
				+]
			
 
				+EXPORT_DATA_MAX_FAILED_TIMES = 5 # 导出数据时最大的失败次数，包括保存和更新，超过这个次数报警
			
 
				+EXPORT_DATA_MAX_RETRY_TIMES = 5 # 导出数据时最大的重试次数，包括保存和更新，超过这个次数则放弃重试
			
 
				+#
			
 
				+# # 爬虫相关
			
 
				+# # COLLECTOR
			
 
				+# COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
			
 
				+# COLLECTOR_TASK_COUNT = 10  # 每次获取任务数量
			
 
				+#
			
 
				+REDIS_KEY = "fwork"
			
 
				+# # SPIDER
			
 
				+# SPIDER_THREAD_COUNT = 10  # 爬虫并发数
			
 
				+# SPIDER_SLEEP_TIME = [2, 5] # 下载时间间隔 单位秒。 支持随机 如 SPIDER_SLEEP_TIME = [2, 5] 则间隔为 2~5秒之间的随机数，包含2和5
			
 
				+# SPIDER_TASK_COUNT = 1  # 每个parser从内存队列中获取任务的数量
			
 
				+SPIDER_MAX_RETRY_TIMES = 2  # 每个请求最大重试次数
			
 
				+# KEEP_ALIVE = False  # 爬虫是否常驻
			
 
				+#
			
 
				+# # 浏览器渲染
			
 
				+WEBDRIVER  = dict(
			
 
				+    pool_size=1,  # 浏览器的数量
			
 
				+    load_images=False,  # 是否加载图片
			
 
				+    # user_agent=None,  # 字符串 或 无参函数，返回值为user_agent
			
 
				+    proxy=None,  # xxx.xxx.xx.xxx:xxxx 或 无参函数，返回值为代理地址
			
 
				+    headless=False,  # 是否为无头浏览器
			
 
				+    driver_type="FIREFOX",  # CHROME、PHANTOMJS、FIREFOX
			
 
				+    timeout=30,  # 请求超时时间
			
 
				+    window_size=(1280, 800),  # 窗口大小
			
 
				+    executable_path="D:\\geckodriver.exe",  # 浏览器路径，默认为默认路径
			
 
				+    render_time=0,  # 渲染时长，即打开网页等待指定时间后再获取源码
			
 
				+    custom_argument=["--ignore-certificate-errors"],  # 自定义浏览器渲染参数
			
 
				+)
			
 
				+#wget https://github.com/mozilla/geckodriver/releases/download/v0.25.0/geckodriver-v0.25.0-linux64.tar.gz
			
 
				+# # 爬虫启动时，重新抓取失败的requests
			
 
				+# RETRY_FAILED_REQUESTS = False
			
 
				+# # 保存失败的request
			
 
				+# SAVE_FAILED_REQUEST = True
			
 
				+# # request防丢机制。（指定的REQUEST_LOST_TIMEOUT时间内request还没做完，会重新下发 重做）
			
 
				+# REQUEST_LOST_TIMEOUT = 600  # 10分钟
			
 
				+# # request网络请求超时时间
			
 
				+# REQUEST_TIMEOUT = 22  # 等待服务器响应的超时时间，浮点数，或(connect timeout, read timeout)元组
			
 
				+#
			
 
				+# # 下载缓存 利用redis缓存，但由于内存大小限制，所以建议仅供开发调试代码时使用，防止每次debug都需要网络请求
			
 
				+# RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据，建议设置为True
			
 
				+# RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
			
 
				+# RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
			
 
				+#
			
 
				+# # 设置代理
			
 
				+PROXY_EXTRACT_API = "http://socks.spdata.jianyu360.com/socks/getips?limit=100"  # 代理提取API ，返回的代理分割符为\r\n
			
 
				+PROXY_ENABLE = True
			
 
				+#
			
 
				+# # 随机headers
			
 
				+# RANDOM_HEADERS = True
			
 
				+# # UserAgent类型 支持 'chrome', 'opera', 'firefox', 'internetexplorer', 'safari'，'mobile' 若不指定则随机类型
			
 
				+# USER_AGENT_TYPE = "chrome"
			
 
				+# # 默认使用的浏览器头 RANDOM_HEADERS=True时不生效
			
 
				+# DEFAULT_USERAGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
			
 
				+# # requests 使用session
			
 
				+# USE_SESSION = False
			
 
				+#
			
 
				+# # 去重
			
 
				+# ITEM_FILTER_ENABLE = False  # item 去重
			
 
				+# REQUEST_FILTER_ENABLE = False  # request 去重
			
 
				+# ITEM_FILTER_SETTING = dict(
			
 
				+#     filter_type=1  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
			
 
				+# )
			
 
				+# REQUEST_FILTER_ENABLE = True  # request 去重
			
 
				+# REQUEST_FILTER_SETTING = dict(
			
 
				+#     filter_type=3,  # 永久去重（BloomFilter） = 1 、内存去重（MemoryFilter） = 2、 临时去重（ExpireFilter）= 3
			
 
				+#     expire_time=2592000,  # 过期时间1个月
			
 
				+# )
			
 
				+#
			
 
				+# # 报警 支持钉钉、企业微信、邮件
			
 
				+# # 钉钉报警
			
 
				+# DINGDING_WARNING_URL = ""  # 钉钉机器人api
			
 
				+# DINGDING_WARNING_PHONE = ""  # 报警人 支持列表，可指定多个
			
 
				+# DINGDING_WARNING_ALL = False # 是否提示所有人， 默认为False
			
 
				+# # 邮件报警
			
 
				+# EMAIL_SENDER = ""  # 发件人
			
 
				+# EMAIL_PASSWORD = ""  # 授权码
			
 
				+# EMAIL_RECEIVER = ""  # 收件人 支持列表，可指定多个
			
 
				+# EMAIL_SMTPSERVER = "smtp.163.com" # 邮件服务器 默认为163邮箱
			
 
				+# # 企业微信报警
			
 
				+# WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=89f0b1e9-8d08-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
			
 
				+WECHAT_WARNING_URL = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=-4e26-a563-cd6b07b9db14"  # 企业微信机器人api
			
 
				+WECHAT_WARNING_PHONE = "马国鹏"  # 报警人 将会在群内@此人, 支持列表，可指定多人
			
 
				+WECHAT_WARNING_ALL = True  # 是否提示所有人， 默认为False
			
 
				+# # 时间间隔
			
 
				+WARNING_INTERVAL = 360  # 相同报警的报警时间间隔，防止刷屏; 0表示不去重
			
 
				+WARNING_LEVEL = "DEBUG"  # 报警级别， DEBUG / ERROR
			
 
				+WARNING_FAILED_COUNT = 2  # 任务失败数 超过WARNING_FAILED_COUNT则报警
			
 
				+#
			
 
				+LOG_NAME = os.path.basename(os.getcwd())
			
 
				+
			
 
				+DTIME = time.strftime("%Y-%m-%d", time.localtime(time.time()))
			
 
				+# LOG_NAME = os.path.split(sys.argv[0])[-1].split('.')[0]
			
 
				+# LOG_PATH = "log/%s/%s.log" %(DTIME,LOG_NAME)  # log存储路径
			
 
				+LOG_PATH = LOG_NAME  # log存储路径
			
 
				+LOG_LEVEL = "DEBUG"
			
 
				+LOG_COLOR = True  # 是否带有颜色
			
 
				+LOG_IS_WRITE_TO_CONSOLE = True # 是否打印到控制台
			
 
				+LOG_IS_WRITE_TO_FILE = True  # 是否写文件
			
 
				+LOG_MODE = "w"  # 写文件的模式
			
 
				+LOG_MAX_BYTES = 10 * 1024 * 1024  # 每个日志文件的最大字节数
			
 
				+LOG_BACKUP_COUNT = 20  # 日志文件保留数量
			
 
				+LOG_ENCODING = "utf8"  # 日志文件编码
			
 
				+OTHERS_LOG_LEVAL = "ERROR"  # 第三方库的log等级
			
 
				+#
			
 
				+# # 切换工作路径为当前项目路径
			
 
				+# project_path = os.path.abspath(os.path.dirname(__file__))
			
 
				+# os.chdir(project_path)  # 切换工作路经
			
 
				+# sys.path.insert(0, project_path)
			
 
				+# print('当前工作路径为 ' + os.getcwd())
			
 
				+jy_proxy = {'socks5': {'url': 'http://socks.spdata.jianyu360.com/socks/getips?limit=100', 'decrypt': 'ABNOPqrceQRSTklmUDEFGXYZabnopfghHVWdijstuvwCIJKLMxyz0123456789+/'}}
			
 
				+headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36', 'Accept': '*/*'}
			
 
				+oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing.aliyuncs.com', 'bucket_name': 'jy-datafile'}
			
 
				+# oss_={'key_id': 'LTAI4G5x9aoZx8dDamQ7vfZi', 'key_secret': 'Bk98FsbPYXcJe72n1bG3Ssf73acuNh', 'endpoint': 'oss-cn-beijing-internal.aliyuncs.com', 'bucket_name': 'jy-editor'}
			
 
				+
			
 
				+author = {"dzr":"董钊瑞",'mgp':"马国鹏","lzz":"李宗泽"}
			
--- a/FworkSpider/untils/__init__.py
+++ b/FworkSpider/untils/__init__.py
@@ -0,0 +1,22 @@
 
				+
			
 
				+
			
 
				+
			
 
				+'''
			
 
				+    时间一晃而过,转眼间两已经入职近三个月，我有幸来到公司剑雨产品部工作,在这短暂的三个月中，在公司领导的亲切关怀和指导下,在同事们的热情帮助下我很快的熟悉了公司环境，
			
 
				+适应了新的工作岗位，现将我试用期的工作情况简要小结如下
			
 
				+    一、严格遵守公司各项规章制度。上班开始，我认真学习了公司《员工手册》及各项管理制度，并严格遵守，做到了无违规现象。
			
 
				+    二、主动学习、尽快适应，迅速熟悉环境和工作内容。首先从尽快熟悉工作环境和工作内容；其次，主动、虚心向主管、同事请教、学习，基本掌握了日常上班的工作内容，工作流程、工作方法。
			
 
				+    三、工作积极、认真、负责，通过不断学习、虚心请教，总结积累，较好的完成了领导安排的各项工作任务。
			
 
				+        1、开发爬虫管理平台
			
 
				+        2、搭建定制爬虫框架，开发通用模块、伪代码生成器，以达到提升开发效率的目标
			
 
				+        3、实现管理平台的线上部署与基础测试，目前已部署爬虫15个，且正常运行中
			
 
				+        4、编写发文档、在小组内进行相关人员的培训，让小组的人一起来对这个框架和管理平台进行测评
			
 
				+        5、日常数据采集，目前开发共三十个平台爬虫，涉及一百多个栏目，数据采集量达二十多万
			
 
				+    四、与同事之间和谐相处、加强沟通、团结协作，以尽快更好的融入团队。
			
 
				+    五、存在问题及解决办法：
			
 
				+        1、与同事间的沟通交流较少，以后要加强同事间的沟通交流
			
 
				+        2、js反爬比较能力不够强，以后多学习js相关知识，提高js反爬能力
			
 
				+        3、逻辑不够严谨，仔细仔细再仔细，
			
 
				+
			
 
				+
			
 
				+'''
			
--- a/FworkSpider/untils/aliyun.py
+++ b/FworkSpider/untils/aliyun.py
@@ -0,0 +1,24 @@
 
				+import oss2
			
 
				+
			
 
				+# from config.load import oss_conf
			
 
				+from feapder.setting import oss_ as oss_conf
			
 
				+
			
 
				+
			
 
				+class AliYunService:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.__acc_key_id = oss_conf['key_id']
			
 
				+        self.__acc_key_secret = oss_conf['key_secret']
			
 
				+        self.__endpoint = oss_conf['endpoint']
			
 
				+        self.__bucket_name = oss_conf['bucket_name']
			
 
				+
			
 
				+    def push_oss_from_local(self, key, filename):
			
 
				+        """
			
 
				+        上传一个本地文件到OSS的普通文件
			
 
				+
			
 
				+        :param str key: 上传到OSS的文件名
			
 
				+        :param str filename: 本地文件名，需要有可读权限
			
 
				+        """
			
 
				+        auth = oss2.Auth(self.__acc_key_id, self.__acc_key_secret)
			
 
				+        bucket = oss2.Bucket(auth, self.__endpoint, self.__bucket_name)
			
 
				+        bucket.put_object_from_file(key, filename)
			
--- a/FworkSpider/untils/attachment.py
+++ b/FworkSpider/untils/attachment.py
@@ -0,0 +1,198 @@
 
				+import hashlib
			
 
				+import os
			
 
				+import re
			
 
				+import traceback
			
 
				+import uuid
			
 
				+from urllib.parse import urlparse, unquote
			
 
				+
			
 
				+import requests
			
 
				+import urllib3
			
 
				+
			
 
				+from feapder.setting import headers
			
 
				+from untils.execptions import AttachmentNullError
			
 
				+from untils.aliyun import AliYunService
			
 
				+from untils.proxy_pool import ProxyPool
			
 
				+
			
 
				+urllib3.disable_warnings()
			
 
				+
			
 
				+
			
 
				+def hex_sha1(val):
			
 
				+    sha1 = hashlib.sha1()
			
 
				+    if isinstance(val, bytes):
			
 
				+        sha1.update(str(val).encode("utf-8"))
			
 
				+    elif isinstance(val, str):
			
 
				+        sha1.update(val.encode("utf-8"))
			
 
				+    res = sha1.hexdigest()
			
 
				+    return res
			
 
				+
			
 
				+
			
 
				+def extract_file_type(text):
			
 
				+    if text is None:
			
 
				+        return None
			
 
				+
			
 
				+    file_types = {
			
 
				+        'pdf', 'doc', 'docx', 'rar', 'zip', 'gzzb', 'jpg', 'png'
			
 
				+    }
			
 
				+    for file_type in file_types:
			
 
				+        tmp = [file_type, file_type.upper()]
			
 
				+        for t in tmp:
			
 
				+            result = re.match(f'.*{t}$', text, re.S)
			
 
				+            if result is not None:
			
 
				+                return t
			
 
				+    else:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def extract_file_name(href: str, file_type: str):
			
 
				+    # 中文标点符号:[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]
			
 
				+    # 中文字符:[\u4e00 -\u9fa5]
			
 
				+    zh_char_pattern = '[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5]+'
			
 
				+    parser = urlparse(href)
			
 
				+    query = (parser.query or parser.path)
			
 
				+    result = re.search(f'.*\\.{file_type}', query, re.S)
			
 
				+    if result is not None:
			
 
				+        encode_str = unquote(result.group())
			
 
				+        name = re.search(zh_char_pattern, encode_str)
			
 
				+        if name is not None:
			
 
				+            return unquote(name.group())
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def verify_file_name(name):
			
 
				+    if extract_file_type(name) is None:
			
 
				+        raise ValueError
			
 
				+
			
 
				+
			
 
				+class AttachmentDownloader:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.dir_name = 'file'
			
 
				+
			
 
				+    def create_dir(self):
			
 
				+        if not os.path.exists(self.dir_name):
			
 
				+            os.makedirs(self.dir_name, mode=0o777, exist_ok=True)
			
 
				+
			
 
				+    def create_file_path(self, filename, file_type):
			
 
				+        self.create_dir()
			
 
				+        sign = hex_sha1("{}_{}".format(filename, uuid.uuid4()))
			
 
				+        tmp_name = "{}.{}".format(sign, file_type)
			
 
				+        return "{}/{}".format(self.dir_name, tmp_name)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def create_fid(file_stream: bytes):
			
 
				+        return hex_sha1(file_stream)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _fetch_attachment(
			
 
				+            url: str,
			
 
				+            file_path: str,
			
 
				+            enable_proxy=False,
			
 
				+            allow_show_exception=False,
			
 
				+            **kwargs
			
 
				+    ):
			
 
				+        request_params = {}
			
 
				+        request_params.setdefault('headers', kwargs.get('headers') or headers)
			
 
				+        request_params.setdefault('proxies', kwargs.get('proxies'))
			
 
				+        request_params.setdefault('timeout', kwargs.get('timeout') or 60)
			
 
				+        request_params.setdefault('stream', kwargs.get('stream') or True)
			
 
				+        request_params.setdefault('verify', kwargs.get('verify') or False)
			
 
				+        if enable_proxy:
			
 
				+            proxy = ProxyPool()
			
 
				+        else:
			
 
				+            proxy = {}
			
 
				+        retries = 0
			
 
				+        while retries < 3:
			
 
				+            try:
			
 
				+                with requests.get(url, **request_params) as req:
			
 
				+                    if req.status_code == 200:
			
 
				+                        stream = req.content
			
 
				+                        with open(file_path, 'wb') as f:
			
 
				+                            f.write(stream)
			
 
				+                        return stream
			
 
				+                    else:
			
 
				+                        retries += 1
			
 
				+            except requests.RequestException:
			
 
				+                if allow_show_exception:
			
 
				+                    traceback.print_exc()
			
 
				+                if enable_proxy:
			
 
				+                    request_params.update({'proxies': proxy.get()})
			
 
				+                retries += 1
			
 
				+        return b''
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def clean_attachment(file_path):
			
 
				+        os.remove(file_path)
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def getsize(file_path: str):
			
 
				+        def _getsize(filename):
			
 
				+            try:
			
 
				+                return os.path.getsize(filename)
			
 
				+            except:
			
 
				+                return 0
			
 
				+
			
 
				+        _kb = float(_getsize(file_path)) / 1024
			
 
				+        if _kb >= 1024:
			
 
				+            _M = _kb / 1024
			
 
				+            if _M >= 1024:
			
 
				+                _G = _M / 1024
			
 
				+                return "{:.1f} G".format(_G)
			
 
				+            else:
			
 
				+                return "{:.1f} M".format(_M)
			
 
				+        else:
			
 
				+            return "{:.1f} kb".format(_kb)
			
 
				+
			
 
				+    def fetch_attachment(
			
 
				+            self,
			
 
				+            file_name: str,
			
 
				+            file_type: str,
			
 
				+            download_url: str,
			
 
				+            enable_proxy=False,
			
 
				+            allow_request_exception=False,
			
 
				+            **kwargs
			
 
				+    ):
			
 
				+        if not file_name or not file_type or not download_url:
			
 
				+            raise AttachmentNullError
			
 
				+
			
 
				+        file_path = self.create_file_path(file_name, file_type)
			
 
				+        file_stream = self._fetch_attachment(
			
 
				+            download_url,
			
 
				+            file_path,
			
 
				+            enable_proxy,
			
 
				+            allow_request_exception,
			
 
				+            **kwargs
			
 
				+        )
			
 
				+        if len(file_stream) > 0:
			
 
				+            fid = self.create_fid(file_stream)
			
 
				+            '''上传/下载,无论失败成功都需要给出文件基础信息'''
			
 
				+            try:
			
 
				+                result = {
			
 
				+                    'filename': file_name,
			
 
				+                    'ftype': file_type,
			
 
				+                    'fid': "{}.{}".format(fid, file_type),
			
 
				+                    'org_url': download_url,
			
 
				+                    'size': self.getsize(file_path),
			
 
				+                    'url': 'oss',
			
 
				+                }
			
 
				+                AliYunService().push_oss_from_local(result['fid'], file_path)
			
 
				+            except Exception:
			
 
				+                result = {
			
 
				+                    'filename': file_name,
			
 
				+                    'org_url': download_url,
			
 
				+                }
			
 
				+            self.clean_attachment(file_path)
			
 
				+        else:
			
 
				+            result = {
			
 
				+                'filename': file_name,
			
 
				+                'org_url': download_url,
			
 
				+            }
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+# if __name__ == '__main__':
			
 
				+    # a = AttachmentDownloader().fetch_attachment(
			
 
				+    #     file_name='成建制移民村（五标段）合同',
			
 
				+    #     file_type='pdf',
			
 
				+    #     download_url='http://222.75.70.90/NXGPPSP_MG/downloadFileServlet?req=F&num=8b80b23f7e729b88017e758a1b03422c'
			
 
				+    # )
			
 
				+    # print(a)
			
--- a/FworkSpider/untils/chaojiying.py
+++ b/FworkSpider/untils/chaojiying.py
@@ -0,0 +1,61 @@
 
				+#!/usr/bin/env python
			
 
				+# coding:utf-8
			
 
				+
			
 
				+import requests
			
 
				+from hashlib import md5
			
 
				+
			
 
				+class Chaojiying_Client(object):
			
 
				+
			
 
				+    def __init__(self, username, password, soft_id):
			
 
				+        self.username = username
			
 
				+        password =  password.encode('utf8')
			
 
				+        self.password = md5(password).hexdigest()
			
 
				+        self.soft_id = soft_id
			
 
				+        self.base_params = {
			
 
				+            'user': self.username,
			
 
				+            'pass2': self.password,
			
 
				+            'softid': self.soft_id,
			
 
				+        }
			
 
				+        self.headers = {
			
 
				+            'Connection': 'Keep-Alive',
			
 
				+            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
			
 
				+        }
			
 
				+
			
 
				+    def PostPic(self, im, codetype):
			
 
				+        """
			
 
				+        im: 图片字节
			
 
				+        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
			
 
				+        """
			
 
				+        params = {
			
 
				+            'codetype': codetype,
			
 
				+        }
			
 
				+        params.update(self.base_params)
			
 
				+        files = {'userfile': ('ccc.jpg', im)}
			
 
				+        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
			
 
				+        return r.json()
			
 
				+
			
 
				+    def ReportError(self, im_id):
			
 
				+        """
			
 
				+        im_id:报错题目的图片ID
			
 
				+        """
			
 
				+        params = {
			
 
				+            'id': im_id,
			
 
				+        }
			
 
				+        params.update(self.base_params)
			
 
				+        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
			
 
				+        return r.json()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # chaojiying = Chaojiying_Client('ddddjy', 'ddddjy2021', '超级鹰')	#用户中心>>软件ID 生成一个替换 96001
			
 
				+    # im = open('a.jpg', 'rb').read()													#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
			
 
				+    # # print(chaojiying.PostPic(im, 1902))
			
 
				+    # res = chaojiying.PostPic(im, 2004)
			
 
				+    # print(res)
			
 
				+    # if res.get("err_no") != 0:
			
 
				+    #     chaojiying.ReportError(res.get("pic_id"))
			
 
				+    # if res.get("")
			
 
				+    code = "haoho"
			
 
				+    url = 'http://www.ccgp-fujian.gov.cn/3500/noticelist/e8d2cd51915e4c338dc1c6ee2f02b127/?page={page}&verifycode=胡吃海喝'[:-4]+code
			
 
				+
			
 
				+    print(url)
			
--- a/FworkSpider/untils/cookie_pool.py
+++ b/FworkSpider/untils/cookie_pool.py
@@ -0,0 +1,788 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2018/12/27 11:32 AM
			
 
				+---------
			
 
				+@summary: cookie池
			
 
				+---------
			
 
				+@author: Boris
			
 
				+@email:  boris_liu@foxmail.com
			
 
				+"""
			
 
				+
			
 
				+import abc
			
 
				+import datetime
			
 
				+import random
			
 
				+import time
			
 
				+import warnings
			
 
				+from collections import Iterable
			
 
				+from enum import Enum, unique
			
 
				+import requests
			
 
				+from feapder.db.mongodb import MongoDB
			
 
				+
			
 
				+import feapder.utils.tools as tools
			
 
				+from feapder import setting
			
 
				+from feapder.network import user_agent
			
 
				+
			
 
				+from feapder.db.mysqldb import MysqlDB
			
 
				+from feapder.db.redisdb import RedisDB
			
 
				+from feapder.utils import metrics
			
 
				+from feapder.utils.log import log
			
 
				+from feapder.utils.redis_lock import RedisLock
			
 
				+from feapder.utils.tools import send_msg
			
 
				+from feapder.utils.webdriver import WebDriver
			
 
				+
			
 
				+
			
 
				+class CookiePoolInterface(metaclass=abc.ABCMeta):
			
 
				+    """
			
 
				+    cookie pool interface
			
 
				+    """
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def create_cookie(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_cookie(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def del_cookie(self, *args, **kwargs):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def run(self):
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+
			
 
				+class PageCookiePool(CookiePoolInterface):
			
 
				+    """
			
 
				+    由页面产生的cookie 不需要用户登陆
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        redis_key,
			
 
				+        page_url=None,
			
 
				+        min_cookies=10000,
			
 
				+        must_contained_keys=(),
			
 
				+        keep_alive=False,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param redis_key: 项目名
			
 
				+        @param page_url: 生产cookie的url
			
 
				+        @param min_cookies: 最小cookie数
			
 
				+        @param must_contained_keys: cookie 必须包含的key
			
 
				+        @param keep_alive: 当cookie数量足够是是否保持随时待命，生产cookie的状态。False为否，满足则退出
			
 
				+        ---
			
 
				+        @param kwargs: WebDriver的一些参数
			
 
				+            load_images: 是否加载图片
			
 
				+            user_agent_pool: user-agent池 为None时不使用
			
 
				+            proxies_pool: ；代理池 为None时不使用
			
 
				+            headless: 是否启用无头模式
			
 
				+            driver_type: web driver 类型
			
 
				+            timeout: 请求超时时间 默认16s
			
 
				+            window_size: 屏幕分辨率 (width, height)
			
 
				+
			
 
				+        """
			
 
				+
			
 
				+        self._redisdb = RedisDB()
			
 
				+
			
 
				+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
			
 
				+        self._tab_cookie_pool_last_count = "{}:str_cookie_pool_count".format(
			
 
				+            redis_key
			
 
				+        )  # 存储上一次统计cookie 数量的时间，格式为 时间戳:数量
			
 
				+        self._page_url = page_url
			
 
				+        self._min_cookies = min_cookies
			
 
				+        self._must_contained_keys = must_contained_keys
			
 
				+        self._keep_alive = keep_alive
			
 
				+
			
 
				+        self._kwargs = kwargs
			
 
				+        self._kwargs.setdefault("load_images", False)
			
 
				+        self._kwargs.setdefault("headless", True)
			
 
				+
			
 
				+    def create_cookie(self):
			
 
				+        """
			
 
				+        可能会重写
			
 
				+        @return:
			
 
				+        """
			
 
				+        print('ssssssssssssssss',self._kwargs)
			
 
				+        url = 'https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do'
			
 
				+        header = {
			
 
				+            "Upgrade-Insecure-Requests": "1",
			
 
				+            "User-Agent": user_agent.get()
			
 
				+        }
			
 
				+        res = requests.get(url, headers=header)
			
 
				+        cookies = requests.utils.dict_from_cookiejar(res.cookies)
			
 
				+        return cookies
			
 
				+
			
 
				+
			
 
				+    def add_cookies(self, cookies):
			
 
				+        log.info("添加cookie {}".format(cookies))
			
 
				+        self._redisdb.lpush(self._tab_cookie_pool, cookies)
			
 
				+    def run(self):
			
 
				+        while True:
			
 
				+            try:
			
 
				+                now_cookie_count = self._redisdb.lget_count(self._tab_cookie_pool)
			
 
				+                need_cookie_count = self._min_cookies - now_cookie_count
			
 
				+
			
 
				+                if need_cookie_count > 0:
			
 
				+                    log.info(
			
 
				+                        "当前cookie数为 {} 小于 {}, 生产cookie".format(
			
 
				+                            now_cookie_count, self._min_cookies
			
 
				+                        )
			
 
				+                    )
			
 
				+                    try:
			
 
				+                        cookies = self.create_cookie()
			
 
				+                        if cookies:
			
 
				+                            self.add_cookies(cookies)
			
 
				+                    except Exception as e:
			
 
				+                        log.exception(e)
			
 
				+                else:
			
 
				+                    log.info("当前cookie数为 {} 数量足够 暂不生产".format(now_cookie_count))
			
 
				+
			
 
				+                    # 判断cookie池近一分钟数量是否有变化，无变化则认为爬虫不再用了，退出
			
 
				+                    last_count_info = self._redisdb.strget(
			
 
				+                        self._tab_cookie_pool_last_count
			
 
				+                    )
			
 
				+                    if not last_count_info:
			
 
				+                        self._redisdb.strset(
			
 
				+                            self._tab_cookie_pool_last_count,
			
 
				+                            "{}:{}".format(time.time(), now_cookie_count),
			
 
				+                        )
			
 
				+                    else:
			
 
				+                        last_time, last_count = last_count_info.split(":")
			
 
				+                        last_time = float(last_time)
			
 
				+                        last_count = int(last_count)
			
 
				+
			
 
				+                        if time.time() - last_time > 60:
			
 
				+                            if now_cookie_count == last_count:
			
 
				+                                log.info("近一分钟，cookie池数量无变化，判定爬虫未使用，退出生产")
			
 
				+                                break
			
 
				+                            else:
			
 
				+                                self._redisdb.strset(
			
 
				+                                    self._tab_cookie_pool_last_count,
			
 
				+                                    "{}:{}".format(time.time(), now_cookie_count),
			
 
				+                                )
			
 
				+
			
 
				+                    if self._keep_alive:
			
 
				+                        log.info("sleep 10")
			
 
				+                        tools.delay_time(10)
			
 
				+                    else:
			
 
				+                        break
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                tools.delay_time(1)
			
 
				+
			
 
				+    def get_cookie(self, wait_when_null=True):
			
 
				+        while True:
			
 
				+            try:
			
 
				+                cookie_info = self._redisdb.rpoplpush(self._tab_cookie_pool)
			
 
				+                if not cookie_info and wait_when_null:
			
 
				+                    log.info("暂无cookie 生产中...")
			
 
				+                    self._keep_alive = False
			
 
				+                    self._min_cookies = 1
			
 
				+                    with RedisLock(
			
 
				+                        key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=5
			
 
				+                    ) as _lock:
			
 
				+                        if _lock.locked:
			
 
				+                            self.run()
			
 
				+                    continue
			
 
				+                return eval(cookie_info) if cookie_info else {}
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                tools.delay_time(1)
			
 
				+
			
 
				+    def del_cookie(self, cookies):
			
 
				+        self._redisdb.lrem(self._tab_cookie_pool, cookies)
			
 
				+
			
 
				+# PageCookiePool('cookie_1',page_url="https://www.whzbtb.com/V2PRTS/PrequalificationPublicityInfoListInit.do").create_cookie()
			
 
				+class User:
			
 
				+    def __init__(self, username, cookie):
			
 
				+        self.username = username
			
 
				+        self.cookie = cookie
			
 
				+
			
 
				+
			
 
				+class LoginCookiePool(CookiePoolInterface):
			
 
				+    """
			
 
				+    需要登陆的cookie池, 用户账号密码等信息用mysql保存
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        redis_key,
			
 
				+        *,
			
 
				+        table_userbase,
			
 
				+        login_state_key="login_state",
			
 
				+        lock_state_key="lock_state",
			
 
				+        username_key="username",
			
 
				+        password_key="password",
			
 
				+        login_retry_times=10,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param redis_key: 项目名
			
 
				+        @param table_userbase: 用户表名
			
 
				+        @param login_state_key: 登录状态列名
			
 
				+        @param lock_state_key: 封锁状态列名
			
 
				+        @param username_key: 登陆名列名
			
 
				+        @param password_key: 密码列名
			
 
				+        @param login_retry_times: 登陆失败重试次数
			
 
				+        """
			
 
				+
			
 
				+        self._tab_cookie_pool = "{}:l_cookie_pool".format(redis_key)
			
 
				+        self._login_retry_times = login_retry_times
			
 
				+        self._table_userbase = table_userbase
			
 
				+        self._login_state_key = login_state_key
			
 
				+        self._lock_state_key = lock_state_key
			
 
				+        self._username_key = username_key
			
 
				+        self._password_key = password_key
			
 
				+
			
 
				+        self._redisdb = RedisDB()
			
 
				+        self._mongo = MongoDB(db='user_login')
			
 
				+
			
 
				+
			
 
				+    def create_cookie(self, username, password):
			
 
				+
			
 
				+        """
			
 
				+        创建cookie
			
 
				+        @param username: 用户名
			
 
				+        @param password: 密码
			
 
				+        @return: return cookie / None
			
 
				+        """
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def get_user_info(self):
			
 
				+        """
			
 
				+        返回用户信息
			
 
				+        @return: yield username, password
			
 
				+        """
			
 
				+
			
 
				+        return self._mongo.find(self._table_userbase,{self._lock_state_key:0,self._login_state_key:0})
			
 
				+
			
 
				+    def handle_login_failed_user(self, username, password):
			
 
				+        """
			
 
				+        处理登录失败的user
			
 
				+        @param username:
			
 
				+        @param password:
			
 
				+        @return:
			
 
				+        """
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def handel_exception(self, e):
			
 
				+        """
			
 
				+        处理异常
			
 
				+        @param e:
			
 
				+        @return:
			
 
				+        """
			
 
				+        log.exception(e)
			
 
				+
			
 
				+    def save_cookie(self, username, cookie):
			
 
				+        user_cookie = {"username": username, "cookie": cookie}
			
 
				+
			
 
				+        self._redisdb.lpush(self._tab_cookie_pool, user_cookie)
			
 
				+        self._mongo.add(
			
 
				+                coll_name=self._table_userbase,
			
 
				+                data={self._login_state_key:1},
			
 
				+                update_columns=self._username_key,
			
 
				+                update_columns_value=username)
			
 
				+
			
 
				+    def get_cookie(self, wait_when_null=True) -> User:
			
 
				+        while True:
			
 
				+            try:
			
 
				+                user_cookie = self._redisdb.rpoplpush(self._tab_cookie_pool)
			
 
				+                if not user_cookie and wait_when_null:
			
 
				+                    log.info("暂无cookie 生产中...")
			
 
				+                    self.login()
			
 
				+                    continue
			
 
				+
			
 
				+                if user_cookie:
			
 
				+                    user_cookie = eval(user_cookie)
			
 
				+                    return User(**user_cookie)
			
 
				+
			
 
				+                return None
			
 
				+            except Exception as e:
			
 
				+                log.exception(e)
			
 
				+                tools.delay_time(1)
			
 
				+
			
 
				+    def del_cookie(self, user: User):
			
 
				+        """
			
 
				+        删除失效的cookie
			
 
				+        @param user:
			
 
				+        @return:
			
 
				+        """
			
 
				+        user_info = {"username": user.username, "cookie": user.cookie}
			
 
				+        self._redisdb.lrem(self._tab_cookie_pool, user_info)
			
 
				+
			
 
				+        self._mongo.add(
			
 
				+            coll_name=self._table_userbase,
			
 
				+            data={self._login_state_key: 1},
			
 
				+            update_columns=self._username_key,
			
 
				+            update_columns_value=user.username)
			
 
				+
			
 
				+    def user_is_locked(self, user: User):
			
 
				+
			
 
				+        self._mongo.add(
			
 
				+            coll_name=self._table_userbase,
			
 
				+            data={self._lock_state_key: 1},
			
 
				+            update_columns=self._username_key,
			
 
				+            update_columns_value=user.username)
			
 
				+
			
 
				+    def run(self):
			
 
				+        with RedisLock(
			
 
				+            key=self._tab_cookie_pool, lock_timeout=3600, wait_timeout=100
			
 
				+        ) as _lock:
			
 
				+            if _lock.locked:
			
 
				+                user_infos = self.get_user_info()
			
 
				+                if not isinstance(user_infos, Iterable):
			
 
				+                    raise ValueError("get_user_info 返回值必须可迭代")
			
 
				+
			
 
				+                if not user_infos:
			
 
				+                    log.info("无可用用户")
			
 
				+
			
 
				+                for info in user_infos:
			
 
				+                    username = info.get("username")
			
 
				+                    password = info.get("password")
			
 
				+                    for i in range(self._login_retry_times):
			
 
				+                        try:
			
 
				+                            cookie = self.create_cookie(username, password)
			
 
				+                            if cookie:
			
 
				+                                self.save_cookie(username, cookie)
			
 
				+                            else:
			
 
				+                                self.handle_login_failed_user(username, password)
			
 
				+
			
 
				+                            break
			
 
				+                        except Exception as e:
			
 
				+                            self.handel_exception(e)
			
 
				+
			
 
				+                    else:
			
 
				+                        self.handle_login_failed_user(username, password)
			
 
				+
			
 
				+    login = run
			
 
				+
			
 
				+
			
 
				+@unique
			
 
				+class LimitTimesUserStatus(Enum):
			
 
				+    # 使用状态
			
 
				+    USED = "used"
			
 
				+    SUCCESS = "success"
			
 
				+    OVERDUE = "overdue"  # cookie 过期
			
 
				+    SLEEP = "sleep"
			
 
				+    EXCEPTION = "exception"
			
 
				+    # 登陆状态
			
 
				+    LOGIN_SUCCESS = "login_success"
			
 
				+    LOGIN_FALIED = "login_failed"
			
 
				+
			
 
				+
			
 
				+class LimitTimesUser:
			
 
				+    """
			
 
				+    有次数限制的账户
			
 
				+    基于本地做的缓存，不支持多进程调用
			
 
				+    """
			
 
				+
			
 
				+    ACCOUNT_INFO_KEY = "accounts:h_account_info"  # 存储cookie的redis key
			
 
				+    SITE_NAME = ""  # 网站名
			
 
				+
			
 
				+    redisdb = None
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        username,
			
 
				+        password,
			
 
				+        max_search_times,
			
 
				+        proxies=None,
			
 
				+        search_interval=0,
			
 
				+        **kwargs,
			
 
				+    ):
			
 
				+        """
			
 
				+        @param username:
			
 
				+        @param password:
			
 
				+        @param max_search_times:
			
 
				+        @param proxies:
			
 
				+        @param search_interval: 调用时间间隔。 支持元组 指定间隔的时间范围 如（5，10）即5到10秒；或直接传整数
			
 
				+        """
			
 
				+        self.__dict__.update(kwargs)
			
 
				+        self.username = username
			
 
				+        self.password = password
			
 
				+        self.max_search_times = max_search_times
			
 
				+        self.proxies = proxies
			
 
				+        self.search_interval = search_interval
			
 
				+        self.delay_use = 0  # 延时使用，用于等待解封的用户
			
 
				+
			
 
				+        if isinstance(search_interval, (tuple, list)):
			
 
				+            if len(search_interval) != 2:
			
 
				+                raise ValueError("search_interval 需传递两个值的元组或列表。如（5，10）即5到10秒")
			
 
				+
			
 
				+            self.used_for_time_length = (
			
 
				+                search_interval[1] * 5
			
 
				+            )  # 抢占式爬虫独享cookie时间，这段时间内其他爬虫不可抢占
			
 
				+        else:
			
 
				+            self.used_for_time_length = (
			
 
				+                search_interval * 5
			
 
				+            )  # 抢占式爬虫独享cookie时间，这段时间内其他爬虫不可抢占
			
 
				+
			
 
				+        self.account_info = {
			
 
				+            "login_time": 0,
			
 
				+            "cookies": {},
			
 
				+            "search_times": 0,
			
 
				+            "last_search_time": 0,
			
 
				+            "used_for_spider_name": None,  # 只被某个爬虫使用 其他爬虫不可使用
			
 
				+            "init_search_times_time": 0,  # 初始化搜索次数的时间
			
 
				+        }
			
 
				+
			
 
				+        if not self.__class__.redisdb:
			
 
				+            self.__class__.redisdb = RedisDB()
			
 
				+
			
 
				+        self.sync_account_info_from_redis()
			
 
				+
			
 
				+        self.__init_metrics()
			
 
				+
			
 
				+    def __init_metrics(self):
			
 
				+        """
			
 
				+        初始化打点系统
			
 
				+        @return:
			
 
				+        """
			
 
				+        metrics.init(**setting.METRICS_OTHER_ARGS)
			
 
				+
			
 
				+    def record_user_status(self, status: LimitTimesUserStatus):
			
 
				+        metrics.emit_counter(f"{self.username}:{status.value}", 1, classify="users")
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<LimitTimesUser {} | cookies:{}>".format(self.username, self.cookies)
			
 
				+
			
 
				+    def __eq__(self, other):
			
 
				+        return self.username == other.username
			
 
				+
			
 
				+    def sync_account_info_from_redis(self):
			
 
				+        account_info = self.redisdb.hget(self.ACCOUNT_INFO_KEY, self.username)
			
 
				+        if account_info:
			
 
				+            account_info = eval(account_info)
			
 
				+            self.account_info.update(account_info)
			
 
				+
			
 
				+    @property
			
 
				+    def cookies(self):
			
 
				+        cookies = self.account_info.get("cookies")
			
 
				+        return cookies
			
 
				+
			
 
				+    def set_cookies(self, cookies):
			
 
				+        self.account_info["cookies"] = cookies
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    def set_login_time(self, login_time=None):
			
 
				+        self.account_info["login_time"] = login_time or time.time()
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    def get_login_time(self):
			
 
				+        return self.account_info.get("login_time")
			
 
				+
			
 
				+    def is_time_to_login(self):
			
 
				+        return time.time() - self.get_login_time() > 40 * 60
			
 
				+
			
 
				+    def get_last_search_time(self):
			
 
				+        return self.account_info.get("last_search_time", 0)
			
 
				+
			
 
				+    def is_time_to_search(self):
			
 
				+        if self.delay_use:
			
 
				+            is_time = time.time() - self.get_last_search_time() > self.delay_use
			
 
				+            if is_time:
			
 
				+                self.delay_use = 0
			
 
				+
			
 
				+        else:
			
 
				+            is_time = time.time() - self.get_last_search_time() > (
			
 
				+                random.randint(*self.search_interval)
			
 
				+                if isinstance(self.search_interval, (tuple, list))
			
 
				+                else self.search_interval
			
 
				+            )
			
 
				+
			
 
				+        return is_time
			
 
				+
			
 
				+    @property
			
 
				+    def used_for_spider_name(self):
			
 
				+        return self.account_info.get("used_for_spider_name")
			
 
				+
			
 
				+    @used_for_spider_name.setter
			
 
				+    def used_for_spider_name(self, spider_name):
			
 
				+        self.account_info["used_for_spider_name"] = spider_name
			
 
				+
			
 
				+    def update_status(self):
			
 
				+        """
			
 
				+        更新search的一些状态
			
 
				+        @return:
			
 
				+        """
			
 
				+        self.account_info["search_times"] += 1
			
 
				+        self.account_info["last_search_time"] = time.time()
			
 
				+
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    @property
			
 
				+    def search_times(self):
			
 
				+        init_search_times_time = self.account_info.get("init_search_times_time")
			
 
				+        current_time = time.time()
			
 
				+        if (
			
 
				+            current_time - init_search_times_time >= 86400
			
 
				+        ):  # 如果距离上次初始化搜索次数时间大于1天，则搜索次数清清零
			
 
				+            self.account_info["search_times"] = 0
			
 
				+            self.account_info["init_search_times_time"] = current_time
			
 
				+
			
 
				+            self.redisdb.hset(self.ACCOUNT_INFO_KEY, self.username, self.account_info)
			
 
				+
			
 
				+        return self.account_info["search_times"]
			
 
				+
			
 
				+    def is_overwork(self):
			
 
				+        if self.search_times > self.max_search_times:
			
 
				+            log.warning("账号 {} 请求次数超限制".format(self.username))
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def is_at_work_time(self):
			
 
				+        if datetime.datetime.now().hour in list(range(7, 23)):
			
 
				+            return True
			
 
				+
			
 
				+        log.warning("账号 {} 不再工作时间内".format(self.username))
			
 
				+        return False
			
 
				+
			
 
				+    def del_cookie(self):
			
 
				+        self.account_info["cookies"] = {}
			
 
				+        return self.redisdb.hset(
			
 
				+            self.ACCOUNT_INFO_KEY, self.username, self.account_info
			
 
				+        )
			
 
				+
			
 
				+    def create_cookie(self):
			
 
				+        """
			
 
				+        生产cookie 有异常需要抛出
			
 
				+        @return: cookie_dict
			
 
				+        """
			
 
				+
			
 
				+        raise NotImplementedError
			
 
				+
			
 
				+    def login(self):
			
 
				+        """
			
 
				+        @return: 1 成功 0 失败
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+            # 预检查
			
 
				+            if not self.is_time_to_login():
			
 
				+                log.info("此账号尚未到登陆时间: {}".format(self.username))
			
 
				+                time.sleep(5)
			
 
				+                return 0
			
 
				+
			
 
				+            cookies = self.create_cookie()
			
 
				+            if not cookies:
			
 
				+                raise Exception("登陆失败 未获取到合法cookie")
			
 
				+
			
 
				+            if not isinstance(cookies, dict):
			
 
				+                raise Exception("cookie 必须为字典格式")
			
 
				+
			
 
				+            # 保存cookie
			
 
				+            self.set_login_time()
			
 
				+            self.set_cookies(cookies)
			
 
				+            log.info("登录成功 {}".format(self.username))
			
 
				+            self.record_user_status(LimitTimesUserStatus.LOGIN_SUCCESS)
			
 
				+            return 1
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            log.exception(e)
			
 
				+            send_msg(
			
 
				+                msg=f"{self.SITE_NAME} {self.username} 账号登陆异常 exception: {str(e)}",
			
 
				+                level="error",
			
 
				+                message_prefix=f"{self.SITE_NAME} {self.username} 账号登陆异常",
			
 
				+            )
			
 
				+
			
 
				+        log.info("登录失败 {}".format(self.username))
			
 
				+        self.record_user_status(LimitTimesUserStatus.LOGIN_FALIED)
			
 
				+        return 0
			
 
				+
			
 
				+
			
 
				+class LimitTimesUserPool:
			
 
				+    """
			
 
				+    限制查询次数的用户的User pool
			
 
				+    基于本地做的缓存，不支持多进程调用
			
 
				+    """
			
 
				+
			
 
				+    LOAD_USER_INTERVAL = 60
			
 
				+
			
 
				+    def __init__(self, *, accounts_dict, limit_user_class, support_more_client=True):
			
 
				+        """
			
 
				+        @param accounts_dic: 账户信息字典
			
 
				+            {
			
 
				+                "15011300228": {
			
 
				+                    "password": "300228",
			
 
				+                    "proxies": {},
			
 
				+                    "max_search_times": 500,
			
 
				+                    "search_interval": 1, # 使用时间间隔
			
 
				+                    # 其他携带信息
			
 
				+                }
			
 
				+            }
			
 
				+        @param limit_user_class: 用户重写的 limit_user_class
			
 
				+        @param support_more_client: 是否支持多客户端 即多线程 多进程模式 (可能在计数上及使用频率上有些误差)
			
 
				+        """
			
 
				+        self.accounts_dict = accounts_dict
			
 
				+        self.limit_user_class = limit_user_class
			
 
				+
			
 
				+        self.limit_times_users = []
			
 
				+        self.current_user_index = -1
			
 
				+
			
 
				+        self.support_more_client = support_more_client
			
 
				+
			
 
				+        self.last_load_user_time = 0
			
 
				+
			
 
				+    def __load_users(self, username=None):
			
 
				+        # 装载user
			
 
				+        log.info("更新可用用户")
			
 
				+
			
 
				+        for _username, detail in self.accounts_dict.items():
			
 
				+            if username and username != _username:
			
 
				+                continue
			
 
				+
			
 
				+            limit_times_users = self.limit_user_class(username=_username, **detail)
			
 
				+            if limit_times_users in self.limit_times_users:
			
 
				+                continue
			
 
				+
			
 
				+            if limit_times_users.is_overwork():
			
 
				+                continue
			
 
				+            else:
			
 
				+                if (
			
 
				+                    limit_times_users.cookies or limit_times_users.login()
			
 
				+                ):  # 如果有cookie 或者登陆成功 则添加到可用的user队列
			
 
				+                    self.limit_times_users.append(limit_times_users)
			
 
				+
			
 
				+        self.last_load_user_time = time.time()
			
 
				+
			
 
				+    def get_user(
			
 
				+        self,
			
 
				+        username=None,
			
 
				+        used_for_spider_name=None,
			
 
				+        wait_when_null=True,
			
 
				+        not_limit_frequence=False,
			
 
				+    ) -> LimitTimesUser:
			
 
				+        """
			
 
				+        @params username: 获取指定的用户
			
 
				+        @params used_for_spider_name: 独享式使用，独享爬虫的名字。其他爬虫不可抢占
			
 
				+        @params wait_when_null: 无用户时是否等待
			
 
				+        @params not_limit_frequence: 不限制使用频率
			
 
				+        @return: LimitTimesUser
			
 
				+        """
			
 
				+        if not self.support_more_client:
			
 
				+            warnings.warn(
			
 
				+                "LimitTimesUserCookiePool 取查询次数等信息时基于本地做的缓存，不支持多进程或多线程",
			
 
				+                category=Warning,
			
 
				+            )
			
 
				+            self._is_show_warning = True
			
 
				+
			
 
				+        while True:
			
 
				+            if (
			
 
				+                not self.limit_times_users
			
 
				+                or time.time() - self.last_load_user_time >= self.LOAD_USER_INTERVAL
			
 
				+            ):
			
 
				+                self.__load_users(username)
			
 
				+                if not self.limit_times_users:
			
 
				+                    log.warning("无可用的用户")
			
 
				+                    if wait_when_null:
			
 
				+                        time.sleep(1)
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        return None
			
 
				+
			
 
				+            self.current_user_index += 1
			
 
				+            self.current_user_index = self.current_user_index % len(
			
 
				+                self.limit_times_users
			
 
				+            )
			
 
				+
			
 
				+            limit_times_user = self.limit_times_users[self.current_user_index]
			
 
				+            if self.support_more_client:  # 需要先同步下最新数据
			
 
				+                limit_times_user.sync_account_info_from_redis()
			
 
				+
			
 
				+            if username and limit_times_user.username != username:
			
 
				+                log.info(
			
 
				+                    "{} 为非指定用户 {}, 获取下一个用户".format(limit_times_user.username, username)
			
 
				+                )
			
 
				+                time.sleep(1)
			
 
				+                continue
			
 
				+
			
 
				+            # 独占式使用，若为其他爬虫，检查等待使用时间是否超过独占时间，若超过则可以使用
			
 
				+            if (
			
 
				+                limit_times_user.used_for_spider_name
			
 
				+                and limit_times_user.used_for_spider_name != used_for_spider_name
			
 
				+            ):
			
 
				+                wait_time = time.time() - limit_times_user.get_last_search_time()
			
 
				+                if wait_time < limit_times_user.used_for_time_length:
			
 
				+                    log.info(
			
 
				+                        "用户{} 被 {} 爬虫独占，需等待 {} 秒后才可使用".format(
			
 
				+                            limit_times_user.username,
			
 
				+                            limit_times_user.used_for_spider_name,
			
 
				+                            limit_times_user.used_for_time_length - wait_time,
			
 
				+                        )
			
 
				+                    )
			
 
				+                    time.sleep(1)
			
 
				+                    continue
			
 
				+
			
 
				+            if (
			
 
				+                not limit_times_user.is_overwork()
			
 
				+                and limit_times_user.is_at_work_time()
			
 
				+            ):
			
 
				+                if not limit_times_user.cookies:
			
 
				+                    self.limit_times_users.remove(limit_times_user)
			
 
				+                    continue
			
 
				+
			
 
				+                if not_limit_frequence or limit_times_user.is_time_to_search():
			
 
				+                    limit_times_user.used_for_spider_name = used_for_spider_name
			
 
				+
			
 
				+                    limit_times_user.update_status()
			
 
				+                    log.info("使用用户 {}".format(limit_times_user.username))
			
 
				+                    limit_times_user.record_user_status(LimitTimesUserStatus.USED)
			
 
				+                    return limit_times_user
			
 
				+                else:
			
 
				+                    log.info("{} 用户使用间隔过短 查看下一个用户".format(limit_times_user.username))
			
 
				+                    time.sleep(1)
			
 
				+                    continue
			
 
				+            else:
			
 
				+                self.limit_times_users.remove(limit_times_user)
			
 
				+                self.current_user_index -= 1
			
 
				+
			
 
				+                if not limit_times_user.is_at_work_time():
			
 
				+                    log.warning("用户 {} 不在工作时间".format(limit_times_user.username))
			
 
				+                    if wait_when_null:
			
 
				+                        time.sleep(30)
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        return None
			
 
				+
			
 
				+    def del_user(self, username):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.del_cookie()
			
 
				+                self.limit_times_users.remove(limit_times_user)
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.OVERDUE)
			
 
				+                self.__load_users(username)
			
 
				+                break
			
 
				+
			
 
				+    def update_cookies(self, username, cookies):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.set_cookies(cookies)
			
 
				+                break
			
 
				+
			
 
				+    def delay_use(self, username, delay_seconds):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.delay_use = delay_seconds
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.SLEEP)
			
 
				+                break
			
 
				+
			
 
				+    def record_success_user(self, username):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.SUCCESS)
			
 
				+
			
 
				+    def record_exception_user(self, username):
			
 
				+        for limit_times_user in self.limit_times_users:
			
 
				+            if limit_times_user.username == username:
			
 
				+                limit_times_user.record_user_status(LimitTimesUserStatus.EXCEPTION)
			
--- a/FworkSpider/untils/create_menus.py
+++ b/FworkSpider/untils/create_menus.py
@@ -0,0 +1,33 @@
 
				+from feapder.db.mongodb import MongoDB
			
 
				+
			
 
				+
			
 
				+class Details:
			
 
				+    _to_db = None
			
 
				+    _to_db_xs = None
			
 
				+    db_name = 'mgp_list'
			
 
				+    # 定义mongo链接
			
 
				+    @property
			
 
				+    def to_db(self):
			
 
				+        if not self._to_db:
			
 
				+            self._to_db = MongoDB()
			
 
				+        return self._to_db
			
 
				+
			
 
				+    @property
			
 
				+    def to_db_xs(self):
			
 
				+        if not self._to_db_xs:
			
 
				+            self._to_db_xs = MongoDB(port=27001,db='editor')
			
 
				+        return self._to_db_xs
			
 
				+    def main(self,page):
			
 
				+        menus_list = []
			
 
				+        data = self.to_db_xs.find("luaconfig",{"modifyuser":"maguopeng","param_common":{"$elemMatch": {"$regex": "广东省政府采购网", "$options": "$i"}}})
			
 
				+        # print(data)
			
 
				+        for item in data:
			
 
				+            # print(item)
			
 
				+            channls = item.get("param_common")[2]
			
 
				+            code = item.get("code")
			
 
				+            href = item.get("param_common")[11]
			
 
				+            print("Menu"+"(",f"'{channls}',",f"'{code}',\n",f"'{href}',",page,"),")
			
 
				+        #     menus_list.append(f'''Menu({channls},{code},{href},{page})''')
			
 
				+        # print(menus_list)
			
 
				+
			
 
				+Details().main(2)
			
--- a/FworkSpider/untils/execptions.py
+++ b/FworkSpider/untils/execptions.py
@@ -0,0 +1,19 @@
 
				+
			
 
				+class CustomCheckError(Exception):
			
 
				+
			
 
				+    def __init__(self, code: int = 10002, reason: str = '特征条件检查失败'):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+
			
 
				+
			
 
				+class AttachmentNullError(Exception):
			
 
				+
			
 
				+    def __init__(self, code: int = 10004, reason: str = '附件下载失败'):
			
 
				+        self.code = code
			
 
				+        self.reason = reason
			
 
				+
			
 
				+
			
 
				+class CustomAccountPrivilegeError(Exception):
			
 
				+
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        pass